deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +145 -26
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +40 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/paper/web/markdown.py +174 -8
- deepresearch_flow/paper/web/static/css/main.css +8 -1
- deepresearch_flow/paper/web/static/js/detail.js +46 -12
- deepresearch_flow/paper/web/templates/detail.html +9 -0
- deepresearch_flow/paper/web/text.py +8 -4
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +35 -16
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/fixers.py +15 -0
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,965 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import base64
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import mimetypes
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import re
|
|
11
|
+
import sqlite3
|
|
12
|
+
from typing import Any
|
|
13
|
+
import uuid
|
|
14
|
+
|
|
15
|
+
from deepresearch_flow.paper.db_ops import build_index, load_and_merge_papers
|
|
16
|
+
|
|
17
|
+
from deepresearch_flow.paper.render import load_default_template
|
|
18
|
+
from deepresearch_flow.paper.template_registry import load_render_template
|
|
19
|
+
from deepresearch_flow.paper.snapshot.identity import (
|
|
20
|
+
PaperKeyCandidate,
|
|
21
|
+
build_paper_key_candidates,
|
|
22
|
+
choose_preferred_key,
|
|
23
|
+
meta_fingerprint_divergent,
|
|
24
|
+
paper_id_for_key,
|
|
25
|
+
)
|
|
26
|
+
from deepresearch_flow.paper.snapshot.schema import (
|
|
27
|
+
init_snapshot_db,
|
|
28
|
+
recompute_facet_counts,
|
|
29
|
+
recompute_paper_index,
|
|
30
|
+
)
|
|
31
|
+
from deepresearch_flow.paper.snapshot.text import (
|
|
32
|
+
insert_cjk_spaces,
|
|
33
|
+
markdown_to_plain_text,
|
|
34
|
+
)
|
|
35
|
+
from deepresearch_flow.paper.utils import stable_hash
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class SnapshotBuildOptions:
|
|
40
|
+
input_paths: list[Path]
|
|
41
|
+
bibtex_path: Path | None
|
|
42
|
+
md_roots: list[Path]
|
|
43
|
+
md_translated_roots: list[Path]
|
|
44
|
+
pdf_roots: list[Path]
|
|
45
|
+
output_db: Path
|
|
46
|
+
static_export_dir: Path
|
|
47
|
+
previous_snapshot_db: Path | None
|
|
48
|
+
min_meta_title_similarity: float = 0.6
|
|
49
|
+
min_meta_author_jaccard: float = 0.4
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class PreviousAlias:
|
|
54
|
+
paper_id: str
|
|
55
|
+
paper_key_type: str
|
|
56
|
+
meta_fingerprint: str | None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _hash_file(path: Path) -> str:
|
|
60
|
+
digest = hashlib.sha256()
|
|
61
|
+
with path.open("rb") as handle:
|
|
62
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
63
|
+
digest.update(chunk)
|
|
64
|
+
return digest.hexdigest()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _hash_text(text: str) -> str:
|
|
68
|
+
return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _hash_bytes(data: bytes) -> str:
|
|
72
|
+
return hashlib.sha256(data).hexdigest()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _safe_read_text(path: Path) -> str:
|
|
76
|
+
try:
|
|
77
|
+
return path.read_text(encoding="utf-8")
|
|
78
|
+
except UnicodeDecodeError:
|
|
79
|
+
return path.read_text(encoding="latin-1")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
_MD_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
|
83
|
+
_DATA_URL_PATTERN = re.compile(r"^data:([^;,]+)(;base64)?,(.*)$", re.DOTALL)
|
|
84
|
+
_IMG_TAG_PATTERN = re.compile(r"<img\\b[^>]*>", re.IGNORECASE)
|
|
85
|
+
_SRC_ATTR_PATTERN = re.compile(r"\\bsrc\\s*=\\s*(\"[^\"]*\"|'[^']*'|[^\\s>]+)", re.IGNORECASE | re.DOTALL)
|
|
86
|
+
_EXTENSION_OVERRIDES = {".jpe": ".jpg"}
|
|
87
|
+
_WHITESPACE_RE = re.compile(r"\s+")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _split_link_target(raw_link: str) -> tuple[str, str, str, str]:
|
|
91
|
+
link = raw_link.strip()
|
|
92
|
+
if link.startswith("<"):
|
|
93
|
+
end = link.find(">")
|
|
94
|
+
if end != -1:
|
|
95
|
+
return link[1:end], link[end + 1 :], "<", ">"
|
|
96
|
+
parts = link.split()
|
|
97
|
+
if not parts:
|
|
98
|
+
return "", "", "", ""
|
|
99
|
+
target = parts[0]
|
|
100
|
+
suffix = link[len(target) :]
|
|
101
|
+
return target, suffix, "", ""
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _normalize_facet_value(value: str | None) -> str:
|
|
105
|
+
cleaned = str(value or "").strip().lower()
|
|
106
|
+
cleaned = _WHITESPACE_RE.sub(" ", cleaned)
|
|
107
|
+
return cleaned
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _extension_from_mime(mime: str) -> str | None:
|
|
111
|
+
ext = mimetypes.guess_extension(mime, strict=False)
|
|
112
|
+
if ext in _EXTENSION_OVERRIDES:
|
|
113
|
+
return _EXTENSION_OVERRIDES[ext]
|
|
114
|
+
return ext
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _parse_data_url(target: str) -> tuple[str, bytes] | None:
|
|
118
|
+
match = _DATA_URL_PATTERN.match(target)
|
|
119
|
+
if not match:
|
|
120
|
+
return None
|
|
121
|
+
mime = match.group(1) or ""
|
|
122
|
+
if not mime.startswith("image/"):
|
|
123
|
+
return None
|
|
124
|
+
if match.group(2) != ";base64":
|
|
125
|
+
return None
|
|
126
|
+
payload = match.group(3) or ""
|
|
127
|
+
try:
|
|
128
|
+
return mime, base64.b64decode(payload)
|
|
129
|
+
except Exception:
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _is_absolute_url(target: str) -> bool:
|
|
134
|
+
lowered = target.lower()
|
|
135
|
+
return lowered.startswith(("http://", "https://", "data:", "mailto:", "file:", "#")) or target.startswith("/")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _rewrite_markdown_images(
|
|
139
|
+
markdown: str,
|
|
140
|
+
*,
|
|
141
|
+
source_path: Path,
|
|
142
|
+
images_output_dir: Path,
|
|
143
|
+
written: set[str],
|
|
144
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
145
|
+
images: list[dict[str, Any]] = []
|
|
146
|
+
|
|
147
|
+
def store_bytes(mime: str, data: bytes) -> str | None:
|
|
148
|
+
ext = _extension_from_mime(mime)
|
|
149
|
+
if not ext:
|
|
150
|
+
return None
|
|
151
|
+
digest = _hash_bytes(data)
|
|
152
|
+
filename = f"{digest}{ext}"
|
|
153
|
+
rel = f"images/{filename}"
|
|
154
|
+
if filename not in written:
|
|
155
|
+
images_output_dir.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
dest = images_output_dir / filename
|
|
157
|
+
if not dest.exists():
|
|
158
|
+
dest.write_bytes(data)
|
|
159
|
+
written.add(filename)
|
|
160
|
+
images.append({"path": rel, "sha256": digest, "ext": ext.lstrip("."), "status": "available"})
|
|
161
|
+
return rel
|
|
162
|
+
|
|
163
|
+
def store_local(target: str) -> str | None:
|
|
164
|
+
cleaned = target.strip()
|
|
165
|
+
while cleaned.startswith("../"):
|
|
166
|
+
cleaned = cleaned[3:]
|
|
167
|
+
cleaned = cleaned.replace("\\", "/")
|
|
168
|
+
cleaned = cleaned.lstrip("./")
|
|
169
|
+
cleaned = cleaned.lstrip("/")
|
|
170
|
+
|
|
171
|
+
local_path = (source_path.parent / cleaned).resolve()
|
|
172
|
+
if local_path.exists() and local_path.is_file():
|
|
173
|
+
ext = local_path.suffix.lower()
|
|
174
|
+
digest = _hash_file(local_path)
|
|
175
|
+
filename = f"{digest}{ext}" if ext else digest
|
|
176
|
+
rel = f"images/{filename}"
|
|
177
|
+
if filename not in written:
|
|
178
|
+
images_output_dir.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
dest = images_output_dir / filename
|
|
180
|
+
if not dest.exists():
|
|
181
|
+
dest.write_bytes(local_path.read_bytes())
|
|
182
|
+
written.add(filename)
|
|
183
|
+
images.append({"path": rel, "sha256": digest, "ext": ext.lstrip("."), "status": "available"})
|
|
184
|
+
return rel
|
|
185
|
+
|
|
186
|
+
images.append({"path": cleaned, "sha256": None, "ext": Path(cleaned).suffix.lstrip("."), "status": "missing"})
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def replace(match) -> str:
|
|
190
|
+
alt_text = match.group(1)
|
|
191
|
+
raw_link = match.group(2)
|
|
192
|
+
target, suffix, prefix, postfix = _split_link_target(raw_link)
|
|
193
|
+
parsed = _parse_data_url(target)
|
|
194
|
+
if parsed is not None:
|
|
195
|
+
mime, data = parsed
|
|
196
|
+
replacement = store_bytes(mime, data)
|
|
197
|
+
if not replacement:
|
|
198
|
+
return match.group(0)
|
|
199
|
+
new_link = f"{prefix}{replacement}{postfix}{suffix}"
|
|
200
|
+
return f""
|
|
201
|
+
if not target or _is_absolute_url(target):
|
|
202
|
+
return match.group(0)
|
|
203
|
+
|
|
204
|
+
rel = store_local(target)
|
|
205
|
+
if not rel:
|
|
206
|
+
return match.group(0)
|
|
207
|
+
new_link = f"{prefix}{rel}{postfix}{suffix}"
|
|
208
|
+
return f""
|
|
209
|
+
|
|
210
|
+
rewritten = _MD_IMAGE_RE.sub(replace, markdown)
|
|
211
|
+
|
|
212
|
+
def replace_img(match: re.Match[str]) -> str:
|
|
213
|
+
tag = match.group(0)
|
|
214
|
+
src_match = _SRC_ATTR_PATTERN.search(tag)
|
|
215
|
+
if not src_match:
|
|
216
|
+
return tag
|
|
217
|
+
raw_value = src_match.group(1)
|
|
218
|
+
quote = ""
|
|
219
|
+
if raw_value and raw_value[0] in {"\"", "'"}:
|
|
220
|
+
quote = raw_value[0]
|
|
221
|
+
value = raw_value[1:-1]
|
|
222
|
+
else:
|
|
223
|
+
value = raw_value
|
|
224
|
+
parsed = _parse_data_url(value)
|
|
225
|
+
if parsed is not None:
|
|
226
|
+
mime, data = parsed
|
|
227
|
+
replacement = store_bytes(mime, data)
|
|
228
|
+
elif not _is_absolute_url(value):
|
|
229
|
+
replacement = store_local(value)
|
|
230
|
+
else:
|
|
231
|
+
replacement = None
|
|
232
|
+
if not replacement:
|
|
233
|
+
return tag
|
|
234
|
+
new_src = f"{quote}{replacement}{quote}" if quote else replacement
|
|
235
|
+
return tag[: src_match.start(1)] + new_src + tag[src_match.end(1) :]
|
|
236
|
+
|
|
237
|
+
rewritten = _IMG_TAG_PATTERN.sub(replace_img, rewritten)
|
|
238
|
+
return rewritten, images
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _sanitize_component(value: str) -> str:
|
|
242
|
+
import re
|
|
243
|
+
|
|
244
|
+
text = (value or "").strip()
|
|
245
|
+
text = re.sub(r'[\\/:\*\?"<>\|]+', "_", text)
|
|
246
|
+
text = re.sub(r"\s+", "_", text)
|
|
247
|
+
text = re.sub(r"_+", "_", text)
|
|
248
|
+
return text.strip("_")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _normalize_display_venue(value: str) -> str:
|
|
252
|
+
if not value:
|
|
253
|
+
return ""
|
|
254
|
+
text = re.sub(r"\{\{|\}\}", "", value)
|
|
255
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
256
|
+
return text
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _truncate(value: str, max_len: int) -> str:
|
|
260
|
+
if max_len <= 0:
|
|
261
|
+
return value
|
|
262
|
+
return value if len(value) <= max_len else value[:max_len].rstrip("_")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _folder_names(first_author: str, year: str, title: str, paper_id: str) -> tuple[str, str]:
|
|
266
|
+
base_author = _truncate(_sanitize_component(first_author) or "unknown", 32)
|
|
267
|
+
base_year = _sanitize_component(year) or "unknown"
|
|
268
|
+
base_title = _truncate(_sanitize_component(title) or "untitled", 80)
|
|
269
|
+
full = _sanitize_component(f"{base_author}_{base_year}_{base_title}__{paper_id}")
|
|
270
|
+
short = _sanitize_component(f"{base_author}_{base_year}__{paper_id}")
|
|
271
|
+
if len(full) > 200:
|
|
272
|
+
return short, _sanitize_component(paper_id)
|
|
273
|
+
return full, short
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
_MONTH_WORDS = {
|
|
277
|
+
"jan": "01",
|
|
278
|
+
"january": "01",
|
|
279
|
+
"feb": "02",
|
|
280
|
+
"february": "02",
|
|
281
|
+
"mar": "03",
|
|
282
|
+
"march": "03",
|
|
283
|
+
"apr": "04",
|
|
284
|
+
"april": "04",
|
|
285
|
+
"may": "05",
|
|
286
|
+
"jun": "06",
|
|
287
|
+
"june": "06",
|
|
288
|
+
"jul": "07",
|
|
289
|
+
"july": "07",
|
|
290
|
+
"aug": "08",
|
|
291
|
+
"august": "08",
|
|
292
|
+
"sep": "09",
|
|
293
|
+
"sept": "09",
|
|
294
|
+
"september": "09",
|
|
295
|
+
"oct": "10",
|
|
296
|
+
"october": "10",
|
|
297
|
+
"nov": "11",
|
|
298
|
+
"november": "11",
|
|
299
|
+
"dec": "12",
|
|
300
|
+
"december": "12",
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _parse_year_month_from_text(text: str) -> tuple[str | None, str | None]:
|
|
305
|
+
if not text:
|
|
306
|
+
return None, None
|
|
307
|
+
value = str(text).strip()
|
|
308
|
+
if not value:
|
|
309
|
+
return None, None
|
|
310
|
+
year_match = re.search(r"(19|20)\d{2}", value)
|
|
311
|
+
year = year_match.group(0) if year_match else None
|
|
312
|
+
|
|
313
|
+
numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", value)
|
|
314
|
+
if numeric_match:
|
|
315
|
+
m = int(numeric_match.group(2))
|
|
316
|
+
month = f"{m:02d}" if 1 <= m <= 12 else None
|
|
317
|
+
return year, month
|
|
318
|
+
|
|
319
|
+
word_match = re.search(
|
|
320
|
+
r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
|
|
321
|
+
r"january|february|march|april|june|july|august|september|october|november|december)\b",
|
|
322
|
+
value.lower(),
|
|
323
|
+
)
|
|
324
|
+
if word_match:
|
|
325
|
+
return year, _MONTH_WORDS.get(word_match.group(0))
|
|
326
|
+
|
|
327
|
+
return year, None
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _extract_publication_date(paper: dict[str, Any]) -> str:
|
|
331
|
+
value = paper.get("publication_date") or paper.get("paper_publication_date") or ""
|
|
332
|
+
return str(value).strip()
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _load_previous_aliases(db_path: Path) -> dict[str, PreviousAlias]:
|
|
336
|
+
if not db_path:
|
|
337
|
+
return {}
|
|
338
|
+
if not db_path.exists():
|
|
339
|
+
return {}
|
|
340
|
+
conn = sqlite3.connect(str(db_path))
|
|
341
|
+
try:
|
|
342
|
+
rows = conn.execute(
|
|
343
|
+
"SELECT paper_key, paper_id, paper_key_type, meta_fingerprint FROM paper_key_alias"
|
|
344
|
+
).fetchall()
|
|
345
|
+
except sqlite3.Error:
|
|
346
|
+
return {}
|
|
347
|
+
finally:
|
|
348
|
+
conn.close()
|
|
349
|
+
out: dict[str, PreviousAlias] = {}
|
|
350
|
+
for paper_key, paper_id, paper_key_type, meta_fingerprint in rows:
|
|
351
|
+
out[str(paper_key)] = PreviousAlias(
|
|
352
|
+
paper_id=str(paper_id),
|
|
353
|
+
paper_key_type=str(paper_key_type),
|
|
354
|
+
meta_fingerprint=str(meta_fingerprint) if meta_fingerprint is not None else None,
|
|
355
|
+
)
|
|
356
|
+
return out
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _pick_paper_id(
|
|
360
|
+
candidates: list[PaperKeyCandidate],
|
|
361
|
+
*,
|
|
362
|
+
previous: dict[str, PreviousAlias],
|
|
363
|
+
min_meta_title_similarity: float,
|
|
364
|
+
min_meta_author_jaccard: float,
|
|
365
|
+
) -> tuple[str, PaperKeyCandidate, list[str]]:
|
|
366
|
+
preferred = choose_preferred_key(candidates)
|
|
367
|
+
matched: list[tuple[PaperKeyCandidate, PreviousAlias]] = []
|
|
368
|
+
for cand in candidates:
|
|
369
|
+
prev = previous.get(cand.paper_key)
|
|
370
|
+
if prev:
|
|
371
|
+
matched.append((cand, prev))
|
|
372
|
+
if not matched:
|
|
373
|
+
return paper_id_for_key(preferred.paper_key), preferred, []
|
|
374
|
+
|
|
375
|
+
matched.sort(key=lambda pair: pair[0].strength, reverse=True)
|
|
376
|
+
chosen_cand, chosen_prev = matched[0]
|
|
377
|
+
conflicts = []
|
|
378
|
+
for cand, prev in matched[1:]:
|
|
379
|
+
if prev.paper_id != chosen_prev.paper_id:
|
|
380
|
+
conflicts.append(
|
|
381
|
+
f"key_conflict:{cand.paper_key} maps {prev.paper_id} vs {chosen_prev.paper_id}"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
if chosen_cand.key_type == "meta":
|
|
385
|
+
if meta_fingerprint_divergent(
|
|
386
|
+
chosen_prev.meta_fingerprint,
|
|
387
|
+
chosen_cand.meta_fingerprint,
|
|
388
|
+
min_title_similarity=min_meta_title_similarity,
|
|
389
|
+
min_author_jaccard=min_meta_author_jaccard,
|
|
390
|
+
):
|
|
391
|
+
conflicts.append(f"meta_divergent:{chosen_cand.paper_key}")
|
|
392
|
+
return paper_id_for_key(preferred.paper_key), preferred, conflicts
|
|
393
|
+
|
|
394
|
+
return chosen_prev.paper_id, preferred, conflicts
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _extract_summary_markdown(paper: dict[str, Any]) -> str:
|
|
398
|
+
if isinstance(paper.get("summary"), str) and paper.get("summary").strip():
|
|
399
|
+
return str(paper.get("summary"))
|
|
400
|
+
templates = paper.get("templates")
|
|
401
|
+
if isinstance(templates, dict):
|
|
402
|
+
for template_tag in ("simple", "simple_phi"):
|
|
403
|
+
tmpl = templates.get(template_tag)
|
|
404
|
+
if isinstance(tmpl, dict) and isinstance(tmpl.get("summary"), str) and tmpl.get("summary").strip():
|
|
405
|
+
return str(tmpl.get("summary"))
|
|
406
|
+
if isinstance(paper.get("abstract"), str) and paper.get("abstract").strip():
|
|
407
|
+
return str(paper.get("abstract"))
|
|
408
|
+
return ""
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _canonical_template_tag(value: str) -> str:
|
|
412
|
+
tag = (value or "").strip().lower()
|
|
413
|
+
tag = re.sub(r"[^a-z0-9_-]+", "_", tag)
|
|
414
|
+
tag = re.sub(r"_+", "_", tag).strip("_")
|
|
415
|
+
return tag or "default"
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _extract_template_summaries(paper: dict[str, Any]) -> dict[str, str]:
|
|
419
|
+
summaries: dict[str, str] = {}
|
|
420
|
+
templates = paper.get("templates")
|
|
421
|
+
if isinstance(templates, dict):
|
|
422
|
+
for tag, payload in templates.items():
|
|
423
|
+
if not isinstance(tag, str) or not tag.strip():
|
|
424
|
+
continue
|
|
425
|
+
canonical_tag = _canonical_template_tag(tag)
|
|
426
|
+
if not isinstance(payload, dict):
|
|
427
|
+
continue
|
|
428
|
+
for key in ("summary", "abstract"):
|
|
429
|
+
value = payload.get(key)
|
|
430
|
+
if isinstance(value, str) and value.strip():
|
|
431
|
+
summaries[canonical_tag] = value.strip()
|
|
432
|
+
break
|
|
433
|
+
if canonical_tag not in summaries:
|
|
434
|
+
summaries[canonical_tag] = _render_template_fallback_markdown(
|
|
435
|
+
paper, template_tag=canonical_tag, template_payload=payload
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
top_level = paper.get("summary")
|
|
439
|
+
if isinstance(top_level, str) and top_level.strip():
|
|
440
|
+
tag = _canonical_template_tag(
|
|
441
|
+
str(paper.get("default_template") or paper.get("prompt_template") or paper.get("template_tag") or "default")
|
|
442
|
+
)
|
|
443
|
+
summaries.setdefault(tag, top_level.strip())
|
|
444
|
+
|
|
445
|
+
if not summaries:
|
|
446
|
+
fallback = _extract_summary_markdown(paper)
|
|
447
|
+
if fallback:
|
|
448
|
+
summaries["default"] = fallback
|
|
449
|
+
|
|
450
|
+
return summaries
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _render_template_fallback_markdown(
|
|
454
|
+
paper: dict[str, Any],
|
|
455
|
+
*,
|
|
456
|
+
template_tag: str,
|
|
457
|
+
template_payload: dict[str, Any],
|
|
458
|
+
) -> str:
|
|
459
|
+
context = dict(paper)
|
|
460
|
+
context.update(template_payload)
|
|
461
|
+
context.setdefault("output_language", paper.get("output_language") or "en")
|
|
462
|
+
|
|
463
|
+
try:
|
|
464
|
+
template = load_render_template(template_tag)
|
|
465
|
+
except Exception:
|
|
466
|
+
template = load_default_template()
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
rendered = template.render(**context)
|
|
470
|
+
return rendered.strip() if isinstance(rendered, str) else ""
|
|
471
|
+
except Exception:
|
|
472
|
+
payload = json.dumps(template_payload, ensure_ascii=False, indent=2)
|
|
473
|
+
return f"```json\n{payload}\n```"
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _choose_preferred_summary_template(paper: dict[str, Any], summaries: dict[str, str]) -> str:
|
|
477
|
+
if not summaries:
|
|
478
|
+
return "default"
|
|
479
|
+
preferred = _canonical_template_tag(str(paper.get("prompt_template") or paper.get("template_tag") or ""))
|
|
480
|
+
if preferred and preferred in summaries:
|
|
481
|
+
return preferred
|
|
482
|
+
for key in ("simple", "simple_phi"):
|
|
483
|
+
if key in summaries:
|
|
484
|
+
return key
|
|
485
|
+
return sorted(summaries.keys(), key=lambda item: item.lower())[0]
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _summary_preview(markdown: str, *, max_len: int = 320) -> str:
|
|
489
|
+
if not markdown:
|
|
490
|
+
return ""
|
|
491
|
+
text = markdown_to_plain_text(markdown)
|
|
492
|
+
if len(text) > max_len:
|
|
493
|
+
return text[: max_len - 1].rstrip() + "…"
|
|
494
|
+
return text
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _write_json(path: Path, data: Any) -> None:
|
|
498
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
499
|
+
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def build_snapshot(opts: SnapshotBuildOptions) -> None:
|
|
503
|
+
if opts.output_db.exists():
|
|
504
|
+
opts.output_db.unlink()
|
|
505
|
+
|
|
506
|
+
papers = load_and_merge_papers(
|
|
507
|
+
opts.input_paths,
|
|
508
|
+
opts.bibtex_path,
|
|
509
|
+
cache_dir=None,
|
|
510
|
+
use_cache=False,
|
|
511
|
+
pdf_roots=opts.pdf_roots,
|
|
512
|
+
)
|
|
513
|
+
index = build_index(
|
|
514
|
+
papers,
|
|
515
|
+
md_roots=opts.md_roots,
|
|
516
|
+
md_translated_roots=opts.md_translated_roots,
|
|
517
|
+
pdf_roots=opts.pdf_roots,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
previous_aliases = _load_previous_aliases(opts.previous_snapshot_db) if opts.previous_snapshot_db else {}
|
|
521
|
+
snapshot_build_id = uuid.uuid4().hex
|
|
522
|
+
|
|
523
|
+
opts.output_db.parent.mkdir(parents=True, exist_ok=True)
|
|
524
|
+
conn = sqlite3.connect(str(opts.output_db))
|
|
525
|
+
conn.row_factory = sqlite3.Row
|
|
526
|
+
try:
|
|
527
|
+
init_snapshot_db(conn)
|
|
528
|
+
conn.execute(
|
|
529
|
+
"INSERT OR REPLACE INTO snapshot_meta(key, value) VALUES (?, ?)",
|
|
530
|
+
("snapshot_build_id", snapshot_build_id),
|
|
531
|
+
)
|
|
532
|
+
conn.execute(
|
|
533
|
+
"INSERT OR REPLACE INTO snapshot_meta(key, value) VALUES (?, ?)",
|
|
534
|
+
("built_at", datetime.now(timezone.utc).isoformat()),
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
static_root = opts.static_export_dir
|
|
538
|
+
(static_root / "pdf").mkdir(parents=True, exist_ok=True)
|
|
539
|
+
(static_root / "md").mkdir(parents=True, exist_ok=True)
|
|
540
|
+
(static_root / "md_translate").mkdir(parents=True, exist_ok=True)
|
|
541
|
+
(static_root / "images").mkdir(parents=True, exist_ok=True)
|
|
542
|
+
(static_root / "summary").mkdir(parents=True, exist_ok=True)
|
|
543
|
+
(static_root / "manifest").mkdir(parents=True, exist_ok=True)
|
|
544
|
+
|
|
545
|
+
written_images: set[str] = set()
|
|
546
|
+
facet_node_cache: dict[tuple[str, str], int] = {}
|
|
547
|
+
|
|
548
|
+
def get_facet_node_id(facet_type: str, value: str | None) -> int | None:
|
|
549
|
+
normalized = _normalize_facet_value(value)
|
|
550
|
+
if not normalized or normalized == "unknown":
|
|
551
|
+
return None
|
|
552
|
+
key = (facet_type, normalized)
|
|
553
|
+
cached = facet_node_cache.get(key)
|
|
554
|
+
if cached:
|
|
555
|
+
return cached
|
|
556
|
+
conn.execute(
|
|
557
|
+
"INSERT OR IGNORE INTO facet_node(facet_type, value) VALUES (?, ?)",
|
|
558
|
+
(facet_type, normalized),
|
|
559
|
+
)
|
|
560
|
+
row = conn.execute(
|
|
561
|
+
"SELECT node_id FROM facet_node WHERE facet_type = ? AND value = ?",
|
|
562
|
+
(facet_type, normalized),
|
|
563
|
+
).fetchone()
|
|
564
|
+
if not row:
|
|
565
|
+
return None
|
|
566
|
+
node_id = int(row["node_id"])
|
|
567
|
+
facet_node_cache[key] = node_id
|
|
568
|
+
return node_id
|
|
569
|
+
|
|
570
|
+
with conn:
|
|
571
|
+
for idx, paper in enumerate(index.papers):
|
|
572
|
+
candidates = build_paper_key_candidates(paper)
|
|
573
|
+
paper_id, preferred, conflicts = _pick_paper_id(
|
|
574
|
+
candidates,
|
|
575
|
+
previous=previous_aliases,
|
|
576
|
+
min_meta_title_similarity=opts.min_meta_title_similarity,
|
|
577
|
+
min_meta_author_jaccard=opts.min_meta_author_jaccard,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
title = str(paper.get("paper_title") or "").strip()
|
|
581
|
+
year = str(paper.get("_year") or "unknown").strip() or "unknown"
|
|
582
|
+
year = year if year.isdigit() else year.lower()
|
|
583
|
+
month = "unknown"
|
|
584
|
+
pub_date = _extract_publication_date(paper)
|
|
585
|
+
|
|
586
|
+
bib = paper.get("bibtex") if isinstance(paper.get("bibtex"), dict) else None
|
|
587
|
+
bib_fields = (bib.get("fields") if isinstance(bib, dict) else None) or {}
|
|
588
|
+
bib_year = str(bib_fields.get("year") or "").strip()
|
|
589
|
+
bib_month = str(bib_fields.get("month") or "").strip()
|
|
590
|
+
if bib_year and not year.isdigit():
|
|
591
|
+
parsed_year, _ = _parse_year_month_from_text(bib_year)
|
|
592
|
+
if parsed_year:
|
|
593
|
+
year = parsed_year
|
|
594
|
+
if bib_month:
|
|
595
|
+
_, parsed_month = _parse_year_month_from_text(f"2000-{bib_month}")
|
|
596
|
+
if parsed_month:
|
|
597
|
+
month = parsed_month
|
|
598
|
+
if month == "unknown" and pub_date:
|
|
599
|
+
_, parsed_month = _parse_year_month_from_text(pub_date)
|
|
600
|
+
if parsed_month:
|
|
601
|
+
month = parsed_month
|
|
602
|
+
|
|
603
|
+
if not pub_date:
|
|
604
|
+
pub_date = year if year.isdigit() else ""
|
|
605
|
+
venue = _normalize_display_venue(str(paper.get("_venue") or "").strip()) or "unknown"
|
|
606
|
+
source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
|
|
607
|
+
|
|
608
|
+
authors = paper.get("_authors") or paper.get("paper_authors") or []
|
|
609
|
+
if not isinstance(authors, list):
|
|
610
|
+
authors = [str(authors)]
|
|
611
|
+
first_author = str(authors[0]) if authors else "unknown"
|
|
612
|
+
|
|
613
|
+
pdf_hash = None
|
|
614
|
+
source_md_hash = None
|
|
615
|
+
translated_hashes: dict[str, str] = {}
|
|
616
|
+
images: list[dict[str, Any]] = []
|
|
617
|
+
|
|
618
|
+
md_path = index.md_path_by_hash.get(source_hash)
|
|
619
|
+
if md_path:
|
|
620
|
+
raw_md = _safe_read_text(md_path)
|
|
621
|
+
rewritten_md, md_images = _rewrite_markdown_images(
|
|
622
|
+
raw_md,
|
|
623
|
+
source_path=md_path,
|
|
624
|
+
images_output_dir=static_root / "images",
|
|
625
|
+
written=written_images,
|
|
626
|
+
)
|
|
627
|
+
source_md_hash = _hash_text(rewritten_md)
|
|
628
|
+
md_target = static_root / "md" / f"{source_md_hash}.md"
|
|
629
|
+
if not md_target.exists():
|
|
630
|
+
md_target.write_text(rewritten_md, encoding="utf-8")
|
|
631
|
+
images.extend(md_images)
|
|
632
|
+
|
|
633
|
+
translations = index.translated_md_by_hash.get(source_hash, {})
|
|
634
|
+
for lang, t_path in translations.items():
|
|
635
|
+
raw_md = _safe_read_text(t_path)
|
|
636
|
+
rewritten_md, md_images = _rewrite_markdown_images(
|
|
637
|
+
raw_md,
|
|
638
|
+
source_path=t_path,
|
|
639
|
+
images_output_dir=static_root / "images",
|
|
640
|
+
written=written_images,
|
|
641
|
+
)
|
|
642
|
+
md_hash = _hash_text(rewritten_md)
|
|
643
|
+
lang_norm = str(lang).lower()
|
|
644
|
+
(static_root / "md_translate" / lang_norm).mkdir(parents=True, exist_ok=True)
|
|
645
|
+
md_target = static_root / "md_translate" / lang_norm / f"{md_hash}.md"
|
|
646
|
+
if not md_target.exists():
|
|
647
|
+
md_target.write_text(rewritten_md, encoding="utf-8")
|
|
648
|
+
translated_hashes[lang_norm] = md_hash
|
|
649
|
+
images.extend(md_images)
|
|
650
|
+
|
|
651
|
+
pdf_path = index.pdf_path_by_hash.get(source_hash)
|
|
652
|
+
if pdf_path:
|
|
653
|
+
pdf_hash = _hash_file(pdf_path)
|
|
654
|
+
pdf_target = static_root / "pdf" / f"{pdf_hash}.pdf"
|
|
655
|
+
if not pdf_target.exists():
|
|
656
|
+
pdf_target.write_bytes(pdf_path.read_bytes())
|
|
657
|
+
|
|
658
|
+
template_summaries = _extract_template_summaries(paper)
|
|
659
|
+
preferred_summary_template = _choose_preferred_summary_template(paper, template_summaries)
|
|
660
|
+
preferred_summary_markdown = template_summaries.get(preferred_summary_template) or ""
|
|
661
|
+
preview_source = template_summaries.get("simple") or preferred_summary_markdown
|
|
662
|
+
summary_preview = _summary_preview(preview_source)
|
|
663
|
+
|
|
664
|
+
base_summary_payload = {
|
|
665
|
+
"paper_id": paper_id,
|
|
666
|
+
"paper_title": title,
|
|
667
|
+
"paper_authors": authors,
|
|
668
|
+
"publication_date": paper.get("publication_date") or "",
|
|
669
|
+
"publication_venue": _normalize_display_venue(str(paper.get("publication_venue") or venue)),
|
|
670
|
+
"abstract": paper.get("abstract") or "",
|
|
671
|
+
"keywords": paper.get("keywords") or paper.get("_keywords") or [],
|
|
672
|
+
"paper_institutions": paper.get("paper_institutions") or [],
|
|
673
|
+
"output_language": paper.get("output_language") or "",
|
|
674
|
+
"provider": paper.get("provider") or "",
|
|
675
|
+
"model": paper.get("model") or "",
|
|
676
|
+
"prompt_template": paper.get("prompt_template") or paper.get("template_tag") or "",
|
|
677
|
+
"extracted_at": paper.get("extracted_at") or "",
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
# Back-compat + convenience: summary/<paper_id>.json always exists and points to the preferred template.
|
|
681
|
+
_write_json(
|
|
682
|
+
static_root / "summary" / f"{paper_id}.json",
|
|
683
|
+
{
|
|
684
|
+
**base_summary_payload,
|
|
685
|
+
"template_tag": preferred_summary_template,
|
|
686
|
+
"summary": preferred_summary_markdown,
|
|
687
|
+
"available_templates": sorted(template_summaries.keys(), key=lambda item: item.lower()),
|
|
688
|
+
},
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
# Per-template summary exports.
|
|
692
|
+
summary_dir = static_root / "summary" / paper_id
|
|
693
|
+
for template_tag, summary_markdown in template_summaries.items():
|
|
694
|
+
_write_json(
|
|
695
|
+
summary_dir / f"{template_tag}.json",
|
|
696
|
+
{
|
|
697
|
+
**base_summary_payload,
|
|
698
|
+
"template_tag": template_tag,
|
|
699
|
+
"summary": summary_markdown,
|
|
700
|
+
},
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
folder_name, folder_name_short = _folder_names(first_author, year, title, paper_id)
|
|
704
|
+
pdf_filename = _sanitize_component(f"{first_author}_{year}_{title}") or f"{paper_id}"
|
|
705
|
+
pdf_filename = _truncate(pdf_filename, 120) + ".pdf"
|
|
706
|
+
|
|
707
|
+
manifest_payload = {
|
|
708
|
+
"paper_id": paper_id,
|
|
709
|
+
"folder_name": folder_name,
|
|
710
|
+
"folder_name_short": folder_name_short,
|
|
711
|
+
"assets": {
|
|
712
|
+
"pdf": {
|
|
713
|
+
"static_path": f"pdf/{pdf_hash}.pdf" if pdf_hash else None,
|
|
714
|
+
"zip_path": pdf_filename if pdf_hash else None,
|
|
715
|
+
"sha256": pdf_hash,
|
|
716
|
+
},
|
|
717
|
+
"source_md": {
|
|
718
|
+
"static_path": f"md/{source_md_hash}.md" if source_md_hash else None,
|
|
719
|
+
"zip_path": "source.md" if source_md_hash else None,
|
|
720
|
+
"sha256": source_md_hash,
|
|
721
|
+
},
|
|
722
|
+
"translated_md": [
|
|
723
|
+
{
|
|
724
|
+
"lang": lang,
|
|
725
|
+
"static_path": f"md_translate/{lang}/{md_hash}.md",
|
|
726
|
+
"zip_path": f"translated/{lang}.md",
|
|
727
|
+
"sha256": md_hash,
|
|
728
|
+
}
|
|
729
|
+
for lang, md_hash in sorted(translated_hashes.items())
|
|
730
|
+
],
|
|
731
|
+
"summary": {
|
|
732
|
+
"static_path": f"summary/{paper_id}.json",
|
|
733
|
+
"zip_path": "summary.json",
|
|
734
|
+
},
|
|
735
|
+
"summary_templates": [
|
|
736
|
+
{
|
|
737
|
+
"template_tag": template_tag,
|
|
738
|
+
"static_path": f"summary/{paper_id}/{template_tag}.json",
|
|
739
|
+
"zip_path": f"summaries/{template_tag}.json",
|
|
740
|
+
}
|
|
741
|
+
for template_tag in sorted(template_summaries.keys(), key=lambda item: item.lower())
|
|
742
|
+
],
|
|
743
|
+
},
|
|
744
|
+
"images": [
|
|
745
|
+
{
|
|
746
|
+
"static_path": item.get("path"),
|
|
747
|
+
"zip_path": item.get("path"),
|
|
748
|
+
"sha256": item.get("sha256"),
|
|
749
|
+
"ext": item.get("ext"),
|
|
750
|
+
"status": item.get("status"),
|
|
751
|
+
}
|
|
752
|
+
for item in images
|
|
753
|
+
],
|
|
754
|
+
"conflicts": conflicts,
|
|
755
|
+
}
|
|
756
|
+
if images:
|
|
757
|
+
deduped: dict[str, dict[str, Any]] = {}
|
|
758
|
+
for item in manifest_payload["images"]:
|
|
759
|
+
key = str(item.get("static_path") or "")
|
|
760
|
+
if not key:
|
|
761
|
+
continue
|
|
762
|
+
if key not in deduped:
|
|
763
|
+
deduped[key] = item
|
|
764
|
+
elif deduped[key].get("status") != "available" and item.get("status") == "available":
|
|
765
|
+
deduped[key] = item
|
|
766
|
+
manifest_payload["images"] = list(deduped.values())
|
|
767
|
+
_write_json(static_root / "manifest" / f"{paper_id}.json", manifest_payload)
|
|
768
|
+
|
|
769
|
+
conn.execute(
|
|
770
|
+
"""
|
|
771
|
+
INSERT OR REPLACE INTO paper(
|
|
772
|
+
paper_id, paper_key, paper_key_type, title, year, month, publication_date, venue, preferred_summary_template, summary_preview, paper_index,
|
|
773
|
+
source_hash, output_language, provider, model, prompt_template, extracted_at,
|
|
774
|
+
pdf_content_hash, source_md_content_hash
|
|
775
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
776
|
+
""",
|
|
777
|
+
(
|
|
778
|
+
paper_id,
|
|
779
|
+
preferred.paper_key,
|
|
780
|
+
preferred.key_type,
|
|
781
|
+
title,
|
|
782
|
+
year,
|
|
783
|
+
month,
|
|
784
|
+
pub_date,
|
|
785
|
+
venue,
|
|
786
|
+
preferred_summary_template,
|
|
787
|
+
summary_preview,
|
|
788
|
+
0,
|
|
789
|
+
source_hash,
|
|
790
|
+
str(paper.get("output_language") or ""),
|
|
791
|
+
str(paper.get("provider") or ""),
|
|
792
|
+
str(paper.get("model") or ""),
|
|
793
|
+
str(paper.get("prompt_template") or paper.get("template_tag") or ""),
|
|
794
|
+
str(paper.get("extracted_at") or ""),
|
|
795
|
+
pdf_hash,
|
|
796
|
+
source_md_hash,
|
|
797
|
+
),
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
for template_tag in sorted(template_summaries.keys(), key=lambda item: item.lower()):
|
|
801
|
+
conn.execute(
|
|
802
|
+
"INSERT OR IGNORE INTO paper_summary(paper_id, template_tag) VALUES (?, ?)",
|
|
803
|
+
(paper_id, template_tag),
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
for lang, md_hash in translated_hashes.items():
|
|
807
|
+
conn.execute(
|
|
808
|
+
"INSERT OR REPLACE INTO paper_translation(paper_id, lang, md_content_hash) VALUES (?, ?, ?)",
|
|
809
|
+
(paper_id, lang, md_hash),
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
for cand in candidates:
|
|
813
|
+
conn.execute(
|
|
814
|
+
"""
|
|
815
|
+
INSERT OR REPLACE INTO paper_key_alias(paper_key, paper_id, paper_key_type, meta_fingerprint)
|
|
816
|
+
VALUES (?, ?, ?, ?)
|
|
817
|
+
""",
|
|
818
|
+
(
|
|
819
|
+
cand.paper_key,
|
|
820
|
+
paper_id,
|
|
821
|
+
cand.key_type,
|
|
822
|
+
cand.meta_fingerprint if cand.key_type == "meta" else None,
|
|
823
|
+
),
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
def upsert_facet(table: str, join_table: str, id_col: str, value: str) -> None:
|
|
827
|
+
normalized = _normalize_facet_value(value)
|
|
828
|
+
if not normalized or normalized == "unknown":
|
|
829
|
+
return
|
|
830
|
+
conn.execute(
|
|
831
|
+
f"INSERT OR IGNORE INTO {table}(value) VALUES (?)",
|
|
832
|
+
(normalized,),
|
|
833
|
+
)
|
|
834
|
+
row = conn.execute(
|
|
835
|
+
f"SELECT {id_col} FROM {table} WHERE value = ?",
|
|
836
|
+
(normalized,),
|
|
837
|
+
).fetchone()
|
|
838
|
+
if not row:
|
|
839
|
+
return
|
|
840
|
+
facet_id = int(row[0])
|
|
841
|
+
conn.execute(
|
|
842
|
+
f"INSERT OR IGNORE INTO {join_table}(paper_id, {id_col}) VALUES (?, ?)",
|
|
843
|
+
(paper_id, facet_id),
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
for author in authors:
|
|
847
|
+
upsert_facet("author", "paper_author", "author_id", str(author))
|
|
848
|
+
keywords = paper.get("keywords") or paper.get("_keywords") or []
|
|
849
|
+
if isinstance(keywords, list):
|
|
850
|
+
for kw in keywords:
|
|
851
|
+
upsert_facet("keyword", "paper_keyword", "keyword_id", str(kw))
|
|
852
|
+
institutions = paper.get("paper_institutions") or []
|
|
853
|
+
if isinstance(institutions, list):
|
|
854
|
+
for inst in institutions:
|
|
855
|
+
upsert_facet("institution", "paper_institution", "institution_id", str(inst))
|
|
856
|
+
tags = paper.get("ai_generated_tags") or paper.get("_tags") or []
|
|
857
|
+
if isinstance(tags, list):
|
|
858
|
+
for tag in tags:
|
|
859
|
+
upsert_facet("tag", "paper_tag", "tag_id", str(tag))
|
|
860
|
+
upsert_facet("venue", "paper_venue", "venue_id", venue)
|
|
861
|
+
|
|
862
|
+
graph_nodes: set[int] = set()
|
|
863
|
+
|
|
864
|
+
def add_graph_nodes(facet_type: str, values: Any) -> None:
|
|
865
|
+
if values is None:
|
|
866
|
+
return
|
|
867
|
+
if isinstance(values, (list, tuple, set)):
|
|
868
|
+
iterable = values
|
|
869
|
+
else:
|
|
870
|
+
iterable = [values]
|
|
871
|
+
for item in iterable:
|
|
872
|
+
node_id = get_facet_node_id(facet_type, item)
|
|
873
|
+
if node_id is not None:
|
|
874
|
+
graph_nodes.add(node_id)
|
|
875
|
+
|
|
876
|
+
add_graph_nodes("author", authors)
|
|
877
|
+
if isinstance(keywords, list):
|
|
878
|
+
add_graph_nodes("keyword", keywords)
|
|
879
|
+
if isinstance(institutions, list):
|
|
880
|
+
add_graph_nodes("institution", institutions)
|
|
881
|
+
if isinstance(tags, list):
|
|
882
|
+
add_graph_nodes("tag", tags)
|
|
883
|
+
add_graph_nodes("venue", venue)
|
|
884
|
+
add_graph_nodes("year", year)
|
|
885
|
+
add_graph_nodes("month", month)
|
|
886
|
+
add_graph_nodes("summary_template", list(template_summaries.keys()))
|
|
887
|
+
add_graph_nodes("output_language", paper.get("output_language"))
|
|
888
|
+
add_graph_nodes("provider", paper.get("provider"))
|
|
889
|
+
add_graph_nodes("model", paper.get("model"))
|
|
890
|
+
add_graph_nodes("prompt_template", paper.get("prompt_template") or paper.get("template_tag"))
|
|
891
|
+
add_graph_nodes("translation_lang", list(translated_hashes.keys()))
|
|
892
|
+
|
|
893
|
+
for node_id in graph_nodes:
|
|
894
|
+
conn.execute(
|
|
895
|
+
"INSERT OR IGNORE INTO paper_facet(paper_id, node_id) VALUES (?, ?)",
|
|
896
|
+
(paper_id, node_id),
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
node_list = sorted(graph_nodes)
|
|
900
|
+
if len(node_list) > 1:
|
|
901
|
+
edge_rows = []
|
|
902
|
+
for idx, left in enumerate(node_list):
|
|
903
|
+
for right in node_list[idx + 1 :]:
|
|
904
|
+
edge_rows.append((left, right))
|
|
905
|
+
conn.executemany(
|
|
906
|
+
"""
|
|
907
|
+
INSERT INTO facet_edge(node_id_a, node_id_b, paper_count)
|
|
908
|
+
VALUES (?, ?, 1)
|
|
909
|
+
ON CONFLICT(node_id_a, node_id_b)
|
|
910
|
+
DO UPDATE SET paper_count = paper_count + 1
|
|
911
|
+
""",
|
|
912
|
+
edge_rows,
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
summary_text = markdown_to_plain_text(" ".join(template_summaries.values()))
|
|
916
|
+
source_text = ""
|
|
917
|
+
translated_text = ""
|
|
918
|
+
if source_md_hash and md_path:
|
|
919
|
+
source_text = markdown_to_plain_text(_safe_read_text(static_root / "md" / f"{source_md_hash}.md"))
|
|
920
|
+
if translated_hashes:
|
|
921
|
+
translated_parts: list[str] = []
|
|
922
|
+
for lang, md_hash in translated_hashes.items():
|
|
923
|
+
translated_parts.append(
|
|
924
|
+
markdown_to_plain_text(
|
|
925
|
+
_safe_read_text(static_root / "md_translate" / lang / f"{md_hash}.md")
|
|
926
|
+
)
|
|
927
|
+
)
|
|
928
|
+
translated_text = " ".join(part for part in translated_parts if part)
|
|
929
|
+
|
|
930
|
+
metadata_text = " ".join(
|
|
931
|
+
part
|
|
932
|
+
for part in [
|
|
933
|
+
title,
|
|
934
|
+
" ".join(str(a) for a in authors),
|
|
935
|
+
venue,
|
|
936
|
+
" ".join(str(k) for k in (keywords if isinstance(keywords, list) else [])),
|
|
937
|
+
" ".join(str(i) for i in (institutions if isinstance(institutions, list) else [])),
|
|
938
|
+
year,
|
|
939
|
+
]
|
|
940
|
+
if part
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
conn.execute(
|
|
944
|
+
"""
|
|
945
|
+
INSERT INTO paper_fts(paper_id, title, summary, source, translated, metadata)
|
|
946
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
947
|
+
""",
|
|
948
|
+
(
|
|
949
|
+
paper_id,
|
|
950
|
+
insert_cjk_spaces(title),
|
|
951
|
+
insert_cjk_spaces(summary_text),
|
|
952
|
+
insert_cjk_spaces(source_text),
|
|
953
|
+
insert_cjk_spaces(translated_text),
|
|
954
|
+
insert_cjk_spaces(metadata_text),
|
|
955
|
+
),
|
|
956
|
+
)
|
|
957
|
+
conn.execute(
|
|
958
|
+
"INSERT INTO paper_fts_trigram(paper_id, title, venue) VALUES (?, ?, ?)",
|
|
959
|
+
(paper_id, title.lower(), venue.lower()),
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
recompute_paper_index(conn)
|
|
963
|
+
recompute_facet_counts(conn)
|
|
964
|
+
finally:
|
|
965
|
+
conn.close()
|