deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1154 -35
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
  49. deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
  52. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,239 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import difflib
5
+ import hashlib
6
+ import json
7
+ import re
8
+ import unicodedata
9
+ from typing import Any
10
+ from urllib.parse import unquote
11
+
12
+
13
+ _DOI_PREFIX_RE = re.compile(r"^(?:https?://(?:dx\.)?doi\.org/|doi:\s*)", re.IGNORECASE)
14
+ _ARXIV_PREFIX_RE = re.compile(r"^(?:https?://arxiv\.org/abs/|arxiv:\s*)", re.IGNORECASE)
15
+ _ARXIV_VERSION_RE = re.compile(r"v\d+$", re.IGNORECASE)
16
+ _YEAR_RE = re.compile(r"(19|20)\d{2}")
17
+
18
+
19
+ def canonicalize_doi(raw: str | None) -> str | None:
20
+ if not raw:
21
+ return None
22
+ value = unquote(str(raw).strip())
23
+ if not value:
24
+ return None
25
+ value = _DOI_PREFIX_RE.sub("", value).strip().lower()
26
+ value = value.rstrip()
27
+ value = value.rstrip(".,;)")
28
+ return value or None
29
+
30
+
31
+ def canonicalize_arxiv(raw: str | None) -> str | None:
32
+ if not raw:
33
+ return None
34
+ value = str(raw).strip()
35
+ if not value:
36
+ return None
37
+ value = _ARXIV_PREFIX_RE.sub("", value).strip().lower()
38
+ value = _ARXIV_VERSION_RE.sub("", value)
39
+ return value or None
40
+
41
+
42
+ def _collapse_ws(text: str) -> str:
43
+ return re.sub(r"\s+", " ", text).strip()
44
+
45
+
46
+ def _strip_punct_symbols(text: str) -> str:
47
+ out: list[str] = []
48
+ for ch in text:
49
+ cat = unicodedata.category(ch)
50
+ if cat and cat[0] in {"P", "S"}:
51
+ out.append(" ")
52
+ else:
53
+ out.append(ch)
54
+ return "".join(out)
55
+
56
+
57
+ def normalize_meta_title(raw: str | None) -> str:
58
+ if not raw:
59
+ return ""
60
+ text = unicodedata.normalize("NFKC", str(raw)).lower()
61
+ text = _strip_punct_symbols(text)
62
+ return _collapse_ws(text)
63
+
64
+
65
+ def normalize_meta_name(raw: str | None) -> str:
66
+ if not raw:
67
+ return ""
68
+ text = unicodedata.normalize("NFKC", str(raw)).lower()
69
+ return _collapse_ws(text)
70
+
71
+
72
+ def normalize_meta_venue(raw: str | None) -> str:
73
+ if not raw:
74
+ return ""
75
+ text = unicodedata.normalize("NFKC", str(raw)).lower()
76
+ return _collapse_ws(text)
77
+
78
+
79
+ def extract_year(value: str | None) -> str | None:
80
+ if not value:
81
+ return None
82
+ match = _YEAR_RE.search(str(value))
83
+ return match.group(0) if match else None
84
+
85
+
86
+ def normalized_authors(raw: Any) -> list[str]:
87
+ if raw is None:
88
+ return []
89
+ if isinstance(raw, list):
90
+ items = [normalize_meta_name(item) for item in raw]
91
+ return sorted([item for item in items if item])
92
+ if isinstance(raw, str):
93
+ parts = [normalize_meta_name(part) for part in raw.split(",")]
94
+ return sorted([part for part in parts if part])
95
+ value = normalize_meta_name(str(raw))
96
+ return [value] if value else []
97
+
98
+
99
+ def meta_fingerprint_json(*, title: str, authors: list[str], year: str, venue: str) -> str:
100
+ payload = {"title": title, "authors": authors, "year": year, "venue": venue}
101
+ return json.dumps(payload, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
102
+
103
+
104
+ def meta_hash(*, title: str, authors: list[str], year: str, venue: str) -> str:
105
+ payload = meta_fingerprint_json(title=title, authors=authors, year=year, venue=venue)
106
+ return hashlib.sha256(payload.encode("utf-8", errors="ignore")).hexdigest()
107
+
108
+
109
+ def paper_id_for_key(paper_key: str) -> str:
110
+ digest = hashlib.sha256(f"v1|{paper_key}".encode("utf-8", errors="ignore")).hexdigest()
111
+ return digest[:32]
112
+
113
+
114
+ @dataclass(frozen=True)
115
+ class PaperKeyCandidate:
116
+ key_type: str # doi|arxiv|bib|meta
117
+ paper_key: str
118
+ meta_fingerprint: str | None = None
119
+
120
+ @property
121
+ def strength(self) -> int:
122
+ order = {"doi": 4, "arxiv": 3, "bib": 2, "meta": 1}
123
+ return order.get(self.key_type, 0)
124
+
125
+
126
+ def _bib_fields_lower(paper: dict[str, Any]) -> dict[str, str]:
127
+ bib = paper.get("bibtex")
128
+ if not isinstance(bib, dict):
129
+ return {}
130
+ fields = bib.get("fields")
131
+ if not isinstance(fields, dict):
132
+ return {}
133
+ out: dict[str, str] = {}
134
+ for key, value in fields.items():
135
+ if value is None:
136
+ continue
137
+ out[str(key).lower()] = str(value)
138
+ return out
139
+
140
+
141
+ def _extract_doi(paper: dict[str, Any]) -> str | None:
142
+ if isinstance(paper.get("doi"), str):
143
+ return paper.get("doi")
144
+ if isinstance(paper.get("paper_doi"), str):
145
+ return paper.get("paper_doi")
146
+ fields = _bib_fields_lower(paper)
147
+ return fields.get("doi")
148
+
149
+
150
+ def _extract_arxiv(paper: dict[str, Any]) -> str | None:
151
+ fields = _bib_fields_lower(paper)
152
+ for key in ("arxiv", "arxivid", "arxiv_id", "arxiv-id"):
153
+ if key in fields:
154
+ return fields[key]
155
+ archive_prefix = (fields.get("archiveprefix") or fields.get("archive_prefix") or "").strip().lower()
156
+ if archive_prefix == "arxiv" and fields.get("eprint"):
157
+ return fields.get("eprint")
158
+ if isinstance(paper.get("arxiv"), str):
159
+ return paper.get("arxiv")
160
+ if isinstance(paper.get("arxiv_id"), str):
161
+ return paper.get("arxiv_id")
162
+ return None
163
+
164
+
165
+ def _extract_bib_key(paper: dict[str, Any]) -> str | None:
166
+ bib = paper.get("bibtex")
167
+ if not isinstance(bib, dict):
168
+ return None
169
+ key = bib.get("key")
170
+ if isinstance(key, str) and key.strip():
171
+ return key.strip()
172
+ return None
173
+
174
+
175
+ def build_paper_key_candidates(paper: dict[str, Any]) -> list[PaperKeyCandidate]:
176
+ candidates: list[PaperKeyCandidate] = []
177
+
178
+ doi = canonicalize_doi(_extract_doi(paper))
179
+ if doi:
180
+ candidates.append(PaperKeyCandidate(key_type="doi", paper_key=f"doi:{doi}"))
181
+
182
+ arxiv = canonicalize_arxiv(_extract_arxiv(paper))
183
+ if arxiv:
184
+ candidates.append(PaperKeyCandidate(key_type="arxiv", paper_key=f"arxiv:{arxiv}"))
185
+
186
+ bib_key = _extract_bib_key(paper)
187
+ if bib_key:
188
+ candidates.append(PaperKeyCandidate(key_type="bib", paper_key=f"bib:{bib_key}"))
189
+
190
+ title = normalize_meta_title(str(paper.get("paper_title") or ""))
191
+ authors = normalized_authors(paper.get("paper_authors"))
192
+ year = (
193
+ extract_year(str(_bib_fields_lower(paper).get("year") or "")) or extract_year(str(paper.get("publication_date") or "")) or "unknown"
194
+ )
195
+ venue_raw = _bib_fields_lower(paper).get("journal") or _bib_fields_lower(paper).get("booktitle") or str(paper.get("publication_venue") or "")
196
+ venue = normalize_meta_venue(venue_raw)
197
+ fingerprint = meta_fingerprint_json(title=title, authors=authors, year=year, venue=venue)
198
+ candidates.append(
199
+ PaperKeyCandidate(
200
+ key_type="meta",
201
+ paper_key=f"meta:{meta_hash(title=title, authors=authors, year=year, venue=venue)}",
202
+ meta_fingerprint=fingerprint,
203
+ )
204
+ )
205
+
206
+ return candidates
207
+
208
+
209
+ def choose_preferred_key(candidates: list[PaperKeyCandidate]) -> PaperKeyCandidate:
210
+ if not candidates:
211
+ raise ValueError("At least one candidate key is required")
212
+ return max(candidates, key=lambda item: item.strength)
213
+
214
+
215
+ def meta_fingerprint_divergent(
216
+ previous_fingerprint: str | None,
217
+ current_fingerprint: str | None,
218
+ *,
219
+ min_title_similarity: float,
220
+ min_author_jaccard: float,
221
+ ) -> bool:
222
+ if not previous_fingerprint or not current_fingerprint:
223
+ return False
224
+ try:
225
+ prev = json.loads(previous_fingerprint)
226
+ cur = json.loads(current_fingerprint)
227
+ except Exception:
228
+ return True
229
+ prev_title = str(prev.get("title") or "")
230
+ cur_title = str(cur.get("title") or "")
231
+ title_similarity = difflib.SequenceMatcher(a=prev_title, b=cur_title).ratio()
232
+
233
+ prev_authors = {str(item) for item in (prev.get("authors") or []) if str(item)}
234
+ cur_authors = {str(item) for item in (cur.get("authors") or []) if str(item)}
235
+ union = prev_authors | cur_authors
236
+ jaccard = (len(prev_authors & cur_authors) / len(union)) if union else 1.0
237
+
238
+ return title_similarity < min_title_similarity and jaccard < min_author_jaccard
239
+
@@ -0,0 +1,245 @@
1
+ from __future__ import annotations
2
+
3
+ import sqlite3
4
+
5
+
6
+ def init_snapshot_db(conn: sqlite3.Connection) -> None:
7
+ conn.execute("PRAGMA foreign_keys=ON;")
8
+ conn.execute("PRAGMA journal_mode=WAL;")
9
+
10
+ conn.executescript(
11
+ """
12
+ CREATE TABLE IF NOT EXISTS snapshot_meta (
13
+ key TEXT PRIMARY KEY,
14
+ value TEXT NOT NULL
15
+ );
16
+
17
+ CREATE TABLE IF NOT EXISTS paper (
18
+ paper_id TEXT PRIMARY KEY,
19
+ paper_key TEXT NOT NULL,
20
+ paper_key_type TEXT NOT NULL,
21
+ title TEXT NOT NULL,
22
+ year TEXT NOT NULL,
23
+ month TEXT NOT NULL,
24
+ publication_date TEXT NOT NULL,
25
+ venue TEXT NOT NULL,
26
+ preferred_summary_template TEXT NOT NULL,
27
+ summary_preview TEXT NOT NULL,
28
+ paper_index INTEGER NOT NULL DEFAULT 0,
29
+ source_hash TEXT,
30
+ output_language TEXT,
31
+ provider TEXT,
32
+ model TEXT,
33
+ prompt_template TEXT,
34
+ extracted_at TEXT,
35
+ pdf_content_hash TEXT,
36
+ source_md_content_hash TEXT
37
+ );
38
+
39
+ CREATE TABLE IF NOT EXISTS paper_summary (
40
+ paper_id TEXT NOT NULL,
41
+ template_tag TEXT NOT NULL,
42
+ PRIMARY KEY (paper_id, template_tag),
43
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE
44
+ );
45
+ CREATE INDEX IF NOT EXISTS idx_paper_summary_template ON paper_summary(template_tag);
46
+
47
+ CREATE TABLE IF NOT EXISTS paper_translation (
48
+ paper_id TEXT NOT NULL,
49
+ lang TEXT NOT NULL,
50
+ md_content_hash TEXT NOT NULL,
51
+ PRIMARY KEY (paper_id, lang),
52
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE
53
+ );
54
+
55
+ CREATE TABLE IF NOT EXISTS paper_key_alias (
56
+ paper_key TEXT PRIMARY KEY,
57
+ paper_id TEXT NOT NULL,
58
+ paper_key_type TEXT NOT NULL,
59
+ meta_fingerprint TEXT,
60
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE
61
+ );
62
+ CREATE INDEX IF NOT EXISTS idx_paper_key_alias_paper_id ON paper_key_alias(paper_id);
63
+
64
+ CREATE TABLE IF NOT EXISTS author (
65
+ author_id INTEGER PRIMARY KEY,
66
+ value TEXT NOT NULL UNIQUE,
67
+ paper_count INTEGER NOT NULL DEFAULT 0
68
+ );
69
+ CREATE TABLE IF NOT EXISTS paper_author (
70
+ paper_id TEXT NOT NULL,
71
+ author_id INTEGER NOT NULL,
72
+ PRIMARY KEY (paper_id, author_id),
73
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE,
74
+ FOREIGN KEY (author_id) REFERENCES author(author_id) ON DELETE CASCADE
75
+ );
76
+ CREATE INDEX IF NOT EXISTS idx_paper_author_author_id ON paper_author(author_id);
77
+
78
+ CREATE TABLE IF NOT EXISTS keyword (
79
+ keyword_id INTEGER PRIMARY KEY,
80
+ value TEXT NOT NULL UNIQUE,
81
+ paper_count INTEGER NOT NULL DEFAULT 0
82
+ );
83
+ CREATE TABLE IF NOT EXISTS paper_keyword (
84
+ paper_id TEXT NOT NULL,
85
+ keyword_id INTEGER NOT NULL,
86
+ PRIMARY KEY (paper_id, keyword_id),
87
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE,
88
+ FOREIGN KEY (keyword_id) REFERENCES keyword(keyword_id) ON DELETE CASCADE
89
+ );
90
+ CREATE INDEX IF NOT EXISTS idx_paper_keyword_keyword_id ON paper_keyword(keyword_id);
91
+
92
+ CREATE TABLE IF NOT EXISTS institution (
93
+ institution_id INTEGER PRIMARY KEY,
94
+ value TEXT NOT NULL UNIQUE,
95
+ paper_count INTEGER NOT NULL DEFAULT 0
96
+ );
97
+ CREATE TABLE IF NOT EXISTS paper_institution (
98
+ paper_id TEXT NOT NULL,
99
+ institution_id INTEGER NOT NULL,
100
+ PRIMARY KEY (paper_id, institution_id),
101
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE,
102
+ FOREIGN KEY (institution_id) REFERENCES institution(institution_id) ON DELETE CASCADE
103
+ );
104
+ CREATE INDEX IF NOT EXISTS idx_paper_institution_institution_id ON paper_institution(institution_id);
105
+
106
+ CREATE TABLE IF NOT EXISTS tag (
107
+ tag_id INTEGER PRIMARY KEY,
108
+ value TEXT NOT NULL UNIQUE,
109
+ paper_count INTEGER NOT NULL DEFAULT 0
110
+ );
111
+ CREATE TABLE IF NOT EXISTS paper_tag (
112
+ paper_id TEXT NOT NULL,
113
+ tag_id INTEGER NOT NULL,
114
+ PRIMARY KEY (paper_id, tag_id),
115
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE,
116
+ FOREIGN KEY (tag_id) REFERENCES tag(tag_id) ON DELETE CASCADE
117
+ );
118
+ CREATE INDEX IF NOT EXISTS idx_paper_tag_tag_id ON paper_tag(tag_id);
119
+
120
+ CREATE TABLE IF NOT EXISTS venue (
121
+ venue_id INTEGER PRIMARY KEY,
122
+ value TEXT NOT NULL UNIQUE,
123
+ paper_count INTEGER NOT NULL DEFAULT 0
124
+ );
125
+ CREATE TABLE IF NOT EXISTS paper_venue (
126
+ paper_id TEXT NOT NULL,
127
+ venue_id INTEGER NOT NULL,
128
+ PRIMARY KEY (paper_id, venue_id),
129
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE,
130
+ FOREIGN KEY (venue_id) REFERENCES venue(venue_id) ON DELETE CASCADE
131
+ );
132
+ CREATE INDEX IF NOT EXISTS idx_paper_venue_venue_id ON paper_venue(venue_id);
133
+
134
+ CREATE TABLE IF NOT EXISTS facet_node (
135
+ node_id INTEGER PRIMARY KEY,
136
+ facet_type TEXT NOT NULL,
137
+ value TEXT NOT NULL,
138
+ paper_count INTEGER NOT NULL DEFAULT 0,
139
+ UNIQUE(facet_type, value)
140
+ );
141
+ CREATE INDEX IF NOT EXISTS idx_facet_node_type ON facet_node(facet_type);
142
+ CREATE INDEX IF NOT EXISTS idx_facet_node_value ON facet_node(value);
143
+
144
+ CREATE TABLE IF NOT EXISTS paper_facet (
145
+ paper_id TEXT NOT NULL,
146
+ node_id INTEGER NOT NULL,
147
+ PRIMARY KEY (paper_id, node_id),
148
+ FOREIGN KEY (paper_id) REFERENCES paper(paper_id) ON DELETE CASCADE,
149
+ FOREIGN KEY (node_id) REFERENCES facet_node(node_id) ON DELETE CASCADE
150
+ );
151
+ CREATE INDEX IF NOT EXISTS idx_paper_facet_node_id ON paper_facet(node_id);
152
+
153
+ CREATE TABLE IF NOT EXISTS facet_edge (
154
+ node_id_a INTEGER NOT NULL,
155
+ node_id_b INTEGER NOT NULL,
156
+ paper_count INTEGER NOT NULL DEFAULT 0,
157
+ PRIMARY KEY (node_id_a, node_id_b),
158
+ FOREIGN KEY (node_id_a) REFERENCES facet_node(node_id) ON DELETE CASCADE,
159
+ FOREIGN KEY (node_id_b) REFERENCES facet_node(node_id) ON DELETE CASCADE
160
+ );
161
+ CREATE INDEX IF NOT EXISTS idx_facet_edge_a ON facet_edge(node_id_a);
162
+ CREATE INDEX IF NOT EXISTS idx_facet_edge_b ON facet_edge(node_id_b);
163
+
164
+ CREATE TABLE IF NOT EXISTS year_count (
165
+ year TEXT PRIMARY KEY,
166
+ paper_count INTEGER NOT NULL
167
+ );
168
+
169
+ CREATE TABLE IF NOT EXISTS month_count (
170
+ month TEXT PRIMARY KEY,
171
+ paper_count INTEGER NOT NULL
172
+ );
173
+
174
+ CREATE VIRTUAL TABLE IF NOT EXISTS paper_fts USING fts5(
175
+ paper_id UNINDEXED,
176
+ title,
177
+ summary,
178
+ source,
179
+ translated,
180
+ metadata,
181
+ tokenize='unicode61'
182
+ );
183
+
184
+ CREATE VIRTUAL TABLE IF NOT EXISTS paper_fts_trigram USING fts5(
185
+ paper_id UNINDEXED,
186
+ title,
187
+ venue,
188
+ tokenize='trigram'
189
+ );
190
+ """
191
+ )
192
+
193
+
194
+ def recompute_facet_counts(conn: sqlite3.Connection) -> None:
195
+ conn.execute(
196
+ "UPDATE author SET paper_count = (SELECT COUNT(*) FROM paper_author WHERE author_id = author.author_id);"
197
+ )
198
+ conn.execute(
199
+ "UPDATE keyword SET paper_count = (SELECT COUNT(*) FROM paper_keyword WHERE keyword_id = keyword.keyword_id);"
200
+ )
201
+ conn.execute(
202
+ "UPDATE institution SET paper_count = (SELECT COUNT(*) FROM paper_institution WHERE institution_id = institution.institution_id);"
203
+ )
204
+ conn.execute(
205
+ "UPDATE tag SET paper_count = (SELECT COUNT(*) FROM paper_tag WHERE tag_id = tag.tag_id);"
206
+ )
207
+ conn.execute(
208
+ "UPDATE venue SET paper_count = (SELECT COUNT(*) FROM paper_venue WHERE venue_id = venue.venue_id);"
209
+ )
210
+ conn.execute(
211
+ "UPDATE facet_node SET paper_count = (SELECT COUNT(*) FROM paper_facet WHERE node_id = facet_node.node_id);"
212
+ )
213
+
214
+ conn.execute("DELETE FROM year_count;")
215
+ conn.execute(
216
+ """
217
+ INSERT INTO year_count(year, paper_count)
218
+ SELECT year, COUNT(*) AS paper_count
219
+ FROM paper
220
+ GROUP BY year
221
+ """
222
+ )
223
+
224
+ conn.execute("DELETE FROM month_count;")
225
+ conn.execute(
226
+ """
227
+ INSERT INTO month_count(month, paper_count)
228
+ SELECT month, COUNT(*) AS paper_count
229
+ FROM paper
230
+ GROUP BY month
231
+ """
232
+ )
233
+
234
+
235
+ def recompute_paper_index(conn: sqlite3.Connection) -> None:
236
+ conn.execute(
237
+ """
238
+ WITH ordered AS (
239
+ SELECT paper_id, ROW_NUMBER() OVER (ORDER BY paper_id ASC) AS idx
240
+ FROM paper
241
+ )
242
+ UPDATE paper
243
+ SET paper_index = (SELECT idx FROM ordered WHERE ordered.paper_id = paper.paper_id);
244
+ """
245
+ )
@@ -0,0 +1,2 @@
1
+ """Unit tests for snapshot build + API helpers."""
2
+
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ import unittest
4
+
5
+ from deepresearch_flow.paper.snapshot.identity import (
6
+ canonicalize_arxiv,
7
+ canonicalize_doi,
8
+ meta_fingerprint_divergent,
9
+ paper_id_for_key,
10
+ )
11
+ from deepresearch_flow.paper.snapshot.text import (
12
+ insert_cjk_spaces,
13
+ markdown_to_plain_text,
14
+ merge_adjacent_markers,
15
+ remove_cjk_spaces,
16
+ rewrite_search_query,
17
+ )
18
+
19
+
20
+ class TestIdentity(unittest.TestCase):
21
+ def test_canonicalize_doi_prefix_decode_and_case(self) -> None:
22
+ self.assertEqual(
23
+ canonicalize_doi("https://doi.org/10.1000%2FXYZ."),
24
+ "10.1000/xyz",
25
+ )
26
+
27
+ def test_canonicalize_arxiv_strips_version(self) -> None:
28
+ self.assertEqual(
29
+ canonicalize_arxiv("https://arxiv.org/abs/2301.00001v3"),
30
+ "2301.00001",
31
+ )
32
+
33
+ def test_paper_id_is_stable(self) -> None:
34
+ key = "doi:10.1000/xyz"
35
+ self.assertEqual(paper_id_for_key(key), paper_id_for_key(key))
36
+
37
+ def test_meta_fingerprint_divergence_requires_both_signals(self) -> None:
38
+ prev = '{"authors":["a","b"],"title":"deep learning","venue":"x","year":"2020"}'
39
+ cur = '{"authors":["c"],"title":"completely different","venue":"y","year":"2020"}'
40
+ self.assertTrue(
41
+ meta_fingerprint_divergent(
42
+ prev,
43
+ cur,
44
+ min_title_similarity=0.8,
45
+ min_author_jaccard=0.5,
46
+ )
47
+ )
48
+ cur_same_authors = '{"authors":["a","b"],"title":"completely different","venue":"y","year":"2020"}'
49
+ self.assertFalse(
50
+ meta_fingerprint_divergent(
51
+ prev,
52
+ cur_same_authors,
53
+ min_title_similarity=0.8,
54
+ min_author_jaccard=0.5,
55
+ )
56
+ )
57
+
58
+
59
+ class TestSearchText(unittest.TestCase):
60
+ def test_rewrite_search_query_cjk_phrase(self) -> None:
61
+ self.assertEqual(rewrite_search_query("深度学习"), "\"深 度 学 习\"")
62
+
63
+ def test_rewrite_search_query_mixed(self) -> None:
64
+ self.assertEqual(rewrite_search_query("深度学习 transformer"), "\"深 度 学 习\" transformer")
65
+
66
+ def test_rewrite_search_query_boolean(self) -> None:
67
+ self.assertEqual(rewrite_search_query("lidar AND localization"), "lidar AND localization")
68
+
69
+ def test_markdown_to_plain_text_strips_tables(self) -> None:
70
+ md = "hello\n\n| a | b |\n|---|---|\n| 1 | 2 |\n\nworld"
71
+ plain = markdown_to_plain_text(md)
72
+ self.assertIn("hello", plain)
73
+ self.assertIn("world", plain)
74
+ self.assertNotIn("1", plain)
75
+ self.assertNotIn("2", plain)
76
+
77
+ def test_cjk_spacing_roundtrip(self) -> None:
78
+ original = "深度学习"
79
+ spaced = insert_cjk_spaces(original)
80
+ self.assertEqual(spaced, "深 度 学 习")
81
+ self.assertEqual(remove_cjk_spaces(spaced), original)
82
+
83
+ def test_merge_adjacent_markers(self) -> None:
84
+ self.assertEqual(
85
+ merge_adjacent_markers("[[[深]]][[[度]]]"),
86
+ "[[[深度]]]",
87
+ )
88
+
89
+ def test_markdown_monthly_facets_exist_after_build(self) -> None:
90
+ # This is a lightweight schema sanity check (no full build here).
91
+ # The snapshot DB is expected to include month support via schema tables.
92
+ import sqlite3
93
+ from deepresearch_flow.paper.snapshot.schema import init_snapshot_db
94
+
95
+ conn = sqlite3.connect(":memory:")
96
+ try:
97
+ init_snapshot_db(conn)
98
+ tables = {row[0] for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'")}
99
+ self.assertIn("month_count", tables)
100
+ cols = {row[1] for row in conn.execute("PRAGMA table_info(paper)")}
101
+ self.assertIn("month", cols)
102
+ self.assertIn("publication_date", cols)
103
+ finally:
104
+ conn.close()
105
+
106
+ def test_extract_template_summaries(self) -> None:
107
+ from deepresearch_flow.paper.snapshot.builder import _extract_template_summaries, _choose_preferred_summary_template
108
+
109
+ paper = {
110
+ "templates": {
111
+ "simple": {"summary": "s1"},
112
+ "deep_read": {"summary": "s2"},
113
+ },
114
+ "prompt_template": "deep_read",
115
+ }
116
+ summaries = _extract_template_summaries(paper)
117
+ self.assertEqual(summaries["simple"], "s1")
118
+ self.assertEqual(summaries["deep_read"], "s2")
119
+ self.assertEqual(_choose_preferred_summary_template(paper, summaries), "deep_read")
120
+
121
+
122
+ if __name__ == "__main__":
123
+ unittest.main()