deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. deepresearch_flow/paper/db.py +184 -0
  2. deepresearch_flow/paper/db_ops.py +1939 -0
  3. deepresearch_flow/paper/web/app.py +38 -3705
  4. deepresearch_flow/paper/web/constants.py +23 -0
  5. deepresearch_flow/paper/web/filters.py +255 -0
  6. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  7. deepresearch_flow/paper/web/handlers/api.py +217 -0
  8. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  9. deepresearch_flow/paper/web/markdown.py +549 -0
  10. deepresearch_flow/paper/web/static/css/main.css +857 -0
  11. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  12. deepresearch_flow/paper/web/static/js/index.js +266 -0
  13. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  14. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  15. deepresearch_flow/paper/web/templates/base.html +43 -0
  16. deepresearch_flow/paper/web/templates/detail.html +332 -0
  17. deepresearch_flow/paper/web/templates/index.html +114 -0
  18. deepresearch_flow/paper/web/templates/stats.html +29 -0
  19. deepresearch_flow/paper/web/templates.py +85 -0
  20. deepresearch_flow/paper/web/text.py +68 -0
  21. deepresearch_flow/recognize/cli.py +805 -26
  22. deepresearch_flow/recognize/katex_check.js +29 -0
  23. deepresearch_flow/recognize/math.py +719 -0
  24. deepresearch_flow/recognize/mermaid.py +690 -0
  25. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/METADATA +78 -4
  26. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/RECORD +30 -9
  27. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/WHEEL +0 -0
  28. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/entry_points.txt +0 -0
  29. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
  30. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,23 @@
1
+ """Constants for paper web UI."""
2
+
3
+ from pathlib import Path
4
+
5
+ # CDN URLs for external libraries
6
+ CDN_ECHARTS = "https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"
7
+ CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"
8
+ CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/katex.min.css"
9
+ CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/katex.min.js"
10
+ CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/contrib/auto-render.min.js"
11
+
12
+ # Use legacy builds to ensure `pdfjsLib` is available as a global.
13
+ CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
14
+ CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
15
+
16
+ # PDF.js viewer configuration
17
+ PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
18
+ PDFJS_STATIC_DIR = Path(__file__).resolve().parent / "pdfjs"
19
+ STATIC_DIR = Path(__file__).resolve().parent / "static"
20
+ TEMPLATES_DIR = Path(__file__).resolve().parent / "templates"
21
+
22
+ # Metadata
23
+ REPO_URL = "https://github.com/nerdneilsfield/ai-deepresearch-flow"
@@ -0,0 +1,255 @@
1
+ """Filter, query, and statistics utilities for paper web UI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from starlette.requests import Request
8
+
9
+ from deepresearch_flow.paper.db_ops import PaperIndex
10
+ from deepresearch_flow.paper.utils import stable_hash
11
+
12
+
13
+ BOOL_TRUE = {"1", "true", "yes", "with", "has"}
14
+ BOOL_FALSE = {"0", "false", "no", "without"}
15
+
16
+
17
+ def tokenize_filter_query(text: str) -> list[str]:
18
+ """Tokenize a filter query string, respecting quoted phrases."""
19
+ out: list[str] = []
20
+ buf: list[str] = []
21
+ in_quote = False
22
+
23
+ for ch in text:
24
+ if ch == '"':
25
+ in_quote = not in_quote
26
+ continue
27
+ if not in_quote and ch.isspace():
28
+ token = "".join(buf).strip()
29
+ if token:
30
+ out.append(token)
31
+ buf = []
32
+ continue
33
+ buf.append(ch)
34
+
35
+ token = "".join(buf).strip()
36
+ if token:
37
+ out.append(token)
38
+ return out
39
+
40
+
41
+ def normalize_presence_value(value: str) -> str | None:
42
+ """Normalize a presence filter value to 'with' or 'without'."""
43
+ token = value.strip().lower()
44
+ if token in BOOL_TRUE:
45
+ return "with"
46
+ if token in BOOL_FALSE:
47
+ return "without"
48
+ return None
49
+
50
+
51
+ def parse_filter_query(text: str) -> dict[str, set[str]]:
52
+ """Parse a filter query string into structured filters."""
53
+ parsed = {
54
+ "pdf": set(),
55
+ "source": set(),
56
+ "summary": set(),
57
+ "translated": set(),
58
+ "template": set(),
59
+ }
60
+ for token in tokenize_filter_query(text):
61
+ if ":" not in token:
62
+ continue
63
+ key, raw_value = token.split(":", 1)
64
+ key = key.strip().lower()
65
+ raw_value = raw_value.strip()
66
+ if not raw_value:
67
+ continue
68
+ if key in {"tmpl", "template"}:
69
+ for part in raw_value.split(","):
70
+ tag = part.strip()
71
+ if tag:
72
+ parsed["template"].add(tag.lower())
73
+ continue
74
+ if key in {"pdf", "source", "summary", "translated"}:
75
+ for part in raw_value.split(","):
76
+ normalized = normalize_presence_value(part)
77
+ if normalized:
78
+ parsed[key].add(normalized)
79
+ continue
80
+ if key in {"has", "no"}:
81
+ targets = [part.strip().lower() for part in raw_value.split(",") if part.strip()]
82
+ for target in targets:
83
+ if target not in {"pdf", "source", "summary", "translated"}:
84
+ continue
85
+ parsed[target].add("with" if key == "has" else "without")
86
+ return parsed
87
+
88
+
89
+ def presence_filter(values: list[str]) -> set[str] | None:
90
+ """Convert a list of presence filter values to a normalized set."""
91
+ normalized = set()
92
+ for value in values:
93
+ token = normalize_presence_value(value)
94
+ if token:
95
+ normalized.add(token)
96
+ if not normalized or normalized == {"with", "without"}:
97
+ return None
98
+ return normalized
99
+
100
+
101
+ def merge_filter_set(primary: set[str] | None, secondary: set[str] | None) -> set[str] | None:
102
+ """Merge two filter sets with AND logic."""
103
+ if not primary:
104
+ return secondary
105
+ if not secondary:
106
+ return primary
107
+ return primary & secondary
108
+
109
+
110
+ def matches_presence(allowed: set[str] | None, has_value: bool) -> bool:
111
+ """Check if a value matches a presence filter."""
112
+ if not allowed:
113
+ return True
114
+ if has_value and "with" in allowed:
115
+ return True
116
+ if not has_value and "without" in allowed:
117
+ return True
118
+ return False
119
+
120
+
121
+ def template_tag_map(index: PaperIndex) -> dict[str, str]:
122
+ """Create a mapping from lowercase template tags to display tags."""
123
+ return {tag.lower(): tag for tag in index.template_tags}
124
+
125
+
126
+ def compute_counts(index: PaperIndex, ids: set[int]) -> dict[str, Any]:
127
+ """Compute statistics for a set of paper IDs."""
128
+ template_order = list(index.template_tags)
129
+ template_counts = {tag: 0 for tag in template_order}
130
+ pdf_count = 0
131
+ source_count = 0
132
+ summary_count = 0
133
+ translated_count = 0
134
+ total_count = 0
135
+ tag_map = template_tag_map(index)
136
+
137
+ for idx in ids:
138
+ paper = index.papers[idx]
139
+ if paper.get("_is_pdf_only"):
140
+ continue
141
+ total_count += 1
142
+ source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
143
+ has_source = source_hash in index.md_path_by_hash
144
+ has_pdf = source_hash in index.pdf_path_by_hash
145
+ has_summary = bool(paper.get("_has_summary"))
146
+ has_translated = bool(index.translated_md_by_hash.get(source_hash))
147
+ if has_source:
148
+ source_count += 1
149
+ if has_pdf:
150
+ pdf_count += 1
151
+ if has_summary:
152
+ summary_count += 1
153
+ if has_translated:
154
+ translated_count += 1
155
+ for tag_lc in paper.get("_template_tags_lc") or []:
156
+ display = tag_map.get(tag_lc)
157
+ if display:
158
+ template_counts[display] = template_counts.get(display, 0) + 1
159
+
160
+ return {
161
+ "total": total_count,
162
+ "pdf": pdf_count,
163
+ "source": source_count,
164
+ "summary": summary_count,
165
+ "translated": translated_count,
166
+ "templates": template_counts,
167
+ "template_order": template_order,
168
+ }
169
+
170
+
171
+ def parse_filters(request: Request) -> dict[str, list[str] | str | int]:
172
+ """Parse filters from request query parameters."""
173
+ qp = request.query_params
174
+ page = int(qp.get("page", "1"))
175
+ page_size = int(qp.get("page_size", "30"))
176
+ page = max(1, page)
177
+ page_size = min(max(1, page_size), 200)
178
+
179
+ q = qp.get("q", "").strip()
180
+ filter_query = qp.get("fq", "").strip()
181
+ pdf_filters = [item for item in qp.getlist("pdf") if item]
182
+ source_filters = [item for item in qp.getlist("source") if item]
183
+ summary_filters = [item for item in qp.getlist("summary") if item]
184
+ translated_filters = [item for item in qp.getlist("translated") if item]
185
+ template_filters = [item for item in qp.getlist("template") if item]
186
+ sort_by = qp.get("sort_by", "").strip()
187
+ sort_dir = qp.get("sort_dir", "desc").strip().lower()
188
+ if sort_dir not in {"asc", "desc"}:
189
+ sort_dir = "desc"
190
+
191
+ return {
192
+ "page": page,
193
+ "page_size": page_size,
194
+ "q": q,
195
+ "filter_query": filter_query,
196
+ "pdf": pdf_filters,
197
+ "source": source_filters,
198
+ "summary": summary_filters,
199
+ "translated": translated_filters,
200
+ "template": template_filters,
201
+ "sort_by": sort_by,
202
+ "sort_dir": sort_dir,
203
+ }
204
+
205
+
206
+ def safe_int(value: Any) -> int:
207
+ """Safely convert a value to int, returning 0 on error."""
208
+ try:
209
+ return int(value)
210
+ except (TypeError, ValueError):
211
+ return 0
212
+
213
+
214
+ def normalize_sort_value(value: Any) -> str:
215
+ """Normalize a value for sorting."""
216
+ return str(value or "").strip().lower()
217
+
218
+
219
+ def sorted_ids(
220
+ index: PaperIndex,
221
+ ids: set[int],
222
+ sort_by: str,
223
+ sort_dir: str,
224
+ ) -> list[int]:
225
+ """Sort paper IDs according to sort criteria."""
226
+ if not sort_by:
227
+ return [idx for idx in index.ordered_ids if idx in ids]
228
+ reverse = sort_dir == "desc"
229
+
230
+ def sort_value(idx: int) -> tuple[Any, bool]:
231
+ paper = index.papers[idx]
232
+ if sort_by == "year":
233
+ year = safe_int(paper.get("_year"))
234
+ month = safe_int(paper.get("_month"))
235
+ return (year, month), year == 0
236
+ if sort_by == "title":
237
+ value = normalize_sort_value(paper.get("paper_title"))
238
+ return value, not bool(value)
239
+ if sort_by == "venue":
240
+ value = normalize_sort_value(paper.get("_venue"))
241
+ return value, not bool(value)
242
+ if sort_by == "author":
243
+ authors = paper.get("_authors") or paper.get("authors") or []
244
+ value = normalize_sort_value(authors[0] if authors else "")
245
+ return value, not bool(value)
246
+ return normalize_sort_value(paper.get("paper_title")), False
247
+
248
+ def key_fn(idx: int) -> tuple[int, Any, int]:
249
+ value, missing = sort_value(idx)
250
+ missing_score = 0 if missing else 1
251
+ if not reverse:
252
+ missing_score = 1 if missing else 0
253
+ return (missing_score, value, idx)
254
+
255
+ return sorted(ids, key=key_fn, reverse=reverse)
@@ -0,0 +1,14 @@
1
+ """Route handlers for paper web UI."""
2
+
3
+ from .api import api_papers, api_pdf, api_stats
4
+ from .pages import index_page, paper_detail, robots_txt, stats_page
5
+
6
+ __all__ = [
7
+ "api_papers",
8
+ "api_pdf",
9
+ "api_stats",
10
+ "index_page",
11
+ "paper_detail",
12
+ "robots_txt",
13
+ "stats_page",
14
+ ]
@@ -0,0 +1,217 @@
1
+ """API route handlers for paper web UI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from starlette.requests import Request
9
+ from starlette.responses import FileResponse, JSONResponse, Response
10
+
11
+ from deepresearch_flow.paper.db_ops import PaperIndex
12
+ from deepresearch_flow.paper.utils import stable_hash
13
+ from deepresearch_flow.paper.web.filters import (
14
+ compute_counts,
15
+ matches_presence,
16
+ merge_filter_set,
17
+ parse_filters,
18
+ parse_filter_query,
19
+ presence_filter,
20
+ sorted_ids,
21
+ )
22
+ from deepresearch_flow.paper.web.text import extract_summary_snippet, normalize_title, normalize_venue
23
+ from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
24
+
25
+
26
+ def _ensure_under_roots(path: Path, roots: list[Path]) -> bool:
27
+ """Check if path is under one of the allowed root directories."""
28
+ resolved = path.resolve()
29
+ for root in roots:
30
+ try:
31
+ resolved.relative_to(root.resolve())
32
+ return True
33
+ except Exception:
34
+ continue
35
+ return False
36
+
37
+
38
+ def _apply_query(index: PaperIndex, query: Query) -> set[int]:
39
+ """Apply a search query to the paper index and return matching IDs."""
40
+ all_ids = set(index.ordered_ids)
41
+
42
+ def ids_for_term(term: QueryTerm, base: set[int]) -> set[int]:
43
+ value_lc = term.value.lower()
44
+ if term.field is None:
45
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_search_lc") or "")}
46
+ if term.field == "title":
47
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_title_lc") or "")}
48
+ if term.field == "venue":
49
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_venue") or "").lower()}
50
+ if term.field == "tag":
51
+ exact = index.by_tag.get(value_lc)
52
+ if exact is not None:
53
+ return exact & base
54
+ return {idx for idx in base if any(value_lc in t.lower() for t in (index.papers[idx].get("_tags") or []))}
55
+ if term.field == "author":
56
+ exact = index.by_author.get(value_lc)
57
+ if exact is not None:
58
+ return exact & base
59
+ return {idx for idx in base if any(value_lc in a.lower() for a in (index.papers[idx].get("_authors") or []))}
60
+ if term.field == "month":
61
+ exact = index.by_month.get(value_lc)
62
+ if exact is not None:
63
+ return exact & base
64
+ return {idx for idx in base if value_lc == str(index.papers[idx].get("_month") or "").lower()}
65
+ if term.field == "year":
66
+ if ".." in term.value:
67
+ start_str, end_str = term.value.split("..", 1)
68
+ if start_str.strip().isdigit() and end_str.strip().isdigit():
69
+ start = int(start_str.strip())
70
+ end = int(end_str.strip())
71
+ ids: set[int] = set()
72
+ for y in range(min(start, end), max(start, end) + 1):
73
+ ids |= index.by_year.get(str(y), set())
74
+ return ids & base
75
+ exact = index.by_year.get(value_lc)
76
+ if exact is not None:
77
+ return exact & base
78
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_year") or "").lower()}
79
+ return set()
80
+
81
+ result: set[int] = set()
82
+ for group in query.groups:
83
+ group_ids = set(all_ids)
84
+ for term in group:
85
+ matched = ids_for_term(term, group_ids if not term.negated else all_ids)
86
+ if term.negated:
87
+ group_ids -= matched
88
+ else:
89
+ group_ids &= matched
90
+ result |= group_ids
91
+
92
+ return result
93
+
94
+
95
+ async def api_papers(request: Request) -> JSONResponse:
96
+ """API endpoint for paper list with filtering, sorting, and pagination."""
97
+ index: PaperIndex = request.app.state.index
98
+ filters = parse_filters(request)
99
+ page = int(filters["page"])
100
+ page_size = int(filters["page_size"])
101
+ q = str(filters["q"])
102
+ filter_query = str(filters["filter_query"])
103
+ sort_by = str(filters["sort_by"]).strip().lower()
104
+ sort_dir = str(filters["sort_dir"]).strip().lower()
105
+ if sort_by not in {"year", "title", "venue", "author"}:
106
+ sort_by = ""
107
+ query = parse_query(q)
108
+ candidate = _apply_query(index, query)
109
+ filter_terms = parse_filter_query(filter_query)
110
+ pdf_filter = merge_filter_set(presence_filter(filters["pdf"]), presence_filter(list(filter_terms["pdf"])))
111
+ source_filter = merge_filter_set(
112
+ presence_filter(filters["source"]), presence_filter(list(filter_terms["source"]))
113
+ )
114
+ summary_filter = merge_filter_set(
115
+ presence_filter(filters["summary"]), presence_filter(list(filter_terms["summary"]))
116
+ )
117
+ translated_filter = merge_filter_set(
118
+ presence_filter(filters["translated"]), presence_filter(list(filter_terms["translated"]))
119
+ )
120
+ template_selected = {item.lower() for item in filters["template"] if item}
121
+ template_filter = merge_filter_set(
122
+ template_selected or None,
123
+ filter_terms["template"] or None,
124
+ )
125
+
126
+ if candidate:
127
+ filtered: set[int] = set()
128
+ for idx in candidate:
129
+ paper = index.papers[idx]
130
+ source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
131
+ has_source = source_hash in index.md_path_by_hash
132
+ has_pdf = source_hash in index.pdf_path_by_hash
133
+ has_summary = bool(paper.get("_has_summary"))
134
+ has_translated = bool(index.translated_md_by_hash.get(source_hash))
135
+ if not matches_presence(pdf_filter, has_pdf):
136
+ continue
137
+ if not matches_presence(source_filter, has_source):
138
+ continue
139
+ if not matches_presence(summary_filter, has_summary):
140
+ continue
141
+ if not matches_presence(translated_filter, has_translated):
142
+ continue
143
+ if template_filter:
144
+ tags = paper.get("_template_tags_lc") or []
145
+ if not any(tag in template_filter for tag in tags):
146
+ continue
147
+ filtered.add(idx)
148
+ candidate = filtered
149
+ ordered = sorted_ids(index, candidate, sort_by, sort_dir)
150
+ total = len(ordered)
151
+ start = (page - 1) * page_size
152
+ end = min(start + page_size, total)
153
+ page_ids = ordered[start:end]
154
+ stats_payload = None
155
+ if page == 1:
156
+ all_ids = set(index.ordered_ids)
157
+ stats_payload = {
158
+ "all": compute_counts(index, all_ids),
159
+ "filtered": compute_counts(index, candidate),
160
+ }
161
+
162
+ items: list[dict[str, Any]] = []
163
+ for idx in page_ids:
164
+ paper = index.papers[idx]
165
+ source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
166
+ translations = index.translated_md_by_hash.get(source_hash, {})
167
+ translation_languages = sorted(translations.keys(), key=str.lower)
168
+ items.append(
169
+ {
170
+ "source_hash": source_hash,
171
+ "title": normalize_title(paper.get("paper_title") or ""),
172
+ "summary_excerpt": extract_summary_snippet(paper),
173
+ "summary_full": paper.get("summary") or "",
174
+ "authors": paper.get("_authors") or [],
175
+ "year": paper.get("_year") or "",
176
+ "month": paper.get("_month") or "",
177
+ "venue": normalize_venue(paper.get("_venue") or ""),
178
+ "tags": paper.get("_tags") or [],
179
+ "template_tags": paper.get("_template_tags") or [],
180
+ "has_source": source_hash in index.md_path_by_hash,
181
+ "has_translation": bool(translation_languages),
182
+ "has_pdf": source_hash in index.pdf_path_by_hash,
183
+ "has_summary": bool(paper.get("_has_summary")),
184
+ "is_pdf_only": bool(paper.get("_is_pdf_only")),
185
+ "translation_languages": translation_languages,
186
+ }
187
+ )
188
+
189
+ return JSONResponse(
190
+ {
191
+ "page": page,
192
+ "page_size": page_size,
193
+ "total": total,
194
+ "has_more": end < total,
195
+ "items": items,
196
+ "stats": stats_payload,
197
+ }
198
+ )
199
+
200
+
201
+ async def api_stats(request: Request) -> JSONResponse:
202
+ """API endpoint for database statistics."""
203
+ index: PaperIndex = request.app.state.index
204
+ return JSONResponse(index.stats)
205
+
206
+
207
+ async def api_pdf(request: Request) -> Response:
208
+ """API endpoint to serve PDF files."""
209
+ index: PaperIndex = request.app.state.index
210
+ source_hash = request.path_params["source_hash"]
211
+ pdf_path = index.pdf_path_by_hash.get(source_hash)
212
+ if not pdf_path:
213
+ return Response("PDF not found", status_code=404)
214
+ allowed_roots: list[Path] = request.app.state.pdf_roots
215
+ if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
216
+ return Response("Forbidden", status_code=403)
217
+ return FileResponse(pdf_path)