deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +193 -0
  4. deepresearch_flow/paper/db_ops.py +1939 -0
  5. deepresearch_flow/paper/llm.py +2 -0
  6. deepresearch_flow/paper/web/app.py +46 -3320
  7. deepresearch_flow/paper/web/constants.py +23 -0
  8. deepresearch_flow/paper/web/filters.py +255 -0
  9. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  10. deepresearch_flow/paper/web/handlers/api.py +217 -0
  11. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  12. deepresearch_flow/paper/web/markdown.py +549 -0
  13. deepresearch_flow/paper/web/static/css/main.css +857 -0
  14. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  15. deepresearch_flow/paper/web/static/js/index.js +266 -0
  16. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  17. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  18. deepresearch_flow/paper/web/templates/base.html +43 -0
  19. deepresearch_flow/paper/web/templates/detail.html +332 -0
  20. deepresearch_flow/paper/web/templates/index.html +114 -0
  21. deepresearch_flow/paper/web/templates/stats.html +29 -0
  22. deepresearch_flow/paper/web/templates.py +85 -0
  23. deepresearch_flow/paper/web/text.py +68 -0
  24. deepresearch_flow/recognize/cli.py +157 -3
  25. deepresearch_flow/recognize/organize.py +58 -0
  26. deepresearch_flow/translator/__init__.py +1 -0
  27. deepresearch_flow/translator/cli.py +451 -0
  28. deepresearch_flow/translator/config.py +19 -0
  29. deepresearch_flow/translator/engine.py +959 -0
  30. deepresearch_flow/translator/fixers.py +451 -0
  31. deepresearch_flow/translator/placeholder.py +62 -0
  32. deepresearch_flow/translator/prompts.py +116 -0
  33. deepresearch_flow/translator/protector.py +291 -0
  34. deepresearch_flow/translator/segment.py +180 -0
  35. deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
  36. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
  37. deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
  38. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
  39. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  40. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  41. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,549 @@
1
+ """Markdown rendering utilities for paper web UI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import html
6
+ import re
7
+ from html.parser import HTMLParser
8
+ from typing import Any
9
+
10
+ from markdown_it import MarkdownIt
11
+
12
+ try:
13
+ from mdit_py_plugins.footnote import footnote_plugin as footnote
14
+ except ImportError: # pragma: no cover - compatibility with older names
15
+ from mdit_py_plugins.footnote import footnote
16
+
17
+ from deepresearch_flow.paper.db_ops import _available_templates
18
+ from deepresearch_flow.paper.render import load_default_template
19
+ from deepresearch_flow.paper.template_registry import load_render_template
20
+ from deepresearch_flow.paper.web.text import normalize_venue
21
+
22
+ _HTML_TABLE_TOKEN_RE = re.compile(r"@@HTML_TABLE_\d+@@")
23
+
24
+
25
+ def create_md_renderer() -> MarkdownIt:
26
+ """Create a configured markdown renderer."""
27
+ md = MarkdownIt("commonmark", {"html": False, "linkify": True})
28
+ md.use(footnote)
29
+ md.enable("table")
30
+ return md
31
+
32
+
33
+ def strip_paragraph_wrapped_tables(text: str) -> str:
34
+ """Remove paragraph tags wrapping table rows."""
35
+ lines = text.splitlines()
36
+ for idx, line in enumerate(lines):
37
+ line = re.sub(r"^\s*<p>\s*\|", "|", line)
38
+ line = re.sub(r"\|\s*</p>\s*$", "|", line)
39
+ lines[idx] = line
40
+ return "\n".join(lines)
41
+
42
+
43
+ def normalize_footnote_definitions(text: str) -> str:
44
+ """Normalize footnote definitions to the markdown-it footnote format."""
45
+ lines = text.splitlines()
46
+ for idx, line in enumerate(lines):
47
+ match = re.match(r"^\[\^([0-9]+)\]\s+", line)
48
+ if match:
49
+ lines[idx] = re.sub(r"^\[\^([0-9]+)\]\s+", r"[^\1]: ", line)
50
+ return "\n".join(lines)
51
+
52
+
53
+ def normalize_markdown_images(text: str) -> str:
54
+ """Normalize markdown images to ensure proper rendering."""
55
+ lines = text.splitlines()
56
+ out: list[str] = []
57
+ in_fence = False
58
+ fence_char = ""
59
+ fence_len = 0
60
+ img_re = re.compile(r"!\[[^\]]*\]\((?:[^)\\]|\\.)*\)")
61
+ list_re = re.compile(r"^\s{0,3}(-|\*|\+|\d{1,9}\.)\s+")
62
+
63
+ for line in lines:
64
+ stripped = line.lstrip()
65
+ if stripped.startswith(("```", "~~~")):
66
+ run_len = 0
67
+ while run_len < len(stripped) and stripped[run_len] == stripped[0]:
68
+ run_len += 1
69
+ if not in_fence:
70
+ in_fence = True
71
+ fence_char = stripped[0]
72
+ fence_len = run_len
73
+ elif stripped[0] == fence_char and run_len >= fence_len:
74
+ in_fence = False
75
+ out.append(line)
76
+ continue
77
+ if in_fence:
78
+ out.append(line)
79
+ continue
80
+ match = img_re.search(line)
81
+ if not match:
82
+ out.append(line)
83
+ continue
84
+ if list_re.match(line) or (line.lstrip().startswith("|") and line.count("|") >= 2):
85
+ out.append(line)
86
+ continue
87
+ prefix = line[:match.start()]
88
+ if prefix.strip():
89
+ out.append(prefix.rstrip())
90
+ out.append("")
91
+ out.append(line[match.start():].lstrip())
92
+ continue
93
+ if out and out[-1].strip():
94
+ out.append("")
95
+ out.append(line)
96
+ return "\n".join(out)
97
+
98
+
99
+ def extract_math_placeholders(text: str) -> tuple[str, dict[str, str]]:
100
+ """Extract math expressions and replace with placeholders."""
101
+ placeholders: dict[str, str] = {}
102
+ out: list[str] = []
103
+ idx = 0
104
+ in_fence = False
105
+ fence_char = ""
106
+ fence_len = 0
107
+ inline_delim_len = 0
108
+
109
+ def next_placeholder(value: str) -> str:
110
+ key = f"@@MATH_{len(placeholders)}@@"
111
+ placeholders[key] = value
112
+ return key
113
+
114
+ while idx < len(text):
115
+ at_line_start = idx == 0 or text[idx - 1] == "\n"
116
+
117
+ if inline_delim_len == 0 and at_line_start:
118
+ line_end = text.find("\n", idx)
119
+ if line_end == -1:
120
+ line_end = len(text)
121
+ line = text[idx:line_end]
122
+ stripped = line.lstrip(" ")
123
+ leading_spaces = len(line) - len(stripped)
124
+ if leading_spaces <= 3 and stripped:
125
+ first = stripped[0]
126
+ if first in {"`", "~"}:
127
+ run_len = 0
128
+ while run_len < len(stripped) and stripped[run_len] == first:
129
+ run_len += 1
130
+ if run_len >= 3:
131
+ if not in_fence:
132
+ in_fence = True
133
+ fence_char = first
134
+ fence_len = run_len
135
+ elif first == fence_char and run_len >= fence_len:
136
+ in_fence = False
137
+ fence_char = ""
138
+ fence_len = 0
139
+ out.append(line)
140
+ idx = line_end
141
+ continue
142
+
143
+ if in_fence:
144
+ out.append(text[idx])
145
+ idx += 1
146
+ continue
147
+
148
+ if inline_delim_len > 0:
149
+ delim = "`" * inline_delim_len
150
+ if text.startswith(delim, idx):
151
+ out.append(delim)
152
+ idx += inline_delim_len
153
+ inline_delim_len = 0
154
+ continue
155
+ out.append(text[idx])
156
+ idx += 1
157
+ continue
158
+
159
+ ch = text[idx]
160
+ if ch == "`":
161
+ run_len = 0
162
+ while idx + run_len < len(text) and text[idx + run_len] == "`":
163
+ run_len += 1
164
+ inline_delim_len = run_len
165
+ out.append("`" * run_len)
166
+ idx += run_len
167
+ continue
168
+
169
+ # Block math: $$...$$ (can span lines)
170
+ if text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
171
+ search_from = idx + 2
172
+ end = text.find("$$", search_from)
173
+ while end != -1 and text[end - 1] == "\\":
174
+ search_from = end + 2
175
+ end = text.find("$$", search_from)
176
+ if end != -1:
177
+ out.append(next_placeholder(text[idx : end + 2]))
178
+ idx = end + 2
179
+ continue
180
+
181
+ # Inline math: $...$ (single-line)
182
+ if ch == "$" and not text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
183
+ line_end = text.find("\n", idx + 1)
184
+ if line_end == -1:
185
+ line_end = len(text)
186
+ search_from = idx + 1
187
+ end = text.find("$", search_from, line_end)
188
+ while end != -1 and text[end - 1] == "\\":
189
+ search_from = end + 1
190
+ end = text.find("$", search_from, line_end)
191
+ if end != -1:
192
+ out.append(next_placeholder(text[idx : end + 1]))
193
+ idx = end + 1
194
+ continue
195
+
196
+ out.append(ch)
197
+ idx += 1
198
+
199
+ return "".join(out), placeholders
200
+
201
+
202
+ class _TableSanitizer(HTMLParser):
203
+ """HTML parser for sanitizing table HTML."""
204
+
205
+ def __init__(self) -> None:
206
+ super().__init__(convert_charrefs=True)
207
+ self._out: list[str] = []
208
+ self._stack: list[str] = []
209
+
210
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
211
+ t = tag.lower()
212
+ if t not in {
213
+ "table",
214
+ "thead",
215
+ "tbody",
216
+ "tfoot",
217
+ "tr",
218
+ "th",
219
+ "td",
220
+ "caption",
221
+ "colgroup",
222
+ "col",
223
+ "br",
224
+ }:
225
+ return
226
+
227
+ allowed: dict[str, str] = {}
228
+ for name, value in attrs:
229
+ if value is None:
230
+ continue
231
+ n = name.lower()
232
+ v = value.strip()
233
+ if t in {"td", "th"} and n in {"colspan", "rowspan"} and v.isdigit():
234
+ allowed[n] = v
235
+ elif t in {"td", "th"} and n == "align" and v.lower() in {"left", "right", "center"}:
236
+ allowed[n] = v.lower()
237
+
238
+ attr_text = "".join(f' {k}="{html.escape(v, quote=True)}"' for k, v in allowed.items())
239
+ self._out.append(f"<{t}{attr_text}>")
240
+ if t not in {"br", "col"}:
241
+ self._stack.append(t)
242
+
243
+ def handle_endtag(self, tag: str) -> None:
244
+ t = tag.lower()
245
+ if t not in self._stack:
246
+ return
247
+ while self._stack:
248
+ popped = self._stack.pop()
249
+ self._out.append(f"</{popped}>")
250
+ if popped == t:
251
+ break
252
+
253
+ def handle_data(self, data: str) -> None:
254
+ self._out.append(html.escape(data))
255
+
256
+ def handle_entityref(self, name: str) -> None:
257
+ self._out.append(f"&{name};")
258
+
259
+ def handle_charref(self, name: str) -> None:
260
+ self._out.append(f"&#{name};")
261
+
262
+ def close(self) -> None:
263
+ super().close()
264
+ while self._stack:
265
+ self._out.append(f"</{self._stack.pop()}>")
266
+
267
+ def get_html(self) -> str:
268
+ return "".join(self._out)
269
+
270
+
271
+ def sanitize_table_html(raw: str) -> str:
272
+ """Sanitize table HTML to only allow safe elements and attributes."""
273
+ parser = _TableSanitizer()
274
+ try:
275
+ parser.feed(raw)
276
+ parser.close()
277
+ except Exception:
278
+ return f"<pre><code>{html.escape(raw)}</code></pre>"
279
+ return parser.get_html()
280
+
281
+
282
+ def sanitize_img_html(raw: str) -> str | None:
283
+ """Sanitize image HTML to only allow base64 data images."""
284
+ attrs = {}
285
+ for match in re.finditer(r"(\w+)\s*=\s*(\"[^\"]*\"|'[^']*'|[^\s>]+)", raw):
286
+ name = match.group(1).lower()
287
+ value = match.group(2).strip()
288
+ if value and value[0] in {"\"", "'"} and value[-1] == value[0]:
289
+ value = value[1:-1]
290
+ attrs[name] = value
291
+
292
+ src = attrs.get("src", "")
293
+ src_lower = src.lower()
294
+ if not src_lower.startswith("data:image/") or ";base64," not in src_lower:
295
+ return None
296
+
297
+ alt = attrs.get("alt", "")
298
+ alt_attr = f' alt="{html.escape(alt, quote=True)}"' if alt else ""
299
+ return f'<img src="{html.escape(src, quote=True)}"{alt_attr} />'
300
+
301
+
302
+ def extract_html_img_placeholders(text: str) -> tuple[str, dict[str, str]]:
303
+ """Extract HTML img tags and replace with placeholders."""
304
+ placeholders: dict[str, str] = {}
305
+ out: list[str] = []
306
+ idx = 0
307
+ in_fence = False
308
+ fence_char = ""
309
+ fence_len = 0
310
+ inline_delim_len = 0
311
+
312
+ def next_placeholder(value: str) -> str:
313
+ key = f"@@HTML_IMG_{len(placeholders)}@@"
314
+ placeholders[key] = value
315
+ return key
316
+
317
+ lower = text.lower()
318
+ while idx < len(text):
319
+ at_line_start = idx == 0 or text[idx - 1] == "\n"
320
+
321
+ if inline_delim_len == 0 and at_line_start:
322
+ line_end = text.find("\n", idx)
323
+ if line_end == -1:
324
+ line_end = len(text)
325
+ line = text[idx:line_end]
326
+ stripped = line.lstrip(" ")
327
+ leading_spaces = len(line) - len(stripped)
328
+ if leading_spaces <= 3 and stripped:
329
+ first = stripped[0]
330
+ if first in {"`", "~"}:
331
+ run_len = 0
332
+ while run_len < len(stripped) and stripped[run_len] == first:
333
+ run_len += 1
334
+ if run_len >= 3:
335
+ if not in_fence:
336
+ in_fence = True
337
+ fence_char = first
338
+ fence_len = run_len
339
+ elif first == fence_char and run_len >= fence_len:
340
+ in_fence = False
341
+ fence_char = ""
342
+ fence_len = 0
343
+ out.append(line)
344
+ idx = line_end
345
+ continue
346
+
347
+ if in_fence:
348
+ out.append(text[idx])
349
+ idx += 1
350
+ continue
351
+
352
+ if inline_delim_len > 0:
353
+ delim = "`" * inline_delim_len
354
+ if text.startswith(delim, idx):
355
+ out.append(delim)
356
+ idx += inline_delim_len
357
+ inline_delim_len = 0
358
+ continue
359
+ out.append(text[idx])
360
+ idx += 1
361
+ continue
362
+
363
+ if text[idx] == "`":
364
+ run_len = 0
365
+ while idx + run_len < len(text) and text[idx + run_len] == "`":
366
+ run_len += 1
367
+ inline_delim_len = run_len
368
+ out.append("`" * run_len)
369
+ idx += run_len
370
+ continue
371
+
372
+ if lower.startswith("<img", idx):
373
+ end = text.find(">", idx)
374
+ if end != -1:
375
+ raw = text[idx : end + 1]
376
+ safe_html = sanitize_img_html(raw)
377
+ if safe_html:
378
+ out.append(next_placeholder(safe_html))
379
+ idx = end + 1
380
+ continue
381
+
382
+ out.append(text[idx])
383
+ idx += 1
384
+
385
+ return "".join(out), placeholders
386
+
387
+
388
+ def extract_html_table_placeholders(text: str) -> tuple[str, dict[str, str]]:
389
+ """Extract HTML table tags and replace with placeholders."""
390
+ placeholders: dict[str, str] = {}
391
+ out: list[str] = []
392
+ idx = 0
393
+ in_fence = False
394
+ fence_char = ""
395
+ fence_len = 0
396
+ inline_delim_len = 0
397
+
398
+ def next_placeholder(value: str) -> str:
399
+ key = f"@@HTML_TABLE_{len(placeholders)}@@"
400
+ placeholders[key] = value
401
+ return key
402
+
403
+ lower = text.lower()
404
+ while idx < len(text):
405
+ at_line_start = idx == 0 or text[idx - 1] == "\n"
406
+
407
+ if inline_delim_len == 0 and at_line_start:
408
+ line_end = text.find("\n", idx)
409
+ if line_end == -1:
410
+ line_end = len(text)
411
+ line = text[idx:line_end]
412
+ stripped = line.lstrip(" ")
413
+ leading_spaces = len(line) - len(stripped)
414
+ if leading_spaces <= 3 and stripped:
415
+ first = stripped[0]
416
+ if first in {"`", "~"}:
417
+ run_len = 0
418
+ while run_len < len(stripped) and stripped[run_len] == first:
419
+ run_len += 1
420
+ if run_len >= 3:
421
+ if not in_fence:
422
+ in_fence = True
423
+ fence_char = first
424
+ fence_len = run_len
425
+ elif first == fence_char and run_len >= fence_len:
426
+ in_fence = False
427
+ fence_char = ""
428
+ fence_len = 0
429
+ out.append(line)
430
+ idx = line_end
431
+ continue
432
+
433
+ if in_fence:
434
+ out.append(text[idx])
435
+ idx += 1
436
+ continue
437
+
438
+ if inline_delim_len > 0:
439
+ delim = "`" * inline_delim_len
440
+ if text.startswith(delim, idx):
441
+ out.append(delim)
442
+ idx += inline_delim_len
443
+ inline_delim_len = 0
444
+ continue
445
+ out.append(text[idx])
446
+ idx += 1
447
+ continue
448
+
449
+ if text[idx] == "`":
450
+ run_len = 0
451
+ while idx + run_len < len(text) and text[idx + run_len] == "`":
452
+ run_len += 1
453
+ inline_delim_len = run_len
454
+ out.append("`" * run_len)
455
+ idx += run_len
456
+ continue
457
+
458
+ if lower.startswith("<table", idx):
459
+ end = lower.find("</table>", idx)
460
+ if end != -1:
461
+ end += len("</table>")
462
+ raw = text[idx:end]
463
+ key = next_placeholder(raw)
464
+ if out and not out[-1].endswith("\n"):
465
+ out.append("\n\n")
466
+ out.append(key)
467
+ out.append("\n\n")
468
+ idx = end
469
+ continue
470
+
471
+ out.append(text[idx])
472
+ idx += 1
473
+
474
+ return "".join(out), placeholders
475
+
476
+
477
+ def render_markdown_with_math_placeholders(md: MarkdownIt, text: str) -> str:
478
+ """Render markdown with math, images, and tables properly escaped."""
479
+ text = strip_paragraph_wrapped_tables(text)
480
+ text = normalize_footnote_definitions(text)
481
+ rendered, table_placeholders = extract_html_table_placeholders(text)
482
+ rendered, img_placeholders = extract_html_img_placeholders(rendered)
483
+ rendered, placeholders = extract_math_placeholders(rendered)
484
+ html_out = md.render(rendered)
485
+ for key, value in placeholders.items():
486
+ html_out = html_out.replace(key, html.escape(value))
487
+ for key, value in img_placeholders.items():
488
+ html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: value, html_out)
489
+ html_out = html_out.replace(key, value)
490
+ for key, value in table_placeholders.items():
491
+ safe_html = sanitize_table_html(value)
492
+ html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: safe_html, html_out)
493
+ html_out = html_out.replace(key, safe_html)
494
+ if _HTML_TABLE_TOKEN_RE.search(html_out):
495
+ html_out = _HTML_TABLE_TOKEN_RE.sub(
496
+ '<div class="warning">Table placeholder could not be restored.</div>',
497
+ html_out,
498
+ )
499
+ html_out = re.sub(r"&lt;sup&gt;([0-9]+)&lt;/sup&gt;", r"<sup>\1</sup>", html_out)
500
+ html_out = re.sub(r"&lt;sub&gt;([0-9]+)&lt;/sub&gt;", r"<sub>\1</sub>", html_out)
501
+ return html_out
502
+
503
+
504
+ def select_template_tag(
505
+ paper: dict[str, Any], requested: str | None
506
+ ) -> tuple[str | None, list[str]]:
507
+ """Select template tag for paper rendering."""
508
+ available = _available_templates(paper)
509
+ if not available:
510
+ return None, []
511
+ default_tag = paper.get("default_template")
512
+ if not default_tag:
513
+ default_tag = "simple" if "simple" in available else available[0]
514
+ selected = requested if requested in available else default_tag
515
+ return selected, available
516
+
517
+
518
+ def render_paper_markdown(
519
+ paper: dict[str, Any],
520
+ fallback_language: str,
521
+ *,
522
+ template_tag: str | None = None,
523
+ ) -> tuple[str, str, str | None]:
524
+ """Render paper content using template and return markdown, template name, and optional warning."""
525
+ selected_tag, _ = select_template_tag(paper, template_tag)
526
+ selected_paper = paper
527
+ if selected_tag:
528
+ selected_paper = (paper.get("templates") or {}).get(selected_tag, paper)
529
+
530
+ template_name = selected_tag or selected_paper.get("prompt_template")
531
+ warning = None
532
+ if template_name:
533
+ try:
534
+ template = load_render_template(str(template_name))
535
+ except Exception:
536
+ template = load_default_template()
537
+ warning = "Rendered using default template (missing template)."
538
+ template_name = "default_paper"
539
+ else:
540
+ template = load_default_template()
541
+ warning = "Rendered using default template (no template specified)."
542
+ template_name = "default_paper"
543
+
544
+ context = dict(selected_paper)
545
+ if not context.get("output_language"):
546
+ context["output_language"] = fallback_language
547
+ if "publication_venue" in context:
548
+ context["publication_venue"] = normalize_venue(str(context.get("publication_venue") or ""))
549
+ return template.render(**context), str(template_name), warning