deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1154 -35
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
  49. deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
  52. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Iterable
5
+
6
+ from markdown_it import MarkdownIt
7
+
8
+
9
+ _HTML_TABLE_RE = re.compile(r"<table\b.*?</table>", re.IGNORECASE | re.DOTALL)
10
+ _TAG_RE = re.compile(r"<[^>]+>")
11
+ _WS_RE = re.compile(r"\s+")
12
+
13
+
14
+ def _is_cjk_char(ch: str) -> bool:
15
+ code = ord(ch)
16
+ return (
17
+ 0x3400 <= code <= 0x4DBF # CJK Unified Ideographs Extension A
18
+ or 0x4E00 <= code <= 0x9FFF # CJK Unified Ideographs
19
+ or 0xF900 <= code <= 0xFAFF # CJK Compatibility Ideographs
20
+ or 0x3040 <= code <= 0x309F # Hiragana
21
+ or 0x30A0 <= code <= 0x30FF # Katakana
22
+ or 0xAC00 <= code <= 0xD7AF # Hangul syllables
23
+ )
24
+
25
+
26
+ def insert_cjk_spaces(text: str) -> str:
27
+ out: list[str] = []
28
+ prev_cjk = False
29
+ for ch in text:
30
+ cur_cjk = _is_cjk_char(ch)
31
+ if prev_cjk and cur_cjk:
32
+ out.append(" ")
33
+ out.append(ch)
34
+ prev_cjk = cur_cjk
35
+ return "".join(out)
36
+
37
+
38
+ def remove_cjk_spaces(text: str) -> str:
39
+ if " " not in text:
40
+ return text
41
+ chars = list(text)
42
+ out: list[str] = []
43
+ for idx, ch in enumerate(chars):
44
+ if ch == " " and 0 < idx < len(chars) - 1:
45
+ if _is_cjk_char(chars[idx - 1]) and _is_cjk_char(chars[idx + 1]):
46
+ continue
47
+ out.append(ch)
48
+ return "".join(out)
49
+
50
+
51
+ def merge_adjacent_markers(text: str, *, start_marker: str = "[[[", end_marker: str = "]]]") -> str:
52
+ needle = f"{end_marker}{start_marker}"
53
+ while needle in text:
54
+ text = text.replace(needle, "")
55
+ return text
56
+
57
+
58
+ def _md_renderer() -> MarkdownIt:
59
+ md = MarkdownIt("commonmark", {"html": False, "linkify": False})
60
+ md.enable("table")
61
+ return md
62
+
63
+
64
+ def markdown_to_plain_text(markdown: str) -> str:
65
+ if not markdown:
66
+ return ""
67
+ text = _HTML_TABLE_RE.sub(" ", markdown)
68
+ md = _md_renderer()
69
+ tokens = md.parse(text)
70
+
71
+ out: list[str] = []
72
+ in_table = 0
73
+ for token in tokens:
74
+ if token.type == "table_open":
75
+ in_table += 1
76
+ continue
77
+ if token.type == "table_close":
78
+ in_table = max(0, in_table - 1)
79
+ continue
80
+ if in_table:
81
+ continue
82
+ if token.type != "inline":
83
+ continue
84
+ for child in token.children or []:
85
+ if child.type in {"text", "code_inline"}:
86
+ out.append(child.content)
87
+ elif child.type == "softbreak":
88
+ out.append("\n")
89
+ elif child.type == "hardbreak":
90
+ out.append("\n")
91
+ elif child.type == "image":
92
+ if child.content:
93
+ out.append(child.content)
94
+
95
+ collapsed = _WS_RE.sub(" ", " ".join(out)).strip()
96
+ collapsed = _TAG_RE.sub(" ", collapsed)
97
+ return _WS_RE.sub(" ", collapsed).strip()
98
+
99
+
100
+ def normalize_query_punctuation(text: str) -> str:
101
+ if not text:
102
+ return ""
103
+ return re.sub(r"[,。、《》、;:!?()【】「」『』·…—]+", " ", text)
104
+
105
+
106
+ def split_mixed_cjk_latin(token: str) -> list[str]:
107
+ if not token:
108
+ return []
109
+ parts: list[str] = []
110
+ buf: list[str] = []
111
+ buf_is_cjk: bool | None = None
112
+ for ch in token:
113
+ cur_is_cjk = _is_cjk_char(ch)
114
+ if buf_is_cjk is None or cur_is_cjk == buf_is_cjk:
115
+ buf.append(ch)
116
+ buf_is_cjk = cur_is_cjk
117
+ continue
118
+ parts.append("".join(buf))
119
+ buf = [ch]
120
+ buf_is_cjk = cur_is_cjk
121
+ if buf:
122
+ parts.append("".join(buf))
123
+ return parts
124
+
125
+
126
+ def rewrite_search_query(user_query: str) -> str:
127
+ cleaned = normalize_query_punctuation(user_query)
128
+ cleaned = _WS_RE.sub(" ", cleaned).strip()
129
+ if not cleaned:
130
+ return ""
131
+
132
+ out: list[str] = []
133
+ for raw in cleaned.split(" "):
134
+ if not raw:
135
+ continue
136
+ upper = raw.upper()
137
+ if upper in {"AND", "OR"}:
138
+ out.append(upper)
139
+ continue
140
+
141
+ segments = split_mixed_cjk_latin(raw)
142
+ for seg in segments:
143
+ if not seg:
144
+ continue
145
+ if all(_is_cjk_char(ch) for ch in seg):
146
+ phrase = insert_cjk_spaces(seg)
147
+ out.append(f"\"{phrase}\"")
148
+ else:
149
+ safe = re.sub(r"[^0-9A-Za-z._+-]+", "", seg)
150
+ if safe:
151
+ out.append(safe.lower())
152
+
153
+ return " ".join(out)
154
+
@@ -24,6 +24,7 @@ class TemplateBundle:
24
24
  class StageDefinition:
25
25
  name: str
26
26
  fields: list[str]
27
+ depends_on: list[str] | None = None
27
28
 
28
29
 
29
30
  _TEMPLATES: dict[str, TemplateBundle] = {
@@ -3,6 +3,10 @@
3
3
  {% set is_zh = output_language == "zh" %}
4
4
  **{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
5
5
 
6
+ {% if paper_institutions %}
7
+ **{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
8
+ {% endif %}
9
+
6
10
  {% if output_language %}
7
11
  **{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
8
12
  {% endif %}
@@ -3,6 +3,10 @@
3
3
  {% set is_zh = output_language == "zh" %}
4
4
  **{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
5
5
 
6
+ {% if paper_institutions %}
7
+ **{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
8
+ {% endif %}
9
+
6
10
  {% if output_language %}
7
11
  **{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
8
12
  {% endif %}
@@ -3,6 +3,10 @@
3
3
  {% set is_zh = output_language == "zh" %}
4
4
  **{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
5
5
 
6
+ {% if paper_institutions %}
7
+ **{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
8
+ {% endif %}
9
+
6
10
  {% if output_language %}
7
11
  **{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
8
12
  {% endif %}
@@ -4,6 +4,10 @@
4
4
 
5
5
  **{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
6
6
 
7
+ {% if paper_institutions %}
8
+ **{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
9
+ {% endif %}
10
+
7
11
  {% if output_language %}
8
12
  **{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
9
13
  {% endif %}
@@ -91,7 +91,7 @@ def create_app(
91
91
  pdf_roots=pdf_roots,
92
92
  )
93
93
  md = create_md_renderer()
94
- static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE_URL")
94
+ static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE") or os.getenv("PAPER_DB_STATIC_BASE_URL")
95
95
  static_mode = _normalize_static_mode(static_mode or os.getenv("PAPER_DB_STATIC_MODE"))
96
96
  resolved_mode = _resolve_static_mode(static_mode, static_base_url)
97
97
  export_dir_value = static_export_dir or os.getenv("PAPER_DB_STATIC_EXPORT_DIR")
@@ -111,7 +111,12 @@ def create_app(
111
111
  asset_config = None
112
112
  if resolved_mode == "prod":
113
113
  if not static_base_url:
114
- logger.warning("Static mode set to prod without base URL; falling back to dev asset routes.")
114
+ logger.warning(
115
+ "Static mode set to prod without base URL; falling back to dev asset routes "
116
+ "(static_mode=%s, static_base_url=%s)",
117
+ static_mode,
118
+ static_base_url or "<empty>",
119
+ )
115
120
  resolved_mode = "dev"
116
121
  else:
117
122
  asset_config = build_static_assets(
@@ -149,8 +154,10 @@ def create_app(
149
154
  )
150
155
  elif pdf_roots:
151
156
  logger.warning(
152
- "PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable.",
157
+ "PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable "
158
+ "(pdf_roots=%d).",
153
159
  PDFJS_STATIC_DIR,
160
+ len(pdf_roots),
154
161
  )
155
162
  if STATIC_DIR.exists():
156
163
  routes.append(