deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +193 -0
  4. deepresearch_flow/paper/db_ops.py +1939 -0
  5. deepresearch_flow/paper/llm.py +2 -0
  6. deepresearch_flow/paper/web/app.py +46 -3320
  7. deepresearch_flow/paper/web/constants.py +23 -0
  8. deepresearch_flow/paper/web/filters.py +255 -0
  9. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  10. deepresearch_flow/paper/web/handlers/api.py +217 -0
  11. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  12. deepresearch_flow/paper/web/markdown.py +549 -0
  13. deepresearch_flow/paper/web/static/css/main.css +857 -0
  14. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  15. deepresearch_flow/paper/web/static/js/index.js +266 -0
  16. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  17. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  18. deepresearch_flow/paper/web/templates/base.html +43 -0
  19. deepresearch_flow/paper/web/templates/detail.html +332 -0
  20. deepresearch_flow/paper/web/templates/index.html +114 -0
  21. deepresearch_flow/paper/web/templates/stats.html +29 -0
  22. deepresearch_flow/paper/web/templates.py +85 -0
  23. deepresearch_flow/paper/web/text.py +68 -0
  24. deepresearch_flow/recognize/cli.py +157 -3
  25. deepresearch_flow/recognize/organize.py +58 -0
  26. deepresearch_flow/translator/__init__.py +1 -0
  27. deepresearch_flow/translator/cli.py +451 -0
  28. deepresearch_flow/translator/config.py +19 -0
  29. deepresearch_flow/translator/engine.py +959 -0
  30. deepresearch_flow/translator/fixers.py +451 -0
  31. deepresearch_flow/translator/placeholder.py +62 -0
  32. deepresearch_flow/translator/prompts.py +116 -0
  33. deepresearch_flow/translator/protector.py +291 -0
  34. deepresearch_flow/translator/segment.py +180 -0
  35. deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
  36. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
  37. deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
  38. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
  39. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  40. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  41. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,451 @@
1
+ """OCR markdown repair utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ import re
7
+ from typing import Iterable, Optional
8
+
9
+
10
+ class ReferenceProcessor:
11
+ def __init__(self) -> None:
12
+ self._patterns = {
13
+ "reference_def": re.compile(
14
+ r"^\[(\d+)\]((?:(?!\[\d+\])[^\n])*)\n(?=^\[\d+\]|$)",
15
+ re.MULTILINE,
16
+ ),
17
+ "reference_range": re.compile(r"\[(\d+)\-(\d+)\]"),
18
+ "reference_multi": re.compile(r"\[(\d+(?:,\s*\d+)*)\]"),
19
+ "reference_single": re.compile(r"\[(\d+)\]"),
20
+ }
21
+
22
+ def fix_references(self, text: str) -> str:
23
+ for match in re.findall(self._patterns["reference_def"], text):
24
+ original = f"[{match[0]}] {match[1].strip()}"
25
+ replacement = f"[^{match[0]}]: {match[1].strip()}\n"
26
+ text = text.replace(original, replacement)
27
+
28
+ for match in re.findall(self._patterns["reference_range"], text):
29
+ original = f"[{match[0]}-{match[1]}]"
30
+ expanded = " ".join(
31
+ f"[^{i}]" for i in range(int(match[0]), int(match[1]) + 1)
32
+ )
33
+ text = text.replace(original, expanded)
34
+
35
+ for match in re.findall(self._patterns["reference_multi"], text):
36
+ original = f"[{match}]"
37
+ numbers = [n.strip() for n in match.split(",")]
38
+ expanded = " ".join(f"[^{n}]" for n in numbers)
39
+ text = text.replace(original, expanded)
40
+
41
+ for match in re.findall(self._patterns["reference_single"], text):
42
+ original = f"[{match}]"
43
+ replacement = f"[^{match}]"
44
+ if original in text and f"[^{match}]" not in text.replace(replacement, ""):
45
+ text = text.replace(original, replacement)
46
+
47
+ return text
48
+
49
+
50
+ class LinkProcessor:
51
+ def __init__(self) -> None:
52
+ self._patterns = {
53
+ "url": re.compile(
54
+ r"(?<!<)(?<!]\()(?:(?<=^)|(?<=\s)|(?<=[\(\[{\"“]))"
55
+ r"(https?://[^\s\)\]\}>]+)"
56
+ r"(?=[\s\)\]\}>.,!?;:,。!?;:]|$)"
57
+ ),
58
+ "email": re.compile(
59
+ r"(?<!<)(?<!]\()(?<![\w.%+-])"
60
+ r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})"
61
+ r"(?=[\s\)\]\}>.,!?;:,。!?;:]|$)"
62
+ ),
63
+ "phone": re.compile(
64
+ r"(?<!<)(?<!]\()(?:(?<=^)|(?<=\s)|(?<=[\(\[{\"“]))"
65
+ r"(\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})"
66
+ r"(?=[\s\)\]\}>.,!?;:,。!?;:]|$)"
67
+ ),
68
+ }
69
+
70
+ def fix_links(self, text: str) -> str:
71
+ def bracket_urls(value: str) -> str:
72
+ def repl(match: re.Match) -> str:
73
+ url = match.group(1)
74
+ if len(url) > 1 and url[-1] in ".,!?;:,。!?;:" and url[-2].isalnum():
75
+ return f"<{url[:-1]}>{url[-1]}"
76
+ return f"<{url}>"
77
+
78
+ return self._patterns["url"].sub(repl, value)
79
+
80
+ def bracket_emails(value: str) -> str:
81
+ return self._patterns["email"].sub(r"<mailto:\1>", value)
82
+
83
+ def bracket_phones(value: str) -> str:
84
+ return self._patterns["phone"].sub(lambda m: f"<tel:{m.group(1)}>", value)
85
+
86
+ text = bracket_urls(text)
87
+ text = bracket_emails(text)
88
+ text = bracket_phones(text)
89
+ return text
90
+
91
+
92
+ class PseudocodeProcessor:
93
+ def __init__(self) -> None:
94
+ self._header_pattern = re.compile(
95
+ r"^\s*\*?\*?\s*(Algorithm|算法)\s+([A-Za-z0-9.-]+)?\*?\*?\s*(.*)$",
96
+ re.IGNORECASE,
97
+ )
98
+
99
+ def wrap_pseudocode_blocks(self, text: str, lang: str = "pseudo") -> str:
100
+ lines = text.splitlines()
101
+ out: list[str] = []
102
+ i = 0
103
+ in_fence = False
104
+
105
+ while i < len(lines):
106
+ line = lines[i]
107
+ if line.strip().startswith("```"):
108
+ in_fence = not in_fence
109
+ out.append(line)
110
+ i += 1
111
+ continue
112
+
113
+ if not in_fence and self._header_pattern.match(line):
114
+ header_line = line
115
+ block = [header_line]
116
+ i += 1
117
+ while i < len(lines):
118
+ peek = lines[i]
119
+ if peek.strip().startswith("```") or re.match(r"^\s*#{1,6}\s", peek):
120
+ break
121
+ if not self._is_algo_continuation(peek):
122
+ break
123
+ block.append(peek)
124
+ i += 1
125
+
126
+ out.append(f"```{lang}")
127
+ title = self._format_title(header_line)
128
+ if title:
129
+ out.append(f"// {title}")
130
+ for raw in block[1:]:
131
+ s = raw.strip()
132
+ if s == "***":
133
+ out.append("// " + "-" * 40)
134
+ continue
135
+ out.append(self._clean_inline(raw))
136
+ out.append("```")
137
+ continue
138
+
139
+ out.append(line)
140
+ i += 1
141
+
142
+ return "\n".join(out)
143
+
144
+ def _format_title(self, header_line: str) -> str | None:
145
+ match = self._header_pattern.match(header_line)
146
+ if not match:
147
+ return None
148
+ alg_no = (match.group(2) or "").strip()
149
+ rest = (match.group(3) or "").strip()
150
+ rest = self._clean_inline(rest)
151
+ if alg_no and rest:
152
+ return f"Algorithm {alg_no}: {rest}"
153
+ if alg_no:
154
+ return f"Algorithm {alg_no}"
155
+ if rest:
156
+ return f"Algorithm: {rest}"
157
+ return "Algorithm"
158
+
159
+ def _is_algo_continuation(self, line: str) -> bool:
160
+ s = line.strip()
161
+ if s == "" or s == "***":
162
+ return True
163
+ if re.match(r"^\s*\d+\s*[:.)]\ ", s):
164
+ return True
165
+ if re.match(r"^\s*(Input|Output|Require|Ensure):\s*", s, re.I):
166
+ return True
167
+ if re.match(
168
+ r"^\s*(function|procedure|for|while|if|else|repeat|return|end)\b",
169
+ s,
170
+ re.I,
171
+ ):
172
+ return True
173
+ return False
174
+
175
+ def _clean_inline(self, text: str) -> str:
176
+ text = re.sub(r"<\s*sub\s*>\s*(.*?)\s*<\s*/\s*sub\s*>", lambda m: "_" + re.sub(r"\*", "", m.group(1)), text, flags=re.I)
177
+ text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
178
+ text = re.sub(r"\*([^\*]+)\*", r"\1", text)
179
+ text = re.sub(r"\*+$", "", text)
180
+ text = re.sub(r"^\*+", "", text)
181
+ return text.strip()
182
+
183
+
184
+ class TitleProcessor:
185
+ def __init__(self) -> None:
186
+ self._patterns = {
187
+ "roman_with_sec": re.compile(
188
+ r"^(#{1,6})?\s*(Sec(?:tion)?\.\s*)?([IVX]+(?:\.[IVX]+)*)(\.?)\s+(.+)$"
189
+ ),
190
+ "number": re.compile(r"^\s*(#{1,6})?\s*(\d+(?:\.\d+)*)(\.?)\s+(.+)$"),
191
+ "letter_upper": re.compile(r"^(#{1,6})?\s*([A-Z])\.\s+(.+)$"),
192
+ "letter_lower": re.compile(r"^(#{1,6})?\s*([a-z])\.\s+(.+)$"),
193
+ }
194
+
195
+ def fix_titles(self, text: str) -> str:
196
+ lines = text.split("\n")
197
+ new_lines: list[str] = []
198
+
199
+ def is_title(line: str) -> bool:
200
+ return re.match(r"^#{1,6}\s+", line) is not None
201
+
202
+ has_roman = bool(
203
+ re.search(
204
+ r"^#{1,6}?\s*(?:Sec(?:tion)?\.\s*)?[IVX]+(?:\.[IVX]+)*\.?\s+",
205
+ text,
206
+ re.MULTILINE,
207
+ )
208
+ )
209
+
210
+ for line in lines:
211
+ if not is_title(line):
212
+ new_lines.append(line)
213
+ continue
214
+ modified = False
215
+
216
+ match = self._patterns["roman_with_sec"].match(line)
217
+ if match:
218
+ section_prefix = match.group(2) or ""
219
+ roman_num = match.group(3)
220
+ dot = match.group(4)
221
+ title = match.group(5)
222
+ level = len(roman_num.split(".")) + 1
223
+ new_hashes = "#" * level
224
+ new_line = f"{new_hashes} {section_prefix}{roman_num}{dot or '.'} {title}"
225
+ new_lines.append(new_line)
226
+ modified = True
227
+
228
+ if not modified:
229
+ match = self._patterns["number"].match(line)
230
+ if match:
231
+ number = match.group(2)
232
+ dot = match.group(3)
233
+ title = match.group(4)
234
+ level = len(number.split(".")) + 1
235
+ if has_roman:
236
+ level += 1
237
+ new_hashes = "#" * min(level, 6)
238
+ trail_dot = dot if has_roman else (dot or ".")
239
+ new_line = f"{new_hashes} {number}{trail_dot} {title}"
240
+ new_lines.append(new_line)
241
+ modified = True
242
+
243
+ if not modified:
244
+ for pattern_name in ["letter_upper", "letter_lower"]:
245
+ match = self._patterns[pattern_name].match(line)
246
+ if match and not re.match(r"^[A-Z][a-z]", match.group(3)):
247
+ letter = match.group(2)
248
+ title = match.group(3)
249
+ level = 4 if pattern_name == "letter_upper" else 5
250
+ new_hashes = "#" * level
251
+ new_line = f"{new_hashes} {letter}. {title}"
252
+ new_lines.append(new_line)
253
+ modified = True
254
+ break
255
+
256
+ if not modified:
257
+ new_lines.append(line)
258
+
259
+ return "\n".join(new_lines)
260
+
261
+
262
+ @dataclass
263
+ class Block:
264
+ kind: str
265
+ content: str
266
+
267
+
268
+ def _is_blank(line: str) -> bool:
269
+ return len(line.strip()) == 0
270
+
271
+
272
+ def _line_starts_with_fence(line: str) -> Optional[str]:
273
+ match = re.match(r"^\s*(`{3,}|~{3,})", line)
274
+ return match.group(1) if match else None
275
+
276
+
277
+ def _looks_like_table_header(line: str) -> bool:
278
+ return "|" in line
279
+
280
+
281
+ def _looks_like_table_delim(line: str) -> bool:
282
+ return (
283
+ re.match(r"^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$", line)
284
+ is not None
285
+ )
286
+
287
+
288
+ def _is_image_line(line: str) -> bool:
289
+ return re.match(r"^\s*!\[.*?\]\(.*?\)\s*$", line) is not None
290
+
291
+
292
+ def _parse_blocks(text: str) -> list[Block]:
293
+ lines = text.splitlines(keepends=True)
294
+ blocks: list[Block] = []
295
+ i = 0
296
+ n = len(lines)
297
+
298
+ while i < n:
299
+ line = lines[i]
300
+
301
+ if _is_blank(line):
302
+ blocks.append(Block(kind="sep", content=line))
303
+ i += 1
304
+ continue
305
+
306
+ if line.strip() == "---":
307
+ blocks.append(Block(kind="page", content=line))
308
+ i += 1
309
+ continue
310
+
311
+ fence = _line_starts_with_fence(line)
312
+ if fence:
313
+ j = i + 1
314
+ while j < n and not re.match(rf"^\s*{re.escape(fence)}", lines[j]):
315
+ j += 1
316
+ if j < n:
317
+ block = "".join(lines[i : j + 1])
318
+ blocks.append(Block(kind="code", content=block))
319
+ i = j + 1
320
+ continue
321
+ block = "".join(lines[i:])
322
+ blocks.append(Block(kind="code", content=block))
323
+ break
324
+
325
+ if _is_image_line(line):
326
+ blocks.append(Block(kind="image", content=line))
327
+ i += 1
328
+ continue
329
+
330
+ if i + 1 < n and _looks_like_table_header(line) and _looks_like_table_delim(lines[i + 1]):
331
+ j = i + 2
332
+ while j < n and ("|" in lines[j]) and not _is_blank(lines[j]):
333
+ j += 1
334
+ block = "".join(lines[i:j])
335
+ blocks.append(Block(kind="table", content=block))
336
+ i = j
337
+ continue
338
+
339
+ if line.strip() == "$$":
340
+ j = i + 1
341
+ while j < n and lines[j].strip() != "$$":
342
+ j += 1
343
+ if j < n:
344
+ block = "".join(lines[i : j + 1])
345
+ blocks.append(Block(kind="math", content=block))
346
+ i = j + 1
347
+ continue
348
+ block = "".join(lines[i:])
349
+ blocks.append(Block(kind="math", content=block))
350
+ break
351
+
352
+ text_lines = [line]
353
+ j = i + 1
354
+ while j < n:
355
+ peek = lines[j]
356
+ if _is_blank(peek) or _is_image_line(peek) or peek.strip() == "---":
357
+ break
358
+ if _line_starts_with_fence(peek):
359
+ break
360
+ if j + 1 < n and _looks_like_table_header(peek) and _looks_like_table_delim(lines[j + 1]):
361
+ break
362
+ if peek.strip() == "$$":
363
+ break
364
+ text_lines.append(peek)
365
+ j += 1
366
+ blocks.append(Block(kind="text", content="".join(text_lines)))
367
+ i = j
368
+
369
+ return blocks
370
+
371
+
372
+ def _word_set(text: str) -> set[str]:
373
+ return {w for w in re.split(r"\W+", text.lower()) if w}
374
+
375
+
376
+ def _split_confidence(before_text: str, after_text: str) -> float:
377
+ confidence = 0.0
378
+ if not re.search(r"[.!?。!?]\s*$", before_text):
379
+ confidence += 0.3
380
+ if after_text and after_text[0].islower():
381
+ confidence += 0.4
382
+ common_words = len(_word_set(before_text) & _word_set(after_text))
383
+ if common_words > 1:
384
+ confidence += min(0.3, common_words * 0.1)
385
+ return min(confidence, 1.0)
386
+
387
+
388
+ def _merge_blocks(blocks: list[Block]) -> list[Block]:
389
+ idx = 0
390
+ while idx + 2 < len(blocks):
391
+ before = blocks[idx]
392
+ middle = blocks[idx + 1]
393
+ after = blocks[idx + 2]
394
+ if before.kind != "text" or after.kind != "text":
395
+ idx += 1
396
+ continue
397
+ if middle.kind not in {"page", "image", "table", "code"}:
398
+ idx += 1
399
+ continue
400
+
401
+ before_text = before.content.strip()
402
+ after_text = after.content.strip()
403
+ if before_text == "" or after_text == "":
404
+ idx += 1
405
+ continue
406
+
407
+ if middle.kind == "page" and before_text.endswith("-") and after_text[0].islower():
408
+ merged_text = before.content.rstrip("-") + after.content.lstrip()
409
+ blocks = blocks[:idx] + [Block(kind="text", content=merged_text)] + blocks[idx + 3 :]
410
+ continue
411
+
412
+ confidence = _split_confidence(before_text, after_text)
413
+ if confidence < 0.7:
414
+ idx += 1
415
+ continue
416
+
417
+ merged_text = before.content.rstrip() + " " + after.content.lstrip()
418
+ if middle.kind == "page":
419
+ blocks = blocks[:idx] + [Block(kind="text", content=merged_text)] + blocks[idx + 3 :]
420
+ continue
421
+
422
+ blocks = blocks[:idx] + [Block(kind="text", content=merged_text), middle] + blocks[idx + 3 :]
423
+ idx += 1
424
+
425
+ return blocks
426
+
427
+
428
+ def merge_paragraphs(text: str) -> str:
429
+ blocks = _parse_blocks(text)
430
+ merged = _merge_blocks(blocks)
431
+ return "".join(block.content for block in merged)
432
+
433
+
434
+ def fix_markdown(text: str, level: str) -> str:
435
+ if level == "off":
436
+ return text
437
+
438
+ ref_processor = ReferenceProcessor()
439
+ link_processor = LinkProcessor()
440
+ pseudo_processor = PseudocodeProcessor()
441
+ title_processor = TitleProcessor()
442
+
443
+ text = merge_paragraphs(text)
444
+ text = ref_processor.fix_references(text)
445
+ text = link_processor.fix_links(text)
446
+ text = pseudo_processor.wrap_pseudocode_blocks(text)
447
+
448
+ if level == "aggressive":
449
+ text = title_processor.fix_titles(text)
450
+
451
+ return text
@@ -0,0 +1,62 @@
1
+ """Placeholder store for protected markdown segments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Dict, List
7
+
8
+
9
+ class PlaceHolderStore:
10
+ def __init__(self) -> None:
11
+ self._map: dict[str, str] = {}
12
+ self._rev: dict[str, str] = {}
13
+ self._kind_count: dict[str, int] = {}
14
+ self.length = 0
15
+
16
+ def add(self, kind: str, text: str) -> str:
17
+ if text in self._map:
18
+ return self._map[text]
19
+
20
+ self.length += 1
21
+ length_str = str(self.length).zfill(6)
22
+ placeholder = f"__PH_{kind}_{length_str}__"
23
+ self._map[text] = placeholder
24
+ self._rev[placeholder] = text
25
+ self._kind_count[kind] = self._kind_count.get(kind, 0) + 1
26
+ return placeholder
27
+
28
+ def save(self, file_path: str) -> None:
29
+ payload = {
30
+ "map": self._map,
31
+ "rev": self._rev,
32
+ "kind_count": self._kind_count,
33
+ }
34
+ with open(file_path, "w", encoding="utf-8") as handle:
35
+ json.dump(payload, handle, indent=2, ensure_ascii=False)
36
+
37
+ def load(self, file_path: str) -> None:
38
+ with open(file_path, "r", encoding="utf-8") as handle:
39
+ payload = json.load(handle)
40
+ self._map = payload.get("map", {})
41
+ self._rev = payload.get("rev", {})
42
+ self._kind_count = payload.get("kind_count", {})
43
+ self.length = len(self._map)
44
+
45
+ def restore_all(self, text: str) -> str:
46
+ for placeholder, raw in sorted(self._rev.items(), key=lambda item: -len(item[0])):
47
+ if raw.endswith("\n"):
48
+ text = text.replace(f"{placeholder}\n", raw)
49
+ text = text.replace(placeholder, raw)
50
+ return text
51
+
52
+ def contains_all(self, text: str) -> bool:
53
+ return all(placeholder in text for placeholder in self._map.values())
54
+
55
+ def diff_missing(self, text: str) -> List[str]:
56
+ return [ph for ph in self._map.values() if ph not in text]
57
+
58
+ def snapshot(self) -> Dict[str, str]:
59
+ return dict(self._map)
60
+
61
+ def kind_counts(self) -> Dict[str, int]:
62
+ return dict(self._kind_count)
@@ -0,0 +1,116 @@
1
+ """Prompt templates for markdown translation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from string import Template
6
+ from textwrap import dedent
7
+
8
+
9
+ def _cdata_wrap(text: str) -> str:
10
+ return text.replace("]]>", "]]]]><![CDATA[>")
11
+
12
+
13
+ SYSTEM_RULES = dedent(
14
+ """\
15
+ You are a professional translation engine. Follow these invariant rules:
16
+ - Preserve all original formatting exactly (Markdown, whitespace, paragraph breaks).
17
+ - Do NOT translate LaTeX ($...$, $$...$$, \\( ... \\), \\[ ... \\]) or LaTeX commands/environments.
18
+ - Keep all HTML tags intact.
19
+ - Do NOT alter abbreviations, technical terms, or code identifiers.
20
+ - Handle NODE styles: @@NODE_START_{n}@@/@@NODE_END_{n}@@ and <NODE_START_{n}></NODE_END_{n}>.
21
+ - Respect PRESERVE spans: @@PRESERVE_{n}@@ ... @@/PRESERVE_{n}@@ (leave markers and enclosed text unchanged).
22
+ - Placeholders like __PH_[A-Z0-9_]+__ must remain unchanged.
23
+ - Output ONLY the NODE blocks in original order; no extra commentary.
24
+ - If markers malformed: reproduce original block verbatim and append <!-- VIOLATION: reason --> once.
25
+ """
26
+ )
27
+
28
+
29
+ TRANSLATE_XML_TEMPLATE = Template(
30
+ dedent(
31
+ """\
32
+ <TranslationTask version="1.0">
33
+ <meta>
34
+ <source_lang>$SOURCE_LANG</source_lang>
35
+ <target_lang>$TARGET_LANG</target_lang>
36
+ <visibility_note>Sections with visibility="internal" are instructions and MUST NOT appear in the final output.</visibility_note>
37
+ </meta>
38
+
39
+ <constraints visibility="internal">
40
+ <rule id="fmt-1">Preserve ALL original formatting exactly: Markdown, whitespace, line breaks, paragraph spacing.</rule>
41
+ <rule id="fmt-2">Do NOT translate any content inside LaTeX ($$...$$, $$$$...$$$$, \\( ... \\), \\[ ... \\]) or LaTeX commands/environments.</rule>
42
+ <rule id="fmt-3">Keep ALL HTML tags intact.</rule>
43
+ <rule id="fmt-4">Do NOT alter abbreviations, technical terms, or code identifiers; translate surrounding prose only.</rule>
44
+ <rule id="fmt-5">Document structure must be preserved, including blank lines (double newlines) between blocks.</rule>
45
+ </constraints>
46
+
47
+ <markers visibility="internal">
48
+ <preserve>
49
+ <open>@@PRESERVE_{n}@@</open>
50
+ <close>@@/PRESERVE_{n}@@</close>
51
+ <instruction>Leave both markers and enclosed text EXACTLY unchanged.</instruction>
52
+ </preserve>
53
+ <node accepted_styles="double">
54
+ <style type="at">
55
+ <open>@@NODE_START_{n}@@</open>
56
+ <close>@@NODE_END_{n}@@</close>
57
+ </style>
58
+ <style type="angle">
59
+ <open>&lt;NODE_START_{n}&gt;</open>
60
+ <close>&lt;/NODE_END_{n}&gt;</close>
61
+ </style>
62
+ <scope>Translate ONLY the text inside each NODE block.</scope>
63
+ <layout>
64
+ <rule>Preserve the exact presence/absence of newlines around the content.</rule>
65
+ <rule>Preserve all spaces and blank lines BETWEEN NODE blocks exactly.</rule>
66
+ </layout>
67
+ </node>
68
+ <placeholders>
69
+ <pattern>__PH_[A-Z0-9_]+__</pattern>
70
+ <instruction>All placeholders matching this regex MUST be left unchanged.</instruction>
71
+ </placeholders>
72
+ </markers>
73
+
74
+ <output_spec visibility="internal">
75
+ <rule id="out-1">Output ONLY the NODE blocks in the original order. Non-NODE text must NOT be echoed.</rule>
76
+ <rule id="out-2">For each NODE: emit the exact START marker, then the translated content, then the exact END marker.</rule>
77
+ <rule id="out-3">Do NOT reveal or restate any instructions with visibility="internal".</rule>
78
+ </output_spec>
79
+
80
+ <quality_checks visibility="internal">
81
+ <check>Count of START and END NODE markers is identical to input; indices {n} match 1:1.</check>
82
+ <check>No PRESERVE spans were altered; byte-for-byte identical.</check>
83
+ <check>No LaTeX/HTML/code tokens changed; only prose translated.</check>
84
+ <check>Paragraph breaks and intra-block whitespace unchanged.</check>
85
+ </quality_checks>
86
+
87
+ <fallback visibility="internal">
88
+ <strategy>If a block violates constraints or markers are malformed, do NOT guess. Reproduce the original block unchanged and append a single-line comment &lt;!-- VIOLATION: reason --&gt; after the block.</strategy>
89
+ </fallback>
90
+
91
+ <io>
92
+ <input>
93
+ <![CDATA[
94
+ $TEXT_TO_TRANSLATE
95
+ ]]>
96
+ </input>
97
+ <expected_output visibility="internal">
98
+ <note>Emit only transformed NODE blocks per output_spec. Nothing else.</note>
99
+ </expected_output>
100
+ </io>
101
+ </TranslationTask>
102
+ """
103
+ )
104
+ )
105
+
106
+
107
+ def build_translation_messages(source_lang: str | None, target_lang: str, text: str) -> list[dict[str, str]]:
108
+ user_xml = TRANSLATE_XML_TEMPLATE.substitute(
109
+ SOURCE_LANG=source_lang or "auto",
110
+ TARGET_LANG=target_lang,
111
+ TEXT_TO_TRANSLATE=_cdata_wrap(text),
112
+ )
113
+ return [
114
+ {"role": "system", "content": SYSTEM_RULES},
115
+ {"role": "user", "content": user_xml},
116
+ ]