deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +145 -26
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +40 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/paper/web/markdown.py +174 -8
- deepresearch_flow/paper/web/static/css/main.css +8 -1
- deepresearch_flow/paper/web/static/js/detail.js +46 -12
- deepresearch_flow/paper/web/templates/detail.html +9 -0
- deepresearch_flow/paper/web/text.py +8 -4
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +35 -16
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/fixers.py +15 -0
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -137,7 +137,13 @@ def parse_data_url(target: str) -> Optional[tuple[str, bytes]]:
|
|
|
137
137
|
try:
|
|
138
138
|
return mime, base64.b64decode(payload)
|
|
139
139
|
except Exception as exc: # pragma: no cover - defensive
|
|
140
|
-
|
|
140
|
+
message = str(exc).strip() or "unknown error"
|
|
141
|
+
logger.warning(
|
|
142
|
+
"Failed to decode base64 image (mime=%s, chars=%d): %s",
|
|
143
|
+
mime or "<unknown>",
|
|
144
|
+
len(payload),
|
|
145
|
+
message,
|
|
146
|
+
)
|
|
141
147
|
return None
|
|
142
148
|
|
|
143
149
|
|
|
@@ -218,17 +224,26 @@ async def embed_markdown_images(
|
|
|
218
224
|
try:
|
|
219
225
|
response = await http_client.get(target)
|
|
220
226
|
except Exception as exc:
|
|
221
|
-
|
|
227
|
+
message = str(exc).strip() or "unknown error"
|
|
228
|
+
logger.warning("Failed to fetch %s (md=%s): %s", target, md_path, message)
|
|
222
229
|
return None
|
|
223
230
|
if response.status_code >= 400:
|
|
224
|
-
logger.warning(
|
|
231
|
+
logger.warning(
|
|
232
|
+
"Failed to fetch %s (md=%s): HTTP %d",
|
|
233
|
+
target,
|
|
234
|
+
md_path,
|
|
235
|
+
response.status_code,
|
|
236
|
+
)
|
|
225
237
|
return None
|
|
226
238
|
content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
|
|
227
239
|
if not content_type.startswith("image/"):
|
|
228
240
|
guessed = mime_from_path(Path(urlparse(target).path))
|
|
229
241
|
if not guessed or not guessed.startswith("image/"):
|
|
230
242
|
logger.warning(
|
|
231
|
-
"Skipping non-image URL %s (Content-Type
|
|
243
|
+
"Skipping non-image URL %s (md=%s, Content-Type=%s)",
|
|
244
|
+
target,
|
|
245
|
+
md_path,
|
|
246
|
+
content_type,
|
|
232
247
|
)
|
|
233
248
|
return None
|
|
234
249
|
content_type = guessed
|
|
@@ -236,11 +251,16 @@ async def embed_markdown_images(
|
|
|
236
251
|
|
|
237
252
|
local_path = resolve_local_path(md_path, target)
|
|
238
253
|
if not local_path.exists() or not local_path.is_file():
|
|
239
|
-
logger.warning("Image not found: %s", local_path)
|
|
254
|
+
logger.warning("Image not found: %s (md=%s, target=%s)", local_path, md_path, target)
|
|
240
255
|
return None
|
|
241
256
|
mime = mime_from_path(local_path)
|
|
242
257
|
if not mime or not mime.startswith("image/"):
|
|
243
|
-
logger.warning(
|
|
258
|
+
logger.warning(
|
|
259
|
+
"Unsupported image type: %s (md=%s, mime=%s)",
|
|
260
|
+
local_path,
|
|
261
|
+
md_path,
|
|
262
|
+
mime or "unknown",
|
|
263
|
+
)
|
|
244
264
|
return None
|
|
245
265
|
data = await asyncio.to_thread(local_path.read_bytes)
|
|
246
266
|
return data_url_from_bytes(mime, data)
|
|
@@ -264,7 +284,11 @@ async def unpack_markdown_images(
|
|
|
264
284
|
mime, data = parsed
|
|
265
285
|
ext = extension_from_mime(mime)
|
|
266
286
|
if not ext:
|
|
267
|
-
logger.warning(
|
|
287
|
+
logger.warning(
|
|
288
|
+
"Unsupported MIME type: %s (alt=%s)",
|
|
289
|
+
mime,
|
|
290
|
+
alt_text or "<empty>",
|
|
291
|
+
)
|
|
268
292
|
return None
|
|
269
293
|
base_name = base_name_from_alt(alt_text)
|
|
270
294
|
if not base_name:
|
|
@@ -392,7 +392,7 @@ def _ensure_node_validator() -> NodeKatexValidator | None:
|
|
|
392
392
|
node_path = shutil.which("node")
|
|
393
393
|
if not node_path:
|
|
394
394
|
if not _KATEX_WARNED:
|
|
395
|
-
logger.warning("node not
|
|
395
|
+
logger.warning("node binary not found; skip KaTeX validation")
|
|
396
396
|
_KATEX_WARNED = True
|
|
397
397
|
return None
|
|
398
398
|
if _NODE_KATEX_READY is None:
|
|
@@ -408,7 +408,10 @@ def _ensure_node_validator() -> NodeKatexValidator | None:
|
|
|
408
408
|
_NODE_KATEX_READY = False
|
|
409
409
|
if not _NODE_KATEX_READY:
|
|
410
410
|
if not _KATEX_WARNED:
|
|
411
|
-
logger.warning(
|
|
411
|
+
logger.warning(
|
|
412
|
+
"katex npm package not available; skip KaTeX validation (node=%s)",
|
|
413
|
+
node_path,
|
|
414
|
+
)
|
|
412
415
|
_KATEX_WARNED = True
|
|
413
416
|
return None
|
|
414
417
|
script_path = str((Path(__file__).with_name("katex_check.js")).resolve())
|
|
@@ -594,12 +597,22 @@ async def fix_math_text(
|
|
|
594
597
|
stats: MathFixStats,
|
|
595
598
|
repair_enabled: bool = True,
|
|
596
599
|
spans: list[FormulaSpan] | None = None,
|
|
600
|
+
allowed_keys: set[tuple[int, str | None, int | None]] | None = None,
|
|
597
601
|
progress_cb: Callable[[], None] | None = None,
|
|
598
602
|
) -> tuple[str, list[dict[str, Any]]]:
|
|
599
603
|
replacements: list[tuple[int, int, str]] = []
|
|
600
604
|
issues: list[FormulaIssue] = []
|
|
601
605
|
if spans is None:
|
|
602
606
|
spans = extract_math_spans(text, context_chars)
|
|
607
|
+
if allowed_keys:
|
|
608
|
+
filtered: list[FormulaSpan] = []
|
|
609
|
+
for span in spans:
|
|
610
|
+
line_no = line_offset + span.line - 1
|
|
611
|
+
if (line_no, field_path, item_index) in allowed_keys:
|
|
612
|
+
filtered.append(span)
|
|
613
|
+
spans = filtered
|
|
614
|
+
if not spans:
|
|
615
|
+
return text, []
|
|
603
616
|
stats.formulas_total += len(spans)
|
|
604
617
|
file_id = short_hash(file_path)
|
|
605
618
|
for idx, span in enumerate(spans):
|
|
@@ -638,16 +651,38 @@ async def fix_math_text(
|
|
|
638
651
|
|
|
639
652
|
error_records: list[dict[str, Any]] = []
|
|
640
653
|
if issues and repair_enabled:
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
timeout,
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
654
|
+
# Convert to list for parallel processing
|
|
655
|
+
batches = list(iter_batches(issues, batch_size))
|
|
656
|
+
|
|
657
|
+
# Parallel batch repair
|
|
658
|
+
batch_results = await asyncio.gather(
|
|
659
|
+
*[
|
|
660
|
+
repair_batch(batch, provider, model_name, api_key, timeout, max_retries, client)
|
|
661
|
+
for batch in batches
|
|
662
|
+
],
|
|
663
|
+
return_exceptions=True,
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
# Process results
|
|
667
|
+
for batch, result in zip(batches, batch_results):
|
|
668
|
+
if isinstance(result, Exception):
|
|
669
|
+
# Entire batch failed with exception
|
|
670
|
+
error = str(result)
|
|
671
|
+
for issue in batch:
|
|
672
|
+
stats.formulas_failed += 1
|
|
673
|
+
error_records.append({
|
|
674
|
+
"path": file_path,
|
|
675
|
+
"line": line_offset + issue.span.line - 1,
|
|
676
|
+
"delimiter": issue.span.delimiter,
|
|
677
|
+
"latex": issue.span.content,
|
|
678
|
+
"errors": issue.errors + [f"batch_exception: {error}"],
|
|
679
|
+
"field_path": issue.field_path,
|
|
680
|
+
"item_index": issue.item_index,
|
|
681
|
+
})
|
|
682
|
+
continue
|
|
683
|
+
|
|
684
|
+
repairs, error = result
|
|
685
|
+
|
|
651
686
|
if error:
|
|
652
687
|
for issue in batch:
|
|
653
688
|
stats.formulas_failed += 1
|
|
@@ -40,6 +40,17 @@ class MermaidIssue:
|
|
|
40
40
|
item_index: int | None
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
@dataclass
|
|
44
|
+
class DiagramTask:
|
|
45
|
+
"""Global diagram task for parallel processing."""
|
|
46
|
+
file_path: Path
|
|
47
|
+
file_line_offset: int
|
|
48
|
+
field_path: str | None
|
|
49
|
+
item_index: int | None
|
|
50
|
+
span: MermaidSpan
|
|
51
|
+
issue: MermaidIssue | None
|
|
52
|
+
|
|
53
|
+
|
|
43
54
|
@dataclass
|
|
44
55
|
class MermaidFixStats:
|
|
45
56
|
diagrams_total: int = 0
|
|
@@ -574,12 +585,22 @@ async def fix_mermaid_text(
|
|
|
574
585
|
stats: MermaidFixStats,
|
|
575
586
|
repair_enabled: bool = True,
|
|
576
587
|
spans: list[MermaidSpan] | None = None,
|
|
588
|
+
allowed_keys: set[tuple[int, str | None, int | None]] | None = None,
|
|
577
589
|
progress_cb: Callable[[], None] | None = None,
|
|
578
590
|
) -> tuple[str, list[dict[str, Any]]]:
|
|
579
591
|
replacements: list[tuple[int, int, str]] = []
|
|
580
592
|
issues: list[MermaidIssue] = []
|
|
581
593
|
if spans is None:
|
|
582
594
|
spans = extract_mermaid_spans(text, context_chars)
|
|
595
|
+
if allowed_keys:
|
|
596
|
+
filtered: list[MermaidSpan] = []
|
|
597
|
+
for span in spans:
|
|
598
|
+
line_no = line_offset + span.line - 1
|
|
599
|
+
if (line_no, field_path, item_index) in allowed_keys:
|
|
600
|
+
filtered.append(span)
|
|
601
|
+
spans = filtered
|
|
602
|
+
if not spans:
|
|
603
|
+
return text, []
|
|
583
604
|
stats.diagrams_total += len(spans)
|
|
584
605
|
file_id = short_hash(file_path)
|
|
585
606
|
for idx, span in enumerate(spans):
|
|
@@ -614,16 +635,37 @@ async def fix_mermaid_text(
|
|
|
614
635
|
|
|
615
636
|
error_records: list[dict[str, Any]] = []
|
|
616
637
|
if issues and repair_enabled:
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
timeout,
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
638
|
+
# Convert to list for parallel processing
|
|
639
|
+
batches = list(iter_batches(issues, batch_size))
|
|
640
|
+
|
|
641
|
+
# Parallel batch repair
|
|
642
|
+
batch_results = await asyncio.gather(
|
|
643
|
+
*[
|
|
644
|
+
repair_batch(batch, provider, model_name, api_key, timeout, max_retries, client)
|
|
645
|
+
for batch in batches
|
|
646
|
+
],
|
|
647
|
+
return_exceptions=True,
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
# Process results
|
|
651
|
+
for batch, result in zip(batches, batch_results):
|
|
652
|
+
if isinstance(result, Exception):
|
|
653
|
+
# Entire batch failed with exception
|
|
654
|
+
error = str(result)
|
|
655
|
+
for issue in batch:
|
|
656
|
+
stats.diagrams_failed += 1
|
|
657
|
+
error_records.append({
|
|
658
|
+
"path": file_path,
|
|
659
|
+
"line": line_offset + issue.span.line - 1,
|
|
660
|
+
"mermaid": issue.span.content,
|
|
661
|
+
"errors": issue.errors + [f"batch_exception: {error}"],
|
|
662
|
+
"field_path": issue.field_path,
|
|
663
|
+
"item_index": issue.item_index,
|
|
664
|
+
})
|
|
665
|
+
continue
|
|
666
|
+
|
|
667
|
+
repairs, error = result
|
|
668
|
+
|
|
627
669
|
if error:
|
|
628
670
|
for issue in batch:
|
|
629
671
|
stats.diagrams_failed += 1
|
|
@@ -688,3 +730,271 @@ async def fix_mermaid_text(
|
|
|
688
730
|
|
|
689
731
|
updated = apply_replacements(text, replacements)
|
|
690
732
|
return updated, error_records
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def extract_diagrams_from_text(
|
|
736
|
+
text: str,
|
|
737
|
+
file_path: Path,
|
|
738
|
+
line_offset: int,
|
|
739
|
+
field_path: str | None,
|
|
740
|
+
item_index: int | None,
|
|
741
|
+
context_chars: int,
|
|
742
|
+
skip_validation: bool = False,
|
|
743
|
+
) -> list[DiagramTask]:
|
|
744
|
+
"""Extract all diagram tasks from a text block.
|
|
745
|
+
|
|
746
|
+
Args:
|
|
747
|
+
skip_validation: If True, skip validation and mark all diagrams as having issues.
|
|
748
|
+
This is faster for initial extraction when you'll validate later.
|
|
749
|
+
"""
|
|
750
|
+
tasks: list[DiagramTask] = []
|
|
751
|
+
spans = extract_mermaid_spans(text, context_chars)
|
|
752
|
+
file_id = short_hash(str(file_path))
|
|
753
|
+
|
|
754
|
+
for idx, span in enumerate(spans):
|
|
755
|
+
issue: MermaidIssue | None = None
|
|
756
|
+
|
|
757
|
+
if skip_validation:
|
|
758
|
+
# Mark all diagrams as needing validation (skip expensive mmdc call)
|
|
759
|
+
issue_id = f"{file_id}:{line_offset + span.line - 1}:{idx}"
|
|
760
|
+
issue = MermaidIssue(
|
|
761
|
+
issue_id=issue_id,
|
|
762
|
+
span=span,
|
|
763
|
+
errors=["not_validated"],
|
|
764
|
+
field_path=field_path,
|
|
765
|
+
item_index=item_index,
|
|
766
|
+
)
|
|
767
|
+
else:
|
|
768
|
+
# Full validation (expensive)
|
|
769
|
+
validation = validate_mermaid(span.content)
|
|
770
|
+
|
|
771
|
+
if validation:
|
|
772
|
+
# Try cleanup first
|
|
773
|
+
candidate = cleanup_mermaid(span.content)
|
|
774
|
+
if candidate != span.content:
|
|
775
|
+
candidate_validation = validate_mermaid(candidate)
|
|
776
|
+
if not candidate_validation:
|
|
777
|
+
# Cleanup fixed it, no issue
|
|
778
|
+
pass
|
|
779
|
+
else:
|
|
780
|
+
validation = candidate_validation
|
|
781
|
+
|
|
782
|
+
if validation:
|
|
783
|
+
# Still invalid after cleanup
|
|
784
|
+
issue_id = f"{file_id}:{line_offset + span.line - 1}:{idx}"
|
|
785
|
+
issue = MermaidIssue(
|
|
786
|
+
issue_id=issue_id,
|
|
787
|
+
span=span,
|
|
788
|
+
errors=[validation],
|
|
789
|
+
field_path=field_path,
|
|
790
|
+
item_index=item_index,
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
tasks.append(
|
|
794
|
+
DiagramTask(
|
|
795
|
+
file_path=file_path,
|
|
796
|
+
file_line_offset=line_offset,
|
|
797
|
+
field_path=field_path,
|
|
798
|
+
item_index=item_index,
|
|
799
|
+
span=span,
|
|
800
|
+
issue=issue,
|
|
801
|
+
)
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
return tasks
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
async def repair_all_diagrams_global(
|
|
808
|
+
tasks: list[DiagramTask],
|
|
809
|
+
batch_size: int,
|
|
810
|
+
max_concurrent_batches: int,
|
|
811
|
+
provider,
|
|
812
|
+
model_name: str,
|
|
813
|
+
api_key: str | None,
|
|
814
|
+
timeout: float,
|
|
815
|
+
max_retries: int,
|
|
816
|
+
client: httpx.AsyncClient,
|
|
817
|
+
stats: MermaidFixStats,
|
|
818
|
+
progress_cb: Callable[[], None] | None = None,
|
|
819
|
+
) -> tuple[dict[Path, list[tuple[int, int, str]]], list[dict[str, Any]]]:
|
|
820
|
+
"""
|
|
821
|
+
Globally repair all diagrams in parallel.
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
- dict mapping file paths to list of (start, end, replacement) tuples
|
|
825
|
+
- list of error records
|
|
826
|
+
"""
|
|
827
|
+
from collections import defaultdict
|
|
828
|
+
|
|
829
|
+
stats.diagrams_total += len(tasks)
|
|
830
|
+
|
|
831
|
+
file_replacements: dict[Path, list[tuple[int, int, str]]] = defaultdict(list)
|
|
832
|
+
error_records: list[dict[str, Any]] = []
|
|
833
|
+
|
|
834
|
+
clean_tasks: list[DiagramTask] = []
|
|
835
|
+
invalid_tasks: list[DiagramTask] = []
|
|
836
|
+
needs_validation: list[DiagramTask] = []
|
|
837
|
+
task_by_issue_id: dict[str, DiagramTask] = {}
|
|
838
|
+
|
|
839
|
+
for task in tasks:
|
|
840
|
+
if not task.issue:
|
|
841
|
+
clean_tasks.append(task)
|
|
842
|
+
continue
|
|
843
|
+
if task.issue.errors == ["not_validated"]:
|
|
844
|
+
needs_validation.append(task)
|
|
845
|
+
continue
|
|
846
|
+
invalid_tasks.append(task)
|
|
847
|
+
task_by_issue_id[task.issue.issue_id] = task
|
|
848
|
+
|
|
849
|
+
if progress_cb:
|
|
850
|
+
for _ in clean_tasks:
|
|
851
|
+
progress_cb()
|
|
852
|
+
|
|
853
|
+
if needs_validation:
|
|
854
|
+
validate_limit = max(1, min(8, max_concurrent_batches))
|
|
855
|
+
validate_semaphore = asyncio.Semaphore(validate_limit)
|
|
856
|
+
|
|
857
|
+
def validate_and_cleanup(text: str) -> tuple[str, str | None]:
|
|
858
|
+
validation = validate_mermaid(text)
|
|
859
|
+
if not validation:
|
|
860
|
+
return "clean", None
|
|
861
|
+
cleaned = cleanup_mermaid(text)
|
|
862
|
+
if cleaned != text and not validate_mermaid(cleaned):
|
|
863
|
+
return "cleaned", cleaned
|
|
864
|
+
return "invalid", validation
|
|
865
|
+
|
|
866
|
+
async def validate_one(task: DiagramTask) -> tuple[str, str | None]:
|
|
867
|
+
async with validate_semaphore:
|
|
868
|
+
return await asyncio.to_thread(validate_and_cleanup, task.span.content)
|
|
869
|
+
|
|
870
|
+
results = await asyncio.gather(*[validate_one(task) for task in needs_validation])
|
|
871
|
+
for task, (status, payload) in zip(needs_validation, results):
|
|
872
|
+
if status == "clean":
|
|
873
|
+
if progress_cb:
|
|
874
|
+
progress_cb()
|
|
875
|
+
continue
|
|
876
|
+
if status == "cleaned":
|
|
877
|
+
stats.diagrams_repaired += 1
|
|
878
|
+
file_replacements[task.file_path].append((task.span.start, task.span.end, payload or task.span.content))
|
|
879
|
+
if progress_cb:
|
|
880
|
+
progress_cb()
|
|
881
|
+
continue
|
|
882
|
+
|
|
883
|
+
# Still invalid: attach validation errors and send to LLM repair.
|
|
884
|
+
task.issue.errors = [payload] if payload else ["invalid"]
|
|
885
|
+
invalid_tasks.append(task)
|
|
886
|
+
task_by_issue_id[task.issue.issue_id] = task
|
|
887
|
+
|
|
888
|
+
stats.diagrams_invalid += len(invalid_tasks)
|
|
889
|
+
|
|
890
|
+
if not invalid_tasks:
|
|
891
|
+
return file_replacements, error_records
|
|
892
|
+
|
|
893
|
+
issues = [task.issue for task in invalid_tasks if task.issue]
|
|
894
|
+
batches = list(iter_batches(issues, batch_size))
|
|
895
|
+
|
|
896
|
+
semaphore = asyncio.Semaphore(max_concurrent_batches)
|
|
897
|
+
|
|
898
|
+
async def process_batch(batch: list[MermaidIssue]) -> tuple[dict[str, str], str | None]:
|
|
899
|
+
async with semaphore:
|
|
900
|
+
return await repair_batch(batch, provider, model_name, api_key, timeout, max_retries, client)
|
|
901
|
+
|
|
902
|
+
results = await asyncio.gather(
|
|
903
|
+
*[process_batch(batch) for batch in batches],
|
|
904
|
+
return_exceptions=True,
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
for batch, result in zip(batches, results):
|
|
908
|
+
if isinstance(result, Exception):
|
|
909
|
+
error_msg = str(result)
|
|
910
|
+
for issue in batch:
|
|
911
|
+
stats.diagrams_failed += 1
|
|
912
|
+
task = task_by_issue_id.get(issue.issue_id)
|
|
913
|
+
if not task:
|
|
914
|
+
continue
|
|
915
|
+
error_records.append(
|
|
916
|
+
{
|
|
917
|
+
"path": str(task.file_path),
|
|
918
|
+
"line": task.file_line_offset + issue.span.line - 1,
|
|
919
|
+
"mermaid": issue.span.content,
|
|
920
|
+
"errors": issue.errors + [f"batch_error: {error_msg}"],
|
|
921
|
+
"field_path": issue.field_path,
|
|
922
|
+
"item_index": issue.item_index,
|
|
923
|
+
}
|
|
924
|
+
)
|
|
925
|
+
if progress_cb:
|
|
926
|
+
progress_cb()
|
|
927
|
+
continue
|
|
928
|
+
|
|
929
|
+
repairs, batch_error = result
|
|
930
|
+
|
|
931
|
+
if batch_error:
|
|
932
|
+
for issue in batch:
|
|
933
|
+
stats.diagrams_failed += 1
|
|
934
|
+
task = task_by_issue_id.get(issue.issue_id)
|
|
935
|
+
if not task:
|
|
936
|
+
continue
|
|
937
|
+
error_records.append(
|
|
938
|
+
{
|
|
939
|
+
"path": str(task.file_path),
|
|
940
|
+
"line": task.file_line_offset + issue.span.line - 1,
|
|
941
|
+
"mermaid": issue.span.content,
|
|
942
|
+
"errors": issue.errors + [f"llm_error: {batch_error}"],
|
|
943
|
+
"field_path": issue.field_path,
|
|
944
|
+
"item_index": issue.item_index,
|
|
945
|
+
}
|
|
946
|
+
)
|
|
947
|
+
if progress_cb:
|
|
948
|
+
progress_cb()
|
|
949
|
+
continue
|
|
950
|
+
|
|
951
|
+
for issue in batch:
|
|
952
|
+
task = task_by_issue_id.get(issue.issue_id)
|
|
953
|
+
if not task:
|
|
954
|
+
if progress_cb:
|
|
955
|
+
progress_cb()
|
|
956
|
+
continue
|
|
957
|
+
repaired = repairs.get(issue.issue_id)
|
|
958
|
+
|
|
959
|
+
if not repaired:
|
|
960
|
+
stats.diagrams_failed += 1
|
|
961
|
+
error_records.append(
|
|
962
|
+
{
|
|
963
|
+
"path": str(task.file_path),
|
|
964
|
+
"line": task.file_line_offset + issue.span.line - 1,
|
|
965
|
+
"mermaid": issue.span.content,
|
|
966
|
+
"errors": issue.errors + ["llm_missing_output"],
|
|
967
|
+
"field_path": issue.field_path,
|
|
968
|
+
"item_index": issue.item_index,
|
|
969
|
+
}
|
|
970
|
+
)
|
|
971
|
+
if progress_cb:
|
|
972
|
+
progress_cb()
|
|
973
|
+
continue
|
|
974
|
+
|
|
975
|
+
repaired = strip_mermaid_fences(repaired)
|
|
976
|
+
repaired = cleanup_mermaid(repaired)
|
|
977
|
+
validation = validate_mermaid(repaired)
|
|
978
|
+
|
|
979
|
+
if validation:
|
|
980
|
+
stats.diagrams_failed += 1
|
|
981
|
+
error_records.append(
|
|
982
|
+
{
|
|
983
|
+
"path": str(task.file_path),
|
|
984
|
+
"line": task.file_line_offset + issue.span.line - 1,
|
|
985
|
+
"mermaid": issue.span.content,
|
|
986
|
+
"errors": issue.errors + [f"repair_still_invalid: {validation}"],
|
|
987
|
+
"field_path": issue.field_path,
|
|
988
|
+
"item_index": issue.item_index,
|
|
989
|
+
}
|
|
990
|
+
)
|
|
991
|
+
if progress_cb:
|
|
992
|
+
progress_cb()
|
|
993
|
+
continue
|
|
994
|
+
|
|
995
|
+
stats.diagrams_repaired += 1
|
|
996
|
+
file_replacements[task.file_path].append((issue.span.start, issue.span.end, repaired))
|
|
997
|
+
if progress_cb:
|
|
998
|
+
progress_cb()
|
|
999
|
+
|
|
1000
|
+
return file_replacements, error_records
|
|
@@ -31,7 +31,7 @@ async def _format_markdown(text: str) -> str:
|
|
|
31
31
|
global _RUMDL_WARNED
|
|
32
32
|
if not _RUMDL_PATH:
|
|
33
33
|
if not _RUMDL_WARNED:
|
|
34
|
-
logger.warning("rumdl not available; skip markdown formatting")
|
|
34
|
+
logger.warning("rumdl not available; skip markdown formatting (recognize)")
|
|
35
35
|
_RUMDL_WARNED = True
|
|
36
36
|
return text
|
|
37
37
|
|
|
@@ -45,10 +45,15 @@ async def _format_markdown(text: str) -> str:
|
|
|
45
45
|
check=False,
|
|
46
46
|
)
|
|
47
47
|
except OSError as exc:
|
|
48
|
-
|
|
48
|
+
message = str(exc).strip() or "unknown error"
|
|
49
|
+
logger.warning("rumdl fmt failed (oserror=%s): %s", type(exc).__name__, message)
|
|
49
50
|
return text
|
|
50
51
|
if proc.returncode != 0:
|
|
51
|
-
logger.warning(
|
|
52
|
+
logger.warning(
|
|
53
|
+
"rumdl fmt failed (rc=%s): %s",
|
|
54
|
+
proc.returncode,
|
|
55
|
+
proc.stderr.strip() or "unknown error",
|
|
56
|
+
)
|
|
52
57
|
return text
|
|
53
58
|
return proc.stdout or text
|
|
54
59
|
|
|
@@ -80,26 +85,35 @@ def discover_mineru_dirs(inputs: Iterable[str], recursive: bool) -> list[Path]:
|
|
|
80
85
|
if path.name != "full.md":
|
|
81
86
|
raise FileNotFoundError(f"Expected full.md file but got: {path}")
|
|
82
87
|
parent = path.parent.resolve()
|
|
83
|
-
if (parent / "images").is_dir():
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
88
|
+
if not (parent / "images").is_dir():
|
|
89
|
+
logger.warning(
|
|
90
|
+
"Missing images/ for %s; continuing (expected=%s)",
|
|
91
|
+
parent,
|
|
92
|
+
parent / "images",
|
|
93
|
+
)
|
|
94
|
+
results.add(parent)
|
|
87
95
|
continue
|
|
88
96
|
if not path.exists():
|
|
89
97
|
raise FileNotFoundError(f"Input path not found: {path}")
|
|
90
98
|
if path.is_dir():
|
|
91
99
|
if (path / "full.md").is_file():
|
|
92
|
-
if (path / "images").is_dir():
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
100
|
+
if not (path / "images").is_dir():
|
|
101
|
+
logger.warning(
|
|
102
|
+
"Missing images/ for %s; continuing (expected=%s)",
|
|
103
|
+
path,
|
|
104
|
+
path / "images",
|
|
105
|
+
)
|
|
106
|
+
results.add(path.resolve())
|
|
96
107
|
pattern = path.rglob("full.md") if recursive else path.glob("full.md")
|
|
97
108
|
for full_path in pattern:
|
|
98
109
|
parent = full_path.parent.resolve()
|
|
99
|
-
if (parent / "images").is_dir():
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
110
|
+
if not (parent / "images").is_dir():
|
|
111
|
+
logger.warning(
|
|
112
|
+
"Missing images/ for %s; continuing (expected=%s)",
|
|
113
|
+
parent,
|
|
114
|
+
parent / "images",
|
|
115
|
+
)
|
|
116
|
+
results.add(parent)
|
|
103
117
|
continue
|
|
104
118
|
raise FileNotFoundError(f"Input path not found: {path}")
|
|
105
119
|
return sorted(results)
|
|
@@ -129,7 +143,12 @@ async def organize_mineru_dir(
|
|
|
129
143
|
return None
|
|
130
144
|
source_path = resolve_local_path(md_path, target)
|
|
131
145
|
if not source_path.exists() or not source_path.is_file():
|
|
132
|
-
logger.warning(
|
|
146
|
+
logger.warning(
|
|
147
|
+
"Image not found: %s (md=%s, target=%s)",
|
|
148
|
+
source_path,
|
|
149
|
+
md_path,
|
|
150
|
+
target,
|
|
151
|
+
)
|
|
133
152
|
return None
|
|
134
153
|
if source_path in image_map:
|
|
135
154
|
return f"images/{image_map[source_path]}"
|