deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1041 -34
  4. deepresearch_flow/paper/db_ops.py +145 -26
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
  15. deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
  16. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  17. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  18. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  19. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  20. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
  21. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  22. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  23. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  24. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  25. deepresearch_flow/paper/snapshot/api.py +941 -0
  26. deepresearch_flow/paper/snapshot/builder.py +965 -0
  27. deepresearch_flow/paper/snapshot/identity.py +239 -0
  28. deepresearch_flow/paper/snapshot/schema.py +245 -0
  29. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  30. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  31. deepresearch_flow/paper/snapshot/text.py +154 -0
  32. deepresearch_flow/paper/template_registry.py +40 -0
  33. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
  35. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  36. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  37. deepresearch_flow/paper/web/app.py +10 -3
  38. deepresearch_flow/paper/web/markdown.py +174 -8
  39. deepresearch_flow/paper/web/static/css/main.css +8 -1
  40. deepresearch_flow/paper/web/static/js/detail.js +46 -12
  41. deepresearch_flow/paper/web/templates/detail.html +9 -0
  42. deepresearch_flow/paper/web/text.py +8 -4
  43. deepresearch_flow/recognize/cli.py +380 -103
  44. deepresearch_flow/recognize/markdown.py +31 -7
  45. deepresearch_flow/recognize/math.py +47 -12
  46. deepresearch_flow/recognize/mermaid.py +320 -10
  47. deepresearch_flow/recognize/organize.py +35 -16
  48. deepresearch_flow/translator/cli.py +71 -20
  49. deepresearch_flow/translator/engine.py +220 -81
  50. deepresearch_flow/translator/fixers.py +15 -0
  51. deepresearch_flow/translator/prompts.py +19 -2
  52. deepresearch_flow/translator/protector.py +15 -3
  53. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
  54. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
  55. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
  56. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
  57. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
  58. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
@@ -137,7 +137,13 @@ def parse_data_url(target: str) -> Optional[tuple[str, bytes]]:
137
137
  try:
138
138
  return mime, base64.b64decode(payload)
139
139
  except Exception as exc: # pragma: no cover - defensive
140
- logger.warning("Failed to decode base64 image: %s", exc)
140
+ message = str(exc).strip() or "unknown error"
141
+ logger.warning(
142
+ "Failed to decode base64 image (mime=%s, chars=%d): %s",
143
+ mime or "<unknown>",
144
+ len(payload),
145
+ message,
146
+ )
141
147
  return None
142
148
 
143
149
 
@@ -218,17 +224,26 @@ async def embed_markdown_images(
218
224
  try:
219
225
  response = await http_client.get(target)
220
226
  except Exception as exc:
221
- logger.warning("Failed to fetch %s: %s", target, exc)
227
+ message = str(exc).strip() or "unknown error"
228
+ logger.warning("Failed to fetch %s (md=%s): %s", target, md_path, message)
222
229
  return None
223
230
  if response.status_code >= 400:
224
- logger.warning("Failed to fetch %s: HTTP %d", target, response.status_code)
231
+ logger.warning(
232
+ "Failed to fetch %s (md=%s): HTTP %d",
233
+ target,
234
+ md_path,
235
+ response.status_code,
236
+ )
225
237
  return None
226
238
  content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
227
239
  if not content_type.startswith("image/"):
228
240
  guessed = mime_from_path(Path(urlparse(target).path))
229
241
  if not guessed or not guessed.startswith("image/"):
230
242
  logger.warning(
231
- "Skipping non-image URL %s (Content-Type %s)", target, content_type
243
+ "Skipping non-image URL %s (md=%s, Content-Type=%s)",
244
+ target,
245
+ md_path,
246
+ content_type,
232
247
  )
233
248
  return None
234
249
  content_type = guessed
@@ -236,11 +251,16 @@ async def embed_markdown_images(
236
251
 
237
252
  local_path = resolve_local_path(md_path, target)
238
253
  if not local_path.exists() or not local_path.is_file():
239
- logger.warning("Image not found: %s", local_path)
254
+ logger.warning("Image not found: %s (md=%s, target=%s)", local_path, md_path, target)
240
255
  return None
241
256
  mime = mime_from_path(local_path)
242
257
  if not mime or not mime.startswith("image/"):
243
- logger.warning("Unsupported image type: %s", local_path)
258
+ logger.warning(
259
+ "Unsupported image type: %s (md=%s, mime=%s)",
260
+ local_path,
261
+ md_path,
262
+ mime or "unknown",
263
+ )
244
264
  return None
245
265
  data = await asyncio.to_thread(local_path.read_bytes)
246
266
  return data_url_from_bytes(mime, data)
@@ -264,7 +284,11 @@ async def unpack_markdown_images(
264
284
  mime, data = parsed
265
285
  ext = extension_from_mime(mime)
266
286
  if not ext:
267
- logger.warning("Unsupported MIME type: %s", mime)
287
+ logger.warning(
288
+ "Unsupported MIME type: %s (alt=%s)",
289
+ mime,
290
+ alt_text or "<empty>",
291
+ )
268
292
  return None
269
293
  base_name = base_name_from_alt(alt_text)
270
294
  if not base_name:
@@ -392,7 +392,7 @@ def _ensure_node_validator() -> NodeKatexValidator | None:
392
392
  node_path = shutil.which("node")
393
393
  if not node_path:
394
394
  if not _KATEX_WARNED:
395
- logger.warning("node not available; skip KaTeX validation")
395
+ logger.warning("node binary not found; skip KaTeX validation")
396
396
  _KATEX_WARNED = True
397
397
  return None
398
398
  if _NODE_KATEX_READY is None:
@@ -408,7 +408,10 @@ def _ensure_node_validator() -> NodeKatexValidator | None:
408
408
  _NODE_KATEX_READY = False
409
409
  if not _NODE_KATEX_READY:
410
410
  if not _KATEX_WARNED:
411
- logger.warning("katex npm package not available; skip KaTeX validation")
411
+ logger.warning(
412
+ "katex npm package not available; skip KaTeX validation (node=%s)",
413
+ node_path,
414
+ )
412
415
  _KATEX_WARNED = True
413
416
  return None
414
417
  script_path = str((Path(__file__).with_name("katex_check.js")).resolve())
@@ -594,12 +597,22 @@ async def fix_math_text(
594
597
  stats: MathFixStats,
595
598
  repair_enabled: bool = True,
596
599
  spans: list[FormulaSpan] | None = None,
600
+ allowed_keys: set[tuple[int, str | None, int | None]] | None = None,
597
601
  progress_cb: Callable[[], None] | None = None,
598
602
  ) -> tuple[str, list[dict[str, Any]]]:
599
603
  replacements: list[tuple[int, int, str]] = []
600
604
  issues: list[FormulaIssue] = []
601
605
  if spans is None:
602
606
  spans = extract_math_spans(text, context_chars)
607
+ if allowed_keys:
608
+ filtered: list[FormulaSpan] = []
609
+ for span in spans:
610
+ line_no = line_offset + span.line - 1
611
+ if (line_no, field_path, item_index) in allowed_keys:
612
+ filtered.append(span)
613
+ spans = filtered
614
+ if not spans:
615
+ return text, []
603
616
  stats.formulas_total += len(spans)
604
617
  file_id = short_hash(file_path)
605
618
  for idx, span in enumerate(spans):
@@ -638,16 +651,38 @@ async def fix_math_text(
638
651
 
639
652
  error_records: list[dict[str, Any]] = []
640
653
  if issues and repair_enabled:
641
- for batch in iter_batches(issues, batch_size):
642
- repairs, error = await repair_batch(
643
- batch,
644
- provider,
645
- model_name,
646
- api_key,
647
- timeout,
648
- max_retries,
649
- client,
650
- )
654
+ # Convert to list for parallel processing
655
+ batches = list(iter_batches(issues, batch_size))
656
+
657
+ # Parallel batch repair
658
+ batch_results = await asyncio.gather(
659
+ *[
660
+ repair_batch(batch, provider, model_name, api_key, timeout, max_retries, client)
661
+ for batch in batches
662
+ ],
663
+ return_exceptions=True,
664
+ )
665
+
666
+ # Process results
667
+ for batch, result in zip(batches, batch_results):
668
+ if isinstance(result, Exception):
669
+ # Entire batch failed with exception
670
+ error = str(result)
671
+ for issue in batch:
672
+ stats.formulas_failed += 1
673
+ error_records.append({
674
+ "path": file_path,
675
+ "line": line_offset + issue.span.line - 1,
676
+ "delimiter": issue.span.delimiter,
677
+ "latex": issue.span.content,
678
+ "errors": issue.errors + [f"batch_exception: {error}"],
679
+ "field_path": issue.field_path,
680
+ "item_index": issue.item_index,
681
+ })
682
+ continue
683
+
684
+ repairs, error = result
685
+
651
686
  if error:
652
687
  for issue in batch:
653
688
  stats.formulas_failed += 1
@@ -40,6 +40,17 @@ class MermaidIssue:
40
40
  item_index: int | None
41
41
 
42
42
 
43
+ @dataclass
44
+ class DiagramTask:
45
+ """Global diagram task for parallel processing."""
46
+ file_path: Path
47
+ file_line_offset: int
48
+ field_path: str | None
49
+ item_index: int | None
50
+ span: MermaidSpan
51
+ issue: MermaidIssue | None
52
+
53
+
43
54
  @dataclass
44
55
  class MermaidFixStats:
45
56
  diagrams_total: int = 0
@@ -574,12 +585,22 @@ async def fix_mermaid_text(
574
585
  stats: MermaidFixStats,
575
586
  repair_enabled: bool = True,
576
587
  spans: list[MermaidSpan] | None = None,
588
+ allowed_keys: set[tuple[int, str | None, int | None]] | None = None,
577
589
  progress_cb: Callable[[], None] | None = None,
578
590
  ) -> tuple[str, list[dict[str, Any]]]:
579
591
  replacements: list[tuple[int, int, str]] = []
580
592
  issues: list[MermaidIssue] = []
581
593
  if spans is None:
582
594
  spans = extract_mermaid_spans(text, context_chars)
595
+ if allowed_keys:
596
+ filtered: list[MermaidSpan] = []
597
+ for span in spans:
598
+ line_no = line_offset + span.line - 1
599
+ if (line_no, field_path, item_index) in allowed_keys:
600
+ filtered.append(span)
601
+ spans = filtered
602
+ if not spans:
603
+ return text, []
583
604
  stats.diagrams_total += len(spans)
584
605
  file_id = short_hash(file_path)
585
606
  for idx, span in enumerate(spans):
@@ -614,16 +635,37 @@ async def fix_mermaid_text(
614
635
 
615
636
  error_records: list[dict[str, Any]] = []
616
637
  if issues and repair_enabled:
617
- for batch in iter_batches(issues, batch_size):
618
- repairs, error = await repair_batch(
619
- batch,
620
- provider,
621
- model_name,
622
- api_key,
623
- timeout,
624
- max_retries,
625
- client,
626
- )
638
+ # Convert to list for parallel processing
639
+ batches = list(iter_batches(issues, batch_size))
640
+
641
+ # Parallel batch repair
642
+ batch_results = await asyncio.gather(
643
+ *[
644
+ repair_batch(batch, provider, model_name, api_key, timeout, max_retries, client)
645
+ for batch in batches
646
+ ],
647
+ return_exceptions=True,
648
+ )
649
+
650
+ # Process results
651
+ for batch, result in zip(batches, batch_results):
652
+ if isinstance(result, Exception):
653
+ # Entire batch failed with exception
654
+ error = str(result)
655
+ for issue in batch:
656
+ stats.diagrams_failed += 1
657
+ error_records.append({
658
+ "path": file_path,
659
+ "line": line_offset + issue.span.line - 1,
660
+ "mermaid": issue.span.content,
661
+ "errors": issue.errors + [f"batch_exception: {error}"],
662
+ "field_path": issue.field_path,
663
+ "item_index": issue.item_index,
664
+ })
665
+ continue
666
+
667
+ repairs, error = result
668
+
627
669
  if error:
628
670
  for issue in batch:
629
671
  stats.diagrams_failed += 1
@@ -688,3 +730,271 @@ async def fix_mermaid_text(
688
730
 
689
731
  updated = apply_replacements(text, replacements)
690
732
  return updated, error_records
733
+
734
+
735
+ def extract_diagrams_from_text(
736
+ text: str,
737
+ file_path: Path,
738
+ line_offset: int,
739
+ field_path: str | None,
740
+ item_index: int | None,
741
+ context_chars: int,
742
+ skip_validation: bool = False,
743
+ ) -> list[DiagramTask]:
744
+ """Extract all diagram tasks from a text block.
745
+
746
+ Args:
747
+ skip_validation: If True, skip validation and mark all diagrams as having issues.
748
+ This is faster for initial extraction when you'll validate later.
749
+ """
750
+ tasks: list[DiagramTask] = []
751
+ spans = extract_mermaid_spans(text, context_chars)
752
+ file_id = short_hash(str(file_path))
753
+
754
+ for idx, span in enumerate(spans):
755
+ issue: MermaidIssue | None = None
756
+
757
+ if skip_validation:
758
+ # Mark all diagrams as needing validation (skip expensive mmdc call)
759
+ issue_id = f"{file_id}:{line_offset + span.line - 1}:{idx}"
760
+ issue = MermaidIssue(
761
+ issue_id=issue_id,
762
+ span=span,
763
+ errors=["not_validated"],
764
+ field_path=field_path,
765
+ item_index=item_index,
766
+ )
767
+ else:
768
+ # Full validation (expensive)
769
+ validation = validate_mermaid(span.content)
770
+
771
+ if validation:
772
+ # Try cleanup first
773
+ candidate = cleanup_mermaid(span.content)
774
+ if candidate != span.content:
775
+ candidate_validation = validate_mermaid(candidate)
776
+ if not candidate_validation:
777
+ # Cleanup fixed it, no issue
778
+ pass
779
+ else:
780
+ validation = candidate_validation
781
+
782
+ if validation:
783
+ # Still invalid after cleanup
784
+ issue_id = f"{file_id}:{line_offset + span.line - 1}:{idx}"
785
+ issue = MermaidIssue(
786
+ issue_id=issue_id,
787
+ span=span,
788
+ errors=[validation],
789
+ field_path=field_path,
790
+ item_index=item_index,
791
+ )
792
+
793
+ tasks.append(
794
+ DiagramTask(
795
+ file_path=file_path,
796
+ file_line_offset=line_offset,
797
+ field_path=field_path,
798
+ item_index=item_index,
799
+ span=span,
800
+ issue=issue,
801
+ )
802
+ )
803
+
804
+ return tasks
805
+
806
+
807
+ async def repair_all_diagrams_global(
808
+ tasks: list[DiagramTask],
809
+ batch_size: int,
810
+ max_concurrent_batches: int,
811
+ provider,
812
+ model_name: str,
813
+ api_key: str | None,
814
+ timeout: float,
815
+ max_retries: int,
816
+ client: httpx.AsyncClient,
817
+ stats: MermaidFixStats,
818
+ progress_cb: Callable[[], None] | None = None,
819
+ ) -> tuple[dict[Path, list[tuple[int, int, str]]], list[dict[str, Any]]]:
820
+ """
821
+ Globally repair all diagrams in parallel.
822
+
823
+ Returns:
824
+ - dict mapping file paths to list of (start, end, replacement) tuples
825
+ - list of error records
826
+ """
827
+ from collections import defaultdict
828
+
829
+ stats.diagrams_total += len(tasks)
830
+
831
+ file_replacements: dict[Path, list[tuple[int, int, str]]] = defaultdict(list)
832
+ error_records: list[dict[str, Any]] = []
833
+
834
+ clean_tasks: list[DiagramTask] = []
835
+ invalid_tasks: list[DiagramTask] = []
836
+ needs_validation: list[DiagramTask] = []
837
+ task_by_issue_id: dict[str, DiagramTask] = {}
838
+
839
+ for task in tasks:
840
+ if not task.issue:
841
+ clean_tasks.append(task)
842
+ continue
843
+ if task.issue.errors == ["not_validated"]:
844
+ needs_validation.append(task)
845
+ continue
846
+ invalid_tasks.append(task)
847
+ task_by_issue_id[task.issue.issue_id] = task
848
+
849
+ if progress_cb:
850
+ for _ in clean_tasks:
851
+ progress_cb()
852
+
853
+ if needs_validation:
854
+ validate_limit = max(1, min(8, max_concurrent_batches))
855
+ validate_semaphore = asyncio.Semaphore(validate_limit)
856
+
857
+ def validate_and_cleanup(text: str) -> tuple[str, str | None]:
858
+ validation = validate_mermaid(text)
859
+ if not validation:
860
+ return "clean", None
861
+ cleaned = cleanup_mermaid(text)
862
+ if cleaned != text and not validate_mermaid(cleaned):
863
+ return "cleaned", cleaned
864
+ return "invalid", validation
865
+
866
+ async def validate_one(task: DiagramTask) -> tuple[str, str | None]:
867
+ async with validate_semaphore:
868
+ return await asyncio.to_thread(validate_and_cleanup, task.span.content)
869
+
870
+ results = await asyncio.gather(*[validate_one(task) for task in needs_validation])
871
+ for task, (status, payload) in zip(needs_validation, results):
872
+ if status == "clean":
873
+ if progress_cb:
874
+ progress_cb()
875
+ continue
876
+ if status == "cleaned":
877
+ stats.diagrams_repaired += 1
878
+ file_replacements[task.file_path].append((task.span.start, task.span.end, payload or task.span.content))
879
+ if progress_cb:
880
+ progress_cb()
881
+ continue
882
+
883
+ # Still invalid: attach validation errors and send to LLM repair.
884
+ task.issue.errors = [payload] if payload else ["invalid"]
885
+ invalid_tasks.append(task)
886
+ task_by_issue_id[task.issue.issue_id] = task
887
+
888
+ stats.diagrams_invalid += len(invalid_tasks)
889
+
890
+ if not invalid_tasks:
891
+ return file_replacements, error_records
892
+
893
+ issues = [task.issue for task in invalid_tasks if task.issue]
894
+ batches = list(iter_batches(issues, batch_size))
895
+
896
+ semaphore = asyncio.Semaphore(max_concurrent_batches)
897
+
898
+ async def process_batch(batch: list[MermaidIssue]) -> tuple[dict[str, str], str | None]:
899
+ async with semaphore:
900
+ return await repair_batch(batch, provider, model_name, api_key, timeout, max_retries, client)
901
+
902
+ results = await asyncio.gather(
903
+ *[process_batch(batch) for batch in batches],
904
+ return_exceptions=True,
905
+ )
906
+
907
+ for batch, result in zip(batches, results):
908
+ if isinstance(result, Exception):
909
+ error_msg = str(result)
910
+ for issue in batch:
911
+ stats.diagrams_failed += 1
912
+ task = task_by_issue_id.get(issue.issue_id)
913
+ if not task:
914
+ continue
915
+ error_records.append(
916
+ {
917
+ "path": str(task.file_path),
918
+ "line": task.file_line_offset + issue.span.line - 1,
919
+ "mermaid": issue.span.content,
920
+ "errors": issue.errors + [f"batch_error: {error_msg}"],
921
+ "field_path": issue.field_path,
922
+ "item_index": issue.item_index,
923
+ }
924
+ )
925
+ if progress_cb:
926
+ progress_cb()
927
+ continue
928
+
929
+ repairs, batch_error = result
930
+
931
+ if batch_error:
932
+ for issue in batch:
933
+ stats.diagrams_failed += 1
934
+ task = task_by_issue_id.get(issue.issue_id)
935
+ if not task:
936
+ continue
937
+ error_records.append(
938
+ {
939
+ "path": str(task.file_path),
940
+ "line": task.file_line_offset + issue.span.line - 1,
941
+ "mermaid": issue.span.content,
942
+ "errors": issue.errors + [f"llm_error: {batch_error}"],
943
+ "field_path": issue.field_path,
944
+ "item_index": issue.item_index,
945
+ }
946
+ )
947
+ if progress_cb:
948
+ progress_cb()
949
+ continue
950
+
951
+ for issue in batch:
952
+ task = task_by_issue_id.get(issue.issue_id)
953
+ if not task:
954
+ if progress_cb:
955
+ progress_cb()
956
+ continue
957
+ repaired = repairs.get(issue.issue_id)
958
+
959
+ if not repaired:
960
+ stats.diagrams_failed += 1
961
+ error_records.append(
962
+ {
963
+ "path": str(task.file_path),
964
+ "line": task.file_line_offset + issue.span.line - 1,
965
+ "mermaid": issue.span.content,
966
+ "errors": issue.errors + ["llm_missing_output"],
967
+ "field_path": issue.field_path,
968
+ "item_index": issue.item_index,
969
+ }
970
+ )
971
+ if progress_cb:
972
+ progress_cb()
973
+ continue
974
+
975
+ repaired = strip_mermaid_fences(repaired)
976
+ repaired = cleanup_mermaid(repaired)
977
+ validation = validate_mermaid(repaired)
978
+
979
+ if validation:
980
+ stats.diagrams_failed += 1
981
+ error_records.append(
982
+ {
983
+ "path": str(task.file_path),
984
+ "line": task.file_line_offset + issue.span.line - 1,
985
+ "mermaid": issue.span.content,
986
+ "errors": issue.errors + [f"repair_still_invalid: {validation}"],
987
+ "field_path": issue.field_path,
988
+ "item_index": issue.item_index,
989
+ }
990
+ )
991
+ if progress_cb:
992
+ progress_cb()
993
+ continue
994
+
995
+ stats.diagrams_repaired += 1
996
+ file_replacements[task.file_path].append((issue.span.start, issue.span.end, repaired))
997
+ if progress_cb:
998
+ progress_cb()
999
+
1000
+ return file_replacements, error_records
@@ -31,7 +31,7 @@ async def _format_markdown(text: str) -> str:
31
31
  global _RUMDL_WARNED
32
32
  if not _RUMDL_PATH:
33
33
  if not _RUMDL_WARNED:
34
- logger.warning("rumdl not available; skip markdown formatting")
34
+ logger.warning("rumdl not available; skip markdown formatting (recognize)")
35
35
  _RUMDL_WARNED = True
36
36
  return text
37
37
 
@@ -45,10 +45,15 @@ async def _format_markdown(text: str) -> str:
45
45
  check=False,
46
46
  )
47
47
  except OSError as exc:
48
- logger.warning("rumdl fmt failed: %s", exc)
48
+ message = str(exc).strip() or "unknown error"
49
+ logger.warning("rumdl fmt failed (oserror=%s): %s", type(exc).__name__, message)
49
50
  return text
50
51
  if proc.returncode != 0:
51
- logger.warning("rumdl fmt failed (%s): %s", proc.returncode, proc.stderr.strip())
52
+ logger.warning(
53
+ "rumdl fmt failed (rc=%s): %s",
54
+ proc.returncode,
55
+ proc.stderr.strip() or "unknown error",
56
+ )
52
57
  return text
53
58
  return proc.stdout or text
54
59
 
@@ -80,26 +85,35 @@ def discover_mineru_dirs(inputs: Iterable[str], recursive: bool) -> list[Path]:
80
85
  if path.name != "full.md":
81
86
  raise FileNotFoundError(f"Expected full.md file but got: {path}")
82
87
  parent = path.parent.resolve()
83
- if (parent / "images").is_dir():
84
- results.add(parent)
85
- else:
86
- logger.warning("Skipping %s (missing images/)", parent)
88
+ if not (parent / "images").is_dir():
89
+ logger.warning(
90
+ "Missing images/ for %s; continuing (expected=%s)",
91
+ parent,
92
+ parent / "images",
93
+ )
94
+ results.add(parent)
87
95
  continue
88
96
  if not path.exists():
89
97
  raise FileNotFoundError(f"Input path not found: {path}")
90
98
  if path.is_dir():
91
99
  if (path / "full.md").is_file():
92
- if (path / "images").is_dir():
93
- results.add(path.resolve())
94
- else:
95
- logger.warning("Skipping %s (missing images/)", path)
100
+ if not (path / "images").is_dir():
101
+ logger.warning(
102
+ "Missing images/ for %s; continuing (expected=%s)",
103
+ path,
104
+ path / "images",
105
+ )
106
+ results.add(path.resolve())
96
107
  pattern = path.rglob("full.md") if recursive else path.glob("full.md")
97
108
  for full_path in pattern:
98
109
  parent = full_path.parent.resolve()
99
- if (parent / "images").is_dir():
100
- results.add(parent)
101
- else:
102
- logger.warning("Skipping %s (missing images/)", parent)
110
+ if not (parent / "images").is_dir():
111
+ logger.warning(
112
+ "Missing images/ for %s; continuing (expected=%s)",
113
+ parent,
114
+ parent / "images",
115
+ )
116
+ results.add(parent)
103
117
  continue
104
118
  raise FileNotFoundError(f"Input path not found: {path}")
105
119
  return sorted(results)
@@ -129,7 +143,12 @@ async def organize_mineru_dir(
129
143
  return None
130
144
  source_path = resolve_local_path(md_path, target)
131
145
  if not source_path.exists() or not source_path.is_file():
132
- logger.warning("Image not found: %s", source_path)
146
+ logger.warning(
147
+ "Image not found: %s (md=%s, target=%s)",
148
+ source_path,
149
+ md_path,
150
+ target,
151
+ )
133
152
  return None
134
153
  if source_path in image_map:
135
154
  return f"images/{image_map[source_path]}"