PyPI - deepresearch-flow - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

deepresearch-flow 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

deepresearch_flow/paper/cli.py +63 -0
deepresearch_flow/paper/config.py +87 -12
deepresearch_flow/paper/db.py +1041 -34
deepresearch_flow/paper/db_ops.py +145 -26
deepresearch_flow/paper/extract.py +1546 -152
deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
deepresearch_flow/paper/providers/azure_openai.py +45 -3
deepresearch_flow/paper/providers/openai_compatible.py +45 -3
deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
deepresearch_flow/paper/snapshot/__init__.py +4 -0
deepresearch_flow/paper/snapshot/api.py +941 -0
deepresearch_flow/paper/snapshot/builder.py +965 -0
deepresearch_flow/paper/snapshot/identity.py +239 -0
deepresearch_flow/paper/snapshot/schema.py +245 -0
deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
deepresearch_flow/paper/snapshot/text.py +154 -0
deepresearch_flow/paper/template_registry.py +40 -0
deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
deepresearch_flow/paper/web/app.py +10 -3
deepresearch_flow/paper/web/markdown.py +174 -8
deepresearch_flow/paper/web/static/css/main.css +8 -1
deepresearch_flow/paper/web/static/js/detail.js +46 -12
deepresearch_flow/paper/web/templates/detail.html +9 -0
deepresearch_flow/paper/web/text.py +8 -4
deepresearch_flow/recognize/cli.py +380 -103
deepresearch_flow/recognize/markdown.py +31 -7
deepresearch_flow/recognize/math.py +47 -12
deepresearch_flow/recognize/mermaid.py +320 -10
deepresearch_flow/recognize/organize.py +35 -16
deepresearch_flow/translator/cli.py +71 -20
deepresearch_flow/translator/engine.py +220 -81
deepresearch_flow/translator/fixers.py +15 -0
deepresearch_flow/translator/prompts.py +19 -2
deepresearch_flow/translator/protector.py +15 -3
{deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
{deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
{deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
{deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
{deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
{deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0

deepresearch_flow/recognize/cli.py CHANGED Viewed

@@ -42,6 +42,10 @@ from deepresearch_flow.recognize.mermaid import (
     extract_mermaid_spans,
     fix_mermaid_text,
     require_mmdc,
+    extract_diagrams_from_text,
+    repair_all_diagrams_global,
+    DiagramTask,
+    apply_replacements,
 )
 from deepresearch_flow.recognize.organize import (
     discover_mineru_dirs,
@@ -73,7 +77,8 @@ def _relative_path(path: Path) -> str:
 def _warn_if_not_empty(output_dir: Path) -> None:
     if output_dir.exists() and any(output_dir.iterdir()):
-        logger.warning("Output directory not empty: %s", output_dir)
+        item_count = sum(1 for _ in output_dir.iterdir())
+        logger.warning("Output directory not empty: %s (items=%d)", output_dir, item_count)
 def _print_summary(title: str, rows: list[tuple[str, str]]) -> None:
@@ -114,6 +119,60 @@ def _map_output_files(
     return mapping
+RetryKey = tuple[int, str | None, int | None]
+def _load_retry_targets(report_path: Path) -> dict[Path, set[RetryKey]]:
+    if not report_path.exists():
+        raise click.ClickException(f"Retry report not found: {report_path}")
+    try:
+        payload = json.loads(read_text(report_path))
+    except json.JSONDecodeError as exc:
+        raise click.ClickException(f"Retry report is not valid JSON: {exc}") from exc
+    if not isinstance(payload, list) or not payload:
+        raise click.ClickException(f"Retry report is empty: {report_path}")
+    targets: dict[Path, set[RetryKey]] = {}
+    for entry in payload:
+        if not isinstance(entry, dict):
+            continue
+        path_raw = entry.get("path")
+        line_raw = entry.get("line")
+        if not path_raw or line_raw is None:
+            continue
+        try:
+            line_no = int(line_raw)
+        except (TypeError, ValueError):
+            continue
+        field_path = entry.get("field_path")
+        if not isinstance(field_path, str):
+            field_path = None
+        item_index = entry.get("item_index")
+        if not isinstance(item_index, int):
+            item_index = None
+        key = (line_no, field_path, item_index)
+        targets.setdefault(Path(path_raw).resolve(), set()).add(key)
+    if not targets:
+        raise click.ClickException(f"Retry report has no valid entries: {report_path}")
+    return targets
+def _filter_retry_spans(
+    spans: list[Any],
+    line_offset: int,
+    field_path: str | None,
+    item_index: int | None,
+    retry_keys: set[RetryKey] | None,
+) -> list[Any]:
+    if not retry_keys:
+        return spans
+    filtered: list[Any] = []
+    for span in spans:
+        line_no = line_offset + span.line - 1
+        if (line_no, field_path, item_index) in retry_keys:
+            filtered.append(span)
+    return filtered
 def _aggregate_image_counts(paths: Iterable[Path]) -> dict[str, int]:
     totals = {"total": 0, "data": 0, "http": 0, "local": 0}
     for path in paths:
@@ -194,6 +253,8 @@ async def _fix_json_items(
     default_template: str | None,
     fix_level: str,
     format_enabled: bool,
+    progress: tqdm | None = None,
+    progress_lock: asyncio.Lock | None = None,
 ) -> tuple[int, int, int, int]:
     items_total = 0
     items_updated = 0
@@ -218,6 +279,9 @@ async def _fix_json_items(
                 item_updated = True
         if item_updated:
             items_updated += 1
+        if progress and progress_lock:
+            async with progress_lock:
+                progress.update(1)
     return items_total, items_updated, fields_total, fields_updated
@@ -350,7 +414,12 @@ async def _run_fix_json(
     async def handler(path: Path) -> tuple[int, int, int, int, int]:
         items, payload, template_tag = _load_json_payload(path)
         items_total, items_updated, fields_total, fields_updated = await _fix_json_items(
-            items, template_tag, fix_level, format_enabled
+            items,
+            template_tag,
+            fix_level,
+            format_enabled,
+            progress,
+            progress_lock,
         )
         output_data: Any
         if payload is None:
@@ -367,9 +436,6 @@ async def _run_fix_json(
         async with semaphore:
             result = await handler(path)
             results.append(result)
-            if progress and progress_lock:
-                async with progress_lock:
-                    progress.update(1)
     await asyncio.gather(*(runner(path) for path in paths))
     return results
@@ -787,7 +853,16 @@ def recognize_fix(
         _print_summary("recognize fix (dry-run)", rows)
         return
-    progress = tqdm(total=len(paths), desc="fix", unit="file")
+    progress_total = len(paths)
+    progress_unit = "file"
+    if json_mode:
+        json_items_total = 0
+        for path in paths:
+            items, _, _ = _load_json_payload(path)
+            json_items_total += sum(1 for item in items if isinstance(item, dict))
+        progress_total = json_items_total
+        progress_unit = "item"
+    progress = tqdm(total=progress_total, desc="fix", unit=progress_unit)
     try:
         if json_mode:
             results = asyncio.run(
@@ -870,6 +945,7 @@ def recognize_fix(
 @click.option("--max-retries", "max_retries", default=3, show_default=True, type=int)
 @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
 @click.option("--timeout", "timeout", default=120.0, show_default=True, type=float)
+@click.option("--retry-failed", "retry_failed", is_flag=True, help="Retry only failed formulas")
 @click.option(
     "--only-show-error",
     "only_show_error",
@@ -892,6 +968,7 @@ def recognize_fix_math(
     max_retries: int,
     workers: int,
     timeout: float,
+    retry_failed: bool,
     only_show_error: bool,
     report_path: str | None,
     dry_run: bool,
@@ -911,6 +988,8 @@ def recognize_fix_math(
         raise click.ClickException("--max-retries must be non-negative")
     if workers <= 0:
         raise click.ClickException("--workers must be positive")
+    if retry_failed and only_show_error:
+        raise click.ClickException("--retry-failed cannot be used with --only-show-error")
     try:
         require_pylatexenc()
     except RuntimeError as exc:
@@ -954,6 +1033,24 @@ def recognize_fix_math(
         return
     output_path = Path(output_dir) if output_dir else None
+    report_target = None
+    if report_path:
+        report_target = Path(report_path)
+    elif not only_show_error:
+        if output_path:
+            report_target = output_path / "fix-math-errors.json"
+        elif in_place:
+            report_target = Path.cwd() / "fix-math-errors.json"
+    retry_targets: dict[Path, set[RetryKey]] | None = None
+    if retry_failed:
+        if report_target is None:
+            raise click.ClickException("--retry-failed requires an error report path")
+        retry_targets = _load_retry_targets(report_target)
+        paths = [path for path in paths if path.resolve() in retry_targets]
+        if not paths:
+            raise click.ClickException("No failed formulas matched the provided inputs")
     if output_path and not dry_run and not only_show_error:
         output_path = _ensure_output_dir(output_dir)
         _warn_if_not_empty(output_path)
@@ -969,15 +1066,6 @@ def recognize_fix_math(
     else:
         output_map = {path: path for path in paths}
-    report_target = None
-    if report_path:
-        report_target = Path(report_path)
-    elif not only_show_error:
-        if output_path:
-            report_target = output_path / "fix-math-errors.json"
-        elif in_place:
-            report_target = Path.cwd() / "fix-math-errors.json"
     if dry_run and not only_show_error:
         rows = [
             ("Mode", "json" if json_mode else "markdown"),
@@ -988,6 +1076,7 @@ def recognize_fix_math(
             ("Max retries", str(max_retries)),
             ("Workers", str(workers)),
             ("Timeout", f"{timeout:.1f}s"),
+            ("Retry failed", "yes" if retry_failed else "no"),
             ("Only show error", "yes" if only_show_error else "no"),
             ("In place", "yes" if in_place else "no"),
             ("Output dir", _relative_path(output_path) if output_path else "-"),
@@ -1021,12 +1110,24 @@ def recognize_fix_math(
                             value = item.get(field)
                             if not isinstance(value, str):
                                 continue
-                            spans = extract_math_spans(value, context_chars)
-                            if spans:
-                                formula_progress.total += len(spans)
-                                formula_progress.refresh()
                             line_start, cursor = locate_json_field_start(raw_text, value, cursor)
                             field_path = f"papers[{item_index}].{field}"
+                            spans = extract_math_spans(value, context_chars)
+                            retry_keys = None
+                            if retry_targets is not None:
+                                retry_keys = retry_targets.get(path.resolve(), set())
+                                retry_keys = {
+                                    key
+                                    for key in retry_keys
+                                    if key[1] == field_path and key[2] == item_index
+                                }
+                            spans = _filter_retry_spans(
+                                spans, line_start, field_path, item_index, retry_keys
+                            )
+                            if not spans:
+                                continue
+                            formula_progress.total += len(spans)
+                            formula_progress.refresh()
                             updated, errors = await fix_math_text(
                                 value,
                                 str(path),
@@ -1044,6 +1145,7 @@ def recognize_fix_math(
                                 stats,
                                 repair_enabled=not only_show_error,
                                 spans=spans,
+                                allowed_keys=retry_keys,
                                 progress_cb=lambda: formula_progress.update(1),
                             )
                             if not only_show_error and updated != value:
@@ -1057,6 +1159,12 @@ def recognize_fix_math(
                 else:
                     content = await asyncio.to_thread(read_text, path)
                     spans = extract_math_spans(content, context_chars)
+                    retry_keys = None
+                    if retry_targets is not None:
+                        retry_keys = retry_targets.get(path.resolve(), set())
+                        spans = _filter_retry_spans(spans, 1, None, None, retry_keys)
+                        if not spans:
+                            return stats
                     if spans:
                         formula_progress.total += len(spans)
                         formula_progress.refresh()
@@ -1077,6 +1185,7 @@ def recognize_fix_math(
                         stats,
                         repair_enabled=not only_show_error,
                         spans=spans,
+                        allowed_keys=retry_keys,
                         progress_cb=lambda: formula_progress.update(1),
                     )
                     if not only_show_error:
@@ -1121,6 +1230,7 @@ def recognize_fix_math(
         ("Cleaned", str(stats.formulas_cleaned)),
         ("Repaired", str(stats.formulas_repaired)),
         ("Failed", str(stats.formulas_failed)),
+        ("Retry failed", "yes" if retry_failed else "no"),
         ("Only show error", "yes" if only_show_error else "no"),
         ("Report", _relative_path(report_target) if report_target else "-"),
     ]
@@ -1147,6 +1257,7 @@ def recognize_fix_math(
 @click.option("--max-retries", "max_retries", default=3, show_default=True, type=int)
 @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
 @click.option("--timeout", "timeout", default=120.0, show_default=True, type=float)
+@click.option("--retry-failed", "retry_failed", is_flag=True, help="Retry only failed diagrams")
 @click.option(
     "--only-show-error",
     "only_show_error",
@@ -1169,6 +1280,7 @@ def recognize_fix_mermaid(
     max_retries: int,
     workers: int,
     timeout: float,
+    retry_failed: bool,
     only_show_error: bool,
     report_path: str | None,
     dry_run: bool,
@@ -1188,6 +1300,8 @@ def recognize_fix_mermaid(
         raise click.ClickException("--max-retries must be non-negative")
     if workers <= 0:
         raise click.ClickException("--workers must be positive")
+    if retry_failed and only_show_error:
+        raise click.ClickException("--retry-failed cannot be used with --only-show-error")
     try:
         require_mmdc()
     except RuntimeError as exc:
@@ -1231,6 +1345,24 @@ def recognize_fix_mermaid(
         return
     output_path = Path(output_dir) if output_dir else None
+    report_target = None
+    if report_path:
+        report_target = Path(report_path)
+    elif not only_show_error:
+        if output_path:
+            report_target = output_path / "fix-mermaid-errors.json"
+        elif in_place:
+            report_target = Path.cwd() / "fix-mermaid-errors.json"
+    retry_targets: dict[Path, set[RetryKey]] | None = None
+    if retry_failed:
+        if report_target is None:
+            raise click.ClickException("--retry-failed requires an error report path")
+        retry_targets = _load_retry_targets(report_target)
+        paths = [path for path in paths if path.resolve() in retry_targets]
+        if not paths:
+            raise click.ClickException("No failed diagrams matched the provided inputs")
     if output_path and not dry_run and not only_show_error:
         output_path = _ensure_output_dir(output_dir)
         _warn_if_not_empty(output_path)
@@ -1246,15 +1378,6 @@ def recognize_fix_mermaid(
     else:
         output_map = {path: path for path in paths}
-    report_target = None
-    if report_path:
-        report_target = Path(report_path)
-    elif not only_show_error:
-        if output_path:
-            report_target = output_path / "fix-mermaid-errors.json"
-        elif in_place:
-            report_target = Path.cwd() / "fix-mermaid-errors.json"
     if dry_run and not only_show_error:
         rows = [
             ("Mode", "json" if json_mode else "markdown"),
@@ -1265,6 +1388,7 @@ def recognize_fix_mermaid(
             ("Max retries", str(max_retries)),
             ("Workers", str(workers)),
             ("Timeout", f"{timeout:.1f}s"),
+            ("Retry failed", "yes" if retry_failed else "no"),
             ("Only show error", "yes" if only_show_error else "no"),
             ("In place", "yes" if in_place else "no"),
             ("Output dir", _relative_path(output_path) if output_path else "-"),
@@ -1273,112 +1397,260 @@ def recognize_fix_mermaid(
         _print_summary("recognize fix-mermaid (dry-run)", rows)
         return
-    progress = tqdm(total=len(paths), desc="fix-mermaid", unit="file")
-    diagram_progress = tqdm(total=0, desc="diagrams", unit="diagram")
+    progress = tqdm(total=len(paths), desc="extract", unit="file")
+    field_progress = tqdm(total=0, desc="extract-field", unit="field", disable=not json_mode, leave=False)
+    diagram_progress = tqdm(total=0, desc="repair", unit="diagram")
     error_records: list[dict[str, Any]] = []
+    # Performance metrics
+    extract_start_time = time.monotonic()
+    repair_start_time = 0.0
+    extract_duration = 0.0
+    repair_duration = 0.0
     async def run() -> MermaidFixStats:
-        semaphore = asyncio.Semaphore(workers)
-        progress_lock = asyncio.Lock()
         stats_total = MermaidFixStats()
         async with httpx.AsyncClient() as client:
-            async def handle_path(path: Path) -> MermaidFixStats:
-                stats = MermaidFixStats()
+            # Phase 1: Extract all diagrams from all files in parallel (flatten to 1D)
+            progress_lock = asyncio.Lock()
+            field_progress_lock = asyncio.Lock()
+            async def extract_from_file(path: Path) -> list[DiagramTask]:
+                tasks: list[DiagramTask] = []
                 if json_mode:
-                    raw_text = read_text(path)
+                    raw_text = await asyncio.to_thread(read_text, path)
                     items, payload, template_tag = _load_json_payload(path)
+                    logger.info("Extracting from JSON: %s (%d papers)", _relative_path(path), len(items))
+                    # Pre-calculate all field positions for parallel extraction
+                    field_locations: list[tuple[int, str, str, str | None, int]] = []
                     cursor = 0
                     for item_index, item in enumerate(items):
                         if not isinstance(item, dict):
                             continue
                         template = _resolve_item_template(item, template_tag)
                         fields = _template_markdown_fields(template)
                         for field in fields:
                             value = item.get(field)
                             if not isinstance(value, str):
                                 continue
-                            spans = extract_mermaid_spans(value, context_chars)
-                            if spans:
-                                diagram_progress.total += len(spans)
-                                diagram_progress.refresh()
                             line_start, cursor = locate_json_field_start(raw_text, value, cursor)
                             field_path = f"papers[{item_index}].{field}"
-                            updated, errors = await fix_mermaid_text(
-                                value,
-                                str(path),
-                                line_start,
-                                field_path,
-                                item_index,
-                                provider,
-                                model_name,
-                                api_key,
-                                timeout,
-                                max_retries,
-                                batch_size,
-                                context_chars,
-                                client,
-                                stats,
-                                repair_enabled=not only_show_error,
-                                spans=spans,
-                                progress_cb=lambda: diagram_progress.update(1),
+                            field_locations.append((line_start, value, field_path, None, item_index))
+                    logger.info("Pre-calculated %d field locations from %s", len(field_locations), _relative_path(path))
+                    # Apply retry filter to field locations if needed
+                    if retry_targets is not None:
+                        retry_keys = retry_targets.get(path.resolve(), set())
+                        # Prefer filtering by (field_path, item_index) to avoid expensive validation / mmdc calls.
+                        retry_fields = {
+                            (field_path, item_index)
+                            for _, field_path, item_index in retry_keys
+                            if field_path is not None and item_index is not None
+                        }
+                        if retry_fields:
+                            before = len(field_locations)
+                            field_locations = [
+                                loc for loc in field_locations if (loc[2], loc[4]) in retry_fields
+                            ]
+                            logger.info(
+                                "Retry filter: %d/%d fields match (by field_path)",
+                                len(field_locations),
+                                before,
                             )
-                            if not only_show_error and updated != value:
-                                item[field] = updated
-                            error_records.extend(errors)
-                    if not only_show_error:
-                        output_data: Any = items if payload is None else {**payload, "papers": items}
-                        output_path = output_map[path]
-                        serialized = json.dumps(output_data, ensure_ascii=False, indent=2)
-                        await asyncio.to_thread(output_path.write_text, f"{serialized}\n", encoding="utf-8")
+                        else:
+                            # Fallback: filter by line numbers using fast span extraction (no validation).
+                            filtered_locations: list[tuple[int, str, str, str | None, int]] = []
+                            for line_start, value, field_path, _, item_index in field_locations:
+                                spans = extract_mermaid_spans(value, context_chars)
+                                if any(
+                                    (line_start + span.line - 1, field_path, item_index) in retry_keys
+                                    for span in spans
+                                ):
+                                    filtered_locations.append((line_start, value, field_path, None, item_index))
+                            field_locations = filtered_locations
+                            logger.info("Retry filter: %d fields match (by line)", len(field_locations))
+                    # Parallel extraction from all fields
+                    async def extract_from_field(loc: tuple[int, str, str, str | None, int]) -> list[DiagramTask]:
+                        line_start, value, field_path, _, item_index = loc
+                        field_tasks = extract_diagrams_from_text(
+                            value, path, line_start, field_path, item_index, context_chars,
+                            skip_validation=not only_show_error  # Skip validation unless validating only
+                        )
+                        # Apply retry filter to individual tasks
+                        if retry_targets is not None:
+                            retry_keys = retry_targets.get(path.resolve(), set())
+                            field_tasks = [
+                                task for task in field_tasks
+                                if (task.file_line_offset + task.span.line - 1, task.field_path, task.item_index) in retry_keys
+                            ]
+                        return field_tasks
+                    if field_locations:
+                        logger.info("Extracting diagrams from %d fields in parallel...", len(field_locations))
+                        async with field_progress_lock:
+                            field_progress.total += len(field_locations)
+                            field_progress.refresh()
+                        # Bounded worker pool (avoid scheduling thousands of coroutines at once).
+                        max_field_workers = 50
+                        field_workers = min(max_field_workers, len(field_locations))
+                        field_queue: asyncio.Queue[tuple[int, str, str, str | None, int] | None] = asyncio.Queue()
+                        for loc in field_locations:
+                            field_queue.put_nowait(loc)
+                        for _ in range(field_workers):
+                            field_queue.put_nowait(None)
+                        async def field_worker() -> list[DiagramTask]:
+                            out: list[DiagramTask] = []
+                            while True:
+                                loc = await field_queue.get()
+                                if loc is None:
+                                    break
+                                out.extend(await extract_from_field(loc))
+                                async with field_progress_lock:
+                                    field_progress.update(1)
+                            return out
+                        worker_results = await asyncio.gather(*[field_worker() for _ in range(field_workers)])
+                        for batch in worker_results:
+                            tasks.extend(batch)
+                    logger.info("Extracted %d diagrams from %s", len(tasks), _relative_path(path))
                 else:
                     content = await asyncio.to_thread(read_text, path)
-                    spans = extract_mermaid_spans(content, context_chars)
-                    if spans:
-                        diagram_progress.total += len(spans)
-                        diagram_progress.refresh()
-                    updated, errors = await fix_mermaid_text(
-                        content,
-                        str(path),
-                        1,
-                        None,
-                        None,
-                        provider,
-                        model_name,
-                        api_key,
-                        timeout,
-                        max_retries,
-                        batch_size,
-                        context_chars,
-                        client,
-                        stats,
-                        repair_enabled=not only_show_error,
-                        spans=spans,
-                        progress_cb=lambda: diagram_progress.update(1),
+                    logger.info("Extracting from markdown: %s", _relative_path(path))
+                    # Extract diagrams from markdown
+                    file_tasks = extract_diagrams_from_text(
+                        content, path, 1, None, None, context_chars,
+                        skip_validation=not only_show_error  # Skip validation unless validating only
                     )
-                    if not only_show_error:
-                        output_path = output_map[path]
+                    # Apply retry filter if needed
+                    if retry_targets is not None:
+                        retry_keys = retry_targets.get(path.resolve(), set())
+                        file_tasks = [
+                            task for task in file_tasks
+                            if (task.file_line_offset + task.span.line - 1, task.field_path, task.item_index) in retry_keys
+                        ]
+                    tasks.extend(file_tasks)
+                    logger.info("Extracted %d diagrams from %s", len(tasks), _relative_path(path))
+                async with progress_lock:
+                    progress.update(1)
+                return tasks
+            # Parallel extraction with progress
+            file_task_lists = await asyncio.gather(*[extract_from_file(path) for path in paths])
+            all_tasks = [task for tasks in file_task_lists for task in tasks]
+            progress.close()
+            field_progress.close()
+            nonlocal extract_duration, repair_start_time
+            extract_duration = time.monotonic() - extract_start_time
+            # Update diagram progress total
+            diagram_progress.total = len(all_tasks)
+            diagram_progress.refresh()
+            if not all_tasks:
+                return stats_total
+            # Phase 2: Global parallel repair (flatten all batches)
+            repair_start_time = time.monotonic()
+            file_replacements, errors = await repair_all_diagrams_global(
+                all_tasks,
+                batch_size,
+                workers,  # Use workers for global batch concurrency
+                provider,
+                model_name,
+                api_key,
+                timeout,
+                max_retries,
+                client,
+                stats_total,
+                progress_cb=lambda: diagram_progress.update(1) if not only_show_error else None,
+            )
+            error_records.extend(errors)
+            diagram_progress.close()
+            nonlocal repair_duration
+            repair_duration = time.monotonic() - repair_start_time
+            # Phase 3: Write back to files
+            if not only_show_error:
+                write_progress = tqdm(total=len(paths), desc="write", unit="file")
+                for path in paths:
+                    replacements = file_replacements.get(path, [])
+                    output_path = output_map[path]
+                    if json_mode:
+                        # For JSON, apply replacements to fields
+                        raw_text = await asyncio.to_thread(read_text, path)
+                        items, payload, template_tag = _load_json_payload(path)
+                        cursor = 0
+                        for item_index, item in enumerate(items):
+                            if not isinstance(item, dict):
+                                continue
+                            template = _resolve_item_template(item, template_tag)
+                            fields = _template_markdown_fields(template)
+                            for field in fields:
+                                value = item.get(field)
+                                if not isinstance(value, str):
+                                    continue
+                                field_path = f"papers[{item_index}].{field}"
+                                # Find replacements for this specific field
+                                field_replacements = [
+                                    (start, end, repl)
+                                    for start, end, repl in replacements
+                                    if any(
+                                        t.field_path == field_path and t.item_index == item_index and t.span.start == start
+                                        for t in all_tasks
+                                        if t.file_path == path
+                                    )
+                                ]
+                                if field_replacements:
+                                    updated_value = apply_replacements(value, field_replacements)
+                                    item[field] = updated_value
+                        output_data: Any = items if payload is None else {**payload, "papers": items}
+                        serialized = json.dumps(output_data, ensure_ascii=False, indent=2)
+                        await asyncio.to_thread(output_path.write_text, f"{serialized}\n", encoding="utf-8")
+                    else:
+                        # For markdown, apply replacements directly
+                        content = await asyncio.to_thread(read_text, path)
+                        updated = apply_replacements(content, replacements)
                         await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
-                    error_records.extend(errors)
-                return stats
-            async def runner(path: Path) -> None:
-                async with semaphore:
-                    stats = await handle_path(path)
-                    stats_total.diagrams_total += stats.diagrams_total
-                    stats_total.diagrams_invalid += stats.diagrams_invalid
-                    stats_total.diagrams_repaired += stats.diagrams_repaired
-                    stats_total.diagrams_failed += stats.diagrams_failed
-                    async with progress_lock:
-                        progress.update(1)
-            await asyncio.gather(*(runner(path) for path in paths))
+                    write_progress.update(1)
+                write_progress.close()
         return stats_total
     try:
         stats = asyncio.run(run())
     finally:
         progress.close()
+        field_progress.close()
         diagram_progress.close()
     if report_target and error_records:
@@ -1396,6 +1668,11 @@ def recognize_fix_mermaid(
         ("Invalid", str(stats.diagrams_invalid)),
         ("Repaired", str(stats.diagrams_repaired)),
         ("Failed", str(stats.diagrams_failed)),
+        ("Extract time", _format_duration(extract_duration)),
+        ("Extract avg", f"{extract_duration / stats.diagrams_total:.3f}s/diagram" if stats.diagrams_total > 0 else "-"),
+        ("Repair time", _format_duration(repair_duration)),
+        ("Repair avg", f"{repair_duration / stats.diagrams_invalid:.3f}s/diagram" if stats.diagrams_invalid > 0 else "-"),
+        ("Retry failed", "yes" if retry_failed else "no"),
         ("Only show error", "yes" if only_show_error else "no"),
         ("Report", _relative_path(report_target) if report_target else "-"),
     ]

deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

deepresearch-flow 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl