PyPI - deepresearch-flow - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

deepresearch-flow 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

deepresearch_flow/paper/db.py +184 -0
deepresearch_flow/paper/db_ops.py +1939 -0
deepresearch_flow/paper/web/app.py +38 -3705
deepresearch_flow/paper/web/constants.py +23 -0
deepresearch_flow/paper/web/filters.py +255 -0
deepresearch_flow/paper/web/handlers/__init__.py +14 -0
deepresearch_flow/paper/web/handlers/api.py +217 -0
deepresearch_flow/paper/web/handlers/pages.py +334 -0
deepresearch_flow/paper/web/markdown.py +549 -0
deepresearch_flow/paper/web/static/css/main.css +857 -0
deepresearch_flow/paper/web/static/js/detail.js +406 -0
deepresearch_flow/paper/web/static/js/index.js +266 -0
deepresearch_flow/paper/web/static/js/outline.js +58 -0
deepresearch_flow/paper/web/static/js/stats.js +39 -0
deepresearch_flow/paper/web/templates/base.html +43 -0
deepresearch_flow/paper/web/templates/detail.html +332 -0
deepresearch_flow/paper/web/templates/index.html +114 -0
deepresearch_flow/paper/web/templates/stats.html +29 -0
deepresearch_flow/paper/web/templates.py +85 -0
deepresearch_flow/paper/web/text.py +68 -0
deepresearch_flow/recognize/cli.py +805 -26
deepresearch_flow/recognize/katex_check.js +29 -0
deepresearch_flow/recognize/math.py +719 -0
deepresearch_flow/recognize/mermaid.py +690 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/METADATA +78 -4
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/RECORD +30 -9
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/WHEEL +0 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/entry_points.txt +0 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/top_level.txt +0 -0

deepresearch_flow/recognize/cli.py CHANGED Viewed

@@ -3,10 +3,11 @@
 from __future__ import annotations
 import asyncio
+import json
 import logging
 import time
 from pathlib import Path
-from typing import Awaitable, Callable, Iterable
+from typing import Any, Awaitable, Callable, Iterable
 import click
 import coloredlogs
@@ -15,6 +16,9 @@ from rich.console import Console
 from rich.table import Table
 from tqdm import tqdm
+from deepresearch_flow.paper.config import load_config, resolve_api_keys
+from deepresearch_flow.paper.extract import parse_model_ref
+from deepresearch_flow.paper.template_registry import get_stage_definitions
 from deepresearch_flow.paper.utils import discover_markdown
 from deepresearch_flow.recognize.markdown import (
     DEFAULT_USER_AGENT,
@@ -26,6 +30,19 @@ from deepresearch_flow.recognize.markdown import (
     sanitize_filename,
     unpack_markdown_images,
 )
+from deepresearch_flow.recognize.math import (
+    MathFixStats,
+    extract_math_spans,
+    fix_math_text,
+    locate_json_field_start,
+    require_pylatexenc,
+)
+from deepresearch_flow.recognize.mermaid import (
+    MermaidFixStats,
+    extract_mermaid_spans,
+    fix_mermaid_text,
+    require_mmdc,
+)
 from deepresearch_flow.recognize.organize import (
     discover_mineru_dirs,
     fix_markdown_text,
@@ -72,23 +89,28 @@ def _unique_output_filename(
     base: str,
     output_dirs: Iterable[Path],
     used: set[str],
+    ext: str,
 ) -> str:
     base = sanitize_filename(base) or "document"
-    candidate = f"{base}.md"
+    candidate = f"{base}{ext}"
     counter = 0
     while candidate in used or any((directory / candidate).exists() for directory in output_dirs):
         counter += 1
-        candidate = f"{base}_{counter}.md"
+        candidate = f"{base}_{counter}{ext}"
     used.add(candidate)
     return candidate
-def _map_output_files(paths: Iterable[Path], output_dirs: list[Path]) -> dict[Path, str]:
+def _map_output_files(
+    paths: Iterable[Path],
+    output_dirs: list[Path],
+    ext: str = ".md",
+) -> dict[Path, str]:
     used: set[str] = set()
     mapping: dict[Path, str] = {}
     for path in paths:
         base = path.stem
-        mapping[path] = _unique_output_filename(base, output_dirs, used)
+        mapping[path] = _unique_output_filename(base, output_dirs, used, ext)
     return mapping
@@ -112,6 +134,93 @@ def _format_duration(seconds: float) -> str:
     return f"{int(hours)}h {int(minutes)}m {remainder:.1f}s"
+def _resolve_item_template(item: dict[str, Any], default_template: str | None) -> str | None:
+    raw = item.get("template_tag") or item.get("prompt_template") or default_template
+    if isinstance(raw, str) and raw:
+        return raw
+    return None
+def _template_markdown_fields(template: str | None) -> list[str]:
+    if template:
+        stages = get_stage_definitions(template)
+        if stages:
+            return [field for stage in stages for field in stage.fields]
+    return ["summary", "abstract"]
+def discover_json(inputs: Iterable[str], recursive: bool) -> list[Path]:
+    files: set[Path] = set()
+    for raw in inputs:
+        path = Path(raw)
+        if path.is_file():
+            if path.suffix.lower() != ".json":
+                raise ValueError(f"Input file is not a json file: {path}")
+            files.add(path.resolve())
+            continue
+        if path.is_dir():
+            pattern = path.rglob("*.json") if recursive else path.glob("*.json")
+            for match in pattern:
+                if match.is_file():
+                    files.add(match.resolve())
+            continue
+        raise FileNotFoundError(f"Input path not found: {path}")
+    return sorted(files)
+def _load_json_payload(path: Path) -> tuple[list[Any], dict[str, Any] | None, str | None]:
+    try:
+        data = json.loads(read_text(path))
+    except json.JSONDecodeError as exc:
+        raise click.ClickException(f"Invalid JSON in {path}: {exc}") from exc
+    if isinstance(data, list):
+        return data, None, None
+    if isinstance(data, dict):
+        papers = data.get("papers")
+        if isinstance(papers, list):
+            template_tag = data.get("template_tag")
+            return papers, data, template_tag if isinstance(template_tag, str) else None
+        raise click.ClickException(f"JSON object missing 'papers' list: {path}")
+    raise click.ClickException(f"Unsupported JSON structure in {path}")
+async def _fix_json_items(
+    items: list[Any],
+    default_template: str | None,
+    fix_level: str,
+    format_enabled: bool,
+) -> tuple[int, int, int, int]:
+    items_total = 0
+    items_updated = 0
+    fields_total = 0
+    fields_updated = 0
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        items_total += 1
+        template = _resolve_item_template(item, default_template)
+        fields = _template_markdown_fields(template)
+        item_updated = False
+        for field in fields:
+            value = item.get(field)
+            if not isinstance(value, str):
+                continue
+            fields_total += 1
+            updated = await fix_markdown_text(value, fix_level, format_enabled)
+            if updated != value:
+                item[field] = updated
+                fields_updated += 1
+                item_updated = True
+        if item_updated:
+            items_updated += 1
+    return items_total, items_updated, fields_total, fields_updated
 async def _run_with_workers(
     items: Iterable[Path],
     workers: int,
@@ -226,6 +335,46 @@ async def _run_fix(
     await _run_with_workers(paths, workers, handler, progress=progress)
+async def _run_fix_json(
+    paths: list[Path],
+    output_map: dict[Path, Path],
+    fix_level: str,
+    format_enabled: bool,
+    workers: int,
+    progress: tqdm | None,
+) -> list[tuple[int, int, int, int, int]]:
+    semaphore = asyncio.Semaphore(workers)
+    progress_lock = asyncio.Lock() if progress else None
+    results: list[tuple[int, int, int, int, int]] = []
+    async def handler(path: Path) -> tuple[int, int, int, int, int]:
+        items, payload, template_tag = _load_json_payload(path)
+        items_total, items_updated, fields_total, fields_updated = await _fix_json_items(
+            items, template_tag, fix_level, format_enabled
+        )
+        output_data: Any
+        if payload is None:
+            output_data = items
+        else:
+            payload["papers"] = items
+            output_data = payload
+        output_path = output_map[path]
+        serialized = json.dumps(output_data, ensure_ascii=False, indent=2)
+        await asyncio.to_thread(output_path.write_text, f"{serialized}\n", encoding="utf-8")
+        return len(items), items_total, items_updated, fields_total, fields_updated
+    async def runner(path: Path) -> None:
+        async with semaphore:
+            result = await handler(path)
+            results.append(result)
+            if progress and progress_lock:
+                async with progress_lock:
+                    progress.update(1)
+    await asyncio.gather(*(runner(path) for path in paths))
+    return results
 @click.group()
 def recognize() -> None:
     """OCR recognition and Markdown post-processing commands."""
@@ -530,11 +679,12 @@ def organize(
     "inputs",
     multiple=True,
     required=True,
-    help="Input markdown file or directory (repeatable)",
+    help="Input markdown or JSON file/directory (repeatable)",
 )
 @click.option("-o", "--output", "output_dir", default=None, help="Output directory")
 @click.option("--in-place", "in_place", is_flag=True, help="Fix markdown files in place")
-@click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
+@click.option("-r", "--recursive", is_flag=True, help="Recursively discover files")
+@click.option("--json", "json_mode", is_flag=True, help="Fix markdown fields inside JSON outputs")
 @click.option(
     "--fix-level",
     "fix_level",
@@ -552,13 +702,14 @@ def recognize_fix(
     output_dir: str | None,
     in_place: bool,
     recursive: bool,
+    json_mode: bool,
     fix_level: str,
     no_format: bool,
     workers: int,
     dry_run: bool,
     verbose: bool,
 ) -> None:
-    """Fix and format OCR markdown outputs."""
+    """Fix and format OCR markdown outputs (markdown or JSON)."""
     configure_logging(verbose)
     start_time = time.monotonic()
     if workers <= 0:
@@ -573,19 +724,58 @@ def recognize_fix(
         output_path = _ensure_output_dir(output_dir)
         _warn_if_not_empty(output_path)
-    paths = discover_markdown(inputs, None, recursive=recursive)
+    if json_mode:
+        paths = discover_json(inputs, recursive=recursive)
+    else:
+        json_inputs: list[str] = []
+        md_inputs: list[str] = []
+        for raw in inputs:
+            path = Path(raw)
+            if path.is_file():
+                suffix = path.suffix.lower()
+                if suffix == ".json":
+                    json_inputs.append(raw)
+                    continue
+                if suffix == ".md":
+                    md_inputs.append(raw)
+                    continue
+                raise click.ClickException(f"Input file must be .md or .json: {path}")
+            if path.is_dir():
+                json_inputs.append(raw)
+                md_inputs.append(raw)
+                continue
+            raise click.ClickException(f"Input path not found: {path}")
+        json_paths = discover_json(json_inputs, recursive=recursive) if json_inputs else []
+        md_paths = discover_markdown(md_inputs, None, recursive=recursive) if md_inputs else []
+        if json_paths and not md_paths:
+            json_mode = True
+            paths = json_paths
+            click.echo("Detected JSON inputs; enabling --json mode")
+        elif md_paths and not json_paths:
+            paths = md_paths
+        elif json_paths and md_paths:
+            raise click.ClickException(
+                "Found both markdown and JSON inputs; split inputs or pass --json explicitly"
+            )
+        else:
+            paths = []
     if not paths:
-        click.echo("No markdown files discovered")
+        click.echo("No files discovered")
         return
     format_enabled = not no_format
     if in_place:
         output_map = {path: path for path in paths}
     else:
-        output_map = {path: (output_path / name) for path, name in _map_output_files(paths, [output_path]).items()}
+        ext = ".json" if json_mode else ".md"
+        output_map = {
+            path: (output_path / name)
+            for path, name in _map_output_files(paths, [output_path], ext=ext).items()
+        }
     if dry_run:
         rows = [
+            ("Mode", "json" if json_mode else "markdown"),
             ("Inputs", str(len(paths))),
             ("Outputs", str(len(output_map))),
             ("Fix level", fix_level),
@@ -599,25 +789,614 @@ def recognize_fix(
     progress = tqdm(total=len(paths), desc="fix", unit="file")
     try:
-        asyncio.run(
-            _run_fix(
-                paths,
-                output_map,
-                fix_level,
-                format_enabled,
-                workers,
-                progress,
+        if json_mode:
+            results = asyncio.run(
+                _run_fix_json(
+                    paths,
+                    output_map,
+                    fix_level,
+                    format_enabled,
+                    workers,
+                    progress,
+                )
+            )
+        else:
+            asyncio.run(
+                _run_fix(
+                    paths,
+                    output_map,
+                    fix_level,
+                    format_enabled,
+                    workers,
+                    progress,
+                )
+            )
+    finally:
+        progress.close()
+    if json_mode:
+        total_items = sum(result[0] for result in results)
+        items_processed = sum(result[1] for result in results)
+        items_updated = sum(result[2] for result in results)
+        fields_total = sum(result[3] for result in results)
+        fields_updated = sum(result[4] for result in results)
+        items_skipped = total_items - items_processed
+        rows = [
+            ("Mode", "json"),
+            ("Inputs", str(len(paths))),
+            ("Outputs", str(len(output_map))),
+            ("Items", str(total_items)),
+            ("Items processed", str(items_processed)),
+            ("Items skipped", str(items_skipped)),
+            ("Items updated", str(items_updated)),
+            ("Fields processed", str(fields_total)),
+            ("Fields updated", str(fields_updated)),
+            ("Fix level", fix_level),
+            ("Format", "no" if no_format else "yes"),
+            ("In place", "yes" if in_place else "no"),
+            ("Output dir", _relative_path(output_path) if output_path else "-"),
+            ("Duration", _format_duration(time.monotonic() - start_time)),
+        ]
+    else:
+        rows = [
+            ("Mode", "markdown"),
+            ("Inputs", str(len(paths))),
+            ("Outputs", str(len(output_map))),
+            ("Fix level", fix_level),
+            ("Format", "no" if no_format else "yes"),
+            ("In place", "yes" if in_place else "no"),
+            ("Output dir", _relative_path(output_path) if output_path else "-"),
+            ("Duration", _format_duration(time.monotonic() - start_time)),
+        ]
+    _print_summary("recognize fix", rows)
+@recognize.command("fix-math")
+@click.option("-c", "--config", "config_path", default="config.toml", help="Path to config.toml")
+@click.option(
+    "-i",
+    "--input",
+    "inputs",
+    multiple=True,
+    required=True,
+    help="Input markdown or JSON file/directory (repeatable)",
+)
+@click.option("-o", "--output", "output_dir", default=None, help="Output directory")
+@click.option("--in-place", "in_place", is_flag=True, help="Fix formulas in place")
+@click.option("-r", "--recursive", is_flag=True, help="Recursively discover files")
+@click.option("--json", "json_mode", is_flag=True, help="Process JSON inputs instead of markdown")
+@click.option("-m", "--model", "model_ref", required=True, help="provider/model")
+@click.option("--batch-size", "batch_size", default=10, show_default=True, type=int)
+@click.option("--context-chars", "context_chars", default=80, show_default=True, type=int)
+@click.option("--max-retries", "max_retries", default=3, show_default=True, type=int)
+@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
+@click.option("--timeout", "timeout", default=120.0, show_default=True, type=float)
+@click.option(
+    "--only-show-error",
+    "only_show_error",
+    is_flag=True,
+    help="Only validate formulas and report error counts",
+)
+@click.option("--report", "report_path", default=None, help="Error report output path")
+@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
+@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
+def recognize_fix_math(
+    config_path: str,
+    inputs: tuple[str, ...],
+    output_dir: str | None,
+    in_place: bool,
+    recursive: bool,
+    json_mode: bool,
+    model_ref: str,
+    batch_size: int,
+    context_chars: int,
+    max_retries: int,
+    workers: int,
+    timeout: float,
+    only_show_error: bool,
+    report_path: str | None,
+    dry_run: bool,
+    verbose: bool,
+) -> None:
+    """Validate and repair LaTeX formulas in markdown or JSON outputs."""
+    configure_logging(verbose)
+    if in_place and output_dir:
+        raise click.ClickException("--in-place cannot be used with --output")
+    if not only_show_error and not in_place and not output_dir:
+        raise click.ClickException("Either --in-place or --output is required")
+    if batch_size <= 0:
+        raise click.ClickException("--batch-size must be positive")
+    if context_chars < 0:
+        raise click.ClickException("--context-chars must be non-negative")
+    if max_retries < 0:
+        raise click.ClickException("--max-retries must be non-negative")
+    if workers <= 0:
+        raise click.ClickException("--workers must be positive")
+    try:
+        require_pylatexenc()
+    except RuntimeError as exc:
+        raise click.ClickException(str(exc)) from exc
+    if not json_mode:
+        file_types: set[str] = set()
+        for raw in inputs:
+            path = Path(raw)
+            if path.is_file():
+                suffix = path.suffix.lower()
+                if suffix in {".md", ".json"}:
+                    file_types.add(suffix)
+        if ".md" in file_types and ".json" in file_types:
+            raise click.ClickException(
+                "Mixed markdown and JSON inputs. Use --json for JSON or split commands."
             )
+        if ".json" in file_types:
+            json_mode = True
+            logger.info("Detected JSON inputs; enabling --json mode")
+    config = load_config(config_path)
+    provider, model_name = parse_model_ref(model_ref, config.providers)
+    api_keys = resolve_api_keys(provider.api_keys)
+    if provider.type in {
+        "openai_compatible",
+        "dashscope",
+        "gemini_ai_studio",
+        "azure_openai",
+        "claude",
+    } and not api_keys:
+        raise click.ClickException(f"{provider.type} providers require api_keys")
+    api_key = api_keys[0] if api_keys else None
+    if json_mode:
+        paths = discover_json(inputs, recursive=recursive)
+    else:
+        paths = discover_markdown(inputs, None, recursive=recursive)
+    if not paths:
+        click.echo("No files discovered")
+        return
+    output_path = Path(output_dir) if output_dir else None
+    if output_path and not dry_run and not only_show_error:
+        output_path = _ensure_output_dir(output_dir)
+        _warn_if_not_empty(output_path)
+    if in_place:
+        output_map = {path: path for path in paths}
+    elif output_path:
+        ext = ".json" if json_mode else ".md"
+        output_map = {
+            path: (output_path / name)
+            for path, name in _map_output_files(paths, [output_path], ext=ext).items()
+        }
+    else:
+        output_map = {path: path for path in paths}
+    report_target = None
+    if report_path:
+        report_target = Path(report_path)
+    elif not only_show_error:
+        if output_path:
+            report_target = output_path / "fix-math-errors.json"
+        elif in_place:
+            report_target = Path.cwd() / "fix-math-errors.json"
+    if dry_run and not only_show_error:
+        rows = [
+            ("Mode", "json" if json_mode else "markdown"),
+            ("Inputs", str(len(paths))),
+            ("Outputs", str(len(output_map))),
+            ("Batch size", str(batch_size)),
+            ("Context chars", str(context_chars)),
+            ("Max retries", str(max_retries)),
+            ("Workers", str(workers)),
+            ("Timeout", f"{timeout:.1f}s"),
+            ("Only show error", "yes" if only_show_error else "no"),
+            ("In place", "yes" if in_place else "no"),
+            ("Output dir", _relative_path(output_path) if output_path else "-"),
+            ("Report", _relative_path(report_target) if report_target else "-"),
+        ]
+        _print_summary("recognize fix-math (dry-run)", rows)
+        return
+    progress = tqdm(total=len(paths), desc="fix-math", unit="file")
+    formula_progress = tqdm(total=0, desc="formulas", unit="formula")
+    error_records: list[dict[str, Any]] = []
+    async def run() -> MathFixStats:
+        semaphore = asyncio.Semaphore(workers)
+        progress_lock = asyncio.Lock()
+        stats_total = MathFixStats()
+        async with httpx.AsyncClient() as client:
+            async def handle_path(path: Path) -> MathFixStats:
+                stats = MathFixStats()
+                if json_mode:
+                    raw_text = read_text(path)
+                    items, payload, template_tag = _load_json_payload(path)
+                    cursor = 0
+                    for item_index, item in enumerate(items):
+                        if not isinstance(item, dict):
+                            continue
+                        template = _resolve_item_template(item, template_tag)
+                        fields = _template_markdown_fields(template)
+                        for field in fields:
+                            value = item.get(field)
+                            if not isinstance(value, str):
+                                continue
+                            spans = extract_math_spans(value, context_chars)
+                            if spans:
+                                formula_progress.total += len(spans)
+                                formula_progress.refresh()
+                            line_start, cursor = locate_json_field_start(raw_text, value, cursor)
+                            field_path = f"papers[{item_index}].{field}"
+                            updated, errors = await fix_math_text(
+                                value,
+                                str(path),
+                                line_start,
+                                field_path,
+                                item_index,
+                                provider,
+                                model_name,
+                                api_key,
+                                timeout,
+                                max_retries,
+                                batch_size,
+                                context_chars,
+                                client,
+                                stats,
+                                repair_enabled=not only_show_error,
+                                spans=spans,
+                                progress_cb=lambda: formula_progress.update(1),
+                            )
+                            if not only_show_error and updated != value:
+                                item[field] = updated
+                            error_records.extend(errors)
+                    if not only_show_error:
+                        output_data: Any = items if payload is None else {**payload, "papers": items}
+                        output_path = output_map[path]
+                        serialized = json.dumps(output_data, ensure_ascii=False, indent=2)
+                        await asyncio.to_thread(output_path.write_text, f"{serialized}\n", encoding="utf-8")
+                else:
+                    content = await asyncio.to_thread(read_text, path)
+                    spans = extract_math_spans(content, context_chars)
+                    if spans:
+                        formula_progress.total += len(spans)
+                        formula_progress.refresh()
+                    updated, errors = await fix_math_text(
+                        content,
+                        str(path),
+                        1,
+                        None,
+                        None,
+                        provider,
+                        model_name,
+                        api_key,
+                        timeout,
+                        max_retries,
+                        batch_size,
+                        context_chars,
+                        client,
+                        stats,
+                        repair_enabled=not only_show_error,
+                        spans=spans,
+                        progress_cb=lambda: formula_progress.update(1),
+                    )
+                    if not only_show_error:
+                        output_path = output_map[path]
+                        await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
+                    error_records.extend(errors)
+                return stats
+            async def runner(path: Path) -> None:
+                async with semaphore:
+                    stats = await handle_path(path)
+                    stats_total.formulas_total += stats.formulas_total
+                    stats_total.formulas_invalid += stats.formulas_invalid
+                    stats_total.formulas_cleaned += stats.formulas_cleaned
+                    stats_total.formulas_repaired += stats.formulas_repaired
+                    stats_total.formulas_failed += stats.formulas_failed
+                    async with progress_lock:
+                        progress.update(1)
+            await asyncio.gather(*(runner(path) for path in paths))
+        return stats_total
+    try:
+        stats = asyncio.run(run())
+    finally:
+        progress.close()
+        formula_progress.close()
+    if report_target and error_records:
+        report_target.parent.mkdir(parents=True, exist_ok=True)
+        report_target.write_text(
+            json.dumps(error_records, ensure_ascii=False, indent=2) + "\n",
+            encoding="utf-8",
         )
+    rows = [
+        ("Mode", "json" if json_mode else "markdown"),
+        ("Inputs", str(len(paths))),
+        ("Outputs", str(len(output_map) if not only_show_error else 0)),
+        ("Formulas", str(stats.formulas_total)),
+        ("Invalid", str(stats.formulas_invalid)),
+        ("Cleaned", str(stats.formulas_cleaned)),
+        ("Repaired", str(stats.formulas_repaired)),
+        ("Failed", str(stats.formulas_failed)),
+        ("Only show error", "yes" if only_show_error else "no"),
+        ("Report", _relative_path(report_target) if report_target else "-"),
+    ]
+    _print_summary("recognize fix-math", rows)
+@recognize.command("fix-mermaid")
+@click.option("-c", "--config", "config_path", default="config.toml", help="Path to config.toml")
+@click.option(
+    "-i",
+    "--input",
+    "inputs",
+    multiple=True,
+    required=True,
+    help="Input markdown or JSON file/directory (repeatable)",
+)
+@click.option("-o", "--output", "output_dir", default=None, help="Output directory")
+@click.option("--in-place", "in_place", is_flag=True, help="Fix Mermaid blocks in place")
+@click.option("-r", "--recursive", is_flag=True, help="Recursively discover files")
+@click.option("--json", "json_mode", is_flag=True, help="Process JSON inputs instead of markdown")
+@click.option("-m", "--model", "model_ref", required=True, help="provider/model")
+@click.option("--batch-size", "batch_size", default=10, show_default=True, type=int)
+@click.option("--context-chars", "context_chars", default=80, show_default=True, type=int)
+@click.option("--max-retries", "max_retries", default=3, show_default=True, type=int)
+@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
+@click.option("--timeout", "timeout", default=120.0, show_default=True, type=float)
+@click.option(
+    "--only-show-error",
+    "only_show_error",
+    is_flag=True,
+    help="Only validate Mermaid blocks and report error counts",
+)
+@click.option("--report", "report_path", default=None, help="Error report output path")
+@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
+@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
+def recognize_fix_mermaid(
+    config_path: str,
+    inputs: tuple[str, ...],
+    output_dir: str | None,
+    in_place: bool,
+    recursive: bool,
+    json_mode: bool,
+    model_ref: str,
+    batch_size: int,
+    context_chars: int,
+    max_retries: int,
+    workers: int,
+    timeout: float,
+    only_show_error: bool,
+    report_path: str | None,
+    dry_run: bool,
+    verbose: bool,
+) -> None:
+    """Validate and repair Mermaid diagrams in markdown or JSON outputs."""
+    configure_logging(verbose)
+    if in_place and output_dir:
+        raise click.ClickException("--in-place cannot be used with --output")
+    if not only_show_error and not in_place and not output_dir:
+        raise click.ClickException("Either --in-place or --output is required")
+    if batch_size <= 0:
+        raise click.ClickException("--batch-size must be positive")
+    if context_chars < 0:
+        raise click.ClickException("--context-chars must be non-negative")
+    if max_retries < 0:
+        raise click.ClickException("--max-retries must be non-negative")
+    if workers <= 0:
+        raise click.ClickException("--workers must be positive")
+    try:
+        require_mmdc()
+    except RuntimeError as exc:
+        raise click.ClickException(str(exc)) from exc
+    if not json_mode:
+        file_types: set[str] = set()
+        for raw in inputs:
+            path = Path(raw)
+            if path.is_file():
+                suffix = path.suffix.lower()
+                if suffix in {".md", ".json"}:
+                    file_types.add(suffix)
+        if ".md" in file_types and ".json" in file_types:
+            raise click.ClickException(
+                "Mixed markdown and JSON inputs. Use --json for JSON or split commands."
+            )
+        if ".json" in file_types:
+            json_mode = True
+            logger.info("Detected JSON inputs; enabling --json mode")
+    config = load_config(config_path)
+    provider, model_name = parse_model_ref(model_ref, config.providers)
+    api_keys = resolve_api_keys(provider.api_keys)
+    if provider.type in {
+        "openai_compatible",
+        "dashscope",
+        "gemini_ai_studio",
+        "azure_openai",
+        "claude",
+    } and not api_keys:
+        raise click.ClickException(f"{provider.type} providers require api_keys")
+    api_key = api_keys[0] if api_keys else None
+    if json_mode:
+        paths = discover_json(inputs, recursive=recursive)
+    else:
+        paths = discover_markdown(inputs, None, recursive=recursive)
+    if not paths:
+        click.echo("No files discovered")
+        return
+    output_path = Path(output_dir) if output_dir else None
+    if output_path and not dry_run and not only_show_error:
+        output_path = _ensure_output_dir(output_dir)
+        _warn_if_not_empty(output_path)
+    if in_place:
+        output_map = {path: path for path in paths}
+    elif output_path:
+        ext = ".json" if json_mode else ".md"
+        output_map = {
+            path: (output_path / name)
+            for path, name in _map_output_files(paths, [output_path], ext=ext).items()
+        }
+    else:
+        output_map = {path: path for path in paths}
+    report_target = None
+    if report_path:
+        report_target = Path(report_path)
+    elif not only_show_error:
+        if output_path:
+            report_target = output_path / "fix-mermaid-errors.json"
+        elif in_place:
+            report_target = Path.cwd() / "fix-mermaid-errors.json"
+    if dry_run and not only_show_error:
+        rows = [
+            ("Mode", "json" if json_mode else "markdown"),
+            ("Inputs", str(len(paths))),
+            ("Outputs", str(len(output_map))),
+            ("Batch size", str(batch_size)),
+            ("Context chars", str(context_chars)),
+            ("Max retries", str(max_retries)),
+            ("Workers", str(workers)),
+            ("Timeout", f"{timeout:.1f}s"),
+            ("Only show error", "yes" if only_show_error else "no"),
+            ("In place", "yes" if in_place else "no"),
+            ("Output dir", _relative_path(output_path) if output_path else "-"),
+            ("Report", _relative_path(report_target) if report_target else "-"),
+        ]
+        _print_summary("recognize fix-mermaid (dry-run)", rows)
+        return
+    progress = tqdm(total=len(paths), desc="fix-mermaid", unit="file")
+    diagram_progress = tqdm(total=0, desc="diagrams", unit="diagram")
+    error_records: list[dict[str, Any]] = []
+    async def run() -> MermaidFixStats:
+        semaphore = asyncio.Semaphore(workers)
+        progress_lock = asyncio.Lock()
+        stats_total = MermaidFixStats()
+        async with httpx.AsyncClient() as client:
+            async def handle_path(path: Path) -> MermaidFixStats:
+                stats = MermaidFixStats()
+                if json_mode:
+                    raw_text = read_text(path)
+                    items, payload, template_tag = _load_json_payload(path)
+                    cursor = 0
+                    for item_index, item in enumerate(items):
+                        if not isinstance(item, dict):
+                            continue
+                        template = _resolve_item_template(item, template_tag)
+                        fields = _template_markdown_fields(template)
+                        for field in fields:
+                            value = item.get(field)
+                            if not isinstance(value, str):
+                                continue
+                            spans = extract_mermaid_spans(value, context_chars)
+                            if spans:
+                                diagram_progress.total += len(spans)
+                                diagram_progress.refresh()
+                            line_start, cursor = locate_json_field_start(raw_text, value, cursor)
+                            field_path = f"papers[{item_index}].{field}"
+                            updated, errors = await fix_mermaid_text(
+                                value,
+                                str(path),
+                                line_start,
+                                field_path,
+                                item_index,
+                                provider,
+                                model_name,
+                                api_key,
+                                timeout,
+                                max_retries,
+                                batch_size,
+                                context_chars,
+                                client,
+                                stats,
+                                repair_enabled=not only_show_error,
+                                spans=spans,
+                                progress_cb=lambda: diagram_progress.update(1),
+                            )
+                            if not only_show_error and updated != value:
+                                item[field] = updated
+                            error_records.extend(errors)
+                    if not only_show_error:
+                        output_data: Any = items if payload is None else {**payload, "papers": items}
+                        output_path = output_map[path]
+                        serialized = json.dumps(output_data, ensure_ascii=False, indent=2)
+                        await asyncio.to_thread(output_path.write_text, f"{serialized}\n", encoding="utf-8")
+                else:
+                    content = await asyncio.to_thread(read_text, path)
+                    spans = extract_mermaid_spans(content, context_chars)
+                    if spans:
+                        diagram_progress.total += len(spans)
+                        diagram_progress.refresh()
+                    updated, errors = await fix_mermaid_text(
+                        content,
+                        str(path),
+                        1,
+                        None,
+                        None,
+                        provider,
+                        model_name,
+                        api_key,
+                        timeout,
+                        max_retries,
+                        batch_size,
+                        context_chars,
+                        client,
+                        stats,
+                        repair_enabled=not only_show_error,
+                        spans=spans,
+                        progress_cb=lambda: diagram_progress.update(1),
+                    )
+                    if not only_show_error:
+                        output_path = output_map[path]
+                        await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
+                    error_records.extend(errors)
+                return stats
+            async def runner(path: Path) -> None:
+                async with semaphore:
+                    stats = await handle_path(path)
+                    stats_total.diagrams_total += stats.diagrams_total
+                    stats_total.diagrams_invalid += stats.diagrams_invalid
+                    stats_total.diagrams_repaired += stats.diagrams_repaired
+                    stats_total.diagrams_failed += stats.diagrams_failed
+                    async with progress_lock:
+                        progress.update(1)
+            await asyncio.gather(*(runner(path) for path in paths))
+        return stats_total
+    try:
+        stats = asyncio.run(run())
     finally:
         progress.close()
+        diagram_progress.close()
+    if report_target and error_records:
+        report_target.parent.mkdir(parents=True, exist_ok=True)
+        report_target.write_text(
+            json.dumps(error_records, ensure_ascii=False, indent=2) + "\n",
+            encoding="utf-8",
+        )
     rows = [
+        ("Mode", "json" if json_mode else "markdown"),
         ("Inputs", str(len(paths))),
-        ("Outputs", str(len(output_map))),
-        ("Fix level", fix_level),
-        ("Format", "no" if no_format else "yes"),
-        ("In place", "yes" if in_place else "no"),
-        ("Output dir", _relative_path(output_path) if output_path else "-"),
-        ("Duration", _format_duration(time.monotonic() - start_time)),
+        ("Outputs", str(len(output_map) if not only_show_error else 0)),
+        ("Diagrams", str(stats.diagrams_total)),
+        ("Invalid", str(stats.diagrams_invalid)),
+        ("Repaired", str(stats.diagrams_repaired)),
+        ("Failed", str(stats.diagrams_failed)),
+        ("Only show error", "yes" if only_show_error else "no"),
+        ("Report", _relative_path(report_target) if report_target else "-"),
     ]
-    _print_summary("recognize fix", rows)
+    _print_summary("recognize fix-mermaid", rows)

deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

deepresearch-flow 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl