PyPI - deepresearch-flow - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

deepresearch-flow 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

deepresearch_flow/cli.py +2 -0
deepresearch_flow/paper/config.py +15 -0
deepresearch_flow/paper/db.py +9 -0
deepresearch_flow/paper/llm.py +2 -0
deepresearch_flow/paper/web/app.py +413 -20
deepresearch_flow/recognize/cli.py +157 -3
deepresearch_flow/recognize/organize.py +58 -0
deepresearch_flow/translator/__init__.py +1 -0
deepresearch_flow/translator/cli.py +451 -0
deepresearch_flow/translator/config.py +19 -0
deepresearch_flow/translator/engine.py +959 -0
deepresearch_flow/translator/fixers.py +451 -0
deepresearch_flow/translator/placeholder.py +62 -0
deepresearch_flow/translator/prompts.py +116 -0
deepresearch_flow/translator/protector.py +291 -0
deepresearch_flow/translator/segment.py +180 -0
deepresearch_flow-0.3.0.dist-info/METADATA +306 -0
{deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/RECORD +22 -13
deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
{deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/WHEEL +0 -0
{deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/entry_points.txt +0 -0
{deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/licenses/LICENSE +0 -0
{deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/top_level.txt +0 -0

deepresearch_flow/recognize/cli.py CHANGED Viewed

@@ -26,7 +26,11 @@ from deepresearch_flow.recognize.markdown import (
     sanitize_filename,
     unpack_markdown_images,
 )
-from deepresearch_flow.recognize.organize import discover_mineru_dirs, organize_mineru_dir
+from deepresearch_flow.recognize.organize import (
+    discover_mineru_dirs,
+    fix_markdown_text,
+    organize_mineru_dir,
+)
 logger = logging.getLogger(__name__)
@@ -180,6 +184,8 @@ async def _run_organize(
     output_base64: Path | None,
     output_map: dict[Path, str],
     workers: int,
+    fix_level: str | None,
+    format_enabled: bool,
     progress: tqdm | None,
 ) -> None:
     image_registry = None
@@ -196,11 +202,30 @@ async def _run_organize(
             output_base64,
             output_filename,
             image_registry,
+            fix_level,
+            format_enabled,
         )
     await _run_with_workers(layout_dirs, workers, handler, progress=progress)
+async def _run_fix(
+    paths: list[Path],
+    output_map: dict[Path, Path],
+    fix_level: str,
+    format_enabled: bool,
+    workers: int,
+    progress: tqdm | None,
+) -> None:
+    async def handler(path: Path) -> None:
+        content = await asyncio.to_thread(read_text, path)
+        updated = await fix_markdown_text(content, fix_level, format_enabled)
+        output_path = output_map[path]
+        await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
+    await _run_with_workers(paths, workers, handler, progress=progress)
 @click.group()
 def recognize() -> None:
     """OCR recognition and Markdown post-processing commands."""
@@ -364,7 +389,7 @@ def unpack(
     )
-@recognize.command()
+@recognize.group(invoke_without_command=True)
 @click.option(
     "--layout",
     "layout",
@@ -378,28 +403,47 @@ def unpack(
     "--input",
     "inputs",
     multiple=True,
-    required=True,
+    required=False,
     help="Input directory (repeatable)",
 )
 @click.option("-r", "--recursive", is_flag=True, help="Recursively search for layout folders")
 @click.option("--output-simple", "output_simple", default=None, help="Output directory for copied markdown")
 @click.option("--output-base64", "output_base64", default=None, help="Output directory for embedded markdown")
+@click.option("--fix", "enable_fix", is_flag=True, help="Apply OCR fix and rumdl formatting")
+@click.option(
+    "--fix-level",
+    "fix_level",
+    default="moderate",
+    type=click.Choice(["off", "moderate", "aggressive"]),
+    show_default=True,
+    help="OCR fix level",
+)
+@click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
 @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
 @click.option("--dry-run", is_flag=True, help="Report actions without writing files")
 @click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
+@click.pass_context
 def organize(
+    ctx: click.Context,
     layout: str,
     inputs: tuple[str, ...],
     recursive: bool,
     output_simple: str | None,
     output_base64: str | None,
+    enable_fix: bool,
+    fix_level: str,
+    no_format: bool,
     workers: int,
     dry_run: bool,
     verbose: bool,
 ) -> None:
     """Organize OCR outputs into markdown files."""
+    if ctx.invoked_subcommand:
+        return
     configure_logging(verbose)
     start_time = time.monotonic()
+    if not inputs:
+        raise click.ClickException("--input is required")
     if workers <= 0:
         raise click.ClickException("--workers must be positive")
     if output_simple is None and output_base64 is None:
@@ -424,6 +468,8 @@ def organize(
     output_map = _map_output_files(layout_dirs, output_dirs)
     image_counts = _aggregate_image_counts([path / "full.md" for path in layout_dirs])
+    fix_value = fix_level if enable_fix else None
+    format_enabled = enable_fix and not no_format
     if dry_run:
         rows = [
             ("Layout", layout),
@@ -433,6 +479,9 @@ def organize(
             ("Images data", str(image_counts["data"])),
             ("Images http", str(image_counts["http"])),
             ("Images local", str(image_counts["local"])),
+            ("Fix", "yes" if enable_fix else "no"),
+            ("Fix level", fix_level if enable_fix else "-"),
+            ("Format", "no" if no_format else ("yes" if enable_fix else "-")),
             ("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
             ("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
             ("Duration", _format_duration(time.monotonic() - start_time)),
@@ -449,6 +498,8 @@ def organize(
                 output_base64_path,
                 output_map,
                 workers,
+                fix_value,
+                format_enabled,
                 progress,
             )
         )
@@ -462,8 +513,111 @@ def organize(
         ("Images data", str(image_counts["data"])),
         ("Images http", str(image_counts["http"])),
         ("Images local", str(image_counts["local"])),
+        ("Fix", "yes" if enable_fix else "no"),
+        ("Fix level", fix_level if enable_fix else "-"),
+        ("Format", "no" if no_format else ("yes" if enable_fix else "-")),
         ("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
         ("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
         ("Duration", _format_duration(time.monotonic() - start_time)),
     ]
     _print_summary("recognize organize", rows)
+@recognize.command("fix")
+@click.option(
+    "-i",
+    "--input",
+    "inputs",
+    multiple=True,
+    required=True,
+    help="Input markdown file or directory (repeatable)",
+)
+@click.option("-o", "--output", "output_dir", default=None, help="Output directory")
+@click.option("--in-place", "in_place", is_flag=True, help="Fix markdown files in place")
+@click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
+@click.option(
+    "--fix-level",
+    "fix_level",
+    default="moderate",
+    type=click.Choice(["off", "moderate", "aggressive"]),
+    show_default=True,
+    help="OCR fix level",
+)
+@click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
+@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
+@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
+@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
+def recognize_fix(
+    inputs: tuple[str, ...],
+    output_dir: str | None,
+    in_place: bool,
+    recursive: bool,
+    fix_level: str,
+    no_format: bool,
+    workers: int,
+    dry_run: bool,
+    verbose: bool,
+) -> None:
+    """Fix and format OCR markdown outputs."""
+    configure_logging(verbose)
+    start_time = time.monotonic()
+    if workers <= 0:
+        raise click.ClickException("--workers must be positive")
+    if in_place and output_dir:
+        raise click.ClickException("--in-place cannot be used with --output")
+    if not in_place and not output_dir:
+        raise click.ClickException("Either --in-place or --output is required")
+    output_path = Path(output_dir) if output_dir else None
+    if output_path and not dry_run:
+        output_path = _ensure_output_dir(output_dir)
+        _warn_if_not_empty(output_path)
+    paths = discover_markdown(inputs, None, recursive=recursive)
+    if not paths:
+        click.echo("No markdown files discovered")
+        return
+    format_enabled = not no_format
+    if in_place:
+        output_map = {path: path for path in paths}
+    else:
+        output_map = {path: (output_path / name) for path, name in _map_output_files(paths, [output_path]).items()}
+    if dry_run:
+        rows = [
+            ("Inputs", str(len(paths))),
+            ("Outputs", str(len(output_map))),
+            ("Fix level", fix_level),
+            ("Format", "no" if no_format else "yes"),
+            ("In place", "yes" if in_place else "no"),
+            ("Output dir", _relative_path(output_path) if output_path else "-"),
+            ("Duration", _format_duration(time.monotonic() - start_time)),
+        ]
+        _print_summary("recognize fix (dry-run)", rows)
+        return
+    progress = tqdm(total=len(paths), desc="fix", unit="file")
+    try:
+        asyncio.run(
+            _run_fix(
+                paths,
+                output_map,
+                fix_level,
+                format_enabled,
+                workers,
+                progress,
+            )
+        )
+    finally:
+        progress.close()
+    rows = [
+        ("Inputs", str(len(paths))),
+        ("Outputs", str(len(output_map))),
+        ("Fix level", fix_level),
+        ("Format", "no" if no_format else "yes"),
+        ("In place", "yes" if in_place else "no"),
+        ("Output dir", _relative_path(output_path) if output_path else "-"),
+        ("Duration", _format_duration(time.monotonic() - start_time)),
+    ]
+    _print_summary("recognize fix", rows)

deepresearch_flow/recognize/organize.py CHANGED Viewed

@@ -5,9 +5,12 @@ from __future__ import annotations
 import asyncio
 import logging
 import shutil
+import subprocess
 from pathlib import Path
 from typing import Iterable
+from deepresearch_flow.translator.fixers import fix_markdown
 from deepresearch_flow.recognize.markdown import (
     NameRegistry,
     embed_markdown_images,
@@ -20,6 +23,53 @@ from deepresearch_flow.recognize.markdown import (
 logger = logging.getLogger(__name__)
+_RUMDL_PATH = shutil.which("rumdl")
+_RUMDL_WARNED = False
+async def _format_markdown(text: str) -> str:
+    global _RUMDL_WARNED
+    if not _RUMDL_PATH:
+        if not _RUMDL_WARNED:
+            logger.warning("rumdl not available; skip markdown formatting")
+            _RUMDL_WARNED = True
+        return text
+    def run_formatter() -> str:
+        try:
+            proc = subprocess.run(
+                [_RUMDL_PATH, "fmt", "--stdin", "--quiet"],
+                input=text,
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+        except OSError as exc:
+            logger.warning("rumdl fmt failed: %s", exc)
+            return text
+        if proc.returncode != 0:
+            logger.warning("rumdl fmt failed (%s): %s", proc.returncode, proc.stderr.strip())
+            return text
+        return proc.stdout or text
+    return await asyncio.to_thread(run_formatter)
+def _apply_fix(text: str, fix_level: str) -> str:
+    if fix_level == "off":
+        return text
+    return fix_markdown(text, fix_level)
+async def fix_markdown_text(
+    text: str,
+    fix_level: str,
+    format_enabled: bool,
+) -> str:
+    text = _apply_fix(text, fix_level)
+    if format_enabled:
+        text = await _format_markdown(text)
+    return text
 def discover_mineru_dirs(inputs: Iterable[str], recursive: bool) -> list[Path]:
@@ -61,9 +111,13 @@ async def organize_mineru_dir(
     output_base64: Path | None,
     output_filename: str,
     image_registry: NameRegistry | None,
+    fix_level: str | None,
+    format_enabled: bool,
 ) -> None:
     md_path = layout_dir / "full.md"
     content = await asyncio.to_thread(read_text, md_path)
+    if fix_level is not None:
+        content = _apply_fix(content, fix_level)
     if output_simple is not None and image_registry is not None:
         images_dir = output_simple / "images"
@@ -86,10 +140,14 @@ async def organize_mineru_dir(
             return f"images/{filename}"
         updated = await rewrite_markdown_images(content, replace_simple)
+        if format_enabled:
+            updated = await _format_markdown(updated)
         output_path = output_simple / output_filename
         await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
     if output_base64 is not None:
         updated = await embed_markdown_images(content, md_path, False, None)
+        if format_enabled:
+            updated = await _format_markdown(updated)
         output_path = output_base64 / output_filename
         await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")

deepresearch_flow/translator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Translator package."""

deepresearch-flow 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

deepresearch-flow 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl