deepresearch-flow 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/cli.py +2 -0
- deepresearch_flow/paper/config.py +15 -0
- deepresearch_flow/paper/db.py +9 -0
- deepresearch_flow/paper/llm.py +2 -0
- deepresearch_flow/paper/web/app.py +413 -20
- deepresearch_flow/recognize/cli.py +157 -3
- deepresearch_flow/recognize/organize.py +58 -0
- deepresearch_flow/translator/__init__.py +1 -0
- deepresearch_flow/translator/cli.py +451 -0
- deepresearch_flow/translator/config.py +19 -0
- deepresearch_flow/translator/engine.py +959 -0
- deepresearch_flow/translator/fixers.py +451 -0
- deepresearch_flow/translator/placeholder.py +62 -0
- deepresearch_flow/translator/prompts.py +116 -0
- deepresearch_flow/translator/protector.py +291 -0
- deepresearch_flow/translator/segment.py +180 -0
- deepresearch_flow-0.3.0.dist-info/METADATA +306 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/RECORD +22 -13
- deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -26,7 +26,11 @@ from deepresearch_flow.recognize.markdown import (
|
|
|
26
26
|
sanitize_filename,
|
|
27
27
|
unpack_markdown_images,
|
|
28
28
|
)
|
|
29
|
-
from deepresearch_flow.recognize.organize import
|
|
29
|
+
from deepresearch_flow.recognize.organize import (
|
|
30
|
+
discover_mineru_dirs,
|
|
31
|
+
fix_markdown_text,
|
|
32
|
+
organize_mineru_dir,
|
|
33
|
+
)
|
|
30
34
|
|
|
31
35
|
|
|
32
36
|
logger = logging.getLogger(__name__)
|
|
@@ -180,6 +184,8 @@ async def _run_organize(
|
|
|
180
184
|
output_base64: Path | None,
|
|
181
185
|
output_map: dict[Path, str],
|
|
182
186
|
workers: int,
|
|
187
|
+
fix_level: str | None,
|
|
188
|
+
format_enabled: bool,
|
|
183
189
|
progress: tqdm | None,
|
|
184
190
|
) -> None:
|
|
185
191
|
image_registry = None
|
|
@@ -196,11 +202,30 @@ async def _run_organize(
|
|
|
196
202
|
output_base64,
|
|
197
203
|
output_filename,
|
|
198
204
|
image_registry,
|
|
205
|
+
fix_level,
|
|
206
|
+
format_enabled,
|
|
199
207
|
)
|
|
200
208
|
|
|
201
209
|
await _run_with_workers(layout_dirs, workers, handler, progress=progress)
|
|
202
210
|
|
|
203
211
|
|
|
212
|
+
async def _run_fix(
|
|
213
|
+
paths: list[Path],
|
|
214
|
+
output_map: dict[Path, Path],
|
|
215
|
+
fix_level: str,
|
|
216
|
+
format_enabled: bool,
|
|
217
|
+
workers: int,
|
|
218
|
+
progress: tqdm | None,
|
|
219
|
+
) -> None:
|
|
220
|
+
async def handler(path: Path) -> None:
|
|
221
|
+
content = await asyncio.to_thread(read_text, path)
|
|
222
|
+
updated = await fix_markdown_text(content, fix_level, format_enabled)
|
|
223
|
+
output_path = output_map[path]
|
|
224
|
+
await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
|
|
225
|
+
|
|
226
|
+
await _run_with_workers(paths, workers, handler, progress=progress)
|
|
227
|
+
|
|
228
|
+
|
|
204
229
|
@click.group()
|
|
205
230
|
def recognize() -> None:
|
|
206
231
|
"""OCR recognition and Markdown post-processing commands."""
|
|
@@ -364,7 +389,7 @@ def unpack(
|
|
|
364
389
|
)
|
|
365
390
|
|
|
366
391
|
|
|
367
|
-
@recognize.
|
|
392
|
+
@recognize.group(invoke_without_command=True)
|
|
368
393
|
@click.option(
|
|
369
394
|
"--layout",
|
|
370
395
|
"layout",
|
|
@@ -378,28 +403,47 @@ def unpack(
|
|
|
378
403
|
"--input",
|
|
379
404
|
"inputs",
|
|
380
405
|
multiple=True,
|
|
381
|
-
required=
|
|
406
|
+
required=False,
|
|
382
407
|
help="Input directory (repeatable)",
|
|
383
408
|
)
|
|
384
409
|
@click.option("-r", "--recursive", is_flag=True, help="Recursively search for layout folders")
|
|
385
410
|
@click.option("--output-simple", "output_simple", default=None, help="Output directory for copied markdown")
|
|
386
411
|
@click.option("--output-base64", "output_base64", default=None, help="Output directory for embedded markdown")
|
|
412
|
+
@click.option("--fix", "enable_fix", is_flag=True, help="Apply OCR fix and rumdl formatting")
|
|
413
|
+
@click.option(
|
|
414
|
+
"--fix-level",
|
|
415
|
+
"fix_level",
|
|
416
|
+
default="moderate",
|
|
417
|
+
type=click.Choice(["off", "moderate", "aggressive"]),
|
|
418
|
+
show_default=True,
|
|
419
|
+
help="OCR fix level",
|
|
420
|
+
)
|
|
421
|
+
@click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
|
|
387
422
|
@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
|
|
388
423
|
@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
|
|
389
424
|
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
|
|
425
|
+
@click.pass_context
|
|
390
426
|
def organize(
|
|
427
|
+
ctx: click.Context,
|
|
391
428
|
layout: str,
|
|
392
429
|
inputs: tuple[str, ...],
|
|
393
430
|
recursive: bool,
|
|
394
431
|
output_simple: str | None,
|
|
395
432
|
output_base64: str | None,
|
|
433
|
+
enable_fix: bool,
|
|
434
|
+
fix_level: str,
|
|
435
|
+
no_format: bool,
|
|
396
436
|
workers: int,
|
|
397
437
|
dry_run: bool,
|
|
398
438
|
verbose: bool,
|
|
399
439
|
) -> None:
|
|
400
440
|
"""Organize OCR outputs into markdown files."""
|
|
441
|
+
if ctx.invoked_subcommand:
|
|
442
|
+
return
|
|
401
443
|
configure_logging(verbose)
|
|
402
444
|
start_time = time.monotonic()
|
|
445
|
+
if not inputs:
|
|
446
|
+
raise click.ClickException("--input is required")
|
|
403
447
|
if workers <= 0:
|
|
404
448
|
raise click.ClickException("--workers must be positive")
|
|
405
449
|
if output_simple is None and output_base64 is None:
|
|
@@ -424,6 +468,8 @@ def organize(
|
|
|
424
468
|
|
|
425
469
|
output_map = _map_output_files(layout_dirs, output_dirs)
|
|
426
470
|
image_counts = _aggregate_image_counts([path / "full.md" for path in layout_dirs])
|
|
471
|
+
fix_value = fix_level if enable_fix else None
|
|
472
|
+
format_enabled = enable_fix and not no_format
|
|
427
473
|
if dry_run:
|
|
428
474
|
rows = [
|
|
429
475
|
("Layout", layout),
|
|
@@ -433,6 +479,9 @@ def organize(
|
|
|
433
479
|
("Images data", str(image_counts["data"])),
|
|
434
480
|
("Images http", str(image_counts["http"])),
|
|
435
481
|
("Images local", str(image_counts["local"])),
|
|
482
|
+
("Fix", "yes" if enable_fix else "no"),
|
|
483
|
+
("Fix level", fix_level if enable_fix else "-"),
|
|
484
|
+
("Format", "no" if no_format else ("yes" if enable_fix else "-")),
|
|
436
485
|
("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
|
|
437
486
|
("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
|
|
438
487
|
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
@@ -449,6 +498,8 @@ def organize(
|
|
|
449
498
|
output_base64_path,
|
|
450
499
|
output_map,
|
|
451
500
|
workers,
|
|
501
|
+
fix_value,
|
|
502
|
+
format_enabled,
|
|
452
503
|
progress,
|
|
453
504
|
)
|
|
454
505
|
)
|
|
@@ -462,8 +513,111 @@ def organize(
|
|
|
462
513
|
("Images data", str(image_counts["data"])),
|
|
463
514
|
("Images http", str(image_counts["http"])),
|
|
464
515
|
("Images local", str(image_counts["local"])),
|
|
516
|
+
("Fix", "yes" if enable_fix else "no"),
|
|
517
|
+
("Fix level", fix_level if enable_fix else "-"),
|
|
518
|
+
("Format", "no" if no_format else ("yes" if enable_fix else "-")),
|
|
465
519
|
("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
|
|
466
520
|
("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
|
|
467
521
|
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
468
522
|
]
|
|
469
523
|
_print_summary("recognize organize", rows)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
@recognize.command("fix")
|
|
527
|
+
@click.option(
|
|
528
|
+
"-i",
|
|
529
|
+
"--input",
|
|
530
|
+
"inputs",
|
|
531
|
+
multiple=True,
|
|
532
|
+
required=True,
|
|
533
|
+
help="Input markdown file or directory (repeatable)",
|
|
534
|
+
)
|
|
535
|
+
@click.option("-o", "--output", "output_dir", default=None, help="Output directory")
|
|
536
|
+
@click.option("--in-place", "in_place", is_flag=True, help="Fix markdown files in place")
|
|
537
|
+
@click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
|
|
538
|
+
@click.option(
|
|
539
|
+
"--fix-level",
|
|
540
|
+
"fix_level",
|
|
541
|
+
default="moderate",
|
|
542
|
+
type=click.Choice(["off", "moderate", "aggressive"]),
|
|
543
|
+
show_default=True,
|
|
544
|
+
help="OCR fix level",
|
|
545
|
+
)
|
|
546
|
+
@click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
|
|
547
|
+
@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
|
|
548
|
+
@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
|
|
549
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
|
|
550
|
+
def recognize_fix(
|
|
551
|
+
inputs: tuple[str, ...],
|
|
552
|
+
output_dir: str | None,
|
|
553
|
+
in_place: bool,
|
|
554
|
+
recursive: bool,
|
|
555
|
+
fix_level: str,
|
|
556
|
+
no_format: bool,
|
|
557
|
+
workers: int,
|
|
558
|
+
dry_run: bool,
|
|
559
|
+
verbose: bool,
|
|
560
|
+
) -> None:
|
|
561
|
+
"""Fix and format OCR markdown outputs."""
|
|
562
|
+
configure_logging(verbose)
|
|
563
|
+
start_time = time.monotonic()
|
|
564
|
+
if workers <= 0:
|
|
565
|
+
raise click.ClickException("--workers must be positive")
|
|
566
|
+
if in_place and output_dir:
|
|
567
|
+
raise click.ClickException("--in-place cannot be used with --output")
|
|
568
|
+
if not in_place and not output_dir:
|
|
569
|
+
raise click.ClickException("Either --in-place or --output is required")
|
|
570
|
+
|
|
571
|
+
output_path = Path(output_dir) if output_dir else None
|
|
572
|
+
if output_path and not dry_run:
|
|
573
|
+
output_path = _ensure_output_dir(output_dir)
|
|
574
|
+
_warn_if_not_empty(output_path)
|
|
575
|
+
|
|
576
|
+
paths = discover_markdown(inputs, None, recursive=recursive)
|
|
577
|
+
if not paths:
|
|
578
|
+
click.echo("No markdown files discovered")
|
|
579
|
+
return
|
|
580
|
+
|
|
581
|
+
format_enabled = not no_format
|
|
582
|
+
if in_place:
|
|
583
|
+
output_map = {path: path for path in paths}
|
|
584
|
+
else:
|
|
585
|
+
output_map = {path: (output_path / name) for path, name in _map_output_files(paths, [output_path]).items()}
|
|
586
|
+
|
|
587
|
+
if dry_run:
|
|
588
|
+
rows = [
|
|
589
|
+
("Inputs", str(len(paths))),
|
|
590
|
+
("Outputs", str(len(output_map))),
|
|
591
|
+
("Fix level", fix_level),
|
|
592
|
+
("Format", "no" if no_format else "yes"),
|
|
593
|
+
("In place", "yes" if in_place else "no"),
|
|
594
|
+
("Output dir", _relative_path(output_path) if output_path else "-"),
|
|
595
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
596
|
+
]
|
|
597
|
+
_print_summary("recognize fix (dry-run)", rows)
|
|
598
|
+
return
|
|
599
|
+
|
|
600
|
+
progress = tqdm(total=len(paths), desc="fix", unit="file")
|
|
601
|
+
try:
|
|
602
|
+
asyncio.run(
|
|
603
|
+
_run_fix(
|
|
604
|
+
paths,
|
|
605
|
+
output_map,
|
|
606
|
+
fix_level,
|
|
607
|
+
format_enabled,
|
|
608
|
+
workers,
|
|
609
|
+
progress,
|
|
610
|
+
)
|
|
611
|
+
)
|
|
612
|
+
finally:
|
|
613
|
+
progress.close()
|
|
614
|
+
rows = [
|
|
615
|
+
("Inputs", str(len(paths))),
|
|
616
|
+
("Outputs", str(len(output_map))),
|
|
617
|
+
("Fix level", fix_level),
|
|
618
|
+
("Format", "no" if no_format else "yes"),
|
|
619
|
+
("In place", "yes" if in_place else "no"),
|
|
620
|
+
("Output dir", _relative_path(output_path) if output_path else "-"),
|
|
621
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
622
|
+
]
|
|
623
|
+
_print_summary("recognize fix", rows)
|
|
@@ -5,9 +5,12 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import logging
|
|
7
7
|
import shutil
|
|
8
|
+
import subprocess
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Iterable
|
|
10
11
|
|
|
12
|
+
from deepresearch_flow.translator.fixers import fix_markdown
|
|
13
|
+
|
|
11
14
|
from deepresearch_flow.recognize.markdown import (
|
|
12
15
|
NameRegistry,
|
|
13
16
|
embed_markdown_images,
|
|
@@ -20,6 +23,53 @@ from deepresearch_flow.recognize.markdown import (
|
|
|
20
23
|
|
|
21
24
|
|
|
22
25
|
logger = logging.getLogger(__name__)
|
|
26
|
+
_RUMDL_PATH = shutil.which("rumdl")
|
|
27
|
+
_RUMDL_WARNED = False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def _format_markdown(text: str) -> str:
|
|
31
|
+
global _RUMDL_WARNED
|
|
32
|
+
if not _RUMDL_PATH:
|
|
33
|
+
if not _RUMDL_WARNED:
|
|
34
|
+
logger.warning("rumdl not available; skip markdown formatting")
|
|
35
|
+
_RUMDL_WARNED = True
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
def run_formatter() -> str:
|
|
39
|
+
try:
|
|
40
|
+
proc = subprocess.run(
|
|
41
|
+
[_RUMDL_PATH, "fmt", "--stdin", "--quiet"],
|
|
42
|
+
input=text,
|
|
43
|
+
capture_output=True,
|
|
44
|
+
text=True,
|
|
45
|
+
check=False,
|
|
46
|
+
)
|
|
47
|
+
except OSError as exc:
|
|
48
|
+
logger.warning("rumdl fmt failed: %s", exc)
|
|
49
|
+
return text
|
|
50
|
+
if proc.returncode != 0:
|
|
51
|
+
logger.warning("rumdl fmt failed (%s): %s", proc.returncode, proc.stderr.strip())
|
|
52
|
+
return text
|
|
53
|
+
return proc.stdout or text
|
|
54
|
+
|
|
55
|
+
return await asyncio.to_thread(run_formatter)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _apply_fix(text: str, fix_level: str) -> str:
|
|
59
|
+
if fix_level == "off":
|
|
60
|
+
return text
|
|
61
|
+
return fix_markdown(text, fix_level)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
async def fix_markdown_text(
|
|
65
|
+
text: str,
|
|
66
|
+
fix_level: str,
|
|
67
|
+
format_enabled: bool,
|
|
68
|
+
) -> str:
|
|
69
|
+
text = _apply_fix(text, fix_level)
|
|
70
|
+
if format_enabled:
|
|
71
|
+
text = await _format_markdown(text)
|
|
72
|
+
return text
|
|
23
73
|
|
|
24
74
|
|
|
25
75
|
def discover_mineru_dirs(inputs: Iterable[str], recursive: bool) -> list[Path]:
|
|
@@ -61,9 +111,13 @@ async def organize_mineru_dir(
|
|
|
61
111
|
output_base64: Path | None,
|
|
62
112
|
output_filename: str,
|
|
63
113
|
image_registry: NameRegistry | None,
|
|
114
|
+
fix_level: str | None,
|
|
115
|
+
format_enabled: bool,
|
|
64
116
|
) -> None:
|
|
65
117
|
md_path = layout_dir / "full.md"
|
|
66
118
|
content = await asyncio.to_thread(read_text, md_path)
|
|
119
|
+
if fix_level is not None:
|
|
120
|
+
content = _apply_fix(content, fix_level)
|
|
67
121
|
|
|
68
122
|
if output_simple is not None and image_registry is not None:
|
|
69
123
|
images_dir = output_simple / "images"
|
|
@@ -86,10 +140,14 @@ async def organize_mineru_dir(
|
|
|
86
140
|
return f"images/{filename}"
|
|
87
141
|
|
|
88
142
|
updated = await rewrite_markdown_images(content, replace_simple)
|
|
143
|
+
if format_enabled:
|
|
144
|
+
updated = await _format_markdown(updated)
|
|
89
145
|
output_path = output_simple / output_filename
|
|
90
146
|
await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
|
|
91
147
|
|
|
92
148
|
if output_base64 is not None:
|
|
93
149
|
updated = await embed_markdown_images(content, md_path, False, None)
|
|
150
|
+
if format_enabled:
|
|
151
|
+
updated = await _format_markdown(updated)
|
|
94
152
|
output_path = output_base64 / output_filename
|
|
95
153
|
await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Translator package."""
|