deepresearch-flow 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +9 -0
  4. deepresearch_flow/paper/llm.py +2 -0
  5. deepresearch_flow/paper/web/app.py +413 -20
  6. deepresearch_flow/paper/web/pdfjs/build/pdf.js +18146 -0
  7. deepresearch_flow/paper/web/pdfjs/build/pdf.js.map +1 -0
  8. deepresearch_flow/paper/web/pdfjs/build/pdf.sandbox.js +280 -0
  9. deepresearch_flow/paper/web/pdfjs/build/pdf.sandbox.js.map +1 -0
  10. deepresearch_flow/paper/web/pdfjs/build/pdf.worker.js +58353 -0
  11. deepresearch_flow/paper/web/pdfjs/build/pdf.worker.js.map +1 -0
  12. deepresearch_flow/recognize/cli.py +157 -3
  13. deepresearch_flow/recognize/organize.py +58 -0
  14. deepresearch_flow/translator/__init__.py +1 -0
  15. deepresearch_flow/translator/cli.py +451 -0
  16. deepresearch_flow/translator/config.py +19 -0
  17. deepresearch_flow/translator/engine.py +959 -0
  18. deepresearch_flow/translator/fixers.py +451 -0
  19. deepresearch_flow/translator/placeholder.py +62 -0
  20. deepresearch_flow/translator/prompts.py +116 -0
  21. deepresearch_flow/translator/protector.py +291 -0
  22. deepresearch_flow/translator/segment.py +180 -0
  23. deepresearch_flow-0.3.0.dist-info/METADATA +306 -0
  24. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/RECORD +28 -13
  25. deepresearch_flow-0.2.0.dist-info/METADATA +0 -424
  26. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/WHEEL +0 -0
  27. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/entry_points.txt +0 -0
  28. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/licenses/LICENSE +0 -0
  29. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/top_level.txt +0 -0
@@ -26,7 +26,11 @@ from deepresearch_flow.recognize.markdown import (
26
26
  sanitize_filename,
27
27
  unpack_markdown_images,
28
28
  )
29
- from deepresearch_flow.recognize.organize import discover_mineru_dirs, organize_mineru_dir
29
+ from deepresearch_flow.recognize.organize import (
30
+ discover_mineru_dirs,
31
+ fix_markdown_text,
32
+ organize_mineru_dir,
33
+ )
30
34
 
31
35
 
32
36
  logger = logging.getLogger(__name__)
@@ -180,6 +184,8 @@ async def _run_organize(
180
184
  output_base64: Path | None,
181
185
  output_map: dict[Path, str],
182
186
  workers: int,
187
+ fix_level: str | None,
188
+ format_enabled: bool,
183
189
  progress: tqdm | None,
184
190
  ) -> None:
185
191
  image_registry = None
@@ -196,11 +202,30 @@ async def _run_organize(
196
202
  output_base64,
197
203
  output_filename,
198
204
  image_registry,
205
+ fix_level,
206
+ format_enabled,
199
207
  )
200
208
 
201
209
  await _run_with_workers(layout_dirs, workers, handler, progress=progress)
202
210
 
203
211
 
212
+ async def _run_fix(
213
+ paths: list[Path],
214
+ output_map: dict[Path, Path],
215
+ fix_level: str,
216
+ format_enabled: bool,
217
+ workers: int,
218
+ progress: tqdm | None,
219
+ ) -> None:
220
+ async def handler(path: Path) -> None:
221
+ content = await asyncio.to_thread(read_text, path)
222
+ updated = await fix_markdown_text(content, fix_level, format_enabled)
223
+ output_path = output_map[path]
224
+ await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
225
+
226
+ await _run_with_workers(paths, workers, handler, progress=progress)
227
+
228
+
204
229
  @click.group()
205
230
  def recognize() -> None:
206
231
  """OCR recognition and Markdown post-processing commands."""
@@ -364,7 +389,7 @@ def unpack(
364
389
  )
365
390
 
366
391
 
367
- @recognize.command()
392
+ @recognize.group(invoke_without_command=True)
368
393
  @click.option(
369
394
  "--layout",
370
395
  "layout",
@@ -378,28 +403,47 @@ def unpack(
378
403
  "--input",
379
404
  "inputs",
380
405
  multiple=True,
381
- required=True,
406
+ required=False,
382
407
  help="Input directory (repeatable)",
383
408
  )
384
409
  @click.option("-r", "--recursive", is_flag=True, help="Recursively search for layout folders")
385
410
  @click.option("--output-simple", "output_simple", default=None, help="Output directory for copied markdown")
386
411
  @click.option("--output-base64", "output_base64", default=None, help="Output directory for embedded markdown")
412
+ @click.option("--fix", "enable_fix", is_flag=True, help="Apply OCR fix and rumdl formatting")
413
+ @click.option(
414
+ "--fix-level",
415
+ "fix_level",
416
+ default="moderate",
417
+ type=click.Choice(["off", "moderate", "aggressive"]),
418
+ show_default=True,
419
+ help="OCR fix level",
420
+ )
421
+ @click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
387
422
  @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
388
423
  @click.option("--dry-run", is_flag=True, help="Report actions without writing files")
389
424
  @click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
425
+ @click.pass_context
390
426
  def organize(
427
+ ctx: click.Context,
391
428
  layout: str,
392
429
  inputs: tuple[str, ...],
393
430
  recursive: bool,
394
431
  output_simple: str | None,
395
432
  output_base64: str | None,
433
+ enable_fix: bool,
434
+ fix_level: str,
435
+ no_format: bool,
396
436
  workers: int,
397
437
  dry_run: bool,
398
438
  verbose: bool,
399
439
  ) -> None:
400
440
  """Organize OCR outputs into markdown files."""
441
+ if ctx.invoked_subcommand:
442
+ return
401
443
  configure_logging(verbose)
402
444
  start_time = time.monotonic()
445
+ if not inputs:
446
+ raise click.ClickException("--input is required")
403
447
  if workers <= 0:
404
448
  raise click.ClickException("--workers must be positive")
405
449
  if output_simple is None and output_base64 is None:
@@ -424,6 +468,8 @@ def organize(
424
468
 
425
469
  output_map = _map_output_files(layout_dirs, output_dirs)
426
470
  image_counts = _aggregate_image_counts([path / "full.md" for path in layout_dirs])
471
+ fix_value = fix_level if enable_fix else None
472
+ format_enabled = enable_fix and not no_format
427
473
  if dry_run:
428
474
  rows = [
429
475
  ("Layout", layout),
@@ -433,6 +479,9 @@ def organize(
433
479
  ("Images data", str(image_counts["data"])),
434
480
  ("Images http", str(image_counts["http"])),
435
481
  ("Images local", str(image_counts["local"])),
482
+ ("Fix", "yes" if enable_fix else "no"),
483
+ ("Fix level", fix_level if enable_fix else "-"),
484
+ ("Format", "no" if no_format else ("yes" if enable_fix else "-")),
436
485
  ("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
437
486
  ("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
438
487
  ("Duration", _format_duration(time.monotonic() - start_time)),
@@ -449,6 +498,8 @@ def organize(
449
498
  output_base64_path,
450
499
  output_map,
451
500
  workers,
501
+ fix_value,
502
+ format_enabled,
452
503
  progress,
453
504
  )
454
505
  )
@@ -462,8 +513,111 @@ def organize(
462
513
  ("Images data", str(image_counts["data"])),
463
514
  ("Images http", str(image_counts["http"])),
464
515
  ("Images local", str(image_counts["local"])),
516
+ ("Fix", "yes" if enable_fix else "no"),
517
+ ("Fix level", fix_level if enable_fix else "-"),
518
+ ("Format", "no" if no_format else ("yes" if enable_fix else "-")),
465
519
  ("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
466
520
  ("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
467
521
  ("Duration", _format_duration(time.monotonic() - start_time)),
468
522
  ]
469
523
  _print_summary("recognize organize", rows)
524
+
525
+
526
+ @recognize.command("fix")
527
+ @click.option(
528
+ "-i",
529
+ "--input",
530
+ "inputs",
531
+ multiple=True,
532
+ required=True,
533
+ help="Input markdown file or directory (repeatable)",
534
+ )
535
+ @click.option("-o", "--output", "output_dir", default=None, help="Output directory")
536
+ @click.option("--in-place", "in_place", is_flag=True, help="Fix markdown files in place")
537
+ @click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
538
+ @click.option(
539
+ "--fix-level",
540
+ "fix_level",
541
+ default="moderate",
542
+ type=click.Choice(["off", "moderate", "aggressive"]),
543
+ show_default=True,
544
+ help="OCR fix level",
545
+ )
546
+ @click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
547
+ @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
548
+ @click.option("--dry-run", is_flag=True, help="Report actions without writing files")
549
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
550
+ def recognize_fix(
551
+ inputs: tuple[str, ...],
552
+ output_dir: str | None,
553
+ in_place: bool,
554
+ recursive: bool,
555
+ fix_level: str,
556
+ no_format: bool,
557
+ workers: int,
558
+ dry_run: bool,
559
+ verbose: bool,
560
+ ) -> None:
561
+ """Fix and format OCR markdown outputs."""
562
+ configure_logging(verbose)
563
+ start_time = time.monotonic()
564
+ if workers <= 0:
565
+ raise click.ClickException("--workers must be positive")
566
+ if in_place and output_dir:
567
+ raise click.ClickException("--in-place cannot be used with --output")
568
+ if not in_place and not output_dir:
569
+ raise click.ClickException("Either --in-place or --output is required")
570
+
571
+ output_path = Path(output_dir) if output_dir else None
572
+ if output_path and not dry_run:
573
+ output_path = _ensure_output_dir(output_dir)
574
+ _warn_if_not_empty(output_path)
575
+
576
+ paths = discover_markdown(inputs, None, recursive=recursive)
577
+ if not paths:
578
+ click.echo("No markdown files discovered")
579
+ return
580
+
581
+ format_enabled = not no_format
582
+ if in_place:
583
+ output_map = {path: path for path in paths}
584
+ else:
585
+ output_map = {path: (output_path / name) for path, name in _map_output_files(paths, [output_path]).items()}
586
+
587
+ if dry_run:
588
+ rows = [
589
+ ("Inputs", str(len(paths))),
590
+ ("Outputs", str(len(output_map))),
591
+ ("Fix level", fix_level),
592
+ ("Format", "no" if no_format else "yes"),
593
+ ("In place", "yes" if in_place else "no"),
594
+ ("Output dir", _relative_path(output_path) if output_path else "-"),
595
+ ("Duration", _format_duration(time.monotonic() - start_time)),
596
+ ]
597
+ _print_summary("recognize fix (dry-run)", rows)
598
+ return
599
+
600
+ progress = tqdm(total=len(paths), desc="fix", unit="file")
601
+ try:
602
+ asyncio.run(
603
+ _run_fix(
604
+ paths,
605
+ output_map,
606
+ fix_level,
607
+ format_enabled,
608
+ workers,
609
+ progress,
610
+ )
611
+ )
612
+ finally:
613
+ progress.close()
614
+ rows = [
615
+ ("Inputs", str(len(paths))),
616
+ ("Outputs", str(len(output_map))),
617
+ ("Fix level", fix_level),
618
+ ("Format", "no" if no_format else "yes"),
619
+ ("In place", "yes" if in_place else "no"),
620
+ ("Output dir", _relative_path(output_path) if output_path else "-"),
621
+ ("Duration", _format_duration(time.monotonic() - start_time)),
622
+ ]
623
+ _print_summary("recognize fix", rows)
@@ -5,9 +5,12 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import logging
7
7
  import shutil
8
+ import subprocess
8
9
  from pathlib import Path
9
10
  from typing import Iterable
10
11
 
12
+ from deepresearch_flow.translator.fixers import fix_markdown
13
+
11
14
  from deepresearch_flow.recognize.markdown import (
12
15
  NameRegistry,
13
16
  embed_markdown_images,
@@ -20,6 +23,53 @@ from deepresearch_flow.recognize.markdown import (
20
23
 
21
24
 
22
25
  logger = logging.getLogger(__name__)
26
+ _RUMDL_PATH = shutil.which("rumdl")
27
+ _RUMDL_WARNED = False
28
+
29
+
30
+ async def _format_markdown(text: str) -> str:
31
+ global _RUMDL_WARNED
32
+ if not _RUMDL_PATH:
33
+ if not _RUMDL_WARNED:
34
+ logger.warning("rumdl not available; skip markdown formatting")
35
+ _RUMDL_WARNED = True
36
+ return text
37
+
38
+ def run_formatter() -> str:
39
+ try:
40
+ proc = subprocess.run(
41
+ [_RUMDL_PATH, "fmt", "--stdin", "--quiet"],
42
+ input=text,
43
+ capture_output=True,
44
+ text=True,
45
+ check=False,
46
+ )
47
+ except OSError as exc:
48
+ logger.warning("rumdl fmt failed: %s", exc)
49
+ return text
50
+ if proc.returncode != 0:
51
+ logger.warning("rumdl fmt failed (%s): %s", proc.returncode, proc.stderr.strip())
52
+ return text
53
+ return proc.stdout or text
54
+
55
+ return await asyncio.to_thread(run_formatter)
56
+
57
+
58
+ def _apply_fix(text: str, fix_level: str) -> str:
59
+ if fix_level == "off":
60
+ return text
61
+ return fix_markdown(text, fix_level)
62
+
63
+
64
+ async def fix_markdown_text(
65
+ text: str,
66
+ fix_level: str,
67
+ format_enabled: bool,
68
+ ) -> str:
69
+ text = _apply_fix(text, fix_level)
70
+ if format_enabled:
71
+ text = await _format_markdown(text)
72
+ return text
23
73
 
24
74
 
25
75
  def discover_mineru_dirs(inputs: Iterable[str], recursive: bool) -> list[Path]:
@@ -61,9 +111,13 @@ async def organize_mineru_dir(
61
111
  output_base64: Path | None,
62
112
  output_filename: str,
63
113
  image_registry: NameRegistry | None,
114
+ fix_level: str | None,
115
+ format_enabled: bool,
64
116
  ) -> None:
65
117
  md_path = layout_dir / "full.md"
66
118
  content = await asyncio.to_thread(read_text, md_path)
119
+ if fix_level is not None:
120
+ content = _apply_fix(content, fix_level)
67
121
 
68
122
  if output_simple is not None and image_registry is not None:
69
123
  images_dir = output_simple / "images"
@@ -86,10 +140,14 @@ async def organize_mineru_dir(
86
140
  return f"images/{filename}"
87
141
 
88
142
  updated = await rewrite_markdown_images(content, replace_simple)
143
+ if format_enabled:
144
+ updated = await _format_markdown(updated)
89
145
  output_path = output_simple / output_filename
90
146
  await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
91
147
 
92
148
  if output_base64 is not None:
93
149
  updated = await embed_markdown_images(content, md_path, False, None)
150
+ if format_enabled:
151
+ updated = await _format_markdown(updated)
94
152
  output_path = output_base64 / output_filename
95
153
  await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
@@ -0,0 +1 @@
1
+ """Translator package."""