deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +193 -0
  4. deepresearch_flow/paper/db_ops.py +1939 -0
  5. deepresearch_flow/paper/llm.py +2 -0
  6. deepresearch_flow/paper/web/app.py +46 -3320
  7. deepresearch_flow/paper/web/constants.py +23 -0
  8. deepresearch_flow/paper/web/filters.py +255 -0
  9. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  10. deepresearch_flow/paper/web/handlers/api.py +217 -0
  11. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  12. deepresearch_flow/paper/web/markdown.py +549 -0
  13. deepresearch_flow/paper/web/static/css/main.css +857 -0
  14. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  15. deepresearch_flow/paper/web/static/js/index.js +266 -0
  16. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  17. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  18. deepresearch_flow/paper/web/templates/base.html +43 -0
  19. deepresearch_flow/paper/web/templates/detail.html +332 -0
  20. deepresearch_flow/paper/web/templates/index.html +114 -0
  21. deepresearch_flow/paper/web/templates/stats.html +29 -0
  22. deepresearch_flow/paper/web/templates.py +85 -0
  23. deepresearch_flow/paper/web/text.py +68 -0
  24. deepresearch_flow/recognize/cli.py +157 -3
  25. deepresearch_flow/recognize/organize.py +58 -0
  26. deepresearch_flow/translator/__init__.py +1 -0
  27. deepresearch_flow/translator/cli.py +451 -0
  28. deepresearch_flow/translator/config.py +19 -0
  29. deepresearch_flow/translator/engine.py +959 -0
  30. deepresearch_flow/translator/fixers.py +451 -0
  31. deepresearch_flow/translator/placeholder.py +62 -0
  32. deepresearch_flow/translator/prompts.py +116 -0
  33. deepresearch_flow/translator/protector.py +291 -0
  34. deepresearch_flow/translator/segment.py +180 -0
  35. deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
  36. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
  37. deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
  38. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
  39. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  40. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  41. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
deepresearch_flow/cli.py CHANGED
@@ -4,6 +4,7 @@ import click
4
4
 
5
5
  from deepresearch_flow.paper.cli import paper
6
6
  from deepresearch_flow.recognize.cli import recognize
7
+ from deepresearch_flow.translator.cli import translator
7
8
 
8
9
 
9
10
  @click.group()
@@ -13,6 +14,7 @@ def cli() -> None:
13
14
 
14
15
  cli.add_command(paper)
15
16
  cli.add_command(recognize)
17
+ cli.add_command(translator)
16
18
 
17
19
 
18
20
  def main() -> None:
@@ -40,6 +40,7 @@ class ProviderConfig:
40
40
  location: str | None
41
41
  credentials_path: str | None
42
42
  anthropic_version: str | None
43
+ max_tokens: int | None
43
44
  structured_mode: str
44
45
  extra_headers: dict[str, str]
45
46
  system_prompt: str | None
@@ -102,6 +103,15 @@ def _as_str(value: Any, default: str | None = None) -> str | None:
102
103
  return str(value)
103
104
 
104
105
 
106
+ def _ensure_http_scheme(base_url: str, *, default_scheme: str = "http://") -> str:
107
+ normalized = base_url.strip()
108
+ if normalized.startswith(("http://", "https://")):
109
+ scheme, rest = normalized.split("://", 1)
110
+ rest = rest.lstrip("/")
111
+ return f"{scheme}://{rest}" if rest else f"{scheme}://"
112
+ return f"{default_scheme}{normalized.lstrip('/')}"
113
+
114
+
105
115
  def load_config(path: str) -> PaperConfig:
106
116
  config_path = Path(path)
107
117
  if not config_path.exists():
@@ -158,6 +168,8 @@ def load_config(path: str) -> PaperConfig:
158
168
  raise ValueError(f"Provider '{name}' requires base_url")
159
169
  elif provider_type == "azure_openai" and endpoint:
160
170
  base_url = endpoint
171
+ if provider_type == "ollama" and base_url:
172
+ base_url = _ensure_http_scheme(base_url)
161
173
 
162
174
  api_keys = _as_list(provider.get("api_keys"))
163
175
  if not api_keys:
@@ -188,6 +200,8 @@ def load_config(path: str) -> PaperConfig:
188
200
  location = _as_str(provider.get("location"), None)
189
201
  credentials_path = _as_str(provider.get("credentials_path"), None)
190
202
  anthropic_version = _as_str(provider.get("anthropic_version"), None)
203
+ max_tokens = provider.get("max_tokens")
204
+ max_tokens_value = int(max_tokens) if max_tokens is not None else None
191
205
 
192
206
  if provider_type == "azure_openai":
193
207
  if not base_url:
@@ -221,6 +235,7 @@ def load_config(path: str) -> PaperConfig:
221
235
  location=location,
222
236
  credentials_path=credentials_path,
223
237
  anthropic_version=anthropic_version,
238
+ max_tokens=max_tokens_value,
224
239
  structured_mode=structured_mode,
225
240
  extra_headers=extra_headers,
226
241
  system_prompt=_as_str(provider.get("system_prompt"), None),
@@ -584,6 +584,13 @@ def register_db_commands(db_group: click.Group) -> None:
584
584
  default=(),
585
585
  help="Optional markdown root directory (repeatable) for source viewing",
586
586
  )
587
+ @click.option(
588
+ "--md-translated-root",
589
+ "md_translated_roots",
590
+ multiple=True,
591
+ default=(),
592
+ help="Optional markdown root directory (repeatable) for translated viewing",
593
+ )
587
594
  @click.option(
588
595
  "--pdf-root",
589
596
  "pdf_roots",
@@ -606,6 +613,7 @@ def register_db_commands(db_group: click.Group) -> None:
606
613
  input_paths: tuple[str, ...],
607
614
  bibtex_path: str | None,
608
615
  md_roots: tuple[str, ...],
616
+ md_translated_roots: tuple[str, ...],
609
617
  pdf_roots: tuple[str, ...],
610
618
  cache_dir: str | None,
611
619
  no_cache: bool,
@@ -623,6 +631,7 @@ def register_db_commands(db_group: click.Group) -> None:
623
631
  fallback_language=fallback_language,
624
632
  bibtex_path=Path(bibtex_path) if bibtex_path else None,
625
633
  md_roots=[Path(root) for root in md_roots],
634
+ md_translated_roots=[Path(root) for root in md_translated_roots],
626
635
  pdf_roots=[Path(root) for root in pdf_roots],
627
636
  cache_dir=Path(cache_dir) if cache_dir else None,
628
637
  use_cache=not no_cache,
@@ -799,3 +808,187 @@ def register_db_commands(db_group: click.Group) -> None:
799
808
  raise click.ClickException(str(exc)) from exc
800
809
  rendered = render_papers(papers, out_dir, template, output_language)
801
810
  click.echo(f"Rendered {rendered} markdown files")
811
+
812
+ @db_group.command("compare")
813
+ @click.option(
814
+ "-ia", "--input-a", "input_paths_a", multiple=True, help="Input JSON files for side A (repeatable)"
815
+ )
816
+ @click.option(
817
+ "-ib", "--input-b", "input_paths_b", multiple=True, help="Input JSON files for side B (repeatable)"
818
+ )
819
+ @click.option(
820
+ "--pdf-root-a", "pdf_roots_a", multiple=True, help="PDF root directories for side A (repeatable)"
821
+ )
822
+ @click.option(
823
+ "--pdf-root-b", "pdf_roots_b", multiple=True, help="PDF root directories for side B (repeatable)"
824
+ )
825
+ @click.option(
826
+ "--md-root-a", "md_roots_a", multiple=True, help="Markdown root directories for side A (repeatable)"
827
+ )
828
+ @click.option(
829
+ "--md-root-b", "md_roots_b", multiple=True, help="Markdown root directories for side B (repeatable)"
830
+ )
831
+ @click.option(
832
+ "--md-translated-root-a", "md_translated_roots_a", multiple=True,
833
+ help="Translated Markdown root directories for side A (repeatable)"
834
+ )
835
+ @click.option(
836
+ "--md-translated-root-b", "md_translated_roots_b", multiple=True,
837
+ help="Translated Markdown root directories for side B (repeatable)"
838
+ )
839
+ @click.option("-b", "--bibtex", "bibtex_path", default=None, help="Optional BibTeX file path")
840
+ @click.option("--lang", "lang", default=None, help="Language code for translated comparisons (e.g., zh)")
841
+ @click.option(
842
+ "--output-csv", "output_csv", default=None, help="Path to export results as CSV"
843
+ )
844
+ @click.option(
845
+ "--sample-limit", "sample_limit", default=5, type=int, show_default=True,
846
+ help="Number of sample items to show in terminal output"
847
+ )
848
+ def compare(
849
+ input_paths_a: tuple[str, ...],
850
+ input_paths_b: tuple[str, ...],
851
+ pdf_roots_a: tuple[str, ...],
852
+ pdf_roots_b: tuple[str, ...],
853
+ md_roots_a: tuple[str, ...],
854
+ md_roots_b: tuple[str, ...],
855
+ md_translated_roots_a: tuple[str, ...],
856
+ md_translated_roots_b: tuple[str, ...],
857
+ bibtex_path: str | None,
858
+ lang: str | None,
859
+ output_csv: str | None,
860
+ sample_limit: int,
861
+ ) -> None:
862
+ """Compare two datasets and report matches and differences."""
863
+ from deepresearch_flow.paper.db_ops import compare_datasets
864
+ import csv
865
+
866
+ # Validate that at least one input is provided for each side
867
+ has_input_a = bool(input_paths_a or pdf_roots_a or md_roots_a or md_translated_roots_a)
868
+ has_input_b = bool(input_paths_b or pdf_roots_b or md_roots_b or md_translated_roots_b)
869
+
870
+ if not has_input_a:
871
+ raise click.ClickException(
872
+ "Side A must have at least one input: --input-a, --pdf-root-a, --md-root-a, or --md-translated-root-a"
873
+ )
874
+ if not has_input_b:
875
+ raise click.ClickException(
876
+ "Side B must have at least one input: --input-b, --pdf-root-b, --md-root-b, or --md-translated-root-b"
877
+ )
878
+ if (md_translated_roots_a or md_translated_roots_b) and not lang:
879
+ raise click.ClickException("--lang is required when comparing translated Markdown datasets")
880
+
881
+ # Run comparison
882
+ try:
883
+ results = compare_datasets(
884
+ json_paths_a=[Path(p) for p in input_paths_a],
885
+ pdf_roots_a=[Path(p) for p in pdf_roots_a],
886
+ md_roots_a=[Path(p) for p in md_roots_a],
887
+ md_translated_roots_a=[Path(p) for p in md_translated_roots_a],
888
+ json_paths_b=[Path(p) for p in input_paths_b],
889
+ pdf_roots_b=[Path(p) for p in pdf_roots_b],
890
+ md_roots_b=[Path(p) for p in md_roots_b],
891
+ md_translated_roots_b=[Path(p) for p in md_translated_roots_b],
892
+ bibtex_path=Path(bibtex_path) if bibtex_path else None,
893
+ lang=lang,
894
+ )
895
+ except ValueError as exc:
896
+ raise click.ClickException(str(exc)) from exc
897
+
898
+ # Calculate statistics
899
+ total_a = sum(1 for r in results if r.side == "A")
900
+ total_b = sum(1 for r in results if r.side == "B")
901
+ matched = sum(1 for r in results if r.side == "MATCH")
902
+ only_in_a = sum(1 for r in results if r.side == "A" and r.match_status == "only_in_A")
903
+ only_in_b = sum(1 for r in results if r.side == "B" and r.match_status == "only_in_B")
904
+
905
+ console = Console()
906
+
907
+ # Print summary table
908
+ summary_table = Table(title="Comparison Summary")
909
+ summary_table.add_column("Metric", style="cyan")
910
+ summary_table.add_column("Count", style="green", justify="right")
911
+ summary_table.add_row("Total in A", str(total_a))
912
+ summary_table.add_row("Total in B", str(total_b))
913
+ summary_table.add_row("Matched", str(matched))
914
+ summary_table.add_row("Only in A", str(only_in_a))
915
+ summary_table.add_row("Only in B", str(only_in_b))
916
+ console.print(summary_table)
917
+
918
+ # Print match type breakdown
919
+ match_types: dict[str, int] = {}
920
+ for r in results:
921
+ if r.side == "MATCH" and r.match_type:
922
+ match_types[r.match_type] = match_types.get(r.match_type, 0) + 1
923
+
924
+ if match_types:
925
+ type_table = Table(title="Match Types")
926
+ type_table.add_column("Type", style="cyan")
927
+ type_table.add_column("Count", style="green", justify="right")
928
+ for match_type, count in sorted(match_types.items(), key=lambda x: x[1], reverse=True):
929
+ type_table.add_row(match_type, str(count))
930
+ console.print(type_table)
931
+
932
+ # Print sample results
933
+ console.print("\n[bold]Sample Results:[/bold]")
934
+
935
+ # Sample matched items
936
+ matched_samples = [r for r in results if r.side == "MATCH"][:sample_limit]
937
+ if matched_samples:
938
+ console.print("\n[green]Matched Items:[/green]")
939
+ for r in matched_samples:
940
+ left = (r.title or "")[:60]
941
+ right = (r.other_title or "")[:60]
942
+ console.print(
943
+ f" • {left} ↔ {right} (type: {r.match_type}, score: {r.match_score:.2f})"
944
+ )
945
+
946
+ # Sample only in A
947
+ only_a_samples = [
948
+ r for r in results if r.side == "A" and r.match_status == "only_in_A"
949
+ ][:sample_limit]
950
+ if only_a_samples:
951
+ console.print("\n[yellow]Only in A:[/yellow]")
952
+ for r in only_a_samples:
953
+ console.print(f" • {r.title[:60]}...")
954
+
955
+ # Sample only in B
956
+ only_b_samples = [
957
+ r for r in results if r.side == "B" and r.match_status == "only_in_B"
958
+ ][:sample_limit]
959
+ if only_b_samples:
960
+ console.print("\n[yellow]Only in B:[/yellow]")
961
+ for r in only_b_samples:
962
+ console.print(f" • {r.title[:60]}...")
963
+
964
+ # Export to CSV if requested
965
+ if output_csv:
966
+ output_path = Path(output_csv)
967
+ output_path.parent.mkdir(parents=True, exist_ok=True)
968
+
969
+ with open(output_path, "w", newline="", encoding="utf-8") as f:
970
+ writer = csv.writer(f)
971
+ writer.writerow([
972
+ "Side", "Source Hash", "Title", "Match Status", "Match Type",
973
+ "Match Score", "Source Path", "Other Source Hash", "Other Title",
974
+ "Other Source Path", "Lang"
975
+ ])
976
+ for r in results:
977
+ writer.writerow([
978
+ r.side,
979
+ r.source_hash,
980
+ r.title,
981
+ r.match_status,
982
+ r.match_type or "",
983
+ f"{r.match_score:.4f}",
984
+ r.source_path or "",
985
+ r.other_source_hash or "",
986
+ r.other_title or "",
987
+ r.other_source_path or "",
988
+ r.lang or "",
989
+ ])
990
+
991
+ console.print(f"\n[green]Results exported to: {output_path}[/green]")
992
+
993
+ # Print final counts
994
+ console.print(f"\nTotal results: {len(results)}")