deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/cli.py +2 -0
- deepresearch_flow/paper/config.py +15 -0
- deepresearch_flow/paper/db.py +193 -0
- deepresearch_flow/paper/db_ops.py +1939 -0
- deepresearch_flow/paper/llm.py +2 -0
- deepresearch_flow/paper/web/app.py +46 -3320
- deepresearch_flow/paper/web/constants.py +23 -0
- deepresearch_flow/paper/web/filters.py +255 -0
- deepresearch_flow/paper/web/handlers/__init__.py +14 -0
- deepresearch_flow/paper/web/handlers/api.py +217 -0
- deepresearch_flow/paper/web/handlers/pages.py +334 -0
- deepresearch_flow/paper/web/markdown.py +549 -0
- deepresearch_flow/paper/web/static/css/main.css +857 -0
- deepresearch_flow/paper/web/static/js/detail.js +406 -0
- deepresearch_flow/paper/web/static/js/index.js +266 -0
- deepresearch_flow/paper/web/static/js/outline.js +58 -0
- deepresearch_flow/paper/web/static/js/stats.js +39 -0
- deepresearch_flow/paper/web/templates/base.html +43 -0
- deepresearch_flow/paper/web/templates/detail.html +332 -0
- deepresearch_flow/paper/web/templates/index.html +114 -0
- deepresearch_flow/paper/web/templates/stats.html +29 -0
- deepresearch_flow/paper/web/templates.py +85 -0
- deepresearch_flow/paper/web/text.py +68 -0
- deepresearch_flow/recognize/cli.py +157 -3
- deepresearch_flow/recognize/organize.py +58 -0
- deepresearch_flow/translator/__init__.py +1 -0
- deepresearch_flow/translator/cli.py +451 -0
- deepresearch_flow/translator/config.py +19 -0
- deepresearch_flow/translator/engine.py +959 -0
- deepresearch_flow/translator/fixers.py +451 -0
- deepresearch_flow/translator/placeholder.py +62 -0
- deepresearch_flow/translator/prompts.py +116 -0
- deepresearch_flow/translator/protector.py +291 -0
- deepresearch_flow/translator/segment.py +180 -0
- deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
- deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
deepresearch_flow/cli.py
CHANGED
|
@@ -4,6 +4,7 @@ import click
|
|
|
4
4
|
|
|
5
5
|
from deepresearch_flow.paper.cli import paper
|
|
6
6
|
from deepresearch_flow.recognize.cli import recognize
|
|
7
|
+
from deepresearch_flow.translator.cli import translator
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@click.group()
|
|
@@ -13,6 +14,7 @@ def cli() -> None:
|
|
|
13
14
|
|
|
14
15
|
cli.add_command(paper)
|
|
15
16
|
cli.add_command(recognize)
|
|
17
|
+
cli.add_command(translator)
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
def main() -> None:
|
|
@@ -40,6 +40,7 @@ class ProviderConfig:
|
|
|
40
40
|
location: str | None
|
|
41
41
|
credentials_path: str | None
|
|
42
42
|
anthropic_version: str | None
|
|
43
|
+
max_tokens: int | None
|
|
43
44
|
structured_mode: str
|
|
44
45
|
extra_headers: dict[str, str]
|
|
45
46
|
system_prompt: str | None
|
|
@@ -102,6 +103,15 @@ def _as_str(value: Any, default: str | None = None) -> str | None:
|
|
|
102
103
|
return str(value)
|
|
103
104
|
|
|
104
105
|
|
|
106
|
+
def _ensure_http_scheme(base_url: str, *, default_scheme: str = "http://") -> str:
|
|
107
|
+
normalized = base_url.strip()
|
|
108
|
+
if normalized.startswith(("http://", "https://")):
|
|
109
|
+
scheme, rest = normalized.split("://", 1)
|
|
110
|
+
rest = rest.lstrip("/")
|
|
111
|
+
return f"{scheme}://{rest}" if rest else f"{scheme}://"
|
|
112
|
+
return f"{default_scheme}{normalized.lstrip('/')}"
|
|
113
|
+
|
|
114
|
+
|
|
105
115
|
def load_config(path: str) -> PaperConfig:
|
|
106
116
|
config_path = Path(path)
|
|
107
117
|
if not config_path.exists():
|
|
@@ -158,6 +168,8 @@ def load_config(path: str) -> PaperConfig:
|
|
|
158
168
|
raise ValueError(f"Provider '{name}' requires base_url")
|
|
159
169
|
elif provider_type == "azure_openai" and endpoint:
|
|
160
170
|
base_url = endpoint
|
|
171
|
+
if provider_type == "ollama" and base_url:
|
|
172
|
+
base_url = _ensure_http_scheme(base_url)
|
|
161
173
|
|
|
162
174
|
api_keys = _as_list(provider.get("api_keys"))
|
|
163
175
|
if not api_keys:
|
|
@@ -188,6 +200,8 @@ def load_config(path: str) -> PaperConfig:
|
|
|
188
200
|
location = _as_str(provider.get("location"), None)
|
|
189
201
|
credentials_path = _as_str(provider.get("credentials_path"), None)
|
|
190
202
|
anthropic_version = _as_str(provider.get("anthropic_version"), None)
|
|
203
|
+
max_tokens = provider.get("max_tokens")
|
|
204
|
+
max_tokens_value = int(max_tokens) if max_tokens is not None else None
|
|
191
205
|
|
|
192
206
|
if provider_type == "azure_openai":
|
|
193
207
|
if not base_url:
|
|
@@ -221,6 +235,7 @@ def load_config(path: str) -> PaperConfig:
|
|
|
221
235
|
location=location,
|
|
222
236
|
credentials_path=credentials_path,
|
|
223
237
|
anthropic_version=anthropic_version,
|
|
238
|
+
max_tokens=max_tokens_value,
|
|
224
239
|
structured_mode=structured_mode,
|
|
225
240
|
extra_headers=extra_headers,
|
|
226
241
|
system_prompt=_as_str(provider.get("system_prompt"), None),
|
deepresearch_flow/paper/db.py
CHANGED
|
@@ -584,6 +584,13 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
584
584
|
default=(),
|
|
585
585
|
help="Optional markdown root directory (repeatable) for source viewing",
|
|
586
586
|
)
|
|
587
|
+
@click.option(
|
|
588
|
+
"--md-translated-root",
|
|
589
|
+
"md_translated_roots",
|
|
590
|
+
multiple=True,
|
|
591
|
+
default=(),
|
|
592
|
+
help="Optional markdown root directory (repeatable) for translated viewing",
|
|
593
|
+
)
|
|
587
594
|
@click.option(
|
|
588
595
|
"--pdf-root",
|
|
589
596
|
"pdf_roots",
|
|
@@ -606,6 +613,7 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
606
613
|
input_paths: tuple[str, ...],
|
|
607
614
|
bibtex_path: str | None,
|
|
608
615
|
md_roots: tuple[str, ...],
|
|
616
|
+
md_translated_roots: tuple[str, ...],
|
|
609
617
|
pdf_roots: tuple[str, ...],
|
|
610
618
|
cache_dir: str | None,
|
|
611
619
|
no_cache: bool,
|
|
@@ -623,6 +631,7 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
623
631
|
fallback_language=fallback_language,
|
|
624
632
|
bibtex_path=Path(bibtex_path) if bibtex_path else None,
|
|
625
633
|
md_roots=[Path(root) for root in md_roots],
|
|
634
|
+
md_translated_roots=[Path(root) for root in md_translated_roots],
|
|
626
635
|
pdf_roots=[Path(root) for root in pdf_roots],
|
|
627
636
|
cache_dir=Path(cache_dir) if cache_dir else None,
|
|
628
637
|
use_cache=not no_cache,
|
|
@@ -799,3 +808,187 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
799
808
|
raise click.ClickException(str(exc)) from exc
|
|
800
809
|
rendered = render_papers(papers, out_dir, template, output_language)
|
|
801
810
|
click.echo(f"Rendered {rendered} markdown files")
|
|
811
|
+
|
|
812
|
+
@db_group.command("compare")
|
|
813
|
+
@click.option(
|
|
814
|
+
"-ia", "--input-a", "input_paths_a", multiple=True, help="Input JSON files for side A (repeatable)"
|
|
815
|
+
)
|
|
816
|
+
@click.option(
|
|
817
|
+
"-ib", "--input-b", "input_paths_b", multiple=True, help="Input JSON files for side B (repeatable)"
|
|
818
|
+
)
|
|
819
|
+
@click.option(
|
|
820
|
+
"--pdf-root-a", "pdf_roots_a", multiple=True, help="PDF root directories for side A (repeatable)"
|
|
821
|
+
)
|
|
822
|
+
@click.option(
|
|
823
|
+
"--pdf-root-b", "pdf_roots_b", multiple=True, help="PDF root directories for side B (repeatable)"
|
|
824
|
+
)
|
|
825
|
+
@click.option(
|
|
826
|
+
"--md-root-a", "md_roots_a", multiple=True, help="Markdown root directories for side A (repeatable)"
|
|
827
|
+
)
|
|
828
|
+
@click.option(
|
|
829
|
+
"--md-root-b", "md_roots_b", multiple=True, help="Markdown root directories for side B (repeatable)"
|
|
830
|
+
)
|
|
831
|
+
@click.option(
|
|
832
|
+
"--md-translated-root-a", "md_translated_roots_a", multiple=True,
|
|
833
|
+
help="Translated Markdown root directories for side A (repeatable)"
|
|
834
|
+
)
|
|
835
|
+
@click.option(
|
|
836
|
+
"--md-translated-root-b", "md_translated_roots_b", multiple=True,
|
|
837
|
+
help="Translated Markdown root directories for side B (repeatable)"
|
|
838
|
+
)
|
|
839
|
+
@click.option("-b", "--bibtex", "bibtex_path", default=None, help="Optional BibTeX file path")
|
|
840
|
+
@click.option("--lang", "lang", default=None, help="Language code for translated comparisons (e.g., zh)")
|
|
841
|
+
@click.option(
|
|
842
|
+
"--output-csv", "output_csv", default=None, help="Path to export results as CSV"
|
|
843
|
+
)
|
|
844
|
+
@click.option(
|
|
845
|
+
"--sample-limit", "sample_limit", default=5, type=int, show_default=True,
|
|
846
|
+
help="Number of sample items to show in terminal output"
|
|
847
|
+
)
|
|
848
|
+
def compare(
|
|
849
|
+
input_paths_a: tuple[str, ...],
|
|
850
|
+
input_paths_b: tuple[str, ...],
|
|
851
|
+
pdf_roots_a: tuple[str, ...],
|
|
852
|
+
pdf_roots_b: tuple[str, ...],
|
|
853
|
+
md_roots_a: tuple[str, ...],
|
|
854
|
+
md_roots_b: tuple[str, ...],
|
|
855
|
+
md_translated_roots_a: tuple[str, ...],
|
|
856
|
+
md_translated_roots_b: tuple[str, ...],
|
|
857
|
+
bibtex_path: str | None,
|
|
858
|
+
lang: str | None,
|
|
859
|
+
output_csv: str | None,
|
|
860
|
+
sample_limit: int,
|
|
861
|
+
) -> None:
|
|
862
|
+
"""Compare two datasets and report matches and differences."""
|
|
863
|
+
from deepresearch_flow.paper.db_ops import compare_datasets
|
|
864
|
+
import csv
|
|
865
|
+
|
|
866
|
+
# Validate that at least one input is provided for each side
|
|
867
|
+
has_input_a = bool(input_paths_a or pdf_roots_a or md_roots_a or md_translated_roots_a)
|
|
868
|
+
has_input_b = bool(input_paths_b or pdf_roots_b or md_roots_b or md_translated_roots_b)
|
|
869
|
+
|
|
870
|
+
if not has_input_a:
|
|
871
|
+
raise click.ClickException(
|
|
872
|
+
"Side A must have at least one input: --input-a, --pdf-root-a, --md-root-a, or --md-translated-root-a"
|
|
873
|
+
)
|
|
874
|
+
if not has_input_b:
|
|
875
|
+
raise click.ClickException(
|
|
876
|
+
"Side B must have at least one input: --input-b, --pdf-root-b, --md-root-b, or --md-translated-root-b"
|
|
877
|
+
)
|
|
878
|
+
if (md_translated_roots_a or md_translated_roots_b) and not lang:
|
|
879
|
+
raise click.ClickException("--lang is required when comparing translated Markdown datasets")
|
|
880
|
+
|
|
881
|
+
# Run comparison
|
|
882
|
+
try:
|
|
883
|
+
results = compare_datasets(
|
|
884
|
+
json_paths_a=[Path(p) for p in input_paths_a],
|
|
885
|
+
pdf_roots_a=[Path(p) for p in pdf_roots_a],
|
|
886
|
+
md_roots_a=[Path(p) for p in md_roots_a],
|
|
887
|
+
md_translated_roots_a=[Path(p) for p in md_translated_roots_a],
|
|
888
|
+
json_paths_b=[Path(p) for p in input_paths_b],
|
|
889
|
+
pdf_roots_b=[Path(p) for p in pdf_roots_b],
|
|
890
|
+
md_roots_b=[Path(p) for p in md_roots_b],
|
|
891
|
+
md_translated_roots_b=[Path(p) for p in md_translated_roots_b],
|
|
892
|
+
bibtex_path=Path(bibtex_path) if bibtex_path else None,
|
|
893
|
+
lang=lang,
|
|
894
|
+
)
|
|
895
|
+
except ValueError as exc:
|
|
896
|
+
raise click.ClickException(str(exc)) from exc
|
|
897
|
+
|
|
898
|
+
# Calculate statistics
|
|
899
|
+
total_a = sum(1 for r in results if r.side == "A")
|
|
900
|
+
total_b = sum(1 for r in results if r.side == "B")
|
|
901
|
+
matched = sum(1 for r in results if r.side == "MATCH")
|
|
902
|
+
only_in_a = sum(1 for r in results if r.side == "A" and r.match_status == "only_in_A")
|
|
903
|
+
only_in_b = sum(1 for r in results if r.side == "B" and r.match_status == "only_in_B")
|
|
904
|
+
|
|
905
|
+
console = Console()
|
|
906
|
+
|
|
907
|
+
# Print summary table
|
|
908
|
+
summary_table = Table(title="Comparison Summary")
|
|
909
|
+
summary_table.add_column("Metric", style="cyan")
|
|
910
|
+
summary_table.add_column("Count", style="green", justify="right")
|
|
911
|
+
summary_table.add_row("Total in A", str(total_a))
|
|
912
|
+
summary_table.add_row("Total in B", str(total_b))
|
|
913
|
+
summary_table.add_row("Matched", str(matched))
|
|
914
|
+
summary_table.add_row("Only in A", str(only_in_a))
|
|
915
|
+
summary_table.add_row("Only in B", str(only_in_b))
|
|
916
|
+
console.print(summary_table)
|
|
917
|
+
|
|
918
|
+
# Print match type breakdown
|
|
919
|
+
match_types: dict[str, int] = {}
|
|
920
|
+
for r in results:
|
|
921
|
+
if r.side == "MATCH" and r.match_type:
|
|
922
|
+
match_types[r.match_type] = match_types.get(r.match_type, 0) + 1
|
|
923
|
+
|
|
924
|
+
if match_types:
|
|
925
|
+
type_table = Table(title="Match Types")
|
|
926
|
+
type_table.add_column("Type", style="cyan")
|
|
927
|
+
type_table.add_column("Count", style="green", justify="right")
|
|
928
|
+
for match_type, count in sorted(match_types.items(), key=lambda x: x[1], reverse=True):
|
|
929
|
+
type_table.add_row(match_type, str(count))
|
|
930
|
+
console.print(type_table)
|
|
931
|
+
|
|
932
|
+
# Print sample results
|
|
933
|
+
console.print("\n[bold]Sample Results:[/bold]")
|
|
934
|
+
|
|
935
|
+
# Sample matched items
|
|
936
|
+
matched_samples = [r for r in results if r.side == "MATCH"][:sample_limit]
|
|
937
|
+
if matched_samples:
|
|
938
|
+
console.print("\n[green]Matched Items:[/green]")
|
|
939
|
+
for r in matched_samples:
|
|
940
|
+
left = (r.title or "")[:60]
|
|
941
|
+
right = (r.other_title or "")[:60]
|
|
942
|
+
console.print(
|
|
943
|
+
f" • {left} ↔ {right} (type: {r.match_type}, score: {r.match_score:.2f})"
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
# Sample only in A
|
|
947
|
+
only_a_samples = [
|
|
948
|
+
r for r in results if r.side == "A" and r.match_status == "only_in_A"
|
|
949
|
+
][:sample_limit]
|
|
950
|
+
if only_a_samples:
|
|
951
|
+
console.print("\n[yellow]Only in A:[/yellow]")
|
|
952
|
+
for r in only_a_samples:
|
|
953
|
+
console.print(f" • {r.title[:60]}...")
|
|
954
|
+
|
|
955
|
+
# Sample only in B
|
|
956
|
+
only_b_samples = [
|
|
957
|
+
r for r in results if r.side == "B" and r.match_status == "only_in_B"
|
|
958
|
+
][:sample_limit]
|
|
959
|
+
if only_b_samples:
|
|
960
|
+
console.print("\n[yellow]Only in B:[/yellow]")
|
|
961
|
+
for r in only_b_samples:
|
|
962
|
+
console.print(f" • {r.title[:60]}...")
|
|
963
|
+
|
|
964
|
+
# Export to CSV if requested
|
|
965
|
+
if output_csv:
|
|
966
|
+
output_path = Path(output_csv)
|
|
967
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
968
|
+
|
|
969
|
+
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
970
|
+
writer = csv.writer(f)
|
|
971
|
+
writer.writerow([
|
|
972
|
+
"Side", "Source Hash", "Title", "Match Status", "Match Type",
|
|
973
|
+
"Match Score", "Source Path", "Other Source Hash", "Other Title",
|
|
974
|
+
"Other Source Path", "Lang"
|
|
975
|
+
])
|
|
976
|
+
for r in results:
|
|
977
|
+
writer.writerow([
|
|
978
|
+
r.side,
|
|
979
|
+
r.source_hash,
|
|
980
|
+
r.title,
|
|
981
|
+
r.match_status,
|
|
982
|
+
r.match_type or "",
|
|
983
|
+
f"{r.match_score:.4f}",
|
|
984
|
+
r.source_path or "",
|
|
985
|
+
r.other_source_hash or "",
|
|
986
|
+
r.other_title or "",
|
|
987
|
+
r.other_source_path or "",
|
|
988
|
+
r.lang or "",
|
|
989
|
+
])
|
|
990
|
+
|
|
991
|
+
console.print(f"\n[green]Results exported to: {output_path}[/green]")
|
|
992
|
+
|
|
993
|
+
# Print final counts
|
|
994
|
+
console.print(f"\nTotal results: {len(results)}")
|