deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +124 -19
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +1 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +29 -7
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +51 -43
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,8 @@ from typing import Any
|
|
|
8
8
|
import re
|
|
9
9
|
import unicodedata
|
|
10
10
|
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
11
13
|
from deepresearch_flow.paper.utils import stable_hash
|
|
12
14
|
|
|
13
15
|
try:
|
|
@@ -232,6 +234,10 @@ _SIMILARITY_MAX_STEPS = 10
|
|
|
232
234
|
|
|
233
235
|
def _normalize_title_key(title: str) -> str:
|
|
234
236
|
value = unicodedata.normalize("NFKD", title)
|
|
237
|
+
value = re.sub(r"\$([^$]+)\$", r" \1 ", value)
|
|
238
|
+
value = re.sub(r"\\[a-zA-Z]+\\*?\s*\{([^{}]*)\}", r" \1 ", value)
|
|
239
|
+
value = re.sub(r"\\[a-zA-Z]+\\*?", " ", value)
|
|
240
|
+
value = value.replace("^", " ")
|
|
235
241
|
greek_map = {
|
|
236
242
|
"α": "alpha",
|
|
237
243
|
"β": "beta",
|
|
@@ -279,7 +285,12 @@ def _normalize_title_key(title: str) -> str:
|
|
|
279
285
|
idx = 0
|
|
280
286
|
while idx < len(tokens):
|
|
281
287
|
token = tokens[idx]
|
|
282
|
-
if
|
|
288
|
+
if (
|
|
289
|
+
len(token) == 1
|
|
290
|
+
and token.isalpha()
|
|
291
|
+
and idx + 1 < len(tokens)
|
|
292
|
+
and tokens[idx + 1].isalpha()
|
|
293
|
+
):
|
|
283
294
|
merged.append(token + tokens[idx + 1])
|
|
284
295
|
idx += 2
|
|
285
296
|
continue
|
|
@@ -300,6 +311,9 @@ def _strip_leading_numeric_tokens(title_key: str) -> str:
|
|
|
300
311
|
if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
|
|
301
312
|
idx += 1
|
|
302
313
|
continue
|
|
314
|
+
if re.fullmatch(r"\d+\.\d+", token) and len(token) <= _LEADING_NUMERIC_MAX_LEN + 2:
|
|
315
|
+
idx += 1
|
|
316
|
+
continue
|
|
303
317
|
break
|
|
304
318
|
if idx == 0:
|
|
305
319
|
return title_key
|
|
@@ -370,7 +384,11 @@ def _is_pdf_like(path: Path) -> bool:
|
|
|
370
384
|
return ".pdf-" in name_lower and not name_lower.endswith(".md")
|
|
371
385
|
|
|
372
386
|
|
|
373
|
-
def _scan_pdf_roots(
|
|
387
|
+
def _scan_pdf_roots(
|
|
388
|
+
roots: list[Path],
|
|
389
|
+
*,
|
|
390
|
+
show_progress: bool = False,
|
|
391
|
+
) -> tuple[list[Path], list[dict[str, Any]]]:
|
|
374
392
|
pdf_paths: list[Path] = []
|
|
375
393
|
meta: list[dict[str, Any]] = []
|
|
376
394
|
seen: set[Path] = set()
|
|
@@ -381,7 +399,10 @@ def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]
|
|
|
381
399
|
except OSError:
|
|
382
400
|
continue
|
|
383
401
|
files: list[Path] = []
|
|
384
|
-
|
|
402
|
+
iterator = root.rglob("*")
|
|
403
|
+
if show_progress:
|
|
404
|
+
iterator = tqdm(iterator, desc=f"scan pdf {root}", unit="file")
|
|
405
|
+
for path in iterator:
|
|
385
406
|
try:
|
|
386
407
|
if not path.is_file():
|
|
387
408
|
continue
|
|
@@ -597,7 +618,12 @@ def _resolve_by_title_and_meta(
|
|
|
597
618
|
return None, None, 0.0
|
|
598
619
|
|
|
599
620
|
|
|
600
|
-
def _build_file_index(
|
|
621
|
+
def _build_file_index(
|
|
622
|
+
roots: list[Path],
|
|
623
|
+
*,
|
|
624
|
+
suffixes: set[str],
|
|
625
|
+
show_progress: bool = False,
|
|
626
|
+
) -> dict[str, list[Path]]:
|
|
601
627
|
index: dict[str, list[Path]] = {}
|
|
602
628
|
for root in roots:
|
|
603
629
|
try:
|
|
@@ -605,7 +631,10 @@ def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, lis
|
|
|
605
631
|
continue
|
|
606
632
|
except OSError:
|
|
607
633
|
continue
|
|
608
|
-
|
|
634
|
+
iterator = root.rglob("*")
|
|
635
|
+
if show_progress:
|
|
636
|
+
iterator = tqdm(iterator, desc=f"index {next(iter(suffixes))} {root}", unit="file")
|
|
637
|
+
for path in iterator:
|
|
609
638
|
try:
|
|
610
639
|
if not path.is_file():
|
|
611
640
|
continue
|
|
@@ -692,7 +721,11 @@ def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> di
|
|
|
692
721
|
return index
|
|
693
722
|
|
|
694
723
|
|
|
695
|
-
def _build_translated_index(
|
|
724
|
+
def _build_translated_index(
|
|
725
|
+
roots: list[Path],
|
|
726
|
+
*,
|
|
727
|
+
show_progress: bool = False,
|
|
728
|
+
) -> dict[str, dict[str, Path]]:
|
|
696
729
|
index: dict[str, dict[str, Path]] = {}
|
|
697
730
|
candidates: list[Path] = []
|
|
698
731
|
for root in roots:
|
|
@@ -702,7 +735,11 @@ def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
|
|
|
702
735
|
except OSError:
|
|
703
736
|
continue
|
|
704
737
|
try:
|
|
705
|
-
|
|
738
|
+
iterator = root.rglob("*.md")
|
|
739
|
+
if show_progress:
|
|
740
|
+
iterator = tqdm(iterator, desc=f"scan translated {root}", unit="file")
|
|
741
|
+
for path in iterator:
|
|
742
|
+
candidates.append(path)
|
|
706
743
|
except OSError:
|
|
707
744
|
continue
|
|
708
745
|
for path in sorted(candidates, key=lambda item: str(item)):
|
|
@@ -1422,7 +1459,7 @@ class CompareDataset:
|
|
|
1422
1459
|
paper_id_to_index: dict[int, int]
|
|
1423
1460
|
|
|
1424
1461
|
|
|
1425
|
-
def _scan_md_roots(roots: list[Path]) -> list[Path]:
|
|
1462
|
+
def _scan_md_roots(roots: list[Path], *, show_progress: bool = False) -> list[Path]:
|
|
1426
1463
|
paths: list[Path] = []
|
|
1427
1464
|
for root in roots:
|
|
1428
1465
|
try:
|
|
@@ -1431,7 +1468,10 @@ def _scan_md_roots(roots: list[Path]) -> list[Path]:
|
|
|
1431
1468
|
except OSError:
|
|
1432
1469
|
continue
|
|
1433
1470
|
try:
|
|
1434
|
-
|
|
1471
|
+
iterator = root.rglob("*.md")
|
|
1472
|
+
if show_progress:
|
|
1473
|
+
iterator = tqdm(iterator, desc=f"scan md {root}", unit="file")
|
|
1474
|
+
for path in iterator:
|
|
1435
1475
|
try:
|
|
1436
1476
|
if not path.is_file():
|
|
1437
1477
|
continue
|
|
@@ -1688,12 +1728,13 @@ def _get_paper_identifier(paper: dict[str, Any]) -> str:
|
|
|
1688
1728
|
return str(paper.get("source_hash") or paper.get("source_path", ""))
|
|
1689
1729
|
|
|
1690
1730
|
|
|
1691
|
-
def
|
|
1731
|
+
def _match_datasets_with_pairs(
|
|
1692
1732
|
dataset_a: CompareDataset,
|
|
1693
1733
|
dataset_b: CompareDataset,
|
|
1694
1734
|
*,
|
|
1695
1735
|
lang: str | None = None,
|
|
1696
|
-
|
|
1736
|
+
show_progress: bool = False,
|
|
1737
|
+
) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]]]:
|
|
1697
1738
|
"""Match papers between two datasets using db_ops parity."""
|
|
1698
1739
|
results: list[CompareResult] = []
|
|
1699
1740
|
matched_a: set[int] = set()
|
|
@@ -1703,7 +1744,11 @@ def _match_datasets(
|
|
|
1703
1744
|
|
|
1704
1745
|
file_index_b = _merge_file_indexes(dataset_b.md_index, dataset_b.pdf_index)
|
|
1705
1746
|
|
|
1706
|
-
|
|
1747
|
+
papers_a_iter = dataset_a.papers
|
|
1748
|
+
if show_progress:
|
|
1749
|
+
papers_a_iter = tqdm(dataset_a.papers, desc="match A", unit="paper")
|
|
1750
|
+
|
|
1751
|
+
for idx_a, paper in enumerate(papers_a_iter):
|
|
1707
1752
|
_prepare_paper_matching_fields(paper)
|
|
1708
1753
|
source_hash = str(paper.get("source_hash") or "")
|
|
1709
1754
|
title = str(paper.get("paper_title") or "")
|
|
@@ -1777,7 +1822,11 @@ def _match_datasets(
|
|
|
1777
1822
|
)
|
|
1778
1823
|
)
|
|
1779
1824
|
|
|
1780
|
-
|
|
1825
|
+
papers_b_iter = dataset_b.papers
|
|
1826
|
+
if show_progress:
|
|
1827
|
+
papers_b_iter = tqdm(dataset_b.papers, desc="match B", unit="paper")
|
|
1828
|
+
|
|
1829
|
+
for idx_b, paper in enumerate(papers_b_iter):
|
|
1781
1830
|
_prepare_paper_matching_fields(paper)
|
|
1782
1831
|
source_hash = str(paper.get("source_hash") or "")
|
|
1783
1832
|
title = str(paper.get("paper_title") or "")
|
|
@@ -1832,6 +1881,19 @@ def _match_datasets(
|
|
|
1832
1881
|
)
|
|
1833
1882
|
)
|
|
1834
1883
|
|
|
1884
|
+
return results, match_pairs
|
|
1885
|
+
|
|
1886
|
+
|
|
1887
|
+
def _match_datasets(
|
|
1888
|
+
dataset_a: CompareDataset,
|
|
1889
|
+
dataset_b: CompareDataset,
|
|
1890
|
+
*,
|
|
1891
|
+
lang: str | None = None,
|
|
1892
|
+
show_progress: bool = False,
|
|
1893
|
+
) -> list[CompareResult]:
|
|
1894
|
+
results, _ = _match_datasets_with_pairs(
|
|
1895
|
+
dataset_a, dataset_b, lang=lang, show_progress=show_progress
|
|
1896
|
+
)
|
|
1835
1897
|
return results
|
|
1836
1898
|
|
|
1837
1899
|
|
|
@@ -1843,6 +1905,7 @@ def build_compare_dataset(
|
|
|
1843
1905
|
md_translated_roots: list[Path] | None = None,
|
|
1844
1906
|
bibtex_path: Path | None = None,
|
|
1845
1907
|
lang: str | None = None,
|
|
1908
|
+
show_progress: bool = False,
|
|
1846
1909
|
) -> CompareDataset:
|
|
1847
1910
|
"""Load and index a dataset from various sources."""
|
|
1848
1911
|
papers: list[dict[str, Any]] = []
|
|
@@ -1868,11 +1931,17 @@ def build_compare_dataset(
|
|
|
1868
1931
|
for paper in papers:
|
|
1869
1932
|
_prepare_paper_matching_fields(paper)
|
|
1870
1933
|
|
|
1871
|
-
md_paths = _scan_md_roots(md_roots or [])
|
|
1872
|
-
pdf_paths, _ = _scan_pdf_roots(pdf_roots or [])
|
|
1873
|
-
md_index = _build_file_index(
|
|
1874
|
-
|
|
1875
|
-
|
|
1934
|
+
md_paths = _scan_md_roots(md_roots or [], show_progress=show_progress)
|
|
1935
|
+
pdf_paths, _ = _scan_pdf_roots(pdf_roots or [], show_progress=show_progress)
|
|
1936
|
+
md_index = _build_file_index(
|
|
1937
|
+
md_roots or [], suffixes={".md"}, show_progress=show_progress
|
|
1938
|
+
)
|
|
1939
|
+
pdf_index = _build_file_index(
|
|
1940
|
+
pdf_roots or [], suffixes={".pdf"}, show_progress=show_progress
|
|
1941
|
+
)
|
|
1942
|
+
translated_index = _build_translated_index(
|
|
1943
|
+
md_translated_roots or [], show_progress=show_progress
|
|
1944
|
+
)
|
|
1876
1945
|
|
|
1877
1946
|
if pdf_paths:
|
|
1878
1947
|
papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
|
|
@@ -1921,8 +1990,39 @@ def compare_datasets(
|
|
|
1921
1990
|
md_translated_roots_b: list[Path] | None = None,
|
|
1922
1991
|
bibtex_path: Path | None = None,
|
|
1923
1992
|
lang: str | None = None,
|
|
1993
|
+
show_progress: bool = False,
|
|
1924
1994
|
) -> list[CompareResult]:
|
|
1925
1995
|
"""Compare two datasets and return comparison results."""
|
|
1996
|
+
results, _, _, _ = compare_datasets_with_pairs(
|
|
1997
|
+
json_paths_a=json_paths_a,
|
|
1998
|
+
pdf_roots_a=pdf_roots_a,
|
|
1999
|
+
md_roots_a=md_roots_a,
|
|
2000
|
+
md_translated_roots_a=md_translated_roots_a,
|
|
2001
|
+
json_paths_b=json_paths_b,
|
|
2002
|
+
pdf_roots_b=pdf_roots_b,
|
|
2003
|
+
md_roots_b=md_roots_b,
|
|
2004
|
+
md_translated_roots_b=md_translated_roots_b,
|
|
2005
|
+
bibtex_path=bibtex_path,
|
|
2006
|
+
lang=lang,
|
|
2007
|
+
show_progress=show_progress,
|
|
2008
|
+
)
|
|
2009
|
+
return results
|
|
2010
|
+
|
|
2011
|
+
|
|
2012
|
+
def compare_datasets_with_pairs(
|
|
2013
|
+
*,
|
|
2014
|
+
json_paths_a: list[Path] | None = None,
|
|
2015
|
+
pdf_roots_a: list[Path] | None = None,
|
|
2016
|
+
md_roots_a: list[Path] | None = None,
|
|
2017
|
+
md_translated_roots_a: list[Path] | None = None,
|
|
2018
|
+
json_paths_b: list[Path] | None = None,
|
|
2019
|
+
pdf_roots_b: list[Path] | None = None,
|
|
2020
|
+
md_roots_b: list[Path] | None = None,
|
|
2021
|
+
md_translated_roots_b: list[Path] | None = None,
|
|
2022
|
+
bibtex_path: Path | None = None,
|
|
2023
|
+
lang: str | None = None,
|
|
2024
|
+
show_progress: bool = False,
|
|
2025
|
+
) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]], CompareDataset, CompareDataset]:
|
|
1926
2026
|
# Validate language requirement for translated inputs
|
|
1927
2027
|
has_translated_a = md_translated_roots_a is not None and len(md_translated_roots_a) > 0
|
|
1928
2028
|
has_translated_b = md_translated_roots_b is not None and len(md_translated_roots_b) > 0
|
|
@@ -1939,6 +2039,7 @@ def compare_datasets(
|
|
|
1939
2039
|
md_translated_roots=md_translated_roots_a,
|
|
1940
2040
|
bibtex_path=bibtex_path,
|
|
1941
2041
|
lang=lang,
|
|
2042
|
+
show_progress=show_progress,
|
|
1942
2043
|
)
|
|
1943
2044
|
|
|
1944
2045
|
dataset_b = build_compare_dataset(
|
|
@@ -1948,6 +2049,10 @@ def compare_datasets(
|
|
|
1948
2049
|
md_translated_roots=md_translated_roots_b,
|
|
1949
2050
|
bibtex_path=bibtex_path,
|
|
1950
2051
|
lang=lang,
|
|
2052
|
+
show_progress=show_progress,
|
|
1951
2053
|
)
|
|
1952
2054
|
|
|
1953
|
-
|
|
2055
|
+
results, match_pairs = _match_datasets_with_pairs(
|
|
2056
|
+
dataset_a, dataset_b, lang=lang, show_progress=show_progress
|
|
2057
|
+
)
|
|
2058
|
+
return results, match_pairs, dataset_a, dataset_b
|