deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1154 -35
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
  49. deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
  52. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,8 @@ from typing import Any
8
8
  import re
9
9
  import unicodedata
10
10
 
11
+ from tqdm import tqdm
12
+
11
13
  from deepresearch_flow.paper.utils import stable_hash
12
14
 
13
15
  try:
@@ -232,6 +234,10 @@ _SIMILARITY_MAX_STEPS = 10
232
234
 
233
235
  def _normalize_title_key(title: str) -> str:
234
236
  value = unicodedata.normalize("NFKD", title)
237
+ value = re.sub(r"\$([^$]+)\$", r" \1 ", value)
238
+ value = re.sub(r"\\[a-zA-Z]+\\*?\s*\{([^{}]*)\}", r" \1 ", value)
239
+ value = re.sub(r"\\[a-zA-Z]+\\*?", " ", value)
240
+ value = value.replace("^", " ")
235
241
  greek_map = {
236
242
  "α": "alpha",
237
243
  "β": "beta",
@@ -279,7 +285,12 @@ def _normalize_title_key(title: str) -> str:
279
285
  idx = 0
280
286
  while idx < len(tokens):
281
287
  token = tokens[idx]
282
- if len(token) == 1 and idx + 1 < len(tokens):
288
+ if (
289
+ len(token) == 1
290
+ and token.isalpha()
291
+ and idx + 1 < len(tokens)
292
+ and tokens[idx + 1].isalpha()
293
+ ):
283
294
  merged.append(token + tokens[idx + 1])
284
295
  idx += 2
285
296
  continue
@@ -300,6 +311,9 @@ def _strip_leading_numeric_tokens(title_key: str) -> str:
300
311
  if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
301
312
  idx += 1
302
313
  continue
314
+ if re.fullmatch(r"\d+\.\d+", token) and len(token) <= _LEADING_NUMERIC_MAX_LEN + 2:
315
+ idx += 1
316
+ continue
303
317
  break
304
318
  if idx == 0:
305
319
  return title_key
@@ -370,7 +384,11 @@ def _is_pdf_like(path: Path) -> bool:
370
384
  return ".pdf-" in name_lower and not name_lower.endswith(".md")
371
385
 
372
386
 
373
- def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]]:
387
+ def _scan_pdf_roots(
388
+ roots: list[Path],
389
+ *,
390
+ show_progress: bool = False,
391
+ ) -> tuple[list[Path], list[dict[str, Any]]]:
374
392
  pdf_paths: list[Path] = []
375
393
  meta: list[dict[str, Any]] = []
376
394
  seen: set[Path] = set()
@@ -381,7 +399,10 @@ def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]
381
399
  except OSError:
382
400
  continue
383
401
  files: list[Path] = []
384
- for path in root.rglob("*"):
402
+ iterator = root.rglob("*")
403
+ if show_progress:
404
+ iterator = tqdm(iterator, desc=f"scan pdf {root}", unit="file")
405
+ for path in iterator:
385
406
  try:
386
407
  if not path.is_file():
387
408
  continue
@@ -597,7 +618,12 @@ def _resolve_by_title_and_meta(
597
618
  return None, None, 0.0
598
619
 
599
620
 
600
- def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
621
+ def _build_file_index(
622
+ roots: list[Path],
623
+ *,
624
+ suffixes: set[str],
625
+ show_progress: bool = False,
626
+ ) -> dict[str, list[Path]]:
601
627
  index: dict[str, list[Path]] = {}
602
628
  for root in roots:
603
629
  try:
@@ -605,7 +631,10 @@ def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, lis
605
631
  continue
606
632
  except OSError:
607
633
  continue
608
- for path in root.rglob("*"):
634
+ iterator = root.rglob("*")
635
+ if show_progress:
636
+ iterator = tqdm(iterator, desc=f"index {next(iter(suffixes))} {root}", unit="file")
637
+ for path in iterator:
609
638
  try:
610
639
  if not path.is_file():
611
640
  continue
@@ -692,7 +721,11 @@ def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> di
692
721
  return index
693
722
 
694
723
 
695
- def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
724
+ def _build_translated_index(
725
+ roots: list[Path],
726
+ *,
727
+ show_progress: bool = False,
728
+ ) -> dict[str, dict[str, Path]]:
696
729
  index: dict[str, dict[str, Path]] = {}
697
730
  candidates: list[Path] = []
698
731
  for root in roots:
@@ -702,7 +735,11 @@ def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
702
735
  except OSError:
703
736
  continue
704
737
  try:
705
- candidates.extend(root.rglob("*.md"))
738
+ iterator = root.rglob("*.md")
739
+ if show_progress:
740
+ iterator = tqdm(iterator, desc=f"scan translated {root}", unit="file")
741
+ for path in iterator:
742
+ candidates.append(path)
706
743
  except OSError:
707
744
  continue
708
745
  for path in sorted(candidates, key=lambda item: str(item)):
@@ -1422,7 +1459,7 @@ class CompareDataset:
1422
1459
  paper_id_to_index: dict[int, int]
1423
1460
 
1424
1461
 
1425
- def _scan_md_roots(roots: list[Path]) -> list[Path]:
1462
+ def _scan_md_roots(roots: list[Path], *, show_progress: bool = False) -> list[Path]:
1426
1463
  paths: list[Path] = []
1427
1464
  for root in roots:
1428
1465
  try:
@@ -1431,7 +1468,10 @@ def _scan_md_roots(roots: list[Path]) -> list[Path]:
1431
1468
  except OSError:
1432
1469
  continue
1433
1470
  try:
1434
- for path in root.rglob("*.md"):
1471
+ iterator = root.rglob("*.md")
1472
+ if show_progress:
1473
+ iterator = tqdm(iterator, desc=f"scan md {root}", unit="file")
1474
+ for path in iterator:
1435
1475
  try:
1436
1476
  if not path.is_file():
1437
1477
  continue
@@ -1688,12 +1728,13 @@ def _get_paper_identifier(paper: dict[str, Any]) -> str:
1688
1728
  return str(paper.get("source_hash") or paper.get("source_path", ""))
1689
1729
 
1690
1730
 
1691
- def _match_datasets(
1731
+ def _match_datasets_with_pairs(
1692
1732
  dataset_a: CompareDataset,
1693
1733
  dataset_b: CompareDataset,
1694
1734
  *,
1695
1735
  lang: str | None = None,
1696
- ) -> list[CompareResult]:
1736
+ show_progress: bool = False,
1737
+ ) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]]]:
1697
1738
  """Match papers between two datasets using db_ops parity."""
1698
1739
  results: list[CompareResult] = []
1699
1740
  matched_a: set[int] = set()
@@ -1703,7 +1744,11 @@ def _match_datasets(
1703
1744
 
1704
1745
  file_index_b = _merge_file_indexes(dataset_b.md_index, dataset_b.pdf_index)
1705
1746
 
1706
- for idx_a, paper in enumerate(dataset_a.papers):
1747
+ papers_a_iter = dataset_a.papers
1748
+ if show_progress:
1749
+ papers_a_iter = tqdm(dataset_a.papers, desc="match A", unit="paper")
1750
+
1751
+ for idx_a, paper in enumerate(papers_a_iter):
1707
1752
  _prepare_paper_matching_fields(paper)
1708
1753
  source_hash = str(paper.get("source_hash") or "")
1709
1754
  title = str(paper.get("paper_title") or "")
@@ -1777,7 +1822,11 @@ def _match_datasets(
1777
1822
  )
1778
1823
  )
1779
1824
 
1780
- for idx_b, paper in enumerate(dataset_b.papers):
1825
+ papers_b_iter = dataset_b.papers
1826
+ if show_progress:
1827
+ papers_b_iter = tqdm(dataset_b.papers, desc="match B", unit="paper")
1828
+
1829
+ for idx_b, paper in enumerate(papers_b_iter):
1781
1830
  _prepare_paper_matching_fields(paper)
1782
1831
  source_hash = str(paper.get("source_hash") or "")
1783
1832
  title = str(paper.get("paper_title") or "")
@@ -1832,6 +1881,19 @@ def _match_datasets(
1832
1881
  )
1833
1882
  )
1834
1883
 
1884
+ return results, match_pairs
1885
+
1886
+
1887
+ def _match_datasets(
1888
+ dataset_a: CompareDataset,
1889
+ dataset_b: CompareDataset,
1890
+ *,
1891
+ lang: str | None = None,
1892
+ show_progress: bool = False,
1893
+ ) -> list[CompareResult]:
1894
+ results, _ = _match_datasets_with_pairs(
1895
+ dataset_a, dataset_b, lang=lang, show_progress=show_progress
1896
+ )
1835
1897
  return results
1836
1898
 
1837
1899
 
@@ -1843,6 +1905,7 @@ def build_compare_dataset(
1843
1905
  md_translated_roots: list[Path] | None = None,
1844
1906
  bibtex_path: Path | None = None,
1845
1907
  lang: str | None = None,
1908
+ show_progress: bool = False,
1846
1909
  ) -> CompareDataset:
1847
1910
  """Load and index a dataset from various sources."""
1848
1911
  papers: list[dict[str, Any]] = []
@@ -1868,11 +1931,17 @@ def build_compare_dataset(
1868
1931
  for paper in papers:
1869
1932
  _prepare_paper_matching_fields(paper)
1870
1933
 
1871
- md_paths = _scan_md_roots(md_roots or [])
1872
- pdf_paths, _ = _scan_pdf_roots(pdf_roots or [])
1873
- md_index = _build_file_index(md_roots or [], suffixes={".md"})
1874
- pdf_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
1875
- translated_index = _build_translated_index(md_translated_roots or [])
1934
+ md_paths = _scan_md_roots(md_roots or [], show_progress=show_progress)
1935
+ pdf_paths, _ = _scan_pdf_roots(pdf_roots or [], show_progress=show_progress)
1936
+ md_index = _build_file_index(
1937
+ md_roots or [], suffixes={".md"}, show_progress=show_progress
1938
+ )
1939
+ pdf_index = _build_file_index(
1940
+ pdf_roots or [], suffixes={".pdf"}, show_progress=show_progress
1941
+ )
1942
+ translated_index = _build_translated_index(
1943
+ md_translated_roots or [], show_progress=show_progress
1944
+ )
1876
1945
 
1877
1946
  if pdf_paths:
1878
1947
  papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
@@ -1921,8 +1990,39 @@ def compare_datasets(
1921
1990
  md_translated_roots_b: list[Path] | None = None,
1922
1991
  bibtex_path: Path | None = None,
1923
1992
  lang: str | None = None,
1993
+ show_progress: bool = False,
1924
1994
  ) -> list[CompareResult]:
1925
1995
  """Compare two datasets and return comparison results."""
1996
+ results, _, _, _ = compare_datasets_with_pairs(
1997
+ json_paths_a=json_paths_a,
1998
+ pdf_roots_a=pdf_roots_a,
1999
+ md_roots_a=md_roots_a,
2000
+ md_translated_roots_a=md_translated_roots_a,
2001
+ json_paths_b=json_paths_b,
2002
+ pdf_roots_b=pdf_roots_b,
2003
+ md_roots_b=md_roots_b,
2004
+ md_translated_roots_b=md_translated_roots_b,
2005
+ bibtex_path=bibtex_path,
2006
+ lang=lang,
2007
+ show_progress=show_progress,
2008
+ )
2009
+ return results
2010
+
2011
+
2012
+ def compare_datasets_with_pairs(
2013
+ *,
2014
+ json_paths_a: list[Path] | None = None,
2015
+ pdf_roots_a: list[Path] | None = None,
2016
+ md_roots_a: list[Path] | None = None,
2017
+ md_translated_roots_a: list[Path] | None = None,
2018
+ json_paths_b: list[Path] | None = None,
2019
+ pdf_roots_b: list[Path] | None = None,
2020
+ md_roots_b: list[Path] | None = None,
2021
+ md_translated_roots_b: list[Path] | None = None,
2022
+ bibtex_path: Path | None = None,
2023
+ lang: str | None = None,
2024
+ show_progress: bool = False,
2025
+ ) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]], CompareDataset, CompareDataset]:
1926
2026
  # Validate language requirement for translated inputs
1927
2027
  has_translated_a = md_translated_roots_a is not None and len(md_translated_roots_a) > 0
1928
2028
  has_translated_b = md_translated_roots_b is not None and len(md_translated_roots_b) > 0
@@ -1939,6 +2039,7 @@ def compare_datasets(
1939
2039
  md_translated_roots=md_translated_roots_a,
1940
2040
  bibtex_path=bibtex_path,
1941
2041
  lang=lang,
2042
+ show_progress=show_progress,
1942
2043
  )
1943
2044
 
1944
2045
  dataset_b = build_compare_dataset(
@@ -1948,6 +2049,10 @@ def compare_datasets(
1948
2049
  md_translated_roots=md_translated_roots_b,
1949
2050
  bibtex_path=bibtex_path,
1950
2051
  lang=lang,
2052
+ show_progress=show_progress,
1951
2053
  )
1952
2054
 
1953
- return _match_datasets(dataset_a, dataset_b, lang=lang)
2055
+ results, match_pairs = _match_datasets_with_pairs(
2056
+ dataset_a, dataset_b, lang=lang, show_progress=show_progress
2057
+ )
2058
+ return results, match_pairs, dataset_a, dataset_b