deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1041 -34
  4. deepresearch_flow/paper/db_ops.py +145 -26
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
  15. deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
  16. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  17. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  18. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  19. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  20. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
  21. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  22. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  23. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  24. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  25. deepresearch_flow/paper/snapshot/api.py +941 -0
  26. deepresearch_flow/paper/snapshot/builder.py +965 -0
  27. deepresearch_flow/paper/snapshot/identity.py +239 -0
  28. deepresearch_flow/paper/snapshot/schema.py +245 -0
  29. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  30. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  31. deepresearch_flow/paper/snapshot/text.py +154 -0
  32. deepresearch_flow/paper/template_registry.py +40 -0
  33. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
  35. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  36. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  37. deepresearch_flow/paper/web/app.py +10 -3
  38. deepresearch_flow/paper/web/markdown.py +174 -8
  39. deepresearch_flow/paper/web/static/css/main.css +8 -1
  40. deepresearch_flow/paper/web/static/js/detail.js +46 -12
  41. deepresearch_flow/paper/web/templates/detail.html +9 -0
  42. deepresearch_flow/paper/web/text.py +8 -4
  43. deepresearch_flow/recognize/cli.py +380 -103
  44. deepresearch_flow/recognize/markdown.py +31 -7
  45. deepresearch_flow/recognize/math.py +47 -12
  46. deepresearch_flow/recognize/mermaid.py +320 -10
  47. deepresearch_flow/recognize/organize.py +35 -16
  48. deepresearch_flow/translator/cli.py +71 -20
  49. deepresearch_flow/translator/engine.py +220 -81
  50. deepresearch_flow/translator/fixers.py +15 -0
  51. deepresearch_flow/translator/prompts.py +19 -2
  52. deepresearch_flow/translator/protector.py +15 -3
  53. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
  54. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
  55. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
  56. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
  57. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
  58. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,8 @@ from typing import Any
8
8
  import re
9
9
  import unicodedata
10
10
 
11
+ from tqdm import tqdm
12
+
11
13
  from deepresearch_flow.paper.utils import stable_hash
12
14
 
13
15
  try:
@@ -232,6 +234,10 @@ _SIMILARITY_MAX_STEPS = 10
232
234
 
233
235
  def _normalize_title_key(title: str) -> str:
234
236
  value = unicodedata.normalize("NFKD", title)
237
+ value = re.sub(r"\$([^$]+)\$", r" \1 ", value)
238
+ value = re.sub(r"\\[a-zA-Z]+\\*?\s*\{([^{}]*)\}", r" \1 ", value)
239
+ value = re.sub(r"\\[a-zA-Z]+\\*?", " ", value)
240
+ value = value.replace("^", " ")
235
241
  greek_map = {
236
242
  "α": "alpha",
237
243
  "β": "beta",
@@ -279,7 +285,12 @@ def _normalize_title_key(title: str) -> str:
279
285
  idx = 0
280
286
  while idx < len(tokens):
281
287
  token = tokens[idx]
282
- if len(token) == 1 and idx + 1 < len(tokens):
288
+ if (
289
+ len(token) == 1
290
+ and token.isalpha()
291
+ and idx + 1 < len(tokens)
292
+ and tokens[idx + 1].isalpha()
293
+ ):
283
294
  merged.append(token + tokens[idx + 1])
284
295
  idx += 2
285
296
  continue
@@ -300,6 +311,9 @@ def _strip_leading_numeric_tokens(title_key: str) -> str:
300
311
  if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
301
312
  idx += 1
302
313
  continue
314
+ if re.fullmatch(r"\d+\.\d+", token) and len(token) <= _LEADING_NUMERIC_MAX_LEN + 2:
315
+ idx += 1
316
+ continue
303
317
  break
304
318
  if idx == 0:
305
319
  return title_key
@@ -370,7 +384,11 @@ def _is_pdf_like(path: Path) -> bool:
370
384
  return ".pdf-" in name_lower and not name_lower.endswith(".md")
371
385
 
372
386
 
373
- def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]]:
387
+ def _scan_pdf_roots(
388
+ roots: list[Path],
389
+ *,
390
+ show_progress: bool = False,
391
+ ) -> tuple[list[Path], list[dict[str, Any]]]:
374
392
  pdf_paths: list[Path] = []
375
393
  meta: list[dict[str, Any]] = []
376
394
  seen: set[Path] = set()
@@ -381,7 +399,10 @@ def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]
381
399
  except OSError:
382
400
  continue
383
401
  files: list[Path] = []
384
- for path in root.rglob("*"):
402
+ iterator = root.rglob("*")
403
+ if show_progress:
404
+ iterator = tqdm(iterator, desc=f"scan pdf {root}", unit="file")
405
+ for path in iterator:
385
406
  try:
386
407
  if not path.is_file():
387
408
  continue
@@ -597,7 +618,12 @@ def _resolve_by_title_and_meta(
597
618
  return None, None, 0.0
598
619
 
599
620
 
600
- def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
621
+ def _build_file_index(
622
+ roots: list[Path],
623
+ *,
624
+ suffixes: set[str],
625
+ show_progress: bool = False,
626
+ ) -> dict[str, list[Path]]:
601
627
  index: dict[str, list[Path]] = {}
602
628
  for root in roots:
603
629
  try:
@@ -605,7 +631,10 @@ def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, lis
605
631
  continue
606
632
  except OSError:
607
633
  continue
608
- for path in root.rglob("*"):
634
+ iterator = root.rglob("*")
635
+ if show_progress:
636
+ iterator = tqdm(iterator, desc=f"index {next(iter(suffixes))} {root}", unit="file")
637
+ for path in iterator:
609
638
  try:
610
639
  if not path.is_file():
611
640
  continue
@@ -692,7 +721,11 @@ def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> di
692
721
  return index
693
722
 
694
723
 
695
- def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
724
+ def _build_translated_index(
725
+ roots: list[Path],
726
+ *,
727
+ show_progress: bool = False,
728
+ ) -> dict[str, dict[str, Path]]:
696
729
  index: dict[str, dict[str, Path]] = {}
697
730
  candidates: list[Path] = []
698
731
  for root in roots:
@@ -702,7 +735,11 @@ def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
702
735
  except OSError:
703
736
  continue
704
737
  try:
705
- candidates.extend(root.rglob("*.md"))
738
+ iterator = root.rglob("*.md")
739
+ if show_progress:
740
+ iterator = tqdm(iterator, desc=f"scan translated {root}", unit="file")
741
+ for path in iterator:
742
+ candidates.append(path)
706
743
  except OSError:
707
744
  continue
708
745
  for path in sorted(candidates, key=lambda item: str(item)):
@@ -1193,6 +1230,7 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
1193
1230
  merged: list[dict[str, Any]] = []
1194
1231
  threshold = 0.95
1195
1232
  prefix_len = 5
1233
+ source_hash_index: dict[str, int] = {}
1196
1234
  bibtex_exact: dict[str, set[int]] = {}
1197
1235
  bibtex_prefix: dict[str, set[int]] = {}
1198
1236
  paper_exact: dict[str, set[int]] = {}
@@ -1226,16 +1264,22 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
1226
1264
  for paper in papers:
1227
1265
  if not isinstance(paper, dict):
1228
1266
  raise ValueError("Input papers must be objects")
1267
+ source_hash = paper.get("source_hash")
1268
+ source_hash_key = str(source_hash) if source_hash else None
1229
1269
  bib_title = _extract_bibtex_title(paper)
1230
1270
  paper_title = _extract_paper_title(paper)
1231
1271
  match = None
1232
1272
  match_idx = None
1233
- for idx in candidate_ids(bib_title, paper_title):
1234
- candidate = merged[idx]
1235
- if _titles_match(candidate, paper, threshold=threshold):
1236
- match = candidate
1237
- match_idx = idx
1238
- break
1273
+ if source_hash_key and source_hash_key in source_hash_index:
1274
+ match_idx = source_hash_index[source_hash_key]
1275
+ match = merged[match_idx]
1276
+ else:
1277
+ for idx in candidate_ids(bib_title, paper_title):
1278
+ candidate = merged[idx]
1279
+ if _titles_match(candidate, paper, threshold=threshold):
1280
+ match = candidate
1281
+ match_idx = idx
1282
+ break
1239
1283
  if match is None:
1240
1284
  group = {
1241
1285
  "templates": {template_tag: paper},
@@ -1244,6 +1288,8 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
1244
1288
  _add_merge_titles(group, paper)
1245
1289
  merged.append(group)
1246
1290
  group_idx = len(merged) - 1
1291
+ if source_hash_key:
1292
+ source_hash_index[source_hash_key] = group_idx
1247
1293
  if bib_title:
1248
1294
  add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
1249
1295
  if paper_title:
@@ -1256,15 +1302,20 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
1256
1302
  order.append(template_tag)
1257
1303
  _add_merge_titles(match, paper)
1258
1304
  if match_idx is not None:
1305
+ if source_hash_key:
1306
+ source_hash_index[source_hash_key] = match_idx
1259
1307
  if bib_title:
1260
1308
  add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
1261
1309
  if paper_title:
1262
1310
  add_index(paper_title, paper_exact, paper_prefix, match_idx)
1263
1311
 
1312
+ preferred_defaults = ("simple", "simple_phi")
1264
1313
  for group in merged:
1265
1314
  templates = group.get("templates") or {}
1266
1315
  order = group.get("template_order") or list(templates.keys())
1267
- default_tag = "simple" if "simple" in order else (order[0] if order else None)
1316
+ default_tag = next((tag for tag in preferred_defaults if tag in order), None)
1317
+ if default_tag is None:
1318
+ default_tag = order[0] if order else None
1268
1319
  group["default_template"] = default_tag
1269
1320
  if default_tag and default_tag in templates:
1270
1321
  base = templates[default_tag]
@@ -1408,7 +1459,7 @@ class CompareDataset:
1408
1459
  paper_id_to_index: dict[int, int]
1409
1460
 
1410
1461
 
1411
- def _scan_md_roots(roots: list[Path]) -> list[Path]:
1462
+ def _scan_md_roots(roots: list[Path], *, show_progress: bool = False) -> list[Path]:
1412
1463
  paths: list[Path] = []
1413
1464
  for root in roots:
1414
1465
  try:
@@ -1417,7 +1468,10 @@ def _scan_md_roots(roots: list[Path]) -> list[Path]:
1417
1468
  except OSError:
1418
1469
  continue
1419
1470
  try:
1420
- for path in root.rglob("*.md"):
1471
+ iterator = root.rglob("*.md")
1472
+ if show_progress:
1473
+ iterator = tqdm(iterator, desc=f"scan md {root}", unit="file")
1474
+ for path in iterator:
1421
1475
  try:
1422
1476
  if not path.is_file():
1423
1477
  continue
@@ -1674,12 +1728,13 @@ def _get_paper_identifier(paper: dict[str, Any]) -> str:
1674
1728
  return str(paper.get("source_hash") or paper.get("source_path", ""))
1675
1729
 
1676
1730
 
1677
- def _match_datasets(
1731
+ def _match_datasets_with_pairs(
1678
1732
  dataset_a: CompareDataset,
1679
1733
  dataset_b: CompareDataset,
1680
1734
  *,
1681
1735
  lang: str | None = None,
1682
- ) -> list[CompareResult]:
1736
+ show_progress: bool = False,
1737
+ ) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]]]:
1683
1738
  """Match papers between two datasets using db_ops parity."""
1684
1739
  results: list[CompareResult] = []
1685
1740
  matched_a: set[int] = set()
@@ -1689,7 +1744,11 @@ def _match_datasets(
1689
1744
 
1690
1745
  file_index_b = _merge_file_indexes(dataset_b.md_index, dataset_b.pdf_index)
1691
1746
 
1692
- for idx_a, paper in enumerate(dataset_a.papers):
1747
+ papers_a_iter = dataset_a.papers
1748
+ if show_progress:
1749
+ papers_a_iter = tqdm(dataset_a.papers, desc="match A", unit="paper")
1750
+
1751
+ for idx_a, paper in enumerate(papers_a_iter):
1693
1752
  _prepare_paper_matching_fields(paper)
1694
1753
  source_hash = str(paper.get("source_hash") or "")
1695
1754
  title = str(paper.get("paper_title") or "")
@@ -1763,7 +1822,11 @@ def _match_datasets(
1763
1822
  )
1764
1823
  )
1765
1824
 
1766
- for idx_b, paper in enumerate(dataset_b.papers):
1825
+ papers_b_iter = dataset_b.papers
1826
+ if show_progress:
1827
+ papers_b_iter = tqdm(dataset_b.papers, desc="match B", unit="paper")
1828
+
1829
+ for idx_b, paper in enumerate(papers_b_iter):
1767
1830
  _prepare_paper_matching_fields(paper)
1768
1831
  source_hash = str(paper.get("source_hash") or "")
1769
1832
  title = str(paper.get("paper_title") or "")
@@ -1818,6 +1881,19 @@ def _match_datasets(
1818
1881
  )
1819
1882
  )
1820
1883
 
1884
+ return results, match_pairs
1885
+
1886
+
1887
+ def _match_datasets(
1888
+ dataset_a: CompareDataset,
1889
+ dataset_b: CompareDataset,
1890
+ *,
1891
+ lang: str | None = None,
1892
+ show_progress: bool = False,
1893
+ ) -> list[CompareResult]:
1894
+ results, _ = _match_datasets_with_pairs(
1895
+ dataset_a, dataset_b, lang=lang, show_progress=show_progress
1896
+ )
1821
1897
  return results
1822
1898
 
1823
1899
 
@@ -1829,6 +1905,7 @@ def build_compare_dataset(
1829
1905
  md_translated_roots: list[Path] | None = None,
1830
1906
  bibtex_path: Path | None = None,
1831
1907
  lang: str | None = None,
1908
+ show_progress: bool = False,
1832
1909
  ) -> CompareDataset:
1833
1910
  """Load and index a dataset from various sources."""
1834
1911
  papers: list[dict[str, Any]] = []
@@ -1854,11 +1931,17 @@ def build_compare_dataset(
1854
1931
  for paper in papers:
1855
1932
  _prepare_paper_matching_fields(paper)
1856
1933
 
1857
- md_paths = _scan_md_roots(md_roots or [])
1858
- pdf_paths, _ = _scan_pdf_roots(pdf_roots or [])
1859
- md_index = _build_file_index(md_roots or [], suffixes={".md"})
1860
- pdf_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
1861
- translated_index = _build_translated_index(md_translated_roots or [])
1934
+ md_paths = _scan_md_roots(md_roots or [], show_progress=show_progress)
1935
+ pdf_paths, _ = _scan_pdf_roots(pdf_roots or [], show_progress=show_progress)
1936
+ md_index = _build_file_index(
1937
+ md_roots or [], suffixes={".md"}, show_progress=show_progress
1938
+ )
1939
+ pdf_index = _build_file_index(
1940
+ pdf_roots or [], suffixes={".pdf"}, show_progress=show_progress
1941
+ )
1942
+ translated_index = _build_translated_index(
1943
+ md_translated_roots or [], show_progress=show_progress
1944
+ )
1862
1945
 
1863
1946
  if pdf_paths:
1864
1947
  papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
@@ -1907,8 +1990,39 @@ def compare_datasets(
1907
1990
  md_translated_roots_b: list[Path] | None = None,
1908
1991
  bibtex_path: Path | None = None,
1909
1992
  lang: str | None = None,
1993
+ show_progress: bool = False,
1910
1994
  ) -> list[CompareResult]:
1911
1995
  """Compare two datasets and return comparison results."""
1996
+ results, _, _, _ = compare_datasets_with_pairs(
1997
+ json_paths_a=json_paths_a,
1998
+ pdf_roots_a=pdf_roots_a,
1999
+ md_roots_a=md_roots_a,
2000
+ md_translated_roots_a=md_translated_roots_a,
2001
+ json_paths_b=json_paths_b,
2002
+ pdf_roots_b=pdf_roots_b,
2003
+ md_roots_b=md_roots_b,
2004
+ md_translated_roots_b=md_translated_roots_b,
2005
+ bibtex_path=bibtex_path,
2006
+ lang=lang,
2007
+ show_progress=show_progress,
2008
+ )
2009
+ return results
2010
+
2011
+
2012
+ def compare_datasets_with_pairs(
2013
+ *,
2014
+ json_paths_a: list[Path] | None = None,
2015
+ pdf_roots_a: list[Path] | None = None,
2016
+ md_roots_a: list[Path] | None = None,
2017
+ md_translated_roots_a: list[Path] | None = None,
2018
+ json_paths_b: list[Path] | None = None,
2019
+ pdf_roots_b: list[Path] | None = None,
2020
+ md_roots_b: list[Path] | None = None,
2021
+ md_translated_roots_b: list[Path] | None = None,
2022
+ bibtex_path: Path | None = None,
2023
+ lang: str | None = None,
2024
+ show_progress: bool = False,
2025
+ ) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]], CompareDataset, CompareDataset]:
1912
2026
  # Validate language requirement for translated inputs
1913
2027
  has_translated_a = md_translated_roots_a is not None and len(md_translated_roots_a) > 0
1914
2028
  has_translated_b = md_translated_roots_b is not None and len(md_translated_roots_b) > 0
@@ -1925,6 +2039,7 @@ def compare_datasets(
1925
2039
  md_translated_roots=md_translated_roots_a,
1926
2040
  bibtex_path=bibtex_path,
1927
2041
  lang=lang,
2042
+ show_progress=show_progress,
1928
2043
  )
1929
2044
 
1930
2045
  dataset_b = build_compare_dataset(
@@ -1934,6 +2049,10 @@ def compare_datasets(
1934
2049
  md_translated_roots=md_translated_roots_b,
1935
2050
  bibtex_path=bibtex_path,
1936
2051
  lang=lang,
2052
+ show_progress=show_progress,
1937
2053
  )
1938
2054
 
1939
- return _match_datasets(dataset_a, dataset_b, lang=lang)
2055
+ results, match_pairs = _match_datasets_with_pairs(
2056
+ dataset_a, dataset_b, lang=lang, show_progress=show_progress
2057
+ )
2058
+ return results, match_pairs, dataset_a, dataset_b