deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +145 -26
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +40 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/paper/web/markdown.py +174 -8
- deepresearch_flow/paper/web/static/css/main.css +8 -1
- deepresearch_flow/paper/web/static/js/detail.js +46 -12
- deepresearch_flow/paper/web/templates/detail.html +9 -0
- deepresearch_flow/paper/web/text.py +8 -4
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +35 -16
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/fixers.py +15 -0
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,8 @@ from typing import Any
|
|
|
8
8
|
import re
|
|
9
9
|
import unicodedata
|
|
10
10
|
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
11
13
|
from deepresearch_flow.paper.utils import stable_hash
|
|
12
14
|
|
|
13
15
|
try:
|
|
@@ -232,6 +234,10 @@ _SIMILARITY_MAX_STEPS = 10
|
|
|
232
234
|
|
|
233
235
|
def _normalize_title_key(title: str) -> str:
|
|
234
236
|
value = unicodedata.normalize("NFKD", title)
|
|
237
|
+
value = re.sub(r"\$([^$]+)\$", r" \1 ", value)
|
|
238
|
+
value = re.sub(r"\\[a-zA-Z]+\\*?\s*\{([^{}]*)\}", r" \1 ", value)
|
|
239
|
+
value = re.sub(r"\\[a-zA-Z]+\\*?", " ", value)
|
|
240
|
+
value = value.replace("^", " ")
|
|
235
241
|
greek_map = {
|
|
236
242
|
"α": "alpha",
|
|
237
243
|
"β": "beta",
|
|
@@ -279,7 +285,12 @@ def _normalize_title_key(title: str) -> str:
|
|
|
279
285
|
idx = 0
|
|
280
286
|
while idx < len(tokens):
|
|
281
287
|
token = tokens[idx]
|
|
282
|
-
if
|
|
288
|
+
if (
|
|
289
|
+
len(token) == 1
|
|
290
|
+
and token.isalpha()
|
|
291
|
+
and idx + 1 < len(tokens)
|
|
292
|
+
and tokens[idx + 1].isalpha()
|
|
293
|
+
):
|
|
283
294
|
merged.append(token + tokens[idx + 1])
|
|
284
295
|
idx += 2
|
|
285
296
|
continue
|
|
@@ -300,6 +311,9 @@ def _strip_leading_numeric_tokens(title_key: str) -> str:
|
|
|
300
311
|
if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
|
|
301
312
|
idx += 1
|
|
302
313
|
continue
|
|
314
|
+
if re.fullmatch(r"\d+\.\d+", token) and len(token) <= _LEADING_NUMERIC_MAX_LEN + 2:
|
|
315
|
+
idx += 1
|
|
316
|
+
continue
|
|
303
317
|
break
|
|
304
318
|
if idx == 0:
|
|
305
319
|
return title_key
|
|
@@ -370,7 +384,11 @@ def _is_pdf_like(path: Path) -> bool:
|
|
|
370
384
|
return ".pdf-" in name_lower and not name_lower.endswith(".md")
|
|
371
385
|
|
|
372
386
|
|
|
373
|
-
def _scan_pdf_roots(
|
|
387
|
+
def _scan_pdf_roots(
|
|
388
|
+
roots: list[Path],
|
|
389
|
+
*,
|
|
390
|
+
show_progress: bool = False,
|
|
391
|
+
) -> tuple[list[Path], list[dict[str, Any]]]:
|
|
374
392
|
pdf_paths: list[Path] = []
|
|
375
393
|
meta: list[dict[str, Any]] = []
|
|
376
394
|
seen: set[Path] = set()
|
|
@@ -381,7 +399,10 @@ def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]
|
|
|
381
399
|
except OSError:
|
|
382
400
|
continue
|
|
383
401
|
files: list[Path] = []
|
|
384
|
-
|
|
402
|
+
iterator = root.rglob("*")
|
|
403
|
+
if show_progress:
|
|
404
|
+
iterator = tqdm(iterator, desc=f"scan pdf {root}", unit="file")
|
|
405
|
+
for path in iterator:
|
|
385
406
|
try:
|
|
386
407
|
if not path.is_file():
|
|
387
408
|
continue
|
|
@@ -597,7 +618,12 @@ def _resolve_by_title_and_meta(
|
|
|
597
618
|
return None, None, 0.0
|
|
598
619
|
|
|
599
620
|
|
|
600
|
-
def _build_file_index(
|
|
621
|
+
def _build_file_index(
|
|
622
|
+
roots: list[Path],
|
|
623
|
+
*,
|
|
624
|
+
suffixes: set[str],
|
|
625
|
+
show_progress: bool = False,
|
|
626
|
+
) -> dict[str, list[Path]]:
|
|
601
627
|
index: dict[str, list[Path]] = {}
|
|
602
628
|
for root in roots:
|
|
603
629
|
try:
|
|
@@ -605,7 +631,10 @@ def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, lis
|
|
|
605
631
|
continue
|
|
606
632
|
except OSError:
|
|
607
633
|
continue
|
|
608
|
-
|
|
634
|
+
iterator = root.rglob("*")
|
|
635
|
+
if show_progress:
|
|
636
|
+
iterator = tqdm(iterator, desc=f"index {next(iter(suffixes))} {root}", unit="file")
|
|
637
|
+
for path in iterator:
|
|
609
638
|
try:
|
|
610
639
|
if not path.is_file():
|
|
611
640
|
continue
|
|
@@ -692,7 +721,11 @@ def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> di
|
|
|
692
721
|
return index
|
|
693
722
|
|
|
694
723
|
|
|
695
|
-
def _build_translated_index(
|
|
724
|
+
def _build_translated_index(
|
|
725
|
+
roots: list[Path],
|
|
726
|
+
*,
|
|
727
|
+
show_progress: bool = False,
|
|
728
|
+
) -> dict[str, dict[str, Path]]:
|
|
696
729
|
index: dict[str, dict[str, Path]] = {}
|
|
697
730
|
candidates: list[Path] = []
|
|
698
731
|
for root in roots:
|
|
@@ -702,7 +735,11 @@ def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
|
|
|
702
735
|
except OSError:
|
|
703
736
|
continue
|
|
704
737
|
try:
|
|
705
|
-
|
|
738
|
+
iterator = root.rglob("*.md")
|
|
739
|
+
if show_progress:
|
|
740
|
+
iterator = tqdm(iterator, desc=f"scan translated {root}", unit="file")
|
|
741
|
+
for path in iterator:
|
|
742
|
+
candidates.append(path)
|
|
706
743
|
except OSError:
|
|
707
744
|
continue
|
|
708
745
|
for path in sorted(candidates, key=lambda item: str(item)):
|
|
@@ -1193,6 +1230,7 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
|
1193
1230
|
merged: list[dict[str, Any]] = []
|
|
1194
1231
|
threshold = 0.95
|
|
1195
1232
|
prefix_len = 5
|
|
1233
|
+
source_hash_index: dict[str, int] = {}
|
|
1196
1234
|
bibtex_exact: dict[str, set[int]] = {}
|
|
1197
1235
|
bibtex_prefix: dict[str, set[int]] = {}
|
|
1198
1236
|
paper_exact: dict[str, set[int]] = {}
|
|
@@ -1226,16 +1264,22 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
|
1226
1264
|
for paper in papers:
|
|
1227
1265
|
if not isinstance(paper, dict):
|
|
1228
1266
|
raise ValueError("Input papers must be objects")
|
|
1267
|
+
source_hash = paper.get("source_hash")
|
|
1268
|
+
source_hash_key = str(source_hash) if source_hash else None
|
|
1229
1269
|
bib_title = _extract_bibtex_title(paper)
|
|
1230
1270
|
paper_title = _extract_paper_title(paper)
|
|
1231
1271
|
match = None
|
|
1232
1272
|
match_idx = None
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1273
|
+
if source_hash_key and source_hash_key in source_hash_index:
|
|
1274
|
+
match_idx = source_hash_index[source_hash_key]
|
|
1275
|
+
match = merged[match_idx]
|
|
1276
|
+
else:
|
|
1277
|
+
for idx in candidate_ids(bib_title, paper_title):
|
|
1278
|
+
candidate = merged[idx]
|
|
1279
|
+
if _titles_match(candidate, paper, threshold=threshold):
|
|
1280
|
+
match = candidate
|
|
1281
|
+
match_idx = idx
|
|
1282
|
+
break
|
|
1239
1283
|
if match is None:
|
|
1240
1284
|
group = {
|
|
1241
1285
|
"templates": {template_tag: paper},
|
|
@@ -1244,6 +1288,8 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
|
1244
1288
|
_add_merge_titles(group, paper)
|
|
1245
1289
|
merged.append(group)
|
|
1246
1290
|
group_idx = len(merged) - 1
|
|
1291
|
+
if source_hash_key:
|
|
1292
|
+
source_hash_index[source_hash_key] = group_idx
|
|
1247
1293
|
if bib_title:
|
|
1248
1294
|
add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
|
|
1249
1295
|
if paper_title:
|
|
@@ -1256,15 +1302,20 @@ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
|
1256
1302
|
order.append(template_tag)
|
|
1257
1303
|
_add_merge_titles(match, paper)
|
|
1258
1304
|
if match_idx is not None:
|
|
1305
|
+
if source_hash_key:
|
|
1306
|
+
source_hash_index[source_hash_key] = match_idx
|
|
1259
1307
|
if bib_title:
|
|
1260
1308
|
add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
|
|
1261
1309
|
if paper_title:
|
|
1262
1310
|
add_index(paper_title, paper_exact, paper_prefix, match_idx)
|
|
1263
1311
|
|
|
1312
|
+
preferred_defaults = ("simple", "simple_phi")
|
|
1264
1313
|
for group in merged:
|
|
1265
1314
|
templates = group.get("templates") or {}
|
|
1266
1315
|
order = group.get("template_order") or list(templates.keys())
|
|
1267
|
-
default_tag =
|
|
1316
|
+
default_tag = next((tag for tag in preferred_defaults if tag in order), None)
|
|
1317
|
+
if default_tag is None:
|
|
1318
|
+
default_tag = order[0] if order else None
|
|
1268
1319
|
group["default_template"] = default_tag
|
|
1269
1320
|
if default_tag and default_tag in templates:
|
|
1270
1321
|
base = templates[default_tag]
|
|
@@ -1408,7 +1459,7 @@ class CompareDataset:
|
|
|
1408
1459
|
paper_id_to_index: dict[int, int]
|
|
1409
1460
|
|
|
1410
1461
|
|
|
1411
|
-
def _scan_md_roots(roots: list[Path]) -> list[Path]:
|
|
1462
|
+
def _scan_md_roots(roots: list[Path], *, show_progress: bool = False) -> list[Path]:
|
|
1412
1463
|
paths: list[Path] = []
|
|
1413
1464
|
for root in roots:
|
|
1414
1465
|
try:
|
|
@@ -1417,7 +1468,10 @@ def _scan_md_roots(roots: list[Path]) -> list[Path]:
|
|
|
1417
1468
|
except OSError:
|
|
1418
1469
|
continue
|
|
1419
1470
|
try:
|
|
1420
|
-
|
|
1471
|
+
iterator = root.rglob("*.md")
|
|
1472
|
+
if show_progress:
|
|
1473
|
+
iterator = tqdm(iterator, desc=f"scan md {root}", unit="file")
|
|
1474
|
+
for path in iterator:
|
|
1421
1475
|
try:
|
|
1422
1476
|
if not path.is_file():
|
|
1423
1477
|
continue
|
|
@@ -1674,12 +1728,13 @@ def _get_paper_identifier(paper: dict[str, Any]) -> str:
|
|
|
1674
1728
|
return str(paper.get("source_hash") or paper.get("source_path", ""))
|
|
1675
1729
|
|
|
1676
1730
|
|
|
1677
|
-
def
|
|
1731
|
+
def _match_datasets_with_pairs(
|
|
1678
1732
|
dataset_a: CompareDataset,
|
|
1679
1733
|
dataset_b: CompareDataset,
|
|
1680
1734
|
*,
|
|
1681
1735
|
lang: str | None = None,
|
|
1682
|
-
|
|
1736
|
+
show_progress: bool = False,
|
|
1737
|
+
) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]]]:
|
|
1683
1738
|
"""Match papers between two datasets using db_ops parity."""
|
|
1684
1739
|
results: list[CompareResult] = []
|
|
1685
1740
|
matched_a: set[int] = set()
|
|
@@ -1689,7 +1744,11 @@ def _match_datasets(
|
|
|
1689
1744
|
|
|
1690
1745
|
file_index_b = _merge_file_indexes(dataset_b.md_index, dataset_b.pdf_index)
|
|
1691
1746
|
|
|
1692
|
-
|
|
1747
|
+
papers_a_iter = dataset_a.papers
|
|
1748
|
+
if show_progress:
|
|
1749
|
+
papers_a_iter = tqdm(dataset_a.papers, desc="match A", unit="paper")
|
|
1750
|
+
|
|
1751
|
+
for idx_a, paper in enumerate(papers_a_iter):
|
|
1693
1752
|
_prepare_paper_matching_fields(paper)
|
|
1694
1753
|
source_hash = str(paper.get("source_hash") or "")
|
|
1695
1754
|
title = str(paper.get("paper_title") or "")
|
|
@@ -1763,7 +1822,11 @@ def _match_datasets(
|
|
|
1763
1822
|
)
|
|
1764
1823
|
)
|
|
1765
1824
|
|
|
1766
|
-
|
|
1825
|
+
papers_b_iter = dataset_b.papers
|
|
1826
|
+
if show_progress:
|
|
1827
|
+
papers_b_iter = tqdm(dataset_b.papers, desc="match B", unit="paper")
|
|
1828
|
+
|
|
1829
|
+
for idx_b, paper in enumerate(papers_b_iter):
|
|
1767
1830
|
_prepare_paper_matching_fields(paper)
|
|
1768
1831
|
source_hash = str(paper.get("source_hash") or "")
|
|
1769
1832
|
title = str(paper.get("paper_title") or "")
|
|
@@ -1818,6 +1881,19 @@ def _match_datasets(
|
|
|
1818
1881
|
)
|
|
1819
1882
|
)
|
|
1820
1883
|
|
|
1884
|
+
return results, match_pairs
|
|
1885
|
+
|
|
1886
|
+
|
|
1887
|
+
def _match_datasets(
|
|
1888
|
+
dataset_a: CompareDataset,
|
|
1889
|
+
dataset_b: CompareDataset,
|
|
1890
|
+
*,
|
|
1891
|
+
lang: str | None = None,
|
|
1892
|
+
show_progress: bool = False,
|
|
1893
|
+
) -> list[CompareResult]:
|
|
1894
|
+
results, _ = _match_datasets_with_pairs(
|
|
1895
|
+
dataset_a, dataset_b, lang=lang, show_progress=show_progress
|
|
1896
|
+
)
|
|
1821
1897
|
return results
|
|
1822
1898
|
|
|
1823
1899
|
|
|
@@ -1829,6 +1905,7 @@ def build_compare_dataset(
|
|
|
1829
1905
|
md_translated_roots: list[Path] | None = None,
|
|
1830
1906
|
bibtex_path: Path | None = None,
|
|
1831
1907
|
lang: str | None = None,
|
|
1908
|
+
show_progress: bool = False,
|
|
1832
1909
|
) -> CompareDataset:
|
|
1833
1910
|
"""Load and index a dataset from various sources."""
|
|
1834
1911
|
papers: list[dict[str, Any]] = []
|
|
@@ -1854,11 +1931,17 @@ def build_compare_dataset(
|
|
|
1854
1931
|
for paper in papers:
|
|
1855
1932
|
_prepare_paper_matching_fields(paper)
|
|
1856
1933
|
|
|
1857
|
-
md_paths = _scan_md_roots(md_roots or [])
|
|
1858
|
-
pdf_paths, _ = _scan_pdf_roots(pdf_roots or [])
|
|
1859
|
-
md_index = _build_file_index(
|
|
1860
|
-
|
|
1861
|
-
|
|
1934
|
+
md_paths = _scan_md_roots(md_roots or [], show_progress=show_progress)
|
|
1935
|
+
pdf_paths, _ = _scan_pdf_roots(pdf_roots or [], show_progress=show_progress)
|
|
1936
|
+
md_index = _build_file_index(
|
|
1937
|
+
md_roots or [], suffixes={".md"}, show_progress=show_progress
|
|
1938
|
+
)
|
|
1939
|
+
pdf_index = _build_file_index(
|
|
1940
|
+
pdf_roots or [], suffixes={".pdf"}, show_progress=show_progress
|
|
1941
|
+
)
|
|
1942
|
+
translated_index = _build_translated_index(
|
|
1943
|
+
md_translated_roots or [], show_progress=show_progress
|
|
1944
|
+
)
|
|
1862
1945
|
|
|
1863
1946
|
if pdf_paths:
|
|
1864
1947
|
papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
|
|
@@ -1907,8 +1990,39 @@ def compare_datasets(
|
|
|
1907
1990
|
md_translated_roots_b: list[Path] | None = None,
|
|
1908
1991
|
bibtex_path: Path | None = None,
|
|
1909
1992
|
lang: str | None = None,
|
|
1993
|
+
show_progress: bool = False,
|
|
1910
1994
|
) -> list[CompareResult]:
|
|
1911
1995
|
"""Compare two datasets and return comparison results."""
|
|
1996
|
+
results, _, _, _ = compare_datasets_with_pairs(
|
|
1997
|
+
json_paths_a=json_paths_a,
|
|
1998
|
+
pdf_roots_a=pdf_roots_a,
|
|
1999
|
+
md_roots_a=md_roots_a,
|
|
2000
|
+
md_translated_roots_a=md_translated_roots_a,
|
|
2001
|
+
json_paths_b=json_paths_b,
|
|
2002
|
+
pdf_roots_b=pdf_roots_b,
|
|
2003
|
+
md_roots_b=md_roots_b,
|
|
2004
|
+
md_translated_roots_b=md_translated_roots_b,
|
|
2005
|
+
bibtex_path=bibtex_path,
|
|
2006
|
+
lang=lang,
|
|
2007
|
+
show_progress=show_progress,
|
|
2008
|
+
)
|
|
2009
|
+
return results
|
|
2010
|
+
|
|
2011
|
+
|
|
2012
|
+
def compare_datasets_with_pairs(
|
|
2013
|
+
*,
|
|
2014
|
+
json_paths_a: list[Path] | None = None,
|
|
2015
|
+
pdf_roots_a: list[Path] | None = None,
|
|
2016
|
+
md_roots_a: list[Path] | None = None,
|
|
2017
|
+
md_translated_roots_a: list[Path] | None = None,
|
|
2018
|
+
json_paths_b: list[Path] | None = None,
|
|
2019
|
+
pdf_roots_b: list[Path] | None = None,
|
|
2020
|
+
md_roots_b: list[Path] | None = None,
|
|
2021
|
+
md_translated_roots_b: list[Path] | None = None,
|
|
2022
|
+
bibtex_path: Path | None = None,
|
|
2023
|
+
lang: str | None = None,
|
|
2024
|
+
show_progress: bool = False,
|
|
2025
|
+
) -> tuple[list[CompareResult], list[tuple[int, int, str | None, float]], CompareDataset, CompareDataset]:
|
|
1912
2026
|
# Validate language requirement for translated inputs
|
|
1913
2027
|
has_translated_a = md_translated_roots_a is not None and len(md_translated_roots_a) > 0
|
|
1914
2028
|
has_translated_b = md_translated_roots_b is not None and len(md_translated_roots_b) > 0
|
|
@@ -1925,6 +2039,7 @@ def compare_datasets(
|
|
|
1925
2039
|
md_translated_roots=md_translated_roots_a,
|
|
1926
2040
|
bibtex_path=bibtex_path,
|
|
1927
2041
|
lang=lang,
|
|
2042
|
+
show_progress=show_progress,
|
|
1928
2043
|
)
|
|
1929
2044
|
|
|
1930
2045
|
dataset_b = build_compare_dataset(
|
|
@@ -1934,6 +2049,10 @@ def compare_datasets(
|
|
|
1934
2049
|
md_translated_roots=md_translated_roots_b,
|
|
1935
2050
|
bibtex_path=bibtex_path,
|
|
1936
2051
|
lang=lang,
|
|
2052
|
+
show_progress=show_progress,
|
|
1937
2053
|
)
|
|
1938
2054
|
|
|
1939
|
-
|
|
2055
|
+
results, match_pairs = _match_datasets_with_pairs(
|
|
2056
|
+
dataset_a, dataset_b, lang=lang, show_progress=show_progress
|
|
2057
|
+
)
|
|
2058
|
+
return results, match_pairs, dataset_a, dataset_b
|