subset2evaluate 1.0.23__tar.gz → 1.0.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/PKG-INFO +1 -1
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/pyproject.toml +1 -1
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate/utils.py +29 -21
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/PKG-INFO +1 -1
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/README.md +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/setup.cfg +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate/__init__.py +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate/evaluate.py +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate/methods.py +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate/methods_old.py +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate/select_subset.py +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate/test.py +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/SOURCES.txt +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/dependency_links.txt +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/entry_points.txt +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/requires.txt +0 -0
- {subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/top_level.txt +0 -0
|
@@ -712,23 +712,33 @@ def load_data_wmt( # noqa: C901
|
|
|
712
712
|
human_refs.add(parts[1])
|
|
713
713
|
|
|
714
714
|
# do not consider canary line
|
|
715
|
-
|
|
715
|
+
# for WTM24++ it's part of the translation dataset
|
|
716
|
+
contain_canary_line = (
|
|
717
|
+
lines_src[0].lower().startswith("canary") and year != "wmt24pp"
|
|
718
|
+
)
|
|
716
719
|
if contain_canary_line:
|
|
717
720
|
lines_src.pop(0)
|
|
718
721
|
lines_doc.pop(0)
|
|
719
722
|
lines_ref.pop(0)
|
|
720
723
|
|
|
721
724
|
line_model = {}
|
|
722
|
-
for f in
|
|
723
|
-
f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.txt"
|
|
725
|
+
for f in list(
|
|
726
|
+
glob.glob(f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.txt")
|
|
727
|
+
) + list(
|
|
728
|
+
glob.glob(f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.jsonl")
|
|
724
729
|
):
|
|
725
|
-
|
|
730
|
+
if f.endswith(".jsonl"):
|
|
731
|
+
model = f.split("/")[-1].removesuffix(".jsonl")
|
|
732
|
+
lines = [json.loads(line)["hypothesis"] for line in open(f, "r")]
|
|
733
|
+
else:
|
|
734
|
+
model = f.split("/")[-1].removesuffix(".txt")
|
|
735
|
+
lines = open(f, "r").readlines()
|
|
726
736
|
if model in {"synthetic_ref", "chrf_bestmbr"}:
|
|
727
737
|
continue
|
|
728
738
|
if model in human_refs and not include_ref:
|
|
729
739
|
continue
|
|
730
740
|
|
|
731
|
-
line_model[model] =
|
|
741
|
+
line_model[model] = lines
|
|
732
742
|
if contain_canary_line:
|
|
733
743
|
line_model[model].pop(0)
|
|
734
744
|
|
|
@@ -736,22 +746,18 @@ def load_data_wmt( # noqa: C901
|
|
|
736
746
|
|
|
737
747
|
lines_score = collections.defaultdict(list)
|
|
738
748
|
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.appraise.seg.score",
|
|
752
|
-
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-raw.seg.score",
|
|
753
|
-
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-appraise.seg.score",
|
|
754
|
-
]
|
|
749
|
+
f_protocols = [
|
|
750
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.{file_protocol}.seg.score",
|
|
751
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
|
|
752
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa-merged.seg.score",
|
|
753
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa.seg.score",
|
|
754
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.da-sqm.seg.score",
|
|
755
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
|
|
756
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt.seg.score",
|
|
757
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.appraise.seg.score",
|
|
758
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-raw.seg.score",
|
|
759
|
+
f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-appraise.seg.score",
|
|
760
|
+
]
|
|
755
761
|
for fname in [*f_protocols, False]:
|
|
756
762
|
if fname and os.path.exists(fname):
|
|
757
763
|
break
|
|
@@ -808,6 +814,8 @@ def load_data_wmt( # noqa: C901
|
|
|
808
814
|
|
|
809
815
|
model_line_i -= 1
|
|
810
816
|
|
|
817
|
+
if model_line_i >= len(lines_score[model]):
|
|
818
|
+
print(year, langs, model_line_i, metric, model, contain_canary_line)
|
|
811
819
|
lines_score[model][model_line_i][metric] = float(score)
|
|
812
820
|
|
|
813
821
|
# filter out lines that have no human score
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{subset2evaluate-1.0.23 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|