subset2evaluate 1.0.21__tar.gz → 1.0.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/PKG-INFO +1 -1
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/pyproject.toml +1 -1
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/utils.py +114 -14
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/PKG-INFO +1 -1
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/SOURCES.txt +0 -1
- subset2evaluate-1.0.21/subset2evaluate/reference_info.py +0 -147
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/README.md +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/setup.cfg +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/__init__.py +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/evaluate.py +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/methods.py +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/methods_old.py +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/select_subset.py +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/test.py +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/dependency_links.txt +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/entry_points.txt +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/requires.txt +0 -0
- {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from typing import Any, Callable, Dict, List, Optional, Union, Literal
|
|
2
2
|
import numpy as np
|
|
3
|
-
from subset2evaluate.reference_info import year2std_refs
|
|
4
3
|
|
|
5
4
|
PROPS = np.linspace(0.05, 0.5, 10)
|
|
6
5
|
|
|
@@ -16,6 +15,8 @@ def _data_minmax_normalize(data):
|
|
|
16
15
|
for line in data:
|
|
17
16
|
for met_all in line["scores"].values():
|
|
18
17
|
for met_k, met_v in met_all.items():
|
|
18
|
+
if met_v is None:
|
|
19
|
+
continue
|
|
19
20
|
data_flat[met_k].append(met_v)
|
|
20
21
|
|
|
21
22
|
# normalize
|
|
@@ -24,10 +25,15 @@ def _data_minmax_normalize(data):
|
|
|
24
25
|
for line in data:
|
|
25
26
|
for model, met_all in line["scores"].items():
|
|
26
27
|
for met_k, met_v in met_all.items():
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
if met_v is None:
|
|
29
|
+
continue
|
|
30
|
+
if data_flat[met_k][1] - data_flat[met_k][0] == 0:
|
|
31
|
+
line["scores"][model][met_k] = 0
|
|
32
|
+
else:
|
|
33
|
+
# (x-min)/(max-min) normalize
|
|
34
|
+
line["scores"][model][met_k] = (met_v - data_flat[met_k][0]) / (
|
|
35
|
+
data_flat[met_k][1] - data_flat[met_k][0]
|
|
36
|
+
)
|
|
31
37
|
|
|
32
38
|
|
|
33
39
|
def confidence_interval(data, confidence=0.95):
|
|
@@ -616,6 +622,7 @@ def load_data_wmt( # noqa: C901
|
|
|
616
622
|
import pickle
|
|
617
623
|
import contextlib
|
|
618
624
|
import importlib.metadata
|
|
625
|
+
import json
|
|
619
626
|
|
|
620
627
|
# temporarily change to the root directory, this requires Python 3.11
|
|
621
628
|
with contextlib.chdir(os.path.dirname(os.path.realpath(__file__)) + "/../"):
|
|
@@ -637,6 +644,44 @@ def load_data_wmt( # noqa: C901
|
|
|
637
644
|
):
|
|
638
645
|
return cache["data"]
|
|
639
646
|
|
|
647
|
+
# special handling for WMT24++
|
|
648
|
+
if os.path.exists(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl"):
|
|
649
|
+
with open(
|
|
650
|
+
f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl", "r"
|
|
651
|
+
) as f:
|
|
652
|
+
data_tmp = f.readlines()
|
|
653
|
+
lines_src = [json.loads(line)["source"] for line in data_tmp]
|
|
654
|
+
with open(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "w") as f:
|
|
655
|
+
for line in lines_src:
|
|
656
|
+
f.write(line + "\n")
|
|
657
|
+
if os.path.exists(
|
|
658
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl"
|
|
659
|
+
):
|
|
660
|
+
with open(
|
|
661
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl", "r"
|
|
662
|
+
) as f:
|
|
663
|
+
data_tmp = f.readlines()
|
|
664
|
+
lines_ref = [json.loads(line)["target"] for line in data_tmp]
|
|
665
|
+
with open(
|
|
666
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.txt", "w"
|
|
667
|
+
) as f:
|
|
668
|
+
for line in lines_ref:
|
|
669
|
+
f.write(line + "\n")
|
|
670
|
+
if os.path.exists(
|
|
671
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl"
|
|
672
|
+
):
|
|
673
|
+
with open(
|
|
674
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl",
|
|
675
|
+
"r",
|
|
676
|
+
) as f:
|
|
677
|
+
data_tmp = f.readlines()
|
|
678
|
+
lines_ref = [json.loads(line)["target"] for line in data_tmp]
|
|
679
|
+
with open(
|
|
680
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.txt", "w"
|
|
681
|
+
) as f:
|
|
682
|
+
for line in lines_ref:
|
|
683
|
+
f.write(line + "\n")
|
|
684
|
+
|
|
640
685
|
lines_src = open(
|
|
641
686
|
f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "r"
|
|
642
687
|
).readlines()
|
|
@@ -646,14 +691,13 @@ def load_data_wmt( # noqa: C901
|
|
|
646
691
|
lines_ref = None
|
|
647
692
|
|
|
648
693
|
refs_dir = f"data/mt-metrics-eval-v2/{year}/references"
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
if not os.path.exists(file_reference_path):
|
|
694
|
+
for file_reference in [file_reference, "refA", "refB", "refC", "ref"]:
|
|
695
|
+
if os.path.exists(f"{refs_dir}/{langs}.{file_reference}.txt"):
|
|
696
|
+
break
|
|
697
|
+
else:
|
|
655
698
|
# did not find reference
|
|
656
699
|
return []
|
|
700
|
+
file_reference_path = f"{refs_dir}/{langs}.{file_reference}.txt"
|
|
657
701
|
|
|
658
702
|
lines_ref = open(file_reference_path, "r").readlines()
|
|
659
703
|
|
|
@@ -736,14 +780,14 @@ def load_data_wmt( # noqa: C901
|
|
|
736
780
|
f"data/mt-metrics-eval-v2/{year}/metric-scores/{langs}/*.seg.score"
|
|
737
781
|
):
|
|
738
782
|
# among ref-based metrics, load only the scores for the selected human ref
|
|
739
|
-
if not f.endswith(f"-{
|
|
783
|
+
if not f.endswith(f"-{file_reference}.seg.score") and not f.endswith(
|
|
740
784
|
"-src.seg.score"
|
|
741
785
|
):
|
|
742
786
|
continue
|
|
743
787
|
# remove suffix for both ref-based and ref-less metrics
|
|
744
788
|
metric = (
|
|
745
789
|
f.split("/")[-1]
|
|
746
|
-
.removesuffix(f"-{
|
|
790
|
+
.removesuffix(f"-{file_reference}.seg.score")
|
|
747
791
|
.removesuffix("-src.seg.score")
|
|
748
792
|
)
|
|
749
793
|
for line_i, line_raw in enumerate(open(f, "r").readlines()):
|
|
@@ -898,7 +942,7 @@ def load_data_wmt_test(**kwargs):
|
|
|
898
942
|
return data
|
|
899
943
|
|
|
900
944
|
|
|
901
|
-
def load_data_wmt_all(min_items=100, **kwargs):
|
|
945
|
+
def load_data_wmt_all(min_items=100, name_filter=lambda x: True, **kwargs):
|
|
902
946
|
data = {
|
|
903
947
|
args: load_data_wmt(*args, **kwargs)
|
|
904
948
|
for args in [
|
|
@@ -918,6 +962,61 @@ def load_data_wmt_all(min_items=100, **kwargs):
|
|
|
918
962
|
("wmt25", "en-uk_UA"),
|
|
919
963
|
("wmt25", "en-zh_CN"),
|
|
920
964
|
("wmt25", "ja-zh_CN"),
|
|
965
|
+
("wmt24pp", "en-ar_EG"),
|
|
966
|
+
("wmt24pp", "en-bn_IN"),
|
|
967
|
+
("wmt24pp", "en-da_DK"),
|
|
968
|
+
("wmt24pp", "en-es_MX"),
|
|
969
|
+
("wmt24pp", "en-fi_FI"),
|
|
970
|
+
("wmt24pp", "en-fr_FR"),
|
|
971
|
+
("wmt24pp", "en-hi_IN"),
|
|
972
|
+
("wmt24pp", "en-id_ID"),
|
|
973
|
+
("wmt24pp", "en-ja_JP"),
|
|
974
|
+
("wmt24pp", "en-lt_LT"),
|
|
975
|
+
("wmt24pp", "en-mr_IN"),
|
|
976
|
+
("wmt24pp", "en-pa_IN"),
|
|
977
|
+
("wmt24pp", "en-pt_PT"),
|
|
978
|
+
("wmt24pp", "en-sk_SK"),
|
|
979
|
+
("wmt24pp", "en-sv_SE"),
|
|
980
|
+
("wmt24pp", "en-ta_IN"),
|
|
981
|
+
("wmt24pp", "en-tr_TR"),
|
|
982
|
+
("wmt24pp", "en-vi_VN"),
|
|
983
|
+
("wmt24pp", "en-zu_ZA"),
|
|
984
|
+
("wmt24pp", "en-ar_SA"),
|
|
985
|
+
("wmt24pp", "en-ca_ES"),
|
|
986
|
+
("wmt24pp", "en-de_DE"),
|
|
987
|
+
("wmt24pp", "en-et_EE"),
|
|
988
|
+
("wmt24pp", "en-fil_PH"),
|
|
989
|
+
("wmt24pp", "en-gu_IN"),
|
|
990
|
+
("wmt24pp", "en-hr_HR"),
|
|
991
|
+
("wmt24pp", "en-is_IS"),
|
|
992
|
+
("wmt24pp", "en-kn_IN"),
|
|
993
|
+
("wmt24pp", "en-lv_LV"),
|
|
994
|
+
("wmt24pp", "en-nl_NL"),
|
|
995
|
+
("wmt24pp", "en-pl_PL"),
|
|
996
|
+
("wmt24pp", "en-ro_RO"),
|
|
997
|
+
("wmt24pp", "en-sl_SI"),
|
|
998
|
+
("wmt24pp", "en-sw_KE"),
|
|
999
|
+
("wmt24pp", "en-te_IN"),
|
|
1000
|
+
("wmt24pp", "en-uk_UA"),
|
|
1001
|
+
("wmt24pp", "en-zh_CN"),
|
|
1002
|
+
("wmt24pp", "en-bg_BG"),
|
|
1003
|
+
("wmt24pp", "en-cs_CZ"),
|
|
1004
|
+
("wmt24pp", "en-el_GR"),
|
|
1005
|
+
("wmt24pp", "en-fa_IR"),
|
|
1006
|
+
("wmt24pp", "en-fr_CA"),
|
|
1007
|
+
("wmt24pp", "en-he_IL"),
|
|
1008
|
+
("wmt24pp", "en-hu_HU"),
|
|
1009
|
+
("wmt24pp", "en-it_IT"),
|
|
1010
|
+
("wmt24pp", "en-ko_KR"),
|
|
1011
|
+
("wmt24pp", "en-ml_IN"),
|
|
1012
|
+
("wmt24pp", "en-no_NO"),
|
|
1013
|
+
("wmt24pp", "en-pt_BR"),
|
|
1014
|
+
("wmt24pp", "en-ru_RU"),
|
|
1015
|
+
("wmt24pp", "en-sr_RS"),
|
|
1016
|
+
("wmt24pp", "en-sw_TZ"),
|
|
1017
|
+
("wmt24pp", "en-th_TH"),
|
|
1018
|
+
("wmt24pp", "en-ur_PK"),
|
|
1019
|
+
("wmt24pp", "en-zh_TW"),
|
|
921
1020
|
("wmt24", "cs-uk"),
|
|
922
1021
|
("wmt24", "en-cs"),
|
|
923
1022
|
("wmt24", "en-de"),
|
|
@@ -1025,6 +1124,7 @@ def load_data_wmt_all(min_items=100, **kwargs):
|
|
|
1025
1124
|
("wmt19", "en-de"),
|
|
1026
1125
|
("wmt19", "de-cs"),
|
|
1027
1126
|
]
|
|
1127
|
+
if name_filter(args)
|
|
1028
1128
|
}
|
|
1029
1129
|
# filter out empty datasets
|
|
1030
1130
|
# some years/langs have issues with human annotations coverage
|
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
"""Information about the standard human references selected for various WMT test sets."""
|
|
2
|
-
|
|
3
|
-
year2std_refs = {
|
|
4
|
-
|
|
5
|
-
"wmt25": {
|
|
6
|
-
"cs-de_DE": "refA",
|
|
7
|
-
"cs-uk_UA": "refA",
|
|
8
|
-
"en-ar_EG": "refA",
|
|
9
|
-
"en-bho_IN": "refA",
|
|
10
|
-
"en-cs_CZ": "refA",
|
|
11
|
-
"en-et_EE": "refA",
|
|
12
|
-
"en-is_IS": "refA",
|
|
13
|
-
"en-it_IT": "refA",
|
|
14
|
-
"en-ja_JP": "refA",
|
|
15
|
-
"en-ko_KR": "refA",
|
|
16
|
-
"en-mas_KE": "refA",
|
|
17
|
-
"en-ru_RU": "refA",
|
|
18
|
-
"en-sr_Cyrl_RS": "refA",
|
|
19
|
-
"en-uk_UA": "refA",
|
|
20
|
-
"en-zh_CN": "refA",
|
|
21
|
-
"ja-zh_CN": "refA",
|
|
22
|
-
},
|
|
23
|
-
"wmt24": {
|
|
24
|
-
"en-de": "refB",
|
|
25
|
-
"en-es": "refA",
|
|
26
|
-
"ja-zh": "refA",
|
|
27
|
-
"cs-uk": "refA",
|
|
28
|
-
"en-cs": "refA",
|
|
29
|
-
"en-hi": "refA",
|
|
30
|
-
"en-is": "refA",
|
|
31
|
-
"en-ja": "refA",
|
|
32
|
-
"en-ru": "refA",
|
|
33
|
-
"en-uk": "refA",
|
|
34
|
-
"en-zh": "refA"
|
|
35
|
-
},
|
|
36
|
-
"wmt23.sent": {
|
|
37
|
-
"en-de": "refA"
|
|
38
|
-
},
|
|
39
|
-
"wmt23": {
|
|
40
|
-
"en-de": "refA",
|
|
41
|
-
"he-en": "refB",
|
|
42
|
-
"zh-en": "refA",
|
|
43
|
-
"cs-uk": "refA",
|
|
44
|
-
"de-en": "refA",
|
|
45
|
-
"en-cs": "refA",
|
|
46
|
-
"en-he": "refB",
|
|
47
|
-
"en-ja": "refA",
|
|
48
|
-
"en-ru": "refA",
|
|
49
|
-
"en-uk": "refA",
|
|
50
|
-
"en-zh": "refA",
|
|
51
|
-
"ja-en": "refA",
|
|
52
|
-
"ru-en": "refA",
|
|
53
|
-
"uk-en": "refA"
|
|
54
|
-
},
|
|
55
|
-
"wmt22": {
|
|
56
|
-
"en-de": "refA",
|
|
57
|
-
"en-ru": "refA",
|
|
58
|
-
"zh-en": "refA",
|
|
59
|
-
"cs-en": "refB",
|
|
60
|
-
"cs-uk": "refA",
|
|
61
|
-
"de-en": "refA",
|
|
62
|
-
"de-fr": "refA",
|
|
63
|
-
"en-cs": "refB",
|
|
64
|
-
"en-hr": "refA",
|
|
65
|
-
"en-ja": "refA",
|
|
66
|
-
"en-liv": "refA",
|
|
67
|
-
"en-uk": "refA",
|
|
68
|
-
"en-zh": "refA",
|
|
69
|
-
"fr-de": "refA",
|
|
70
|
-
"ja-en": "refA",
|
|
71
|
-
"liv-en": "refA",
|
|
72
|
-
"ru-en": "refA",
|
|
73
|
-
"ru-sah": "refA",
|
|
74
|
-
"sah-ru": "refA",
|
|
75
|
-
"uk-cs": "refA",
|
|
76
|
-
"uk-en": "refA"
|
|
77
|
-
},
|
|
78
|
-
"wmt21.news": {
|
|
79
|
-
"en-cs": "refA",
|
|
80
|
-
"en-de": "refC",
|
|
81
|
-
"en-ha": "refA",
|
|
82
|
-
"en-is": "refA",
|
|
83
|
-
"en-ja": "refA",
|
|
84
|
-
"en-ru": "refA",
|
|
85
|
-
"en-zh": "refA",
|
|
86
|
-
"cs-en": "refA",
|
|
87
|
-
"de-en": "refA",
|
|
88
|
-
"de-fr": "refA",
|
|
89
|
-
"fr-de": "refA",
|
|
90
|
-
"ha-en": "refA",
|
|
91
|
-
"is-en": "refA",
|
|
92
|
-
"ja-en": "refA",
|
|
93
|
-
"ru-en": "refA",
|
|
94
|
-
"zh-en": "refB"
|
|
95
|
-
},
|
|
96
|
-
"wmt21.tedtalks": {
|
|
97
|
-
"en-de": "refA",
|
|
98
|
-
"en-ru": "refA",
|
|
99
|
-
"zh-en": "refB"
|
|
100
|
-
},
|
|
101
|
-
"wmt21.flores": {
|
|
102
|
-
"bn-hi": "refA",
|
|
103
|
-
"hi-bn": "refA",
|
|
104
|
-
"xh-zu": "refA",
|
|
105
|
-
"zu-xh": "refA"
|
|
106
|
-
},
|
|
107
|
-
"wmt20": {
|
|
108
|
-
"cs-en": "ref",
|
|
109
|
-
"de-en": "ref",
|
|
110
|
-
"en-cs": "ref",
|
|
111
|
-
"en-de": "ref",
|
|
112
|
-
"en-iu": "ref",
|
|
113
|
-
"en-ja": "ref",
|
|
114
|
-
"en-pl": "ref",
|
|
115
|
-
"en-ru": "ref",
|
|
116
|
-
"en-ta": "ref",
|
|
117
|
-
"en-zh": "ref",
|
|
118
|
-
"iu-en": "ref",
|
|
119
|
-
"ja-en": "ref",
|
|
120
|
-
"km-en": "ref",
|
|
121
|
-
"pl-en": "ref",
|
|
122
|
-
"ps-en": "ref",
|
|
123
|
-
"ru-en": "ref",
|
|
124
|
-
"ta-en": "ref",
|
|
125
|
-
"zh-en": "ref"
|
|
126
|
-
},
|
|
127
|
-
"wmt19": {
|
|
128
|
-
"de-cs": "ref",
|
|
129
|
-
"de-en": "ref",
|
|
130
|
-
"de-fr": "ref",
|
|
131
|
-
"en-cs": "ref",
|
|
132
|
-
"en-de": "ref",
|
|
133
|
-
"en-fi": "ref",
|
|
134
|
-
"en-gu": "ref",
|
|
135
|
-
"en-kk": "ref",
|
|
136
|
-
"en-lt": "ref",
|
|
137
|
-
"en-ru": "ref",
|
|
138
|
-
"en-zh": "ref",
|
|
139
|
-
"fi-en": "ref",
|
|
140
|
-
"fr-de": "ref",
|
|
141
|
-
"gu-en": "ref",
|
|
142
|
-
"kk-en": "ref",
|
|
143
|
-
"lt-en": "ref",
|
|
144
|
-
"ru-en": "ref",
|
|
145
|
-
"zh-en": "ref"
|
|
146
|
-
}
|
|
147
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|