subset2evaluate 1.0.22__tar.gz → 1.0.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/PKG-INFO +1 -1
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/pyproject.toml +1 -1
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/utils.py +103 -10
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/PKG-INFO +1 -1
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/SOURCES.txt +0 -1
- subset2evaluate-1.0.22/subset2evaluate/reference_info.py +0 -147
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/README.md +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/setup.cfg +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/__init__.py +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/evaluate.py +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/methods.py +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/methods_old.py +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/select_subset.py +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/test.py +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/dependency_links.txt +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/entry_points.txt +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/requires.txt +0 -0
- {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from typing import Any, Callable, Dict, List, Optional, Union, Literal
|
|
2
2
|
import numpy as np
|
|
3
|
-
from subset2evaluate.reference_info import year2std_refs
|
|
4
3
|
|
|
5
4
|
PROPS = np.linspace(0.05, 0.5, 10)
|
|
6
5
|
|
|
@@ -623,6 +622,7 @@ def load_data_wmt( # noqa: C901
|
|
|
623
622
|
import pickle
|
|
624
623
|
import contextlib
|
|
625
624
|
import importlib.metadata
|
|
625
|
+
import json
|
|
626
626
|
|
|
627
627
|
# temporarily change to the root directory, this requires Python 3.11
|
|
628
628
|
with contextlib.chdir(os.path.dirname(os.path.realpath(__file__)) + "/../"):
|
|
@@ -644,6 +644,44 @@ def load_data_wmt( # noqa: C901
|
|
|
644
644
|
):
|
|
645
645
|
return cache["data"]
|
|
646
646
|
|
|
647
|
+
# special handling for WMT24++
|
|
648
|
+
if os.path.exists(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl"):
|
|
649
|
+
with open(
|
|
650
|
+
f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl", "r"
|
|
651
|
+
) as f:
|
|
652
|
+
data_tmp = f.readlines()
|
|
653
|
+
lines_src = [json.loads(line)["source"] for line in data_tmp]
|
|
654
|
+
with open(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "w") as f:
|
|
655
|
+
for line in lines_src:
|
|
656
|
+
f.write(line + "\n")
|
|
657
|
+
if os.path.exists(
|
|
658
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl"
|
|
659
|
+
):
|
|
660
|
+
with open(
|
|
661
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl", "r"
|
|
662
|
+
) as f:
|
|
663
|
+
data_tmp = f.readlines()
|
|
664
|
+
lines_ref = [json.loads(line)["target"] for line in data_tmp]
|
|
665
|
+
with open(
|
|
666
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.txt", "w"
|
|
667
|
+
) as f:
|
|
668
|
+
for line in lines_ref:
|
|
669
|
+
f.write(line + "\n")
|
|
670
|
+
if os.path.exists(
|
|
671
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl"
|
|
672
|
+
):
|
|
673
|
+
with open(
|
|
674
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl",
|
|
675
|
+
"r",
|
|
676
|
+
) as f:
|
|
677
|
+
data_tmp = f.readlines()
|
|
678
|
+
lines_ref = [json.loads(line)["target"] for line in data_tmp]
|
|
679
|
+
with open(
|
|
680
|
+
f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.txt", "w"
|
|
681
|
+
) as f:
|
|
682
|
+
for line in lines_ref:
|
|
683
|
+
f.write(line + "\n")
|
|
684
|
+
|
|
647
685
|
lines_src = open(
|
|
648
686
|
f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "r"
|
|
649
687
|
).readlines()
|
|
@@ -653,14 +691,13 @@ def load_data_wmt( # noqa: C901
|
|
|
653
691
|
lines_ref = None
|
|
654
692
|
|
|
655
693
|
refs_dir = f"data/mt-metrics-eval-v2/{year}/references"
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
if not os.path.exists(file_reference_path):
|
|
694
|
+
for file_reference in [file_reference, "refA", "refB", "refC", "ref"]:
|
|
695
|
+
if os.path.exists(f"{refs_dir}/{langs}.{file_reference}.txt"):
|
|
696
|
+
break
|
|
697
|
+
else:
|
|
662
698
|
# did not find reference
|
|
663
699
|
return []
|
|
700
|
+
file_reference_path = f"{refs_dir}/{langs}.{file_reference}.txt"
|
|
664
701
|
|
|
665
702
|
lines_ref = open(file_reference_path, "r").readlines()
|
|
666
703
|
|
|
@@ -743,14 +780,14 @@ def load_data_wmt( # noqa: C901
|
|
|
743
780
|
f"data/mt-metrics-eval-v2/{year}/metric-scores/{langs}/*.seg.score"
|
|
744
781
|
):
|
|
745
782
|
# among ref-based metrics, load only the scores for the selected human ref
|
|
746
|
-
if not f.endswith(f"-{
|
|
783
|
+
if not f.endswith(f"-{file_reference}.seg.score") and not f.endswith(
|
|
747
784
|
"-src.seg.score"
|
|
748
785
|
):
|
|
749
786
|
continue
|
|
750
787
|
# remove suffix for both ref-based and ref-less metrics
|
|
751
788
|
metric = (
|
|
752
789
|
f.split("/")[-1]
|
|
753
|
-
.removesuffix(f"-{
|
|
790
|
+
.removesuffix(f"-{file_reference}.seg.score")
|
|
754
791
|
.removesuffix("-src.seg.score")
|
|
755
792
|
)
|
|
756
793
|
for line_i, line_raw in enumerate(open(f, "r").readlines()):
|
|
@@ -905,7 +942,7 @@ def load_data_wmt_test(**kwargs):
|
|
|
905
942
|
return data
|
|
906
943
|
|
|
907
944
|
|
|
908
|
-
def load_data_wmt_all(min_items=100, **kwargs):
|
|
945
|
+
def load_data_wmt_all(min_items=100, name_filter=lambda x: True, **kwargs):
|
|
909
946
|
data = {
|
|
910
947
|
args: load_data_wmt(*args, **kwargs)
|
|
911
948
|
for args in [
|
|
@@ -925,6 +962,61 @@ def load_data_wmt_all(min_items=100, **kwargs):
|
|
|
925
962
|
("wmt25", "en-uk_UA"),
|
|
926
963
|
("wmt25", "en-zh_CN"),
|
|
927
964
|
("wmt25", "ja-zh_CN"),
|
|
965
|
+
("wmt24pp", "en-ar_EG"),
|
|
966
|
+
("wmt24pp", "en-bn_IN"),
|
|
967
|
+
("wmt24pp", "en-da_DK"),
|
|
968
|
+
("wmt24pp", "en-es_MX"),
|
|
969
|
+
("wmt24pp", "en-fi_FI"),
|
|
970
|
+
("wmt24pp", "en-fr_FR"),
|
|
971
|
+
("wmt24pp", "en-hi_IN"),
|
|
972
|
+
("wmt24pp", "en-id_ID"),
|
|
973
|
+
("wmt24pp", "en-ja_JP"),
|
|
974
|
+
("wmt24pp", "en-lt_LT"),
|
|
975
|
+
("wmt24pp", "en-mr_IN"),
|
|
976
|
+
("wmt24pp", "en-pa_IN"),
|
|
977
|
+
("wmt24pp", "en-pt_PT"),
|
|
978
|
+
("wmt24pp", "en-sk_SK"),
|
|
979
|
+
("wmt24pp", "en-sv_SE"),
|
|
980
|
+
("wmt24pp", "en-ta_IN"),
|
|
981
|
+
("wmt24pp", "en-tr_TR"),
|
|
982
|
+
("wmt24pp", "en-vi_VN"),
|
|
983
|
+
("wmt24pp", "en-zu_ZA"),
|
|
984
|
+
("wmt24pp", "en-ar_SA"),
|
|
985
|
+
("wmt24pp", "en-ca_ES"),
|
|
986
|
+
("wmt24pp", "en-de_DE"),
|
|
987
|
+
("wmt24pp", "en-et_EE"),
|
|
988
|
+
("wmt24pp", "en-fil_PH"),
|
|
989
|
+
("wmt24pp", "en-gu_IN"),
|
|
990
|
+
("wmt24pp", "en-hr_HR"),
|
|
991
|
+
("wmt24pp", "en-is_IS"),
|
|
992
|
+
("wmt24pp", "en-kn_IN"),
|
|
993
|
+
("wmt24pp", "en-lv_LV"),
|
|
994
|
+
("wmt24pp", "en-nl_NL"),
|
|
995
|
+
("wmt24pp", "en-pl_PL"),
|
|
996
|
+
("wmt24pp", "en-ro_RO"),
|
|
997
|
+
("wmt24pp", "en-sl_SI"),
|
|
998
|
+
("wmt24pp", "en-sw_KE"),
|
|
999
|
+
("wmt24pp", "en-te_IN"),
|
|
1000
|
+
("wmt24pp", "en-uk_UA"),
|
|
1001
|
+
("wmt24pp", "en-zh_CN"),
|
|
1002
|
+
("wmt24pp", "en-bg_BG"),
|
|
1003
|
+
("wmt24pp", "en-cs_CZ"),
|
|
1004
|
+
("wmt24pp", "en-el_GR"),
|
|
1005
|
+
("wmt24pp", "en-fa_IR"),
|
|
1006
|
+
("wmt24pp", "en-fr_CA"),
|
|
1007
|
+
("wmt24pp", "en-he_IL"),
|
|
1008
|
+
("wmt24pp", "en-hu_HU"),
|
|
1009
|
+
("wmt24pp", "en-it_IT"),
|
|
1010
|
+
("wmt24pp", "en-ko_KR"),
|
|
1011
|
+
("wmt24pp", "en-ml_IN"),
|
|
1012
|
+
("wmt24pp", "en-no_NO"),
|
|
1013
|
+
("wmt24pp", "en-pt_BR"),
|
|
1014
|
+
("wmt24pp", "en-ru_RU"),
|
|
1015
|
+
("wmt24pp", "en-sr_RS"),
|
|
1016
|
+
("wmt24pp", "en-sw_TZ"),
|
|
1017
|
+
("wmt24pp", "en-th_TH"),
|
|
1018
|
+
("wmt24pp", "en-ur_PK"),
|
|
1019
|
+
("wmt24pp", "en-zh_TW"),
|
|
928
1020
|
("wmt24", "cs-uk"),
|
|
929
1021
|
("wmt24", "en-cs"),
|
|
930
1022
|
("wmt24", "en-de"),
|
|
@@ -1032,6 +1124,7 @@ def load_data_wmt_all(min_items=100, **kwargs):
|
|
|
1032
1124
|
("wmt19", "en-de"),
|
|
1033
1125
|
("wmt19", "de-cs"),
|
|
1034
1126
|
]
|
|
1127
|
+
if name_filter(args)
|
|
1035
1128
|
}
|
|
1036
1129
|
# filter out empty datasets
|
|
1037
1130
|
# some years/langs have issues with human annotations coverage
|
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
"""Information about the standard human references selected for various WMT test sets."""
|
|
2
|
-
|
|
3
|
-
year2std_refs = {
|
|
4
|
-
|
|
5
|
-
"wmt25": {
|
|
6
|
-
"cs-de_DE": "refA",
|
|
7
|
-
"cs-uk_UA": "refA",
|
|
8
|
-
"en-ar_EG": "refA",
|
|
9
|
-
"en-bho_IN": "refA",
|
|
10
|
-
"en-cs_CZ": "refA",
|
|
11
|
-
"en-et_EE": "refA",
|
|
12
|
-
"en-is_IS": "refA",
|
|
13
|
-
"en-it_IT": "refA",
|
|
14
|
-
"en-ja_JP": "refA",
|
|
15
|
-
"en-ko_KR": "refA",
|
|
16
|
-
"en-mas_KE": "refA",
|
|
17
|
-
"en-ru_RU": "refA",
|
|
18
|
-
"en-sr_Cyrl_RS": "refA",
|
|
19
|
-
"en-uk_UA": "refA",
|
|
20
|
-
"en-zh_CN": "refA",
|
|
21
|
-
"ja-zh_CN": "refA",
|
|
22
|
-
},
|
|
23
|
-
"wmt24": {
|
|
24
|
-
"en-de": "refB",
|
|
25
|
-
"en-es": "refA",
|
|
26
|
-
"ja-zh": "refA",
|
|
27
|
-
"cs-uk": "refA",
|
|
28
|
-
"en-cs": "refA",
|
|
29
|
-
"en-hi": "refA",
|
|
30
|
-
"en-is": "refA",
|
|
31
|
-
"en-ja": "refA",
|
|
32
|
-
"en-ru": "refA",
|
|
33
|
-
"en-uk": "refA",
|
|
34
|
-
"en-zh": "refA"
|
|
35
|
-
},
|
|
36
|
-
"wmt23.sent": {
|
|
37
|
-
"en-de": "refA"
|
|
38
|
-
},
|
|
39
|
-
"wmt23": {
|
|
40
|
-
"en-de": "refA",
|
|
41
|
-
"he-en": "refB",
|
|
42
|
-
"zh-en": "refA",
|
|
43
|
-
"cs-uk": "refA",
|
|
44
|
-
"de-en": "refA",
|
|
45
|
-
"en-cs": "refA",
|
|
46
|
-
"en-he": "refB",
|
|
47
|
-
"en-ja": "refA",
|
|
48
|
-
"en-ru": "refA",
|
|
49
|
-
"en-uk": "refA",
|
|
50
|
-
"en-zh": "refA",
|
|
51
|
-
"ja-en": "refA",
|
|
52
|
-
"ru-en": "refA",
|
|
53
|
-
"uk-en": "refA"
|
|
54
|
-
},
|
|
55
|
-
"wmt22": {
|
|
56
|
-
"en-de": "refA",
|
|
57
|
-
"en-ru": "refA",
|
|
58
|
-
"zh-en": "refA",
|
|
59
|
-
"cs-en": "refB",
|
|
60
|
-
"cs-uk": "refA",
|
|
61
|
-
"de-en": "refA",
|
|
62
|
-
"de-fr": "refA",
|
|
63
|
-
"en-cs": "refB",
|
|
64
|
-
"en-hr": "refA",
|
|
65
|
-
"en-ja": "refA",
|
|
66
|
-
"en-liv": "refA",
|
|
67
|
-
"en-uk": "refA",
|
|
68
|
-
"en-zh": "refA",
|
|
69
|
-
"fr-de": "refA",
|
|
70
|
-
"ja-en": "refA",
|
|
71
|
-
"liv-en": "refA",
|
|
72
|
-
"ru-en": "refA",
|
|
73
|
-
"ru-sah": "refA",
|
|
74
|
-
"sah-ru": "refA",
|
|
75
|
-
"uk-cs": "refA",
|
|
76
|
-
"uk-en": "refA"
|
|
77
|
-
},
|
|
78
|
-
"wmt21.news": {
|
|
79
|
-
"en-cs": "refA",
|
|
80
|
-
"en-de": "refC",
|
|
81
|
-
"en-ha": "refA",
|
|
82
|
-
"en-is": "refA",
|
|
83
|
-
"en-ja": "refA",
|
|
84
|
-
"en-ru": "refA",
|
|
85
|
-
"en-zh": "refA",
|
|
86
|
-
"cs-en": "refA",
|
|
87
|
-
"de-en": "refA",
|
|
88
|
-
"de-fr": "refA",
|
|
89
|
-
"fr-de": "refA",
|
|
90
|
-
"ha-en": "refA",
|
|
91
|
-
"is-en": "refA",
|
|
92
|
-
"ja-en": "refA",
|
|
93
|
-
"ru-en": "refA",
|
|
94
|
-
"zh-en": "refB"
|
|
95
|
-
},
|
|
96
|
-
"wmt21.tedtalks": {
|
|
97
|
-
"en-de": "refA",
|
|
98
|
-
"en-ru": "refA",
|
|
99
|
-
"zh-en": "refB"
|
|
100
|
-
},
|
|
101
|
-
"wmt21.flores": {
|
|
102
|
-
"bn-hi": "refA",
|
|
103
|
-
"hi-bn": "refA",
|
|
104
|
-
"xh-zu": "refA",
|
|
105
|
-
"zu-xh": "refA"
|
|
106
|
-
},
|
|
107
|
-
"wmt20": {
|
|
108
|
-
"cs-en": "ref",
|
|
109
|
-
"de-en": "ref",
|
|
110
|
-
"en-cs": "ref",
|
|
111
|
-
"en-de": "ref",
|
|
112
|
-
"en-iu": "ref",
|
|
113
|
-
"en-ja": "ref",
|
|
114
|
-
"en-pl": "ref",
|
|
115
|
-
"en-ru": "ref",
|
|
116
|
-
"en-ta": "ref",
|
|
117
|
-
"en-zh": "ref",
|
|
118
|
-
"iu-en": "ref",
|
|
119
|
-
"ja-en": "ref",
|
|
120
|
-
"km-en": "ref",
|
|
121
|
-
"pl-en": "ref",
|
|
122
|
-
"ps-en": "ref",
|
|
123
|
-
"ru-en": "ref",
|
|
124
|
-
"ta-en": "ref",
|
|
125
|
-
"zh-en": "ref"
|
|
126
|
-
},
|
|
127
|
-
"wmt19": {
|
|
128
|
-
"de-cs": "ref",
|
|
129
|
-
"de-en": "ref",
|
|
130
|
-
"de-fr": "ref",
|
|
131
|
-
"en-cs": "ref",
|
|
132
|
-
"en-de": "ref",
|
|
133
|
-
"en-fi": "ref",
|
|
134
|
-
"en-gu": "ref",
|
|
135
|
-
"en-kk": "ref",
|
|
136
|
-
"en-lt": "ref",
|
|
137
|
-
"en-ru": "ref",
|
|
138
|
-
"en-zh": "ref",
|
|
139
|
-
"fi-en": "ref",
|
|
140
|
-
"fr-de": "ref",
|
|
141
|
-
"gu-en": "ref",
|
|
142
|
-
"kk-en": "ref",
|
|
143
|
-
"lt-en": "ref",
|
|
144
|
-
"ru-en": "ref",
|
|
145
|
-
"zh-en": "ref"
|
|
146
|
-
}
|
|
147
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|