subset2evaluate 1.0.21__tar.gz → 1.0.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/PKG-INFO +1 -1
  2. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/pyproject.toml +1 -1
  3. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/utils.py +114 -14
  4. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/PKG-INFO +1 -1
  5. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/SOURCES.txt +0 -1
  6. subset2evaluate-1.0.21/subset2evaluate/reference_info.py +0 -147
  7. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/README.md +0 -0
  8. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/setup.cfg +0 -0
  9. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/__init__.py +0 -0
  10. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/evaluate.py +0 -0
  11. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/methods.py +0 -0
  12. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/methods_old.py +0 -0
  13. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/select_subset.py +0 -0
  14. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate/test.py +0 -0
  15. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/dependency_links.txt +0 -0
  16. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/entry_points.txt +0 -0
  17. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/requires.txt +0 -0
  18. {subset2evaluate-1.0.21 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.21
3
+ Version: 1.0.23
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -11,7 +11,7 @@ packages = ["subset2evaluate"]
11
11
 
12
12
  [project]
13
13
  name = "subset2evaluate"
14
- version = "1.0.21"
14
+ version = "1.0.23"
15
15
  description = "Find informative examples to efficiently (human-)evaluate NLG models."
16
16
  license = {text = "MIT"}
17
17
  readme = "README.md"
@@ -1,6 +1,5 @@
1
1
  from typing import Any, Callable, Dict, List, Optional, Union, Literal
2
2
  import numpy as np
3
- from subset2evaluate.reference_info import year2std_refs
4
3
 
5
4
  PROPS = np.linspace(0.05, 0.5, 10)
6
5
 
@@ -16,6 +15,8 @@ def _data_minmax_normalize(data):
16
15
  for line in data:
17
16
  for met_all in line["scores"].values():
18
17
  for met_k, met_v in met_all.items():
18
+ if met_v is None:
19
+ continue
19
20
  data_flat[met_k].append(met_v)
20
21
 
21
22
  # normalize
@@ -24,10 +25,15 @@ def _data_minmax_normalize(data):
24
25
  for line in data:
25
26
  for model, met_all in line["scores"].items():
26
27
  for met_k, met_v in met_all.items():
27
- # (x-min)/(max-min) normalize
28
- line["scores"][model][met_k] = (met_v - data_flat[met_k][0]) / (
29
- data_flat[met_k][1] - data_flat[met_k][0]
30
- )
28
+ if met_v is None:
29
+ continue
30
+ if data_flat[met_k][1] - data_flat[met_k][0] == 0:
31
+ line["scores"][model][met_k] = 0
32
+ else:
33
+ # (x-min)/(max-min) normalize
34
+ line["scores"][model][met_k] = (met_v - data_flat[met_k][0]) / (
35
+ data_flat[met_k][1] - data_flat[met_k][0]
36
+ )
31
37
 
32
38
 
33
39
  def confidence_interval(data, confidence=0.95):
@@ -616,6 +622,7 @@ def load_data_wmt( # noqa: C901
616
622
  import pickle
617
623
  import contextlib
618
624
  import importlib.metadata
625
+ import json
619
626
 
620
627
  # temporarily change to the root directory, this requires Python 3.11
621
628
  with contextlib.chdir(os.path.dirname(os.path.realpath(__file__)) + "/../"):
@@ -637,6 +644,44 @@ def load_data_wmt( # noqa: C901
637
644
  ):
638
645
  return cache["data"]
639
646
 
647
+ # special handling for WMT24++
648
+ if os.path.exists(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl"):
649
+ with open(
650
+ f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl", "r"
651
+ ) as f:
652
+ data_tmp = f.readlines()
653
+ lines_src = [json.loads(line)["source"] for line in data_tmp]
654
+ with open(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "w") as f:
655
+ for line in lines_src:
656
+ f.write(line + "\n")
657
+ if os.path.exists(
658
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl"
659
+ ):
660
+ with open(
661
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl", "r"
662
+ ) as f:
663
+ data_tmp = f.readlines()
664
+ lines_ref = [json.loads(line)["target"] for line in data_tmp]
665
+ with open(
666
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.txt", "w"
667
+ ) as f:
668
+ for line in lines_ref:
669
+ f.write(line + "\n")
670
+ if os.path.exists(
671
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl"
672
+ ):
673
+ with open(
674
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl",
675
+ "r",
676
+ ) as f:
677
+ data_tmp = f.readlines()
678
+ lines_ref = [json.loads(line)["target"] for line in data_tmp]
679
+ with open(
680
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.txt", "w"
681
+ ) as f:
682
+ for line in lines_ref:
683
+ f.write(line + "\n")
684
+
640
685
  lines_src = open(
641
686
  f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "r"
642
687
  ).readlines()
@@ -646,14 +691,13 @@ def load_data_wmt( # noqa: C901
646
691
  lines_ref = None
647
692
 
648
693
  refs_dir = f"data/mt-metrics-eval-v2/{year}/references"
649
- selected_human_ref = (
650
- file_reference if file_reference is not None else year2std_refs[year][langs]
651
- )
652
- file_reference_path = f"{refs_dir}/{langs}.{selected_human_ref}.txt"
653
-
654
- if not os.path.exists(file_reference_path):
694
+ for file_reference in [file_reference, "refA", "refB", "refC", "ref"]:
695
+ if os.path.exists(f"{refs_dir}/{langs}.{file_reference}.txt"):
696
+ break
697
+ else:
655
698
  # did not find reference
656
699
  return []
700
+ file_reference_path = f"{refs_dir}/{langs}.{file_reference}.txt"
657
701
 
658
702
  lines_ref = open(file_reference_path, "r").readlines()
659
703
 
@@ -736,14 +780,14 @@ def load_data_wmt( # noqa: C901
736
780
  f"data/mt-metrics-eval-v2/{year}/metric-scores/{langs}/*.seg.score"
737
781
  ):
738
782
  # among ref-based metrics, load only the scores for the selected human ref
739
- if not f.endswith(f"-{selected_human_ref}.seg.score") and not f.endswith(
783
+ if not f.endswith(f"-{file_reference}.seg.score") and not f.endswith(
740
784
  "-src.seg.score"
741
785
  ):
742
786
  continue
743
787
  # remove suffix for both ref-based and ref-less metrics
744
788
  metric = (
745
789
  f.split("/")[-1]
746
- .removesuffix(f"-{selected_human_ref}.seg.score")
790
+ .removesuffix(f"-{file_reference}.seg.score")
747
791
  .removesuffix("-src.seg.score")
748
792
  )
749
793
  for line_i, line_raw in enumerate(open(f, "r").readlines()):
@@ -898,7 +942,7 @@ def load_data_wmt_test(**kwargs):
898
942
  return data
899
943
 
900
944
 
901
- def load_data_wmt_all(min_items=100, **kwargs):
945
+ def load_data_wmt_all(min_items=100, name_filter=lambda x: True, **kwargs):
902
946
  data = {
903
947
  args: load_data_wmt(*args, **kwargs)
904
948
  for args in [
@@ -918,6 +962,61 @@ def load_data_wmt_all(min_items=100, **kwargs):
918
962
  ("wmt25", "en-uk_UA"),
919
963
  ("wmt25", "en-zh_CN"),
920
964
  ("wmt25", "ja-zh_CN"),
965
+ ("wmt24pp", "en-ar_EG"),
966
+ ("wmt24pp", "en-bn_IN"),
967
+ ("wmt24pp", "en-da_DK"),
968
+ ("wmt24pp", "en-es_MX"),
969
+ ("wmt24pp", "en-fi_FI"),
970
+ ("wmt24pp", "en-fr_FR"),
971
+ ("wmt24pp", "en-hi_IN"),
972
+ ("wmt24pp", "en-id_ID"),
973
+ ("wmt24pp", "en-ja_JP"),
974
+ ("wmt24pp", "en-lt_LT"),
975
+ ("wmt24pp", "en-mr_IN"),
976
+ ("wmt24pp", "en-pa_IN"),
977
+ ("wmt24pp", "en-pt_PT"),
978
+ ("wmt24pp", "en-sk_SK"),
979
+ ("wmt24pp", "en-sv_SE"),
980
+ ("wmt24pp", "en-ta_IN"),
981
+ ("wmt24pp", "en-tr_TR"),
982
+ ("wmt24pp", "en-vi_VN"),
983
+ ("wmt24pp", "en-zu_ZA"),
984
+ ("wmt24pp", "en-ar_SA"),
985
+ ("wmt24pp", "en-ca_ES"),
986
+ ("wmt24pp", "en-de_DE"),
987
+ ("wmt24pp", "en-et_EE"),
988
+ ("wmt24pp", "en-fil_PH"),
989
+ ("wmt24pp", "en-gu_IN"),
990
+ ("wmt24pp", "en-hr_HR"),
991
+ ("wmt24pp", "en-is_IS"),
992
+ ("wmt24pp", "en-kn_IN"),
993
+ ("wmt24pp", "en-lv_LV"),
994
+ ("wmt24pp", "en-nl_NL"),
995
+ ("wmt24pp", "en-pl_PL"),
996
+ ("wmt24pp", "en-ro_RO"),
997
+ ("wmt24pp", "en-sl_SI"),
998
+ ("wmt24pp", "en-sw_KE"),
999
+ ("wmt24pp", "en-te_IN"),
1000
+ ("wmt24pp", "en-uk_UA"),
1001
+ ("wmt24pp", "en-zh_CN"),
1002
+ ("wmt24pp", "en-bg_BG"),
1003
+ ("wmt24pp", "en-cs_CZ"),
1004
+ ("wmt24pp", "en-el_GR"),
1005
+ ("wmt24pp", "en-fa_IR"),
1006
+ ("wmt24pp", "en-fr_CA"),
1007
+ ("wmt24pp", "en-he_IL"),
1008
+ ("wmt24pp", "en-hu_HU"),
1009
+ ("wmt24pp", "en-it_IT"),
1010
+ ("wmt24pp", "en-ko_KR"),
1011
+ ("wmt24pp", "en-ml_IN"),
1012
+ ("wmt24pp", "en-no_NO"),
1013
+ ("wmt24pp", "en-pt_BR"),
1014
+ ("wmt24pp", "en-ru_RU"),
1015
+ ("wmt24pp", "en-sr_RS"),
1016
+ ("wmt24pp", "en-sw_TZ"),
1017
+ ("wmt24pp", "en-th_TH"),
1018
+ ("wmt24pp", "en-ur_PK"),
1019
+ ("wmt24pp", "en-zh_TW"),
921
1020
  ("wmt24", "cs-uk"),
922
1021
  ("wmt24", "en-cs"),
923
1022
  ("wmt24", "en-de"),
@@ -1025,6 +1124,7 @@ def load_data_wmt_all(min_items=100, **kwargs):
1025
1124
  ("wmt19", "en-de"),
1026
1125
  ("wmt19", "de-cs"),
1027
1126
  ]
1127
+ if name_filter(args)
1028
1128
  }
1029
1129
  # filter out empty datasets
1030
1130
  # some years/langs have issues with human annotations coverage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.21
3
+ Version: 1.0.23
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -4,7 +4,6 @@ subset2evaluate/__init__.py
4
4
  subset2evaluate/evaluate.py
5
5
  subset2evaluate/methods.py
6
6
  subset2evaluate/methods_old.py
7
- subset2evaluate/reference_info.py
8
7
  subset2evaluate/select_subset.py
9
8
  subset2evaluate/test.py
10
9
  subset2evaluate/utils.py
@@ -1,147 +0,0 @@
1
- """Information about the standard human references selected for various WMT test sets."""
2
-
3
- year2std_refs = {
4
-
5
- "wmt25": {
6
- "cs-de_DE": "refA",
7
- "cs-uk_UA": "refA",
8
- "en-ar_EG": "refA",
9
- "en-bho_IN": "refA",
10
- "en-cs_CZ": "refA",
11
- "en-et_EE": "refA",
12
- "en-is_IS": "refA",
13
- "en-it_IT": "refA",
14
- "en-ja_JP": "refA",
15
- "en-ko_KR": "refA",
16
- "en-mas_KE": "refA",
17
- "en-ru_RU": "refA",
18
- "en-sr_Cyrl_RS": "refA",
19
- "en-uk_UA": "refA",
20
- "en-zh_CN": "refA",
21
- "ja-zh_CN": "refA",
22
- },
23
- "wmt24": {
24
- "en-de": "refB",
25
- "en-es": "refA",
26
- "ja-zh": "refA",
27
- "cs-uk": "refA",
28
- "en-cs": "refA",
29
- "en-hi": "refA",
30
- "en-is": "refA",
31
- "en-ja": "refA",
32
- "en-ru": "refA",
33
- "en-uk": "refA",
34
- "en-zh": "refA"
35
- },
36
- "wmt23.sent": {
37
- "en-de": "refA"
38
- },
39
- "wmt23": {
40
- "en-de": "refA",
41
- "he-en": "refB",
42
- "zh-en": "refA",
43
- "cs-uk": "refA",
44
- "de-en": "refA",
45
- "en-cs": "refA",
46
- "en-he": "refB",
47
- "en-ja": "refA",
48
- "en-ru": "refA",
49
- "en-uk": "refA",
50
- "en-zh": "refA",
51
- "ja-en": "refA",
52
- "ru-en": "refA",
53
- "uk-en": "refA"
54
- },
55
- "wmt22": {
56
- "en-de": "refA",
57
- "en-ru": "refA",
58
- "zh-en": "refA",
59
- "cs-en": "refB",
60
- "cs-uk": "refA",
61
- "de-en": "refA",
62
- "de-fr": "refA",
63
- "en-cs": "refB",
64
- "en-hr": "refA",
65
- "en-ja": "refA",
66
- "en-liv": "refA",
67
- "en-uk": "refA",
68
- "en-zh": "refA",
69
- "fr-de": "refA",
70
- "ja-en": "refA",
71
- "liv-en": "refA",
72
- "ru-en": "refA",
73
- "ru-sah": "refA",
74
- "sah-ru": "refA",
75
- "uk-cs": "refA",
76
- "uk-en": "refA"
77
- },
78
- "wmt21.news": {
79
- "en-cs": "refA",
80
- "en-de": "refC",
81
- "en-ha": "refA",
82
- "en-is": "refA",
83
- "en-ja": "refA",
84
- "en-ru": "refA",
85
- "en-zh": "refA",
86
- "cs-en": "refA",
87
- "de-en": "refA",
88
- "de-fr": "refA",
89
- "fr-de": "refA",
90
- "ha-en": "refA",
91
- "is-en": "refA",
92
- "ja-en": "refA",
93
- "ru-en": "refA",
94
- "zh-en": "refB"
95
- },
96
- "wmt21.tedtalks": {
97
- "en-de": "refA",
98
- "en-ru": "refA",
99
- "zh-en": "refB"
100
- },
101
- "wmt21.flores": {
102
- "bn-hi": "refA",
103
- "hi-bn": "refA",
104
- "xh-zu": "refA",
105
- "zu-xh": "refA"
106
- },
107
- "wmt20": {
108
- "cs-en": "ref",
109
- "de-en": "ref",
110
- "en-cs": "ref",
111
- "en-de": "ref",
112
- "en-iu": "ref",
113
- "en-ja": "ref",
114
- "en-pl": "ref",
115
- "en-ru": "ref",
116
- "en-ta": "ref",
117
- "en-zh": "ref",
118
- "iu-en": "ref",
119
- "ja-en": "ref",
120
- "km-en": "ref",
121
- "pl-en": "ref",
122
- "ps-en": "ref",
123
- "ru-en": "ref",
124
- "ta-en": "ref",
125
- "zh-en": "ref"
126
- },
127
- "wmt19": {
128
- "de-cs": "ref",
129
- "de-en": "ref",
130
- "de-fr": "ref",
131
- "en-cs": "ref",
132
- "en-de": "ref",
133
- "en-fi": "ref",
134
- "en-gu": "ref",
135
- "en-kk": "ref",
136
- "en-lt": "ref",
137
- "en-ru": "ref",
138
- "en-zh": "ref",
139
- "fi-en": "ref",
140
- "fr-de": "ref",
141
- "gu-en": "ref",
142
- "kk-en": "ref",
143
- "lt-en": "ref",
144
- "ru-en": "ref",
145
- "zh-en": "ref"
146
- }
147
- }