subset2evaluate 1.0.22__tar.gz → 1.0.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/PKG-INFO +1 -1
  2. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/pyproject.toml +1 -1
  3. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/utils.py +103 -10
  4. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/PKG-INFO +1 -1
  5. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/SOURCES.txt +0 -1
  6. subset2evaluate-1.0.22/subset2evaluate/reference_info.py +0 -147
  7. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/README.md +0 -0
  8. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/setup.cfg +0 -0
  9. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/__init__.py +0 -0
  10. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/evaluate.py +0 -0
  11. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/methods.py +0 -0
  12. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/methods_old.py +0 -0
  13. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/select_subset.py +0 -0
  14. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate/test.py +0 -0
  15. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/dependency_links.txt +0 -0
  16. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/entry_points.txt +0 -0
  17. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/requires.txt +0 -0
  18. {subset2evaluate-1.0.22 → subset2evaluate-1.0.23}/subset2evaluate.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.22
3
+ Version: 1.0.23
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -11,7 +11,7 @@ packages = ["subset2evaluate"]
11
11
 
12
12
  [project]
13
13
  name = "subset2evaluate"
14
- version = "1.0.22"
14
+ version = "1.0.23"
15
15
  description = "Find informative examples to efficiently (human-)evaluate NLG models."
16
16
  license = {text = "MIT"}
17
17
  readme = "README.md"
@@ -1,6 +1,5 @@
1
1
  from typing import Any, Callable, Dict, List, Optional, Union, Literal
2
2
  import numpy as np
3
- from subset2evaluate.reference_info import year2std_refs
4
3
 
5
4
  PROPS = np.linspace(0.05, 0.5, 10)
6
5
 
@@ -623,6 +622,7 @@ def load_data_wmt( # noqa: C901
623
622
  import pickle
624
623
  import contextlib
625
624
  import importlib.metadata
625
+ import json
626
626
 
627
627
  # temporarily change to the root directory, this requires Python 3.11
628
628
  with contextlib.chdir(os.path.dirname(os.path.realpath(__file__)) + "/../"):
@@ -644,6 +644,44 @@ def load_data_wmt( # noqa: C901
644
644
  ):
645
645
  return cache["data"]
646
646
 
647
+ # special handling for WMT24++
648
+ if os.path.exists(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl"):
649
+ with open(
650
+ f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl", "r"
651
+ ) as f:
652
+ data_tmp = f.readlines()
653
+ lines_src = [json.loads(line)["source"] for line in data_tmp]
654
+ with open(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "w") as f:
655
+ for line in lines_src:
656
+ f.write(line + "\n")
657
+ if os.path.exists(
658
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl"
659
+ ):
660
+ with open(
661
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl", "r"
662
+ ) as f:
663
+ data_tmp = f.readlines()
664
+ lines_ref = [json.loads(line)["target"] for line in data_tmp]
665
+ with open(
666
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.txt", "w"
667
+ ) as f:
668
+ for line in lines_ref:
669
+ f.write(line + "\n")
670
+ if os.path.exists(
671
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl"
672
+ ):
673
+ with open(
674
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl",
675
+ "r",
676
+ ) as f:
677
+ data_tmp = f.readlines()
678
+ lines_ref = [json.loads(line)["target"] for line in data_tmp]
679
+ with open(
680
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.txt", "w"
681
+ ) as f:
682
+ for line in lines_ref:
683
+ f.write(line + "\n")
684
+
647
685
  lines_src = open(
648
686
  f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "r"
649
687
  ).readlines()
@@ -653,14 +691,13 @@ def load_data_wmt( # noqa: C901
653
691
  lines_ref = None
654
692
 
655
693
  refs_dir = f"data/mt-metrics-eval-v2/{year}/references"
656
- selected_human_ref = (
657
- file_reference if file_reference is not None else year2std_refs[year][langs]
658
- )
659
- file_reference_path = f"{refs_dir}/{langs}.{selected_human_ref}.txt"
660
-
661
- if not os.path.exists(file_reference_path):
694
+ for file_reference in [file_reference, "refA", "refB", "refC", "ref"]:
695
+ if os.path.exists(f"{refs_dir}/{langs}.{file_reference}.txt"):
696
+ break
697
+ else:
662
698
  # did not find reference
663
699
  return []
700
+ file_reference_path = f"{refs_dir}/{langs}.{file_reference}.txt"
664
701
 
665
702
  lines_ref = open(file_reference_path, "r").readlines()
666
703
 
@@ -743,14 +780,14 @@ def load_data_wmt( # noqa: C901
743
780
  f"data/mt-metrics-eval-v2/{year}/metric-scores/{langs}/*.seg.score"
744
781
  ):
745
782
  # among ref-based metrics, load only the scores for the selected human ref
746
- if not f.endswith(f"-{selected_human_ref}.seg.score") and not f.endswith(
783
+ if not f.endswith(f"-{file_reference}.seg.score") and not f.endswith(
747
784
  "-src.seg.score"
748
785
  ):
749
786
  continue
750
787
  # remove suffix for both ref-based and ref-less metrics
751
788
  metric = (
752
789
  f.split("/")[-1]
753
- .removesuffix(f"-{selected_human_ref}.seg.score")
790
+ .removesuffix(f"-{file_reference}.seg.score")
754
791
  .removesuffix("-src.seg.score")
755
792
  )
756
793
  for line_i, line_raw in enumerate(open(f, "r").readlines()):
@@ -905,7 +942,7 @@ def load_data_wmt_test(**kwargs):
905
942
  return data
906
943
 
907
944
 
908
- def load_data_wmt_all(min_items=100, **kwargs):
945
+ def load_data_wmt_all(min_items=100, name_filter=lambda x: True, **kwargs):
909
946
  data = {
910
947
  args: load_data_wmt(*args, **kwargs)
911
948
  for args in [
@@ -925,6 +962,61 @@ def load_data_wmt_all(min_items=100, **kwargs):
925
962
  ("wmt25", "en-uk_UA"),
926
963
  ("wmt25", "en-zh_CN"),
927
964
  ("wmt25", "ja-zh_CN"),
965
+ ("wmt24pp", "en-ar_EG"),
966
+ ("wmt24pp", "en-bn_IN"),
967
+ ("wmt24pp", "en-da_DK"),
968
+ ("wmt24pp", "en-es_MX"),
969
+ ("wmt24pp", "en-fi_FI"),
970
+ ("wmt24pp", "en-fr_FR"),
971
+ ("wmt24pp", "en-hi_IN"),
972
+ ("wmt24pp", "en-id_ID"),
973
+ ("wmt24pp", "en-ja_JP"),
974
+ ("wmt24pp", "en-lt_LT"),
975
+ ("wmt24pp", "en-mr_IN"),
976
+ ("wmt24pp", "en-pa_IN"),
977
+ ("wmt24pp", "en-pt_PT"),
978
+ ("wmt24pp", "en-sk_SK"),
979
+ ("wmt24pp", "en-sv_SE"),
980
+ ("wmt24pp", "en-ta_IN"),
981
+ ("wmt24pp", "en-tr_TR"),
982
+ ("wmt24pp", "en-vi_VN"),
983
+ ("wmt24pp", "en-zu_ZA"),
984
+ ("wmt24pp", "en-ar_SA"),
985
+ ("wmt24pp", "en-ca_ES"),
986
+ ("wmt24pp", "en-de_DE"),
987
+ ("wmt24pp", "en-et_EE"),
988
+ ("wmt24pp", "en-fil_PH"),
989
+ ("wmt24pp", "en-gu_IN"),
990
+ ("wmt24pp", "en-hr_HR"),
991
+ ("wmt24pp", "en-is_IS"),
992
+ ("wmt24pp", "en-kn_IN"),
993
+ ("wmt24pp", "en-lv_LV"),
994
+ ("wmt24pp", "en-nl_NL"),
995
+ ("wmt24pp", "en-pl_PL"),
996
+ ("wmt24pp", "en-ro_RO"),
997
+ ("wmt24pp", "en-sl_SI"),
998
+ ("wmt24pp", "en-sw_KE"),
999
+ ("wmt24pp", "en-te_IN"),
1000
+ ("wmt24pp", "en-uk_UA"),
1001
+ ("wmt24pp", "en-zh_CN"),
1002
+ ("wmt24pp", "en-bg_BG"),
1003
+ ("wmt24pp", "en-cs_CZ"),
1004
+ ("wmt24pp", "en-el_GR"),
1005
+ ("wmt24pp", "en-fa_IR"),
1006
+ ("wmt24pp", "en-fr_CA"),
1007
+ ("wmt24pp", "en-he_IL"),
1008
+ ("wmt24pp", "en-hu_HU"),
1009
+ ("wmt24pp", "en-it_IT"),
1010
+ ("wmt24pp", "en-ko_KR"),
1011
+ ("wmt24pp", "en-ml_IN"),
1012
+ ("wmt24pp", "en-no_NO"),
1013
+ ("wmt24pp", "en-pt_BR"),
1014
+ ("wmt24pp", "en-ru_RU"),
1015
+ ("wmt24pp", "en-sr_RS"),
1016
+ ("wmt24pp", "en-sw_TZ"),
1017
+ ("wmt24pp", "en-th_TH"),
1018
+ ("wmt24pp", "en-ur_PK"),
1019
+ ("wmt24pp", "en-zh_TW"),
928
1020
  ("wmt24", "cs-uk"),
929
1021
  ("wmt24", "en-cs"),
930
1022
  ("wmt24", "en-de"),
@@ -1032,6 +1124,7 @@ def load_data_wmt_all(min_items=100, **kwargs):
1032
1124
  ("wmt19", "en-de"),
1033
1125
  ("wmt19", "de-cs"),
1034
1126
  ]
1127
+ if name_filter(args)
1035
1128
  }
1036
1129
  # filter out empty datasets
1037
1130
  # some years/langs have issues with human annotations coverage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.22
3
+ Version: 1.0.23
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -4,7 +4,6 @@ subset2evaluate/__init__.py
4
4
  subset2evaluate/evaluate.py
5
5
  subset2evaluate/methods.py
6
6
  subset2evaluate/methods_old.py
7
- subset2evaluate/reference_info.py
8
7
  subset2evaluate/select_subset.py
9
8
  subset2evaluate/test.py
10
9
  subset2evaluate/utils.py
@@ -1,147 +0,0 @@
1
- """Information about the standard human references selected for various WMT test sets."""
2
-
3
- year2std_refs = {
4
-
5
- "wmt25": {
6
- "cs-de_DE": "refA",
7
- "cs-uk_UA": "refA",
8
- "en-ar_EG": "refA",
9
- "en-bho_IN": "refA",
10
- "en-cs_CZ": "refA",
11
- "en-et_EE": "refA",
12
- "en-is_IS": "refA",
13
- "en-it_IT": "refA",
14
- "en-ja_JP": "refA",
15
- "en-ko_KR": "refA",
16
- "en-mas_KE": "refA",
17
- "en-ru_RU": "refA",
18
- "en-sr_Cyrl_RS": "refA",
19
- "en-uk_UA": "refA",
20
- "en-zh_CN": "refA",
21
- "ja-zh_CN": "refA",
22
- },
23
- "wmt24": {
24
- "en-de": "refB",
25
- "en-es": "refA",
26
- "ja-zh": "refA",
27
- "cs-uk": "refA",
28
- "en-cs": "refA",
29
- "en-hi": "refA",
30
- "en-is": "refA",
31
- "en-ja": "refA",
32
- "en-ru": "refA",
33
- "en-uk": "refA",
34
- "en-zh": "refA"
35
- },
36
- "wmt23.sent": {
37
- "en-de": "refA"
38
- },
39
- "wmt23": {
40
- "en-de": "refA",
41
- "he-en": "refB",
42
- "zh-en": "refA",
43
- "cs-uk": "refA",
44
- "de-en": "refA",
45
- "en-cs": "refA",
46
- "en-he": "refB",
47
- "en-ja": "refA",
48
- "en-ru": "refA",
49
- "en-uk": "refA",
50
- "en-zh": "refA",
51
- "ja-en": "refA",
52
- "ru-en": "refA",
53
- "uk-en": "refA"
54
- },
55
- "wmt22": {
56
- "en-de": "refA",
57
- "en-ru": "refA",
58
- "zh-en": "refA",
59
- "cs-en": "refB",
60
- "cs-uk": "refA",
61
- "de-en": "refA",
62
- "de-fr": "refA",
63
- "en-cs": "refB",
64
- "en-hr": "refA",
65
- "en-ja": "refA",
66
- "en-liv": "refA",
67
- "en-uk": "refA",
68
- "en-zh": "refA",
69
- "fr-de": "refA",
70
- "ja-en": "refA",
71
- "liv-en": "refA",
72
- "ru-en": "refA",
73
- "ru-sah": "refA",
74
- "sah-ru": "refA",
75
- "uk-cs": "refA",
76
- "uk-en": "refA"
77
- },
78
- "wmt21.news": {
79
- "en-cs": "refA",
80
- "en-de": "refC",
81
- "en-ha": "refA",
82
- "en-is": "refA",
83
- "en-ja": "refA",
84
- "en-ru": "refA",
85
- "en-zh": "refA",
86
- "cs-en": "refA",
87
- "de-en": "refA",
88
- "de-fr": "refA",
89
- "fr-de": "refA",
90
- "ha-en": "refA",
91
- "is-en": "refA",
92
- "ja-en": "refA",
93
- "ru-en": "refA",
94
- "zh-en": "refB"
95
- },
96
- "wmt21.tedtalks": {
97
- "en-de": "refA",
98
- "en-ru": "refA",
99
- "zh-en": "refB"
100
- },
101
- "wmt21.flores": {
102
- "bn-hi": "refA",
103
- "hi-bn": "refA",
104
- "xh-zu": "refA",
105
- "zu-xh": "refA"
106
- },
107
- "wmt20": {
108
- "cs-en": "ref",
109
- "de-en": "ref",
110
- "en-cs": "ref",
111
- "en-de": "ref",
112
- "en-iu": "ref",
113
- "en-ja": "ref",
114
- "en-pl": "ref",
115
- "en-ru": "ref",
116
- "en-ta": "ref",
117
- "en-zh": "ref",
118
- "iu-en": "ref",
119
- "ja-en": "ref",
120
- "km-en": "ref",
121
- "pl-en": "ref",
122
- "ps-en": "ref",
123
- "ru-en": "ref",
124
- "ta-en": "ref",
125
- "zh-en": "ref"
126
- },
127
- "wmt19": {
128
- "de-cs": "ref",
129
- "de-en": "ref",
130
- "de-fr": "ref",
131
- "en-cs": "ref",
132
- "en-de": "ref",
133
- "en-fi": "ref",
134
- "en-gu": "ref",
135
- "en-kk": "ref",
136
- "en-lt": "ref",
137
- "en-ru": "ref",
138
- "en-zh": "ref",
139
- "fi-en": "ref",
140
- "fr-de": "ref",
141
- "gu-en": "ref",
142
- "kk-en": "ref",
143
- "lt-en": "ref",
144
- "ru-en": "ref",
145
- "zh-en": "ref"
146
- }
147
- }