subset2evaluate 1.0.22__tar.gz → 1.0.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/PKG-INFO +1 -1
  2. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/pyproject.toml +1 -1
  3. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate/utils.py +132 -31
  4. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/PKG-INFO +1 -1
  5. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/SOURCES.txt +0 -1
  6. subset2evaluate-1.0.22/subset2evaluate/reference_info.py +0 -147
  7. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/README.md +0 -0
  8. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/setup.cfg +0 -0
  9. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate/__init__.py +0 -0
  10. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate/evaluate.py +0 -0
  11. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate/methods.py +0 -0
  12. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate/methods_old.py +0 -0
  13. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate/select_subset.py +0 -0
  14. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate/test.py +0 -0
  15. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/dependency_links.txt +0 -0
  16. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/entry_points.txt +0 -0
  17. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/requires.txt +0 -0
  18. {subset2evaluate-1.0.22 → subset2evaluate-1.0.24}/subset2evaluate.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.22
3
+ Version: 1.0.24
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -11,7 +11,7 @@ packages = ["subset2evaluate"]
11
11
 
12
12
  [project]
13
13
  name = "subset2evaluate"
14
- version = "1.0.22"
14
+ version = "1.0.24"
15
15
  description = "Find informative examples to efficiently (human-)evaluate NLG models."
16
16
  license = {text = "MIT"}
17
17
  readme = "README.md"
@@ -1,6 +1,5 @@
1
1
  from typing import Any, Callable, Dict, List, Optional, Union, Literal
2
2
  import numpy as np
3
- from subset2evaluate.reference_info import year2std_refs
4
3
 
5
4
  PROPS = np.linspace(0.05, 0.5, 10)
6
5
 
@@ -623,6 +622,7 @@ def load_data_wmt( # noqa: C901
623
622
  import pickle
624
623
  import contextlib
625
624
  import importlib.metadata
625
+ import json
626
626
 
627
627
  # temporarily change to the root directory, this requires Python 3.11
628
628
  with contextlib.chdir(os.path.dirname(os.path.realpath(__file__)) + "/../"):
@@ -644,6 +644,44 @@ def load_data_wmt( # noqa: C901
644
644
  ):
645
645
  return cache["data"]
646
646
 
647
+ # special handling for WMT24++
648
+ if os.path.exists(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl"):
649
+ with open(
650
+ f"data/mt-metrics-eval-v2/{year}/sources/{langs}.jsonl", "r"
651
+ ) as f:
652
+ data_tmp = f.readlines()
653
+ lines_src = [json.loads(line)["source"] for line in data_tmp]
654
+ with open(f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "w") as f:
655
+ for line in lines_src:
656
+ f.write(line + "\n")
657
+ if os.path.exists(
658
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl"
659
+ ):
660
+ with open(
661
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.jsonl", "r"
662
+ ) as f:
663
+ data_tmp = f.readlines()
664
+ lines_ref = [json.loads(line)["target"] for line in data_tmp]
665
+ with open(
666
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.refA.txt", "w"
667
+ ) as f:
668
+ for line in lines_ref:
669
+ f.write(line + "\n")
670
+ if os.path.exists(
671
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl"
672
+ ):
673
+ with open(
674
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.jsonl",
675
+ "r",
676
+ ) as f:
677
+ data_tmp = f.readlines()
678
+ lines_ref = [json.loads(line)["target"] for line in data_tmp]
679
+ with open(
680
+ f"data/mt-metrics-eval-v2/{year}/references/{langs}.posteditA.txt", "w"
681
+ ) as f:
682
+ for line in lines_ref:
683
+ f.write(line + "\n")
684
+
647
685
  lines_src = open(
648
686
  f"data/mt-metrics-eval-v2/{year}/sources/{langs}.txt", "r"
649
687
  ).readlines()
@@ -653,14 +691,13 @@ def load_data_wmt( # noqa: C901
653
691
  lines_ref = None
654
692
 
655
693
  refs_dir = f"data/mt-metrics-eval-v2/{year}/references"
656
- selected_human_ref = (
657
- file_reference if file_reference is not None else year2std_refs[year][langs]
658
- )
659
- file_reference_path = f"{refs_dir}/{langs}.{selected_human_ref}.txt"
660
-
661
- if not os.path.exists(file_reference_path):
694
+ for file_reference in [file_reference, "refA", "refB", "refC", "ref"]:
695
+ if os.path.exists(f"{refs_dir}/{langs}.{file_reference}.txt"):
696
+ break
697
+ else:
662
698
  # did not find reference
663
699
  return []
700
+ file_reference_path = f"{refs_dir}/{langs}.{file_reference}.txt"
664
701
 
665
702
  lines_ref = open(file_reference_path, "r").readlines()
666
703
 
@@ -675,23 +712,33 @@ def load_data_wmt( # noqa: C901
675
712
  human_refs.add(parts[1])
676
713
 
677
714
  # do not consider canary line
678
- contain_canary_line = lines_src[0].lower().startswith("canary")
715
+ # for WTM24++ it's part of the translation dataset
716
+ contain_canary_line = (
717
+ lines_src[0].lower().startswith("canary") and year != "wmt24pp"
718
+ )
679
719
  if contain_canary_line:
680
720
  lines_src.pop(0)
681
721
  lines_doc.pop(0)
682
722
  lines_ref.pop(0)
683
723
 
684
724
  line_model = {}
685
- for f in glob.glob(
686
- f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.txt"
725
+ for f in list(
726
+ glob.glob(f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.txt")
727
+ ) + list(
728
+ glob.glob(f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.jsonl")
687
729
  ):
688
- model = f.split("/")[-1].removesuffix(".txt")
730
+ if f.endswith(".jsonl"):
731
+ model = f.split("/")[-1].removesuffix(".jsonl")
732
+ lines = [json.loads(line)["hypothesis"] for line in open(f, "r")]
733
+ else:
734
+ model = f.split("/")[-1].removesuffix(".txt")
735
+ lines = open(f, "r").readlines()
689
736
  if model in {"synthetic_ref", "chrf_bestmbr"}:
690
737
  continue
691
738
  if model in human_refs and not include_ref:
692
739
  continue
693
740
 
694
- line_model[model] = open(f, "r").readlines()
741
+ line_model[model] = lines
695
742
  if contain_canary_line:
696
743
  line_model[model].pop(0)
697
744
 
@@ -699,22 +746,18 @@ def load_data_wmt( # noqa: C901
699
746
 
700
747
  lines_score = collections.defaultdict(list)
701
748
 
702
- if file_protocol is not None:
703
- f_protocols = [
704
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.{file_protocol}.seg.score",
705
- ]
706
- else:
707
- f_protocols = [
708
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
709
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa-merged.seg.score",
710
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa.seg.score",
711
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.da-sqm.seg.score",
712
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
713
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt.seg.score",
714
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.appraise.seg.score",
715
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-raw.seg.score",
716
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-appraise.seg.score",
717
- ]
749
+ f_protocols = [
750
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.{file_protocol}.seg.score",
751
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
752
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa-merged.seg.score",
753
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa.seg.score",
754
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.da-sqm.seg.score",
755
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
756
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt.seg.score",
757
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.appraise.seg.score",
758
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-raw.seg.score",
759
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-appraise.seg.score",
760
+ ]
718
761
  for fname in [*f_protocols, False]:
719
762
  if fname and os.path.exists(fname):
720
763
  break
@@ -743,14 +786,14 @@ def load_data_wmt( # noqa: C901
743
786
  f"data/mt-metrics-eval-v2/{year}/metric-scores/{langs}/*.seg.score"
744
787
  ):
745
788
  # among ref-based metrics, load only the scores for the selected human ref
746
- if not f.endswith(f"-{selected_human_ref}.seg.score") and not f.endswith(
789
+ if not f.endswith(f"-{file_reference}.seg.score") and not f.endswith(
747
790
  "-src.seg.score"
748
791
  ):
749
792
  continue
750
793
  # remove suffix for both ref-based and ref-less metrics
751
794
  metric = (
752
795
  f.split("/")[-1]
753
- .removesuffix(f"-{selected_human_ref}.seg.score")
796
+ .removesuffix(f"-{file_reference}.seg.score")
754
797
  .removesuffix("-src.seg.score")
755
798
  )
756
799
  for line_i, line_raw in enumerate(open(f, "r").readlines()):
@@ -771,6 +814,8 @@ def load_data_wmt( # noqa: C901
771
814
 
772
815
  model_line_i -= 1
773
816
 
817
+ if model_line_i >= len(lines_score[model]):
818
+ print(year, langs, model_line_i, metric, model, contain_canary_line)
774
819
  lines_score[model][model_line_i][metric] = float(score)
775
820
 
776
821
  # filter out lines that have no human score
@@ -905,7 +950,7 @@ def load_data_wmt_test(**kwargs):
905
950
  return data
906
951
 
907
952
 
908
- def load_data_wmt_all(min_items=100, **kwargs):
953
+ def load_data_wmt_all(min_items=100, name_filter=lambda x: True, **kwargs):
909
954
  data = {
910
955
  args: load_data_wmt(*args, **kwargs)
911
956
  for args in [
@@ -925,6 +970,61 @@ def load_data_wmt_all(min_items=100, **kwargs):
925
970
  ("wmt25", "en-uk_UA"),
926
971
  ("wmt25", "en-zh_CN"),
927
972
  ("wmt25", "ja-zh_CN"),
973
+ ("wmt24pp", "en-ar_EG"),
974
+ ("wmt24pp", "en-bn_IN"),
975
+ ("wmt24pp", "en-da_DK"),
976
+ ("wmt24pp", "en-es_MX"),
977
+ ("wmt24pp", "en-fi_FI"),
978
+ ("wmt24pp", "en-fr_FR"),
979
+ ("wmt24pp", "en-hi_IN"),
980
+ ("wmt24pp", "en-id_ID"),
981
+ ("wmt24pp", "en-ja_JP"),
982
+ ("wmt24pp", "en-lt_LT"),
983
+ ("wmt24pp", "en-mr_IN"),
984
+ ("wmt24pp", "en-pa_IN"),
985
+ ("wmt24pp", "en-pt_PT"),
986
+ ("wmt24pp", "en-sk_SK"),
987
+ ("wmt24pp", "en-sv_SE"),
988
+ ("wmt24pp", "en-ta_IN"),
989
+ ("wmt24pp", "en-tr_TR"),
990
+ ("wmt24pp", "en-vi_VN"),
991
+ ("wmt24pp", "en-zu_ZA"),
992
+ ("wmt24pp", "en-ar_SA"),
993
+ ("wmt24pp", "en-ca_ES"),
994
+ ("wmt24pp", "en-de_DE"),
995
+ ("wmt24pp", "en-et_EE"),
996
+ ("wmt24pp", "en-fil_PH"),
997
+ ("wmt24pp", "en-gu_IN"),
998
+ ("wmt24pp", "en-hr_HR"),
999
+ ("wmt24pp", "en-is_IS"),
1000
+ ("wmt24pp", "en-kn_IN"),
1001
+ ("wmt24pp", "en-lv_LV"),
1002
+ ("wmt24pp", "en-nl_NL"),
1003
+ ("wmt24pp", "en-pl_PL"),
1004
+ ("wmt24pp", "en-ro_RO"),
1005
+ ("wmt24pp", "en-sl_SI"),
1006
+ ("wmt24pp", "en-sw_KE"),
1007
+ ("wmt24pp", "en-te_IN"),
1008
+ ("wmt24pp", "en-uk_UA"),
1009
+ ("wmt24pp", "en-zh_CN"),
1010
+ ("wmt24pp", "en-bg_BG"),
1011
+ ("wmt24pp", "en-cs_CZ"),
1012
+ ("wmt24pp", "en-el_GR"),
1013
+ ("wmt24pp", "en-fa_IR"),
1014
+ ("wmt24pp", "en-fr_CA"),
1015
+ ("wmt24pp", "en-he_IL"),
1016
+ ("wmt24pp", "en-hu_HU"),
1017
+ ("wmt24pp", "en-it_IT"),
1018
+ ("wmt24pp", "en-ko_KR"),
1019
+ ("wmt24pp", "en-ml_IN"),
1020
+ ("wmt24pp", "en-no_NO"),
1021
+ ("wmt24pp", "en-pt_BR"),
1022
+ ("wmt24pp", "en-ru_RU"),
1023
+ ("wmt24pp", "en-sr_RS"),
1024
+ ("wmt24pp", "en-sw_TZ"),
1025
+ ("wmt24pp", "en-th_TH"),
1026
+ ("wmt24pp", "en-ur_PK"),
1027
+ ("wmt24pp", "en-zh_TW"),
928
1028
  ("wmt24", "cs-uk"),
929
1029
  ("wmt24", "en-cs"),
930
1030
  ("wmt24", "en-de"),
@@ -1032,6 +1132,7 @@ def load_data_wmt_all(min_items=100, **kwargs):
1032
1132
  ("wmt19", "en-de"),
1033
1133
  ("wmt19", "de-cs"),
1034
1134
  ]
1135
+ if name_filter(args)
1035
1136
  }
1036
1137
  # filter out empty datasets
1037
1138
  # some years/langs have issues with human annotations coverage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.22
3
+ Version: 1.0.24
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -4,7 +4,6 @@ subset2evaluate/__init__.py
4
4
  subset2evaluate/evaluate.py
5
5
  subset2evaluate/methods.py
6
6
  subset2evaluate/methods_old.py
7
- subset2evaluate/reference_info.py
8
7
  subset2evaluate/select_subset.py
9
8
  subset2evaluate/test.py
10
9
  subset2evaluate/utils.py
@@ -1,147 +0,0 @@
1
- """Information about the standard human references selected for various WMT test sets."""
2
-
3
- year2std_refs = {
4
-
5
- "wmt25": {
6
- "cs-de_DE": "refA",
7
- "cs-uk_UA": "refA",
8
- "en-ar_EG": "refA",
9
- "en-bho_IN": "refA",
10
- "en-cs_CZ": "refA",
11
- "en-et_EE": "refA",
12
- "en-is_IS": "refA",
13
- "en-it_IT": "refA",
14
- "en-ja_JP": "refA",
15
- "en-ko_KR": "refA",
16
- "en-mas_KE": "refA",
17
- "en-ru_RU": "refA",
18
- "en-sr_Cyrl_RS": "refA",
19
- "en-uk_UA": "refA",
20
- "en-zh_CN": "refA",
21
- "ja-zh_CN": "refA",
22
- },
23
- "wmt24": {
24
- "en-de": "refB",
25
- "en-es": "refA",
26
- "ja-zh": "refA",
27
- "cs-uk": "refA",
28
- "en-cs": "refA",
29
- "en-hi": "refA",
30
- "en-is": "refA",
31
- "en-ja": "refA",
32
- "en-ru": "refA",
33
- "en-uk": "refA",
34
- "en-zh": "refA"
35
- },
36
- "wmt23.sent": {
37
- "en-de": "refA"
38
- },
39
- "wmt23": {
40
- "en-de": "refA",
41
- "he-en": "refB",
42
- "zh-en": "refA",
43
- "cs-uk": "refA",
44
- "de-en": "refA",
45
- "en-cs": "refA",
46
- "en-he": "refB",
47
- "en-ja": "refA",
48
- "en-ru": "refA",
49
- "en-uk": "refA",
50
- "en-zh": "refA",
51
- "ja-en": "refA",
52
- "ru-en": "refA",
53
- "uk-en": "refA"
54
- },
55
- "wmt22": {
56
- "en-de": "refA",
57
- "en-ru": "refA",
58
- "zh-en": "refA",
59
- "cs-en": "refB",
60
- "cs-uk": "refA",
61
- "de-en": "refA",
62
- "de-fr": "refA",
63
- "en-cs": "refB",
64
- "en-hr": "refA",
65
- "en-ja": "refA",
66
- "en-liv": "refA",
67
- "en-uk": "refA",
68
- "en-zh": "refA",
69
- "fr-de": "refA",
70
- "ja-en": "refA",
71
- "liv-en": "refA",
72
- "ru-en": "refA",
73
- "ru-sah": "refA",
74
- "sah-ru": "refA",
75
- "uk-cs": "refA",
76
- "uk-en": "refA"
77
- },
78
- "wmt21.news": {
79
- "en-cs": "refA",
80
- "en-de": "refC",
81
- "en-ha": "refA",
82
- "en-is": "refA",
83
- "en-ja": "refA",
84
- "en-ru": "refA",
85
- "en-zh": "refA",
86
- "cs-en": "refA",
87
- "de-en": "refA",
88
- "de-fr": "refA",
89
- "fr-de": "refA",
90
- "ha-en": "refA",
91
- "is-en": "refA",
92
- "ja-en": "refA",
93
- "ru-en": "refA",
94
- "zh-en": "refB"
95
- },
96
- "wmt21.tedtalks": {
97
- "en-de": "refA",
98
- "en-ru": "refA",
99
- "zh-en": "refB"
100
- },
101
- "wmt21.flores": {
102
- "bn-hi": "refA",
103
- "hi-bn": "refA",
104
- "xh-zu": "refA",
105
- "zu-xh": "refA"
106
- },
107
- "wmt20": {
108
- "cs-en": "ref",
109
- "de-en": "ref",
110
- "en-cs": "ref",
111
- "en-de": "ref",
112
- "en-iu": "ref",
113
- "en-ja": "ref",
114
- "en-pl": "ref",
115
- "en-ru": "ref",
116
- "en-ta": "ref",
117
- "en-zh": "ref",
118
- "iu-en": "ref",
119
- "ja-en": "ref",
120
- "km-en": "ref",
121
- "pl-en": "ref",
122
- "ps-en": "ref",
123
- "ru-en": "ref",
124
- "ta-en": "ref",
125
- "zh-en": "ref"
126
- },
127
- "wmt19": {
128
- "de-cs": "ref",
129
- "de-en": "ref",
130
- "de-fr": "ref",
131
- "en-cs": "ref",
132
- "en-de": "ref",
133
- "en-fi": "ref",
134
- "en-gu": "ref",
135
- "en-kk": "ref",
136
- "en-lt": "ref",
137
- "en-ru": "ref",
138
- "en-zh": "ref",
139
- "fi-en": "ref",
140
- "fr-de": "ref",
141
- "gu-en": "ref",
142
- "kk-en": "ref",
143
- "lt-en": "ref",
144
- "ru-en": "ref",
145
- "zh-en": "ref"
146
- }
147
- }