subset2evaluate 1.0.23__tar.gz → 1.0.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.23
3
+ Version: 1.0.24
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -11,7 +11,7 @@ packages = ["subset2evaluate"]
11
11
 
12
12
  [project]
13
13
  name = "subset2evaluate"
14
- version = "1.0.23"
14
+ version = "1.0.24"
15
15
  description = "Find informative examples to efficiently (human-)evaluate NLG models."
16
16
  license = {text = "MIT"}
17
17
  readme = "README.md"
@@ -712,23 +712,33 @@ def load_data_wmt( # noqa: C901
712
712
  human_refs.add(parts[1])
713
713
 
714
714
  # do not consider canary line
715
- contain_canary_line = lines_src[0].lower().startswith("canary")
715
+ # for WTM24++ it's part of the translation dataset
716
+ contain_canary_line = (
717
+ lines_src[0].lower().startswith("canary") and year != "wmt24pp"
718
+ )
716
719
  if contain_canary_line:
717
720
  lines_src.pop(0)
718
721
  lines_doc.pop(0)
719
722
  lines_ref.pop(0)
720
723
 
721
724
  line_model = {}
722
- for f in glob.glob(
723
- f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.txt"
725
+ for f in list(
726
+ glob.glob(f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.txt")
727
+ ) + list(
728
+ glob.glob(f"data/mt-metrics-eval-v2/{year}/system-outputs/{langs}/*.jsonl")
724
729
  ):
725
- model = f.split("/")[-1].removesuffix(".txt")
730
+ if f.endswith(".jsonl"):
731
+ model = f.split("/")[-1].removesuffix(".jsonl")
732
+ lines = [json.loads(line)["hypothesis"] for line in open(f, "r")]
733
+ else:
734
+ model = f.split("/")[-1].removesuffix(".txt")
735
+ lines = open(f, "r").readlines()
726
736
  if model in {"synthetic_ref", "chrf_bestmbr"}:
727
737
  continue
728
738
  if model in human_refs and not include_ref:
729
739
  continue
730
740
 
731
- line_model[model] = open(f, "r").readlines()
741
+ line_model[model] = lines
732
742
  if contain_canary_line:
733
743
  line_model[model].pop(0)
734
744
 
@@ -736,22 +746,18 @@ def load_data_wmt( # noqa: C901
736
746
 
737
747
  lines_score = collections.defaultdict(list)
738
748
 
739
- if file_protocol is not None:
740
- f_protocols = [
741
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.{file_protocol}.seg.score",
742
- ]
743
- else:
744
- f_protocols = [
745
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
746
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa-merged.seg.score",
747
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa.seg.score",
748
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.da-sqm.seg.score",
749
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
750
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt.seg.score",
751
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.appraise.seg.score",
752
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-raw.seg.score",
753
- f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-appraise.seg.score",
754
- ]
749
+ f_protocols = [
750
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.{file_protocol}.seg.score",
751
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
752
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa-merged.seg.score",
753
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.esa.seg.score",
754
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.da-sqm.seg.score",
755
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.mqm.seg.score",
756
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt.seg.score",
757
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.appraise.seg.score",
758
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-raw.seg.score",
759
+ f"data/mt-metrics-eval-v2/{year}/human-scores/{langs}.wmt-appraise.seg.score",
760
+ ]
755
761
  for fname in [*f_protocols, False]:
756
762
  if fname and os.path.exists(fname):
757
763
  break
@@ -808,6 +814,8 @@ def load_data_wmt( # noqa: C901
808
814
 
809
815
  model_line_i -= 1
810
816
 
817
+ if model_line_i >= len(lines_score[model]):
818
+ print(year, langs, model_line_i, metric, model, contain_canary_line)
811
819
  lines_score[model][model_line_i][metric] = float(score)
812
820
 
813
821
  # filter out lines that have no human score
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subset2evaluate
3
- Version: 1.0.23
3
+ Version: 1.0.24
4
4
  Summary: Find informative examples to efficiently (human-)evaluate NLG models.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT