PyPI - gemba - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gemba 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

gemba/__init__.py +3 -0
gemba/gemba_da.py +62 -0
gemba/gemba_esa.py +84 -0
gemba/gemba_mqm_utils.py +234 -0
gemba/gpt_api.py +174 -0
gemba/mtme_tools.py +99 -0
gemba/prompt.py +139 -0
gemba/scores.py +103 -0
gemba/testset.py +58 -0
gemba/utils.py +78 -0
gemba-0.1.0.dist-info/METADATA +136 -0
gemba-0.1.0.dist-info/RECORD +14 -0
gemba-0.1.0.dist-info/WHEEL +4 -0
gemba-0.1.0.dist-info/licenses/LICENSE.md +427 -0

gemba/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .utils import get_gemba_scores
+__all__ = ['get_gemba_scores']

gemba/gemba_da.py ADDED Viewed

@@ -0,0 +1,62 @@
+import diskcache as dc
+from gemba.prompt import prompts, language_codes
+from gemba.gpt_api import GptApi
+from gemba.testset import Testset
+from gemba.scores import Scores
+def main():
+    scenarios = [
+        ["text-davinci-003", "GEMBA-DA", [["wmt22", "en-de"], ["wmt22", "zh-en"], ["wmt22", "en-ru"]], ],
+        ["text-davinci-003", "GEMBA-DA_ref", [["wmt22", "en-de"], ["wmt22", "zh-en"], ["wmt22", "en-ru"]], ],
+    ]
+    gptapi = GptApi()
+    for scenario in scenarios:
+        use_model = scenario[0]
+        annotation = scenario[1]
+        cache = dc.Cache(f'cache/{use_model}_{annotation}', expire=None, size_limit=int(10e10), cull_limit=0, eviction_policy='none')
+        scoring_name = f"{annotation}_{use_model}"
+        if use_model not in credentials["deployments"].keys():
+            print(f"Model {use_model} not supported by credentials")
+            continue
+        for dataset, lp in scenario[2]:
+            testset = Testset("mt-metrics-eval-v2", dataset, lp)
+            if prompts[annotation]["use_ref"]:
+                refname = testset.main_ref
+            else:
+                refname = None
+            scores = Scores(scoring_name, testset, refname)
+            # starts with -1 as it is incremented before the first request
+            hypothesis_index = -1
+            total = testset.segments_count()
+            for src, hyp, ref, system in testset.iterate_over_all(refname):
+                hypothesis_index += 1
+                if scores.get_score(system, hypothesis_index) != 'None':
+                    continue
+                print(f"Processing hypothesis {hypothesis_index}/{total} for {scoring_name} on {dataset}/{lp}")
+                data = {
+                    "source_seg": src,
+                    "target_seg": hyp,
+                    "reference_seg": ref,
+                    "source_lang": language_codes[lp.split("-")[0]],
+                    "target_lang": language_codes[lp.split("-")[1]],
+                }
+                prompt = prompts[annotation]["prompt"].format(**data)
+                parsed_answers = gptapi.request(prompt, use_model, prompts[annotation]["validate_answer"], cache=cache)
+                scores.assign_score(system, hypothesis_index, parsed_answers[0]['answer'], parsed_answers[0]['temperature'])
+            scores.save()
+if __name__ == '__main__':
+    main()

gemba/gemba_esa.py ADDED Viewed

@@ -0,0 +1,84 @@
+import ipdb
+import json
+import re
+from collections import defaultdict
+def esa_fewshot(few_shots):
+    prompts = [
+        {
+            "role": "system",
+            "content": f"Your task is to identify machine translation errors and assess the quality of the translation."
+        }
+    ]
+    template = """{source_lang} source:
+```{source_seg}```
+{target_lang} translation:
+```{target_seg}```
+Based on the source segment and machine translation surrounded with triple backticks, identify error types in the translation and classify them. The categories of errors are: accuracy (addition, mistranslation, omission, untranslated text), fluency (character encoding, grammar, inconsistency, punctuation, register, spelling), style (awkward), terminology (inappropriate for context, inconsistent use), non-translation, other, or no-error.\nEach error is classified as one of two categories: major or minor. Major errors disrupt the flow and make the understandability of text difficult or impossible. Minor errors are errors that do not disrupt the flow significantly and what the text is trying to say is still understandable."""
+    for shot in few_shots:
+        prompts.append({
+            "role": "user",
+            "content": template.format(**shot)
+        })
+        answer = shot['answer']
+        prompts.append({
+            "role": "assistant",
+            "content": answer
+        })
+    prompts.append({
+            "role": "user",
+            "content": template
+        })
+    return prompts
+esa_few_shots = {
+    "ende": {
+            "source_lang": "English",
+            "source_seg": "I do apologise about this, we must gain permission from the account holder to discuss an order with another person, I apologise if this was done previously, however, I would not be able to discuss this with yourself without the account holders permission.",
+            "target_lang": "German",
+            "target_seg": "Ich entschuldige mich dafür, wir müssen die Erlaubnis einholen, um eine Bestellung mit einer anderen Person zu besprechen. Ich entschuldige mich, falls dies zuvor geschehen wäre, aber ohne die Erlaubnis des Kontoinhabers wäre ich nicht in der Lage, dies mit dir involvement.",
+            "answer": """Major:
+accuracy/mistranslation - "involvement"
+accuracy/omission - "the account holder"
+Minor:
+fluency/grammar - "wäre"
+fluency/register - "dir"
+""",
+        },
+    "encs": {
+            "source_lang": "English",
+            "source_seg": "Talks have resumed in Vienna to try to revive the nuclear pact, with both sides trying to gauge the prospects of success after the latest exchanges in the stop-start negotiations.",
+            "target_lang": "Czech",
+            "target_seg": "Ve Vídni se ve Vídni obnovily rozhovory o oživení jaderného paktu, přičemž obě partaje se snaží posoudit vyhlídky na úspěch po posledních výměnách v jednáních.",
+            "answer": """Major:
+accuracy/addition - "ve Vídni"
+accuracy/omission - "the stop-start"
+Minor:
+terminology/inappropriate for context - "partaje"
+""",
+        },
+    "zhen": {
+            "source_lang": "Chinese",
+            "source_seg": "大众点评乌鲁木齐家居卖场频道为您提供高铁居然之家地址，电话，营业时间等最新商户信息，找装修公司，就上大众点评",
+            "target_lang": "English",
+            "target_seg": "Urumqi Home Furnishing Store Channel provides you with the latest business information such as the address, telephone number, business hours, etc., of high-speed rail, and find a decoration company, and go to the reviews.",
+            "answer": """Major:
+accuracy/addition - "of high-speed rail"
+accuracy/mistranslation - "go to the reviews"
+Minor:
+style/awkward - "etc.,"
+""",
+        },
+}
+TEMPLATE_GEMBA_ESA_ERROR_SPANS = esa_fewshot([esa_few_shots['ende'], esa_few_shots['encs'], esa_few_shots['zhen']])
+TEMPLATE_GEMBA_ESA_RANKING = 'Given the translation from {source_lang} to {target_lang} and the annotated error spans, assign a score on a continuous scale from 0 to 100. The scale has following reference points: 0="No meaning preserved", 33="Some meaning preserved", 66="Most meaning preserved and few grammar mistakes", up to 100="Perfect meaning and grammar".\n\nScore the following translation from {source_lang} source:\n```{source_seg}```\n{target_lang} translation:\n```{target_seg}```\nAnnotated error spans:\n```{error_spans}```\nScore (0-100): '

gemba/gemba_mqm_utils.py ADDED Viewed

@@ -0,0 +1,234 @@
+import ipdb
+import json
+import re
+from collections import defaultdict
+def apply_template(template, data):
+    if isinstance(template, str):
+        return template.format(**data)
+    elif isinstance(template, list):
+        prompt = []
+        for conversation_turn in template:
+            p = conversation_turn.copy()
+            p['content'] = p['content'].format(**data)
+            prompt.append(p)
+        return prompt
+    else:
+        raise ValueError(f"Unknown template type {type(template)}")
+def parse_broken_json(x):
+    improved_translation = ""
+    errors = defaultdict(list)
+    if '"errors": ' in x and "improved translation" in x:
+        data = x.split('", "errors": ')
+        if len(data) != 2:
+            return {"improved translation": improved_translation, "errors": errors}
+        # from data[0] parse improved translation
+        improved_translation = data[0].split('"improved translation": "')[1]
+        # remove last character from data[1]
+        data[1] = data[1][:-1]
+        try:
+            errors = json.loads(data[1])
+        except:
+            # just try to get error count
+            words = re.findall(r'\b\w+\b', data[1].lower())
+            keywords = ['critical', 'major', 'minor']
+            last_key = None
+            for word in words:
+                if word in keywords:
+                    last_key = word
+                elif last_key is not None and word == "class":
+                    errors[last_key].append({"class": "other"})
+    return {"improved translation": improved_translation, "errors": errors}
+def parse_error_class(error):
+    # parse error from error description, errors are ['accuracy', 'fluency', 'locale convention', 'style', 'terminology', 'non-translation', 'other']
+    #  locale convention (currency, date, name, telephone, or time format), style (awkward), terminology (inappropriate for context, inconsistent use),
+    class_name = "unknown"
+    if "accuracy" in error:
+        class_name = "accuracy"
+        for subclass in ["addition", "mistranslation", "omission", "untranslated text"]:
+            if subclass in error:
+                class_name = f"accuracy-{subclass}"
+    elif "fluency" in error:
+        class_name = "fluency"
+        for subclass in ["character encoding", "grammar", "inconsistency", "punctuation", "register", "spelling"]:
+            if subclass in error:
+                class_name = f"fluency-{subclass}"
+    elif "locale convention" in error:
+        class_name = "locale convention"
+        for subclass in ["currency", "date", "name", "telephone", "time"]:
+            if subclass in error:
+                class_name = f"locale convention-{subclass}"
+    elif "style" in error:
+        class_name = "style"
+    elif "terminology" in error:
+        class_name = "terminology"
+        for subclass in ["inappropriate", "inconsistent"]:
+            if subclass in error:
+                class_name = f"terminology-{subclass}"
+    elif "non-translation" in error:
+        class_name = "non-translation"
+    elif "other" in error:
+        class_name = "other"
+    return class_name
+def parse_mqm_answer(x, list_mqm_errors=False, full_desc=True, normalize=True):
+    if x is None:
+        return None
+    x = str(x)
+    if x.startswith('{"improved translation"'):
+        try:
+            x = json.loads(x)
+        except:
+            x = parse_broken_json(x)
+        errors = x["errors"]
+    else:
+        x = x.lower()
+        errors = {'critical': [], 'major': [], 'minor': []}
+        error_level = None
+        for line in x.split('\n'):
+            line = line.strip()
+            if "no-error" in line or "no error" in line or "" == line:
+                continue
+            if "critical:" == line:
+                error_level = "critical"
+                continue
+            elif "major:" == line:
+                error_level = "major"
+                continue
+            elif "minor:" == line:
+                error_level = "minor"
+                continue
+            if "critical" in line or "major" in line or "minor" in line:
+                if not any([line.startswith(x) for x in ['accuracy', 'fluency', 'locale convention', 'style', 'terminology', 'non-translation', 'other']]):
+                    print(line)
+            if error_level is None:
+                print(f"No error level for {line}")
+                continue
+            if "non-translation" in line:
+                errors["critical"].append(line)
+            else:
+                errors[error_level].append(line)
+    error_classes = defaultdict(list)
+    final_score = 0
+    error_counter = 0
+    for error_level in ['critical', 'major', 'minor']:
+        if error_level not in errors:
+                continue
+        for error in errors[error_level]:
+            if error_counter < 5:
+                final_score += 25 if error_level == 'critical' else 5 if error_level == 'major' else 1
+                error_counter += 1
+            if full_desc:
+                error_classes[error_level].append(error)
+            else:
+                class_name = parse_error_class(error)
+                error_classes[error_level].append(class_name)
+    if final_score > 25:
+        final_score = 25
+    # negative score is to normalize that higher score is better
+    return_score = (-final_score * 4 + 100) if normalize else -final_score
+    if list_mqm_errors:
+        return return_score, error_classes
+    else:
+        return return_score
+def mqm_fewshot(few_shots):
+    prompts = [
+        {
+            "role": "system",
+            "content": f"You are an annotator for the quality of machine translation. Your task is to identify errors and assess the quality of the translation."
+        }
+    ]
+    template = """{source_lang} source:
+```{source_seg}```
+{target_lang} translation:
+```{target_seg}```
+Based on the source segment and machine translation surrounded with triple backticks, identify error types in the translation and classify them. The categories of errors are: accuracy (addition, mistranslation, omission, untranslated text), fluency (character encoding, grammar, inconsistency, punctuation, register, spelling), style (awkward), terminology (inappropriate for context, inconsistent use), non-translation, other, or no-error.\nEach error is classified as one of three categories: critical, major, and minor. Critical errors inhibit comprehension of the text. Major errors disrupt the flow, but what the text is trying to say is still understandable. Minor errors are technically errors, but do not disrupt the flow or hinder comprehension."""
+    for shot in few_shots:
+        prompts.append({
+            "role": "user",
+            "content": template.format(**shot)
+        })
+        answer = shot['answer']
+        prompts.append({
+            "role": "assistant",
+            "content": answer
+        })
+    prompts.append({
+            "role": "user",
+            "content": template
+        })
+    return prompts
+few_shots = {
+    "ende": {
+            "source_lang": "English",
+            "source_seg": "I do apologise about this, we must gain permission from the account holder to discuss an order with another person, I apologise if this was done previously, however, I would not be able to discuss this with yourself without the account holders permission.",
+            "target_lang": "German",
+            "target_seg": "Ich entschuldige mich dafür, wir müssen die Erlaubnis einholen, um eine Bestellung mit einer anderen Person zu besprechen. Ich entschuldige mich, falls dies zuvor geschehen wäre, aber ohne die Erlaubnis des Kontoinhabers wäre ich nicht in der Lage, dies mit dir involvement.",
+            "answer": """Critical:
+no-error
+Major:
+accuracy/mistranslation - "involvement"
+accuracy/omission - "the account holder"
+Minor:
+fluency/grammar - "wäre"
+fluency/register - "dir"
+""",
+        },
+    "encs": {
+            "source_lang": "English",
+            "source_seg": "Talks have resumed in Vienna to try to revive the nuclear pact, with both sides trying to gauge the prospects of success after the latest exchanges in the stop-start negotiations.",
+            "target_lang": "Czech",
+            "target_seg": "Ve Vídni se ve Vídni obnovily rozhovory o oživení jaderného paktu, přičemž obě partaje se snaží posoudit vyhlídky na úspěch po posledních výměnách v jednáních.",
+            "answer": """Critical:
+no-error
+Major:
+accuracy/addition - "ve Vídni"
+accuracy/omission - "the stop-start"
+Minor:
+terminology/inappropriate for context - "partaje"
+""",
+        },
+    "zhen": {
+            "source_lang": "Chinese",
+            "source_seg": "大众点评乌鲁木齐家居卖场频道为您提供高铁居然之家地址，电话，营业时间等最新商户信息，找装修公司，就上大众点评",
+            "target_lang": "English",
+            "target_seg": "Urumqi Home Furnishing Store Channel provides you with the latest business information such as the address, telephone number, business hours, etc., of high-speed rail, and find a decoration company, and go to the reviews.",
+            "answer": """Critical:
+accuracy/addition - "of high-speed rail"
+Major:
+accuracy/mistranslation - "go to the reviews"
+Minor:
+style/awkward - "etc.,"
+""",
+        },
+}
+TEMPLATE_GEMBA_MQM = mqm_fewshot([few_shots['ende'], few_shots['encs'], few_shots['zhen']])

gemba/gpt_api.py ADDED Viewed

@@ -0,0 +1,174 @@
+import os
+import sys
+import time
+import ipdb
+import logging
+from termcolor import colored
+from datetime import datetime
+import openai
+import tqdm
+# class for calling OpenAI API and handling cache
+class GptApi:
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+        if "OPENAI_AZURE_ENDPOINT" in os.environ:
+            assert "OPENAI_AZURE_KEY" in os.environ, "OPENAI_AZURE_KEY not found in environment"
+            # Azure API access
+            self.client = openai.AzureOpenAI(
+                api_key=os.environ["OPENAI_AZURE_KEY"],
+                azure_endpoint=os.environ["OPENAI_AZURE_ENDPOINT"],
+                api_version="2023-07-01-preview"
+            )
+        elif "OPENAI_API_KEY" in os.environ:
+            # OpenAI API access
+            self.client = openai.OpenAI(
+                api_key=os.environ["OPENAI_API_KEY"]
+            )
+        else:
+            raise Exception("OPENAI_API_KEY or OPENAI_AZURE_KEY not found in environment")
+        logging.getLogger().setLevel(logging.CRITICAL)  # in order to suppress all these HTTP INFO log messages
+    # answer_id is used for determining if it was the top answer or how deep in the list it was
+    def request(self, prompt, model, parse_response, temperature=0, answer_id=-1, cache=None, max_tokens=None):
+        request = {"model": model, "temperature": temperature, "prompt": prompt}
+        if request in cache and cache[request] is not None and len(cache[request]) > 0:
+            answers = cache[request]
+        else:
+            answers = self.request_api(prompt, model, temperature, max_tokens)
+            cache[request] = answers
+        # there is no valid answer
+        if len(answers) == 0:
+            return [{
+                    "temperature": temperature,
+                    "answer_id": answer_id,
+                    "answer": None,
+                    "prompt": prompt,
+                    "finish_reason": None,
+                    "model": model,
+                    }]
+        parsed_answers = []
+        for full_answer in answers:
+            finish_reason = full_answer["finish_reason"]
+            full_answer = full_answer["answer"]
+            answer_id += 1
+            answer = parse_response(full_answer)
+            if isinstance(answer, tuple):
+                answer, errors = answer
+            else:
+                errors = None
+            if self.verbose or temperature > 0:
+                print(f"Answer (t={temperature}): " + colored(answer, "yellow") + " (" + colored(full_answer, "blue") + ")", file=sys.stderr)
+            if answer is None:
+                continue
+            parsed_answers.append(
+                {
+                    "temperature": temperature,
+                    "answer_id": answer_id,
+                    "answer": answer,
+                    "errors": errors,
+                    "prompt": prompt,
+                    "finish_reason": finish_reason,
+                    "model": model,
+                }
+            )
+        # there was no valid answer, increase temperature and try again
+        if len(parsed_answers) == 0:
+            return self.request(prompt, model, parse_response, temperature=temperature + 1, answer_id=answer_id, cache=cache)
+        return parsed_answers
+    def request_api(self, prompt, model, temperature=0, max_tokens=None):
+        if temperature > 10:
+            return []
+        while True:
+            try:
+                response = self.call_api(prompt, model, temperature, max_tokens)
+                break
+            except Exception as e:
+                # response was filtered
+                if hasattr(e, 'code'):
+                    if e.code == 'content_filter':
+                        return []
+                    print(e.code, file=sys.stderr)
+                if hasattr(e, 'error') and e.error['code'] == 'invalid_model_output':
+                    return []
+                # frequent error is reaching the API limit
+                print(colored("Error, retrying...", "red"), file=sys.stderr)
+                print(e, file=sys.stderr)
+                time.sleep(1)
+        answers = []
+        for choice in response.choices:
+            if choice.message.content is None:
+                return []
+            if hasattr(choice, "message"):
+                answer = choice.message.content.strip()
+            else:
+                answer = choice.text.strip()
+            # one of the responses didn't finish, we need to request more tokens
+            if choice.finish_reason != "stop":
+                if self.verbose:
+                    print(colored(f"Increasing max tokens to fit answers.", "red") + colored(answer, "blue"), file=sys.stderr)
+                print(f"Finish reason: {choice.finish_reason}", file=sys.stderr)
+                if max_tokens is None:
+                    return []
+                return self.request_api(prompt, model, temperature=temperature, max_tokens=max_tokens + 200)
+            answers.append({
+                "answer": answer,
+                "finish_reason": choice.finish_reason,
+            })
+        if len(answers) > 1:
+            # remove duplicate answers
+            answers = [dict(t) for t in {tuple(d.items()) for d in answers}]
+        return answers
+    def call_api(self, prompt, model, temperature, max_tokens):
+        parameters = {
+            "temperature": temperature/10,
+            "top_p": 1,
+            "n": 1,
+            "frequency_penalty": 0,
+            "presence_penalty": 0,
+            "stop": None,
+            "model": model
+        }
+        if max_tokens is not None:
+            parameters["max_tokens"] = max_tokens
+        if isinstance(prompt, list):
+            # check that prompt contain list of dictionaries with role and content
+            assert all(isinstance(p, dict) for p in prompt), "Prompts must be a list of dictionaries."
+            assert all("role" in p and "content" in p for p in prompt), "Prompts must be a list of dictionaries with role and content."
+            parameters["messages"] = prompt
+        else:
+            parameters["messages"] = [{
+                "role": "user",
+                "content": prompt,
+            }]
+        return self.client.chat.completions.create(**parameters)
+    def bulk_request(self, df, model, parse_mqm_answer, cache, max_tokens=None):
+        answers = []
+        for i, row in tqdm.tqdm(df.iterrows(), total=len(df), file=sys.stderr):
+            prompt = row["prompt"]
+            parsed_answers = self.request(prompt, model, parse_mqm_answer, cache=cache, max_tokens=max_tokens)
+            answers += parsed_answers
+        return answers

gemba/mtme_tools.py ADDED Viewed

@@ -0,0 +1,99 @@
+from mt_metrics_eval import data
+import scipy
+######
+# Functions in this script are copied from mt-metrics-eval/wmt22_metrics.ipynb
+######
+def eval_metrics(eval_sets, langs, levels, primary_only, k, gold_name='std',
+                 include_domains=True, seg_level_no_avg=False,
+                 include_human_with_acc=False):
+    """Evaluate all metrics for eval sets, across multiple task settings.
+    Args:
+      eval_sets: Map from lang-pair to eval_set objects.
+      langs: List of language pairs (eg 'en-de') for which to compute results.
+      levels: List of levels for which to compute results, allowed elements are
+        'sys' and 'seg'.
+      primary_only: Include only primary metrics.
+      k: Number of boostrap draws. If 0, no significance tests for metric-score
+        differences are run, and execution is much faster.
+      gold_name: Name of gold scores to use, standard scores if 'std'.
+      include_domains: Generate domain-specific results in addition to global
+        results.
+      seg_level_no_avg: If True, use only the average_by=None setting for segment-
+        level correlations
+      include_human_with_acc: If True, include human outputs in accuracy tasks.
+    Returns:
+      Map from task names to metric -> (rank, corr, sig_string) stats.
+    """
+    results = {}
+    # First task is global accuracy, iff more than one language is given.
+    if len(langs) > 0:
+        evs_list = [eval_sets[lp] for lp in langs]
+        main_refs = [{evs.std_ref} for evs in evs_list]
+        close_refs = [set() for evs in evs_list]
+        if gold_name == 'std':
+            gold = evs_list[0].StdHumanScoreName('sys')
+        else:
+            gold = gold_name
+        humans = [True, False] if include_human_with_acc else [False]
+        for human in humans:
+            taskname = data.MakeTaskName(
+                'wmt22', langs, None, 'sys', human, 'none', 'accuracy', k, gold,
+                main_refs, close_refs, False, primary_only)
+            print(taskname)
+            res = data.CompareMetricsWithGlobalAccuracy(
+                evs_list, main_refs, close_refs, include_human=human,
+                include_outliers=False, gold_name=gold,
+                primary_metrics=primary_only,
+                domain=None, k=k, pval=0.05)
+            results[taskname] = reformat(res)
+    # Remaining tasks are specific to language, domain, etc.
+    for lp in langs:
+        evs = eval_sets[lp]
+        main_refs = {evs.std_ref}
+        close_refs = set()
+        for domain in [None] + (list(evs.domain_names) if include_domains else []):
+            for level in levels:
+                gold = evs.StdHumanScoreName(level) if gold_name == 'std' else gold_name
+                for avg in 'none', 'sys', 'item':
+                    if (level == 'sys' or seg_level_no_avg) and avg != 'none':
+                        continue
+                    for human in True, False:
+                        if human == True and len(evs.ref_names) == 1:
+                            continue  # Single ref
+                        for corr in 'pearson', 'kendall':
+                            corr_fcn = {'pearson': scipy.stats.pearsonr,
+                                        'kendall': scipy.stats.kendalltau}[corr]
+                            taskname = data.MakeTaskName(
+                                'wmt22', lp, domain, level, human, avg, corr, k, gold,
+                                main_refs, close_refs, False, primary=primary_only)
+                            print(taskname)
+                            corrs = data.GetCorrelations(
+                                evs=evs, level=level, main_refs={evs.std_ref},
+                                close_refs=close_refs, include_human=human,
+                                include_outliers=False, gold_name=gold_name,
+                                primary_metrics=primary_only, domain=domain)
+                            metrics, sig_matrix = data.CompareMetrics(
+                                corrs, corr_fcn, average_by=avg, k=k, pval=0.05)
+                            # Make compatible with accuracy results.
+                            metrics = {evs.DisplayName(m): v for m, v in metrics.items()}
+                            results[taskname] = reformat((metrics, sig_matrix))
+    return results
+def reformat(results):
+    """Reformat CompareMetrics() results to match mtme's format."""
+    metrics, sig_matrix = results
+    res = {}
+    for i, (m, (corr, rank)) in enumerate(metrics.items()):
+        sigs = ['1' if p < 0.05 else '0' for p in sig_matrix[i]]
+        sigs = ['x'] * (i + 1) + sigs[i + 1:]
+        res[m] = (rank, corr, ' '.join(sigs))
+    return res