PyPI - pearmut - Versions diffs - 0.2.11__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

pearmut 0.2.11py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

pearmut/app.py +9 -19
pearmut/assignment.py +6 -6
pearmut/cli.py +88 -22
pearmut/static/basic.bundle.js +1 -0
pearmut/static/basic.html +74 -0
pearmut/static/dashboard.bundle.js +1 -1
pearmut/static/style.css +1 -2
pearmut/utils.py +1 -32
{pearmut-0.2.11.dist-info → pearmut-0.3.1.dist-info}/METADATA +80 -79
pearmut-0.3.1.dist-info/RECORD +17 -0
pearmut/static/listwise.bundle.js +0 -1
pearmut/static/listwise.html +0 -77
pearmut/static/pointwise.bundle.js +0 -1
pearmut/static/pointwise.html +0 -69
pearmut-0.2.11.dist-info/RECORD +0 -19
{pearmut-0.2.11.dist-info → pearmut-0.3.1.dist-info}/WHEEL +0 -0
{pearmut-0.2.11.dist-info → pearmut-0.3.1.dist-info}/entry_points.txt +0 -0
{pearmut-0.2.11.dist-info → pearmut-0.3.1.dist-info}/licenses/LICENSE +0 -0
{pearmut-0.2.11.dist-info → pearmut-0.3.1.dist-info}/top_level.txt +0 -0

pearmut/utils.py CHANGED Viewed

@@ -7,37 +7,6 @@ ROOT = "."
 RESET_MARKER = "__RESET__"
-def highlight_differences(a, b):
-    """
-    Compares two strings and wraps their differences in HTML span tags.
-    Args:
-        a: The first string.
-        b: The second string.
-    Returns:
-        A tuple containing the two strings with their differences highlighted.
-    """
-    import difflib
-    # TODO: maybe on the level of words?
-    s = difflib.SequenceMatcher(None, a, b)
-    res_a, res_b = [], []
-    span_open = '<span class="difference">'
-    span_close = '</span>'
-    for tag, i1, i2, j1, j2 in s.get_opcodes():
-        if tag == 'equal' or (i2-i1 <= 2 and j2-j1 <= 2):
-            res_a.append(a[i1:i2])
-            res_b.append(b[j1:j2])
-        else:
-            if tag in ('replace', 'delete'):
-                res_a.append(f"{span_open}{a[i1:i2]}{span_close}")
-            if tag in ('replace', 'insert'):
-                res_b.append(f"{span_open}{b[j1:j2]}{span_close}")
-    return "".join(res_a), "".join(res_b)
 def load_progress_data(warn: str | None = None):
     if not os.path.exists(f"{ROOT}/data/progress.json"):
         if warn is not None:
@@ -94,7 +63,7 @@ def get_db_log_item(campaign_id: str, user_id: str | None, item_i: int | None) -
     # Find the last reset marker for this user (if any)
     last_reset_idx = -1
     for i, entry in enumerate(matching):
-        if entry.get("annotations") == RESET_MARKER:
+        if entry.get("annotation") == RESET_MARKER:
             last_reset_idx = i
     # Return only entries after the last reset

{pearmut-0.2.11.dist-info → pearmut-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pearmut
-Version: 0.2.11
+Version: 0.3.1
 Summary: A tool for evaluation of model outputs, primarily MT.
 Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
 License: MIT
@@ -20,7 +20,7 @@ Dynamic: license-file
 # Pearmut 🍐
-**Platform for Evaluation and Reviewing of Multilingual Tasks** — Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
+**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
 [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
 &nbsp;
@@ -38,7 +38,6 @@ Dynamic: license-file
 - [Campaign Configuration](#campaign-configuration)
   - [Basic Structure](#basic-structure)
   - [Assignment Types](#assignment-types)
-  - [Protocol Templates](#protocol-templates)
 - [Advanced Features](#advanced-features)
   - [Pre-filled Error Spans (ESA<sup>AI</sup>)](#pre-filled-error-spans-esaai)
   - [Tutorial and Attention Checks](#tutorial-and-attention-checks)
@@ -51,19 +50,16 @@ Dynamic: license-file
 - [Development](#development)
 - [Citation](#citation)
-**Error Span** — A highlighted segment of text marked as containing an error, with optional severity (`minor`, `major`, `neutral`) and MQM category labels.
 ## Quick Start
 Install and run locally without cloning:
 ```bash
 pip install pearmut
 # Download example campaigns
-wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa_encs.json
-wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da_enuk.json
+wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa.json
+wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da.json
 # Load and start
-pearmut add esa_encs.json da_enuk.json
+pearmut add esa.json da.json
 pearmut run
 ```
@@ -76,10 +72,10 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
 {
   "info": {
     "assignment": "task-based",
-    "template": "pointwise",
-    "protocol_score": true,                 # we want scores [0...100] for each segment
-    "protocol_error_spans": true,           # we want error spans
-    "protocol_error_categories": false,     # we do not want error span categories
+    # DA: scores
+    # ESA: error spans and scores
+    # MQM: error spans, categories, and scores
+    "protocol": "ESA",
   },
   "campaign_id": "wmt25_#_en-cs_CZ",
   "data": [
@@ -90,11 +86,11 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
         {
           "instructions": "Evaluate translation from en to cs_CZ",  # message to show to users above the first item
           "src": "This will be the year that Guinness loses its cool. Cheers to that!",
-          "tgt": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."
+          "tgt": {"modelA": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."}
         },
         {
           "src": "I'm not sure I can remember exactly when I sensed it. Maybe it was when some...",
-          "tgt": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"
+          "tgt": {"modelA": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"}
         }
         ...
       ],
@@ -114,11 +110,11 @@ Task items are protocol-specific. For ESA/DA/MQM protocols, each item is a dicti
 [
   {
     "src": "A najednou se všechna tato voda naplnila dalšími lidmi a dalšími věcmi.",  # required
-    "tgt": "And suddenly all the water became full of other people and other people."  # required
+    "tgt": {"modelA": "And suddenly all the water became full of other people and other people."}  # required (dict)
   },
   {
     "src": "toto je pokračování stejného dokumentu",
-    "tgt": "this is a continuation of the same document"
+    "tgt": {"modelA": "this is a continuation of the same document"}
     # Additional keys stored for analysis
   }
 ]
@@ -136,16 +132,23 @@ pearmut run
 - **`single-stream`**: All users draw from a shared pool (random assignment)
 - **`dynamic`**: work in progress ⚠️
-### Protocol Templates
+## Advanced Features
-- **Pointwise**: Evaluate single output against single input
-  - `protocol_score`: Collect scores [0-100]
-  - `protocol_error_spans`: Collect error span highlights
-  - `protocol_error_categories`: Collect MQM category labels
-- **Listwise**: Evaluate multiple outputs simultaneously
-  - Same protocol options as pointwise
+### Shuffling Model Translations
-## Advanced Features
+By default, Pearmut randomly shuffles the order in which models are shown per each item in order to avoid positional bias.
+The `shuffle` parameter in campaign `info` controls this behavior:
+```python
+{
+  "info": {
+    "assignment": "task-based",
+    "protocol": "ESA",
+    "shuffle": true  # Default: true. Set to false to disable shuffling.
+  },
+  "campaign_id": "my_campaign",
+  "data": [...]
+}
+```
 ### Pre-filled Error Spans (ESA<sup>AI</sup>)
@@ -154,25 +157,27 @@ Include `error_spans` to pre-fill annotations that users can review, modify, or
 ```python
 {
   "src": "The quick brown fox jumps over the lazy dog.",
-  "tgt": "Rychlá hnědá liška skáče přes líného psa.",
-  "error_spans": [
-    {
-      "start_i": 0,         # character index start (inclusive)
-      "end_i": 5,           # character index end (inclusive)
-      "severity": "minor",  # "minor", "major", "neutral", or null
-      "category": null      # MQM category string or null
-    },
-    {
-      "start_i": 27,
-      "end_i": 32,
-      "severity": "major",
-      "category": null
-    }
-  ]
+  "tgt": {"modelA": "Rychlá hnědá liška skáče přes líného psa."},
+  "error_spans": {
+    "modelA": [
+      {
+        "start_i": 0,         # character index start (inclusive)
+        "end_i": 5,           # character index end (inclusive)
+        "severity": "minor",  # "minor", "major", "neutral", or null
+        "category": null      # MQM category string or null
+      },
+      {
+        "start_i": 27,
+        "end_i": 32,
+        "severity": "major",
+        "category": null
+      }
+    ]
+  }
 }
 ```
-For **listwise** template, `error_spans` is a 2D array (one per candidate). See [examples/esaai_prefilled.json](examples/esaai_prefilled.json).
+The `error_spans` field is a 2D array (one per candidate). See [examples/esaai_prefilled.json](examples/esaai_prefilled.json).
 ### Tutorial and Attention Checks
@@ -181,12 +186,16 @@ Add `validation` rules for tutorials or attention checks:
 ```python
 {
   "src": "The quick brown fox jumps.",
-  "tgt": "Rychlá hnědá liška skáče.",
+  "tgt": {"modelA": "Rychlá hnědá liška skáče."},
   "validation": {
-    "warning": "Please set score between 70-80.",  # shown on failure (omit for silent logging)
-    "score": [70, 80],                             # required score range [min, max]
-    "error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}],  # expected spans
-    "allow_skip": true                             # show "skip tutorial" button
+    "modelA": [
+      {
+        "warning": "Please set score between 70-80.",  # shown on failure (omit for silent logging)
+        "score": [70, 80],                             # required score range [min, max]
+        "error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}],  # expected spans
+        "allow_skip": true                             # show "skip tutorial" button
+      }
+    ]
   }
 }
 ```
@@ -196,22 +205,25 @@ Add `validation` rules for tutorials or attention checks:
 - **Loud attention checks**: Include `warning` without `allow_skip` to force retry
 - **Silent attention checks**: Omit `warning` to log failures without notification (quality control)
-For listwise, `validation` is an array (one per candidate). Dashboard shows ✅/❌ based on `validation_threshold` in `info` (integer for max failed count, float \[0,1\) for max proportion, default 0).
+The `validation` field is an array (one per candidate). Dashboard shows ✅/❌ based on `validation_threshold` in `info` (integer for max failed count, float \[0,1\) for max proportion, default 0).
-**Listwise score comparison:** Use `score_greaterthan` to ensure one candidate scores higher than another:
+**Score comparison:** Use `score_greaterthan` to ensure one candidate scores higher than another:
 ```python
 {
   "src": "AI transforms industries.",
-  "tgt": ["UI transformuje průmysly.", "Umělá inteligence mění obory."],
-  "validation": [
-    {"warning": "A has error, score 20-40.", "score": [20, 40]},
-    {"warning": "B is correct and must score higher than A.", "score": [70, 90], "score_greaterthan": 0}
-  ]
+  "tgt": {"A": "UI transformuje průmysly.", "B": "Umělá inteligence mění obory."},
+  "validation": {
+    "A": [
+      {"warning": "A has error, score 20-40.", "score": [20, 40]}
+    ],
+    "B": [
+      {"warning": "B is correct and must score higher than A.", "score": [70, 90], "score_greaterthan": "A"}
+    ]
+  }
 }
 ```
 The `score_greaterthan` field specifies the index of the candidate that must have a lower score than the current candidate.
-See [examples/tutorial_pointwise.json](examples/tutorial_pointwise.json), [examples/tutorial_listwise.json](examples/tutorial_listwise.json), and [examples/tutorial_listwise_score_greaterthan.json](examples/tutorial_listwise_score_greaterthan.json).
+See [examples/tutorial_kway.json](examples/tutorial_kway.json).
 ### Single-stream Assignment
@@ -221,10 +233,10 @@ All annotators draw from a shared pool with random assignment:
     "campaign_id": "my campaign 6",
     "info": {
         "assignment": "single-stream",
-        "template": "pointwise",
-        "protocol_score": True,                # collect scores
-        "protocol_error_spans": True,          # collect error spans
-        "protocol_error_categories": False,    # do not collect MQM categories, so ESA
+        # DA: scores
+        # MQM: error spans and categories
+        # ESA: error spans and scores
+        "protocol": "ESA",
         "users": 50,                           # number of annotators (can also be a list, see below)
     },
     "data": [...], # list of all items (shared among all annotators)
@@ -302,30 +314,21 @@ Completion tokens are shown at annotation end for verification (download correct
 <img width="500" alt="Token on completion" src="https://github.com/user-attachments/assets/40eb904c-f47a-4011-aa63-9a4f1c501549" />
-### Model Results Display
-Add `&results` to dashboard URL to show model rankings (requires valid token).
-Items need `model` field (pointwise) or `models` field (listwise) and the `protocol_score` needs to be enable such that the `score` can be used for the ranking:
-```python
-{"doc_id": "1", "model": "CommandA", "src": "...", "tgt": "..."}
-{"doc_id": "2", "models": ["CommandA", "Claude"], "src": "...", "tgt": ["...", "..."]}
-```
-See an example in [Campaign Management](#campaign-management)
+When tokens are supplied, the dashboard will try to show model rankings based on the names in the dictionaries.
 ## Terminology
 - **Campaign**: An annotation project that contains configuration, data, and user assignments. Each campaign has a unique identifier and is defined in a JSON file.
   - **Campaign File**: A JSON file that defines the campaign configuration, including the campaign ID, assignment type, protocol settings, and annotation data.
-  - **Campaign ID**: A unique identifier for a campaign (e.g., `"wmt25_#_en-cs_CZ"`). Used to reference and manage specific campaigns.
+  - **Campaign ID**: A unique identifier for a campaign (e.g., `"wmt25_#_en-cs_CZ"`). Used to reference and manage specific campaigns. Typically a campaign is created for a specific language and domain.
 - **Task**: A unit of work assigned to a user. In task-based assignment, each task consists of a predefined set of items for a specific user.
-- **Item** — A single annotation unit within a task. For translation evaluation, an item typically represents a document (source text and target translation). Items can contain text, images, audio, or video.
-- **Document** — A collection of one or more segments (sentence pairs or text units) that are evaluated together as a single item.
+- **Item**: A single annotation unit within a task. For translation evaluation, an item typically represents a document (source text and target translation). Items can contain text, images, audio, or video.
+- **Document**: A collection of one or more segments (sentence pairs or text units) that are evaluated together as a single item.
 - **User** / **Annotator**: A person who performs annotations in a campaign. Each user is identified by a unique user ID and accesses the campaign through a unique URL.
-- **Attention Check** — A validation item with known correct answers used to ensure annotator quality. Can be:
+- **Attention Check**: A validation item with known correct answers used to ensure annotator quality. Can be:
   - **Loud**: Shows warning message and forces retry on failure
   - **Silent**: Logs failures without notifying the user (for quality control analysis)
-  - **Token** — A completion code shown to users when they finish their annotations. Tokens verify the completion and whether the user passed quality control checks:
+  - **Token**: A completion code shown to users when they finish their annotations. Tokens verify the completion and whether the user passed quality control checks:
     - **Pass Token** (`token_pass`): Shown when user meets validation thresholds
     - **Fail Token** (`token_fail`): Shown when user fails to meet validation requirements
 - **Tutorial**: An instructional validation item that teaches users how to annotate. Includes `allow_skip: true` to let users skip if they have seen it before.
@@ -334,11 +337,9 @@ See an example in [Campaign Management](#campaign-management)
 - **Dashboard**: The management interface that shows campaign progress, annotator statistics, access links, and allows downloading annotations. Accessed via a special management URL with token authentication.
 - **Protocol**: The annotation scheme defining what data is collected:
   - **Score**: Numeric quality rating (0-100)
-  - **Error Spans**: Text highlights marking errors
+  - **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
   - **Error Categories**: MQM taxonomy labels for errors
-- **Template**: The annotation interface type:
-  - **Pointwise**: Evaluate one output at a time
-  - **Listwise**: Compare multiple outputs simultaneously
+- **Template**: The annotation interface type. The `basic` template supports comparing multiple outputs simultaneously.
 - **Assignment**: The method for distributing items to users:
   - **Task-based**: Each user has predefined items
   - **Single-stream**: Users draw from a shared pool with random assignment
@@ -369,7 +370,7 @@ pearmut run
 2. Add build rule to `webpack.config.js`
 3. Reference as `info->template` in campaign JSON
-See [web/src/pointwise.ts](web/src/pointwise.ts) for example.
+See [web/src/basic.ts](web/src/basic.ts) for example.
 ### Deployment

pearmut-0.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+pearmut/app.py,sha256=IZNmeKTAuLcf9FggvlHktWDbIGxfykjSRM-sI8Byfik,10179
+pearmut/assignment.py,sha256=_0hNXtA-Mgn6bRyRVjgeGxERKRvBezR3NmEwx2uME38,11685
+pearmut/cli.py,sha256=tYzCs7bTuKpt8pIbv8L5SpFHjIVteYyo12KWdrWT1U0,20642
+pearmut/utils.py,sha256=Rl_i-WCaJN3p_VG5iVL0fSeI481jcJUUEZO6HKx62PE,4347
+pearmut/static/basic.bundle.js,sha256=9cz_5Jq0KgnWTwkuGqRT2eAY3FHQJM2f2OP1RnNi0s4,110582
+pearmut/static/basic.html,sha256=Nm0t3uGsbUUso_lFpIpMMEe9iBEDS_Og4tz5vdWhJGo,5473
+pearmut/static/dashboard.bundle.js,sha256=djacPNoKpxtSP0CzAdEmgPocDyBO0ihFUriCw_RJOhQ,100630
+pearmut/static/dashboard.html,sha256=HXZzoz44f7LYtAfuP7uQioxTkNmo2_fAN0v2C2s1lAs,2680
+pearmut/static/favicon.svg,sha256=gVPxdBlyfyJVkiMfh8WLaiSyH4lpwmKZs8UiOeX8YW4,7347
+pearmut/static/index.html,sha256=yMttallApd0T7sxngUrdwCDrtTQpRIFF0-4W0jfXejU,835
+pearmut/static/style.css,sha256=hI_Mbvq6BbXfsp-WMpx73tsOL_6QflgrSV1um-3c-hU,4101
+pearmut-0.3.1.dist-info/licenses/LICENSE,sha256=GtR6RcTdRn-P23h5pKFuWSLZrLPD0ytHAwSOBt7aLpI,1071
+pearmut-0.3.1.dist-info/METADATA,sha256=_8Wp8dbCNV9glYKPfqrAN_AV9G3WeytqcgTzjoMeDnU,15606
+pearmut-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+pearmut-0.3.1.dist-info/entry_points.txt,sha256=eEA9LVWsS3neQbMvL_nMvEw8I0oFudw8nQa1iqxOiWM,45
+pearmut-0.3.1.dist-info/top_level.txt,sha256=CdgtUM-SKQDt6o5g0QreO-_7XTBP9_wnHMS1P-Rl5Go,8
+pearmut-0.3.1.dist-info/RECORD,,

pearmut 0.2.11__py3-none-any.whl → 0.3.1__py3-none-any.whl

pearmut 0.2.11py3-none-any.whl → 0.3.1py3-none-any.whl