PyPI - pearmut - Versions diffs - 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

pearmut 0.3.3py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

pearmut/app.py +46 -27
pearmut/assignment.py +256 -46
pearmut/cli.py +45 -8
pearmut/results_export.py +210 -0
pearmut/static/basic.bundle.js +1 -1
pearmut/static/basic.html +1 -1
pearmut/static/dashboard.bundle.js +1 -1
pearmut/static/dashboard.html +27 -12
pearmut/static/index.bundle.js +1 -1
pearmut/static/index.html +1 -1
pearmut/utils.py +16 -2
{pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/METADATA +54 -26
pearmut-1.0.0.dist-info/RECORD +19 -0
pearmut-0.3.3.dist-info/RECORD +0 -18
{pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/WHEEL +0 -0
{pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/entry_points.txt +0 -0
{pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/licenses/LICENSE +0 -0
{pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/top_level.txt +0 -0

pearmut/app.py CHANGED Viewed

@@ -1,20 +1,23 @@
-import collections
 import json
 import os
-import statistics
 from typing import Any
 from fastapi import FastAPI, Query
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, Response
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from .assignment import get_i_item, get_next_item, reset_task, update_progress
+from .results_export import (
+    compute_model_scores,
+    generate_latex_table,
+    generate_pdf,
+    generate_typst_table,
+)
 from .utils import (
     ROOT,
     check_validation_threshold,
-    get_db_log,
     load_progress_data,
     save_db_payload,
     save_progress_data,
@@ -159,7 +162,7 @@ async def _dashboard_data(request: DashboardDataRequest):
     progress_new = {}
     assignment = tasks_data[campaign_id]["info"]["assignment"]
-    if assignment not in ["task-based", "single-stream"]:
+    if assignment not in ["task-based", "single-stream", "dynamic"]:
         return JSONResponse(
             content="Unsupported campaign assignment type", status_code=400
         )
@@ -211,31 +214,47 @@ async def _dashboard_results(request: DashboardResultsRequest):
     if token != tasks_data[campaign_id]["token"]:
         return JSONResponse(content="Invalid token", status_code=400)
-    # Compute model scores from annotations
-    model_scores = collections.defaultdict(dict)
-    # Iterate through all tasks to find items with 'models' field (basic template)
-    log = get_db_log(campaign_id)
-    for entry in log:
-        if "item" not in entry or "annotation" not in entry:
-            continue
-        for item, annotation in zip(entry["item"], entry["annotation"]):
-            for model, annotation in annotation.items():
-                if "score" in annotation and annotation["score"] is not None:
-                    model_scores[model][json.dumps(item)] = annotation["score"]
-    results = [
-        {
-            "model": model,
-            "score": statistics.mean(scores.values()),
-            "count": len(scores),
-        }
-        for model, scores in model_scores.items()
-    ]
-    results.sort(key=lambda x: x["score"], reverse=True)
+    results = compute_model_scores(campaign_id)
     return JSONResponse(content=results, status_code=200)
+@app.get("/export-results")
+async def _export_results(
+    campaign_id: str = Query(),
+    token: str = Query(),
+    format: str = Query(),
+):
+    if campaign_id not in progress_data:
+        return JSONResponse(content="Unknown campaign ID", status_code=400)
+    # Check if token is valid
+    if token != tasks_data[campaign_id]["token"]:
+        return JSONResponse(content="Invalid token", status_code=400)
+    results = compute_model_scores(campaign_id)
+    if format == "typst":
+        content = generate_typst_table(results)
+        return Response(
+            content=content,
+            media_type="text/plain",
+        )
+    elif format == "latex":
+        content = generate_latex_table(results)
+        return Response(
+            content=content,
+            media_type="text/plain",
+        )
+    elif format == "pdf":
+        pdf_bytes = generate_pdf(results, campaign_id)
+        return Response(
+            content=pdf_bytes,
+            media_type="application/pdf",
+        )
+    else:
+        return JSONResponse(content="Invalid export format", status_code=400)
 class ResetTaskRequest(BaseModel):
     campaign_id: str
     user_id: str

pearmut/assignment.py CHANGED Viewed

@@ -1,4 +1,6 @@
+import collections
 import random
+import statistics
 from typing import Any
 from fastapi.responses import JSONResponse
@@ -6,6 +8,7 @@ from fastapi.responses import JSONResponse
 from .utils import (
     RESET_MARKER,
     check_validation_threshold,
+    get_db_log,
     get_db_log_item,
     save_db_payload,
 )
@@ -20,14 +23,33 @@ def _completed_response(
     """Build a completed response with progress, time, and token."""
     user_progress = progress_data[campaign_id][user_id]
     is_ok = check_validation_threshold(tasks_data, progress_data, campaign_id, user_id)
+    token = user_progress["token_correct" if is_ok else "token_incorrect"]
+    # Get instructions_goodbye from campaign info, with default value
+    instructions_goodbye = tasks_data[campaign_id]["info"].get(
+        "instructions_goodbye",
+        "If someone asks you for a token of completion, show them: ${TOKEN}",
+    )
+    # Replace variables ${TOKEN} and ${USER_ID}
+    instructions_goodbye = instructions_goodbye.replace("${TOKEN}", token).replace(
+        "${USER_ID}", user_id
+    )
+    # Convert sets to lists for JSON serialization (for dynamic assignment)
+    progress = user_progress["progress"]
+    if progress and isinstance(progress[0], set):
+        progress = [list(s) for s in progress]
     return JSONResponse(
         content={
-            "status": "completed",
-            "progress": user_progress["progress"],
+            "status": "goodbye",
+            "progress": progress,
             "time": user_progress["time"],
-            "token": user_progress["token_correct" if is_ok else "token_incorrect"],
+            "token": token,
+            "instructions_goodbye": instructions_goodbye,
         },
-        status_code=200
+        status_code=200,
     )
@@ -44,7 +66,9 @@ def get_next_item(
     if assignment == "task-based":
         return get_next_item_taskbased(campaign_id, user_id, tasks_data, progress_data)
     elif assignment == "single-stream":
-        return get_next_item_singlestream(campaign_id, user_id, tasks_data, progress_data)
+        return get_next_item_singlestream(
+            campaign_id, user_id, tasks_data, progress_data
+        )
     elif assignment == "dynamic":
         return get_next_item_dynamic(campaign_id, user_id, tasks_data, progress_data)
     else:
@@ -63,11 +87,17 @@ def get_i_item(
     """
     assignment = tasks_data[campaign_id]["info"]["assignment"]
     if assignment == "task-based":
-        return get_i_item_taskbased(campaign_id, user_id, tasks_data, progress_data, item_i)
+        return get_i_item_taskbased(
+            campaign_id, user_id, tasks_data, progress_data, item_i
+        )
     elif assignment == "single-stream":
-        return get_i_item_singlestream(campaign_id, user_id, tasks_data, progress_data, item_i)
+        return get_i_item_singlestream(
+            campaign_id, user_id, tasks_data, progress_data, item_i
+        )
     else:
-        return JSONResponse(content="Get item not supported for this assignment type", status_code=400)
+        return JSONResponse(
+            content="Get item not supported for this assignment type", status_code=400
+        )
 def get_i_item_taskbased(
@@ -93,10 +123,7 @@ def get_i_item_taskbased(
             payload_existing["comment"] = latest_item["comment"]
     if item_i < 0 or item_i >= len(data_all[campaign_id]["data"][user_id]):
-        return JSONResponse(
-            content="Item index out of range",
-            status_code=400
-        )
+        return JSONResponse(content="Item index out of range", status_code=400)
     return JSONResponse(
         content={
@@ -105,14 +132,16 @@ def get_i_item_taskbased(
             "time": user_progress["time"],
             "info": {
                 "item_i": item_i,
-            } | {
+            }
+            | {
                 k: v
                 for k, v in data_all[campaign_id]["info"].items()
                 if k.startswith("protocol")
             },
-            "payload": data_all[campaign_id]["data"][user_id][item_i]
-        } | ({"payload_existing": payload_existing} if payload_existing else {}),
-        status_code=200
+            "payload": data_all[campaign_id]["data"][user_id][item_i],
+        }
+        | ({"payload_existing": payload_existing} if payload_existing else {}),
+        status_code=200,
     )
@@ -140,10 +169,7 @@ def get_i_item_singlestream(
             payload_existing["comment"] = latest_item["comment"]
     if item_i < 0 or item_i >= len(data_all[campaign_id]["data"]):
-        return JSONResponse(
-            content="Item index out of range",
-            status_code=400
-        )
+        return JSONResponse(content="Item index out of range", status_code=400)
     return JSONResponse(
         content={
@@ -152,14 +178,16 @@ def get_i_item_singlestream(
             "time": user_progress["time"],
             "info": {
                 "item_i": item_i,
-            } | {
+            }
+            | {
                 k: v
                 for k, v in data_all[campaign_id]["info"].items()
                 if k.startswith("protocol")
             },
-            "payload": data_all[campaign_id]["data"][item_i]
-        } | ({"payload_existing": payload_existing} if payload_existing else {}),
-        status_code=200
+            "payload": data_all[campaign_id]["data"][item_i],
+        }
+        | ({"payload_existing": payload_existing} if payload_existing else {}),
+        status_code=200,
     )
@@ -196,14 +224,16 @@ def get_next_item_taskbased(
             "time": user_progress["time"],
             "info": {
                 "item_i": item_i,
-            } | {
+            }
+            | {
                 k: v
                 for k, v in data_all[campaign_id]["info"].items()
                 if k.startswith("protocol")
             },
-            "payload": data_all[campaign_id]["data"][user_id][item_i]
-        } | ({"payload_existing": payload_existing} if payload_existing else {}),
-        status_code=200
+            "payload": data_all[campaign_id]["data"][user_id][item_i],
+        }
+        | ({"payload_existing": payload_existing} if payload_existing else {}),
+        status_code=200,
     )
@@ -249,21 +279,176 @@ def get_next_item_singlestream(
             "progress": progress,
             "info": {
                 "item_i": item_i,
-            } | {
+            }
+            | {
                 k: v
                 for k, v in data_all[campaign_id]["info"].items()
                 if k.startswith("protocol")
             },
-            "payload": data_all[campaign_id]["data"][item_i]
-        } | ({"payload_existing": payload_existing} if payload_existing else {}),
-        status_code=200
+            "payload": data_all[campaign_id]["data"][item_i],
+        }
+        | ({"payload_existing": payload_existing} if payload_existing else {}),
+        status_code=200,
     )
+def get_next_item_dynamic(
+    campaign_id: str,
+    user_id: str,
+    tasks_data: dict,
+    progress_data: dict,
+) -> JSONResponse:
+    """
+    Get the next item for dynamic assignment based on model performance.
+    NOTE: All items must contain all model outputs for this assignment type to work.
+    In this mode, items are selected based on the current performance of models:
+    1. Contrastive comparison: `dynamic_contrastive_models` models are randomly selected and shown per item
+    2. First phase: Each model gets `dynamic_first` annotations with fully random selection
+    3. After first phase: Top `dynamic_top` models are identified, K randomly selected from them
+    4. Items with least annotations for the selected models are prioritized
+    5. With probability `dynamic_backoff`, uniformly random selection is used instead
+    """
+    import random
+    user_progress = progress_data[campaign_id][user_id]
+    campaign_data = tasks_data[campaign_id]
-def get_next_item_dynamic(campaign_data: dict, user_id: str, progress_data: dict, data_all: dict):
-    raise NotImplementedError("Dynamic protocol is not implemented yet.")
+    # Get all unique models in the campaign (all items must have all models)
+    all_models = list(set(campaign_data["data"][0][0]["tgt"].keys()))
+    # Check if completed (all models completed for all items)
+    # NOTE: this will rarely trigger but we don't have a good way to know when to end anyway for now
+    if all(len(v) == len(all_models) for v in user_progress["progress"]):
+        return _completed_response(tasks_data, progress_data, campaign_id, user_id)
+    # Get configuration parameters
+    dynamic_top = campaign_data["info"].get("dynamic_top", 2)
+    dynamic_first = campaign_data["info"].get("dynamic_first", 5)
+    dynamic_contrastive_models = campaign_data["info"].get(
+        "dynamic_contrastive_models", 1
+    )
+    dynamic_backoff = campaign_data["info"].get("dynamic_backoff", 0)
+    # Count annotations per (model, item) pair to track coverage
+    annotations = get_db_log(campaign_id)
+    model_item_counts = collections.defaultdict(int)  # (model, item_i) -> count
+    model_total_counts = collections.defaultdict(int)  # model -> total count
+    for annotation_line in annotations:
+        if (item_i := annotation_line.get("item_i")) is not None:
+            # Count which models were annotated in this annotation
+            for annotation_item in annotation_line.get("annotation", []):
+                for model in annotation_item:
+                    model_item_counts[(model, item_i)] += 1
+                    model_total_counts[model] += 1
+    # Check if we're still in the first phase (collecting initial data)
+    in_first_phase = any(
+        model_total_counts.get(model, 0) < dynamic_first for model in all_models
+    )
+    # Select which models to show
+    if in_first_phase:
+        # First phase or backoff: select models that don't have enough annotations yet
+        selected_models = random.sample(
+            [
+                model
+                for model in all_models
+                if model_total_counts.get(model, 0) < dynamic_first
+            ],
+            k=min(dynamic_contrastive_models, len(all_models)),
+        )
+    elif random.random() < dynamic_backoff:
+        # Backoff: select K models randomly from all models
+        selected_models = random.sample(
+            all_models, k=min(dynamic_contrastive_models, len(all_models))
+        )
+    else:
+        # Calculate model scores from annotations
+        model_scores = collections.defaultdict(list)
+        for annotation_line in annotations:
+            for annotation_item in annotation_line.get("annotation", {}):
+                for model in annotation_item:
+                    if "score" in annotation_item[model]:
+                        model_scores[model].append(annotation_item[model]["score"])
+        # Calculate average scores
+        model_avg_scores = {
+            model: statistics.mean(scores) for model, scores in model_scores.items()
+        }
+        # Get top N models
+        sorted_models = sorted(
+            model_avg_scores.items(), key=lambda x: x[1], reverse=True
+        )
+        top_models = [model for model, score in sorted_models[:dynamic_top]]
+        # From top N, randomly select K models
+        selected_models = random.sample(
+            top_models, k=min(dynamic_contrastive_models, len(top_models))
+        )
+    # Find incomplete items for the selected models (items where not all selected models are done)
+    item_annotation_counts = {
+        i: sum(model in completed_models for model in selected_models)
+        for i, completed_models in enumerate(user_progress["progress"])
+    }
+    # Select item with minimum annotations (with random tiebreaking)
+    min_annotations = min(item_annotation_counts.values())
+    items_with_min = [
+        item_i
+        for item_i, count in item_annotation_counts.items()
+        if count == min_annotations
+    ]
+    item_i = random.choice(items_with_min)
+    # Prune the payload to only include selected models
+    original_item = campaign_data["data"][item_i]
+    pruned_item = []
+    for doc_segment in original_item:
+        pruned_segment = doc_segment.copy()
+        # Filter tgt to only include selected models
+        pruned_segment["tgt"] = {
+            model: doc_segment["tgt"][model]
+            for model in selected_models
+            if model in doc_segment["tgt"]
+        }
+        # Also filter error_spans if present
+        if "error_spans" in doc_segment:
+            pruned_segment["error_spans"] = {
+                model: doc_segment["error_spans"][model]
+                for model in selected_models
+                if model in doc_segment.get("error_spans", {})
+            }
+        # Also filter validation if present
+        if "validation" in doc_segment:
+            pruned_segment["validation"] = {
+                model: doc_segment["validation"][model]
+                for model in selected_models
+                if model in doc_segment.get("validation", {})
+            }
+        pruned_item.append(pruned_segment)
+    return JSONResponse(
+        content={
+            "status": "ok",
+            "time": user_progress["time"],
+            "progress": user_progress["progress"],
+            "info": {
+                "item_i": item_i,
+            }
+            | {
+                k: v
+                for k, v in campaign_data["info"].items()
+                if k.startswith("protocol")
+            },
+            "payload": pruned_item,
+        },
+        status_code=200,
+    )
 def _reset_user_time(progress_data: dict, campaign_id: str, user_id: str) -> None:
@@ -289,11 +474,10 @@ def reset_task(
         # Save reset marker for this user to mask existing annotations
         num_items = len(tasks_data[campaign_id]["data"][user_id])
         for item_i in range(num_items):
-            save_db_payload(campaign_id, {
-                "user_id": user_id,
-                "item_i": item_i,
-                "annotation": RESET_MARKER
-            })
+            save_db_payload(
+                campaign_id,
+                {"user_id": user_id, "item_i": item_i, "annotation": RESET_MARKER},
+            )
         progress_data[campaign_id][user_id]["progress"] = [False] * num_items
         _reset_user_time(progress_data, campaign_id, user_id)
         return JSONResponse(content="ok", status_code=200)
@@ -301,18 +485,32 @@ def reset_task(
         # Save reset markers for all items (shared pool)
         num_items = len(tasks_data[campaign_id]["data"])
         for item_i in range(num_items):
-            save_db_payload(campaign_id, {
-                "user_id": None,
-                "item_i": item_i,
-                "annotation": RESET_MARKER
-            })
+            save_db_payload(
+                campaign_id,
+                {"user_id": None, "item_i": item_i, "annotation": RESET_MARKER},
+            )
         # for single-stream reset all progress
         for uid in progress_data[campaign_id]:
             progress_data[campaign_id][uid]["progress"] = [False] * num_items
         _reset_user_time(progress_data, campaign_id, user_id)
         return JSONResponse(content="ok", status_code=200)
+    elif assignment == "dynamic":
+        # Save reset markers for all items (shared pool like single-stream)
+        num_items = len(tasks_data[campaign_id]["data"])
+        for item_i in range(num_items):
+            save_db_payload(
+                campaign_id,
+                {"user_id": None, "item_i": item_i, "annotation": RESET_MARKER},
+            )
+        # for dynamic reset all progress (use sets to track models)
+        for uid in progress_data[campaign_id]:
+            progress_data[campaign_id][uid]["progress"] = [[] for _ in range(num_items)]
+        _reset_user_time(progress_data, campaign_id, user_id)
+        return JSONResponse(content="ok", status_code=200)
     else:
-        return JSONResponse(content="Reset not supported for this assignment type", status_code=400)
+        return JSONResponse(
+            content="Reset not supported for this assignment type", status_code=400
+        )
 def update_progress(
@@ -337,6 +535,18 @@ def update_progress(
             progress_data[campaign_id][uid]["progress"][item_i] = True
         return JSONResponse(content="ok", status_code=200)
     elif assignment == "dynamic":
-        return JSONResponse(content="Dynamic protocol logging not implemented yet.", status_code=400)
+        # For dynamic, track which models were annotated
+        # Extract models from the payload annotation
+        annotated_models = []
+        if "annotation" in payload:
+            for annotation_item in payload.get("annotation", []):
+                if isinstance(annotation_item, dict):
+                    annotated_models.extend(annotation_item.keys())
+        # Update progress for all users (shared pool)
+        for uid in progress_data[campaign_id]:
+            # Add the newly annotated models
+            progress_data[campaign_id][uid]["progress"][item_i].extend(annotated_models)
+        return JSONResponse(content="ok", status_code=200)
     else:
         return JSONResponse(content="Unknown campaign assignment type", status_code=400)

pearmut/cli.py CHANGED Viewed

@@ -179,7 +179,7 @@ def _shuffle_campaign_data(campaign_data, rng):
         for user_id, task in campaign_data["data"].items():
             for doc in task:
                 shuffle_document(doc)
-    elif assignment == "single-stream":
+    elif assignment in ["single-stream", "dynamic"]:
         # Shuffle each document in the shared pool
         for doc in campaign_data["data"]:
             shuffle_document(doc)
@@ -259,8 +259,46 @@ def _add_single_campaign(data_file, overwrite, server):
         else:
             raise ValueError("'users' must be an integer or a list.")
     elif assignment == "dynamic":
-        raise NotImplementedError(
-            "Dynamic campaign assignment is not yet implemented.")
+        tasks = campaign_data["data"]
+        if users_spec is None:
+            raise ValueError(
+                "Dynamic campaigns must specify 'users' in info.")
+        if not isinstance(campaign_data["data"], list):
+            raise ValueError(
+                "Dynamic campaign 'data' must be a list of items.")
+        # Validate item structure for dynamic
+        for doc_i, doc in enumerate(tasks):
+            try:
+                _validate_item_structure(doc)
+            except ValueError as e:
+                raise ValueError(f"Document {doc_i}: {e}")
+        if isinstance(users_spec, int):
+            num_users = users_spec
+        elif isinstance(users_spec, list):
+            num_users = len(users_spec)
+        else:
+            raise ValueError("'users' must be an integer or a list.")
+        # Validate dynamic-specific parameters
+        if "dynamic_top" not in campaign_data["info"]:
+            campaign_data["info"]["dynamic_top"] = 2
+        if "dynamic_first" not in campaign_data["info"]:
+            campaign_data["info"]["dynamic_first"] = 5
+        if "dynamic_contrastive_models" not in campaign_data["info"]:
+            campaign_data["info"]["dynamic_contrastive_models"] = 1
+        # Validate that dynamic_first is at least 1
+        assert campaign_data["info"]["dynamic_first"] >= 1, "dynamic_first must be at least 1"
+        # Validate that dynamic_contrastive_models is at most dynamic_top
+        assert campaign_data["info"]["dynamic_contrastive_models"] <= campaign_data["info"]["dynamic_top"], \
+            "dynamic_contrastive_models must be at most dynamic_top"
+        # Validate that all items have the same models
+        all_models = set()
+        for item in campaign_data["data"]:
+            if item and len(item) > 0:
+                all_models.update(item[0]["tgt"].keys())
+        for item in campaign_data["data"]:
+            if item and len(item) > 0:
+                item_models = set(item[0]["tgt"].keys())
+                assert item_models == all_models, "All items must have the same model outputs"
     else:
         raise ValueError(f"Unknown campaign assignment type: {assignment}")
@@ -310,13 +348,13 @@ def _add_single_campaign(data_file, overwrite, server):
             os.remove(output_file)
     # For task-based, data is a dict mapping user_id -> tasks
-    # For single-stream, data is a flat list (shared among all users)
+    # For single-stream and dynamic, data is a flat list (shared among all users)
     if assignment == "task-based":
         campaign_data["data"] = {
             user_id: task
             for user_id, task in zip(user_ids, tasks)
         }
-    elif assignment == "single-stream":
+    elif assignment in ["single-stream", "dynamic"]:
         campaign_data["data"] = tasks
     # generate a token for dashboard access if not present
@@ -338,6 +376,7 @@ def _add_single_campaign(data_file, overwrite, server):
             "progress": (
                 [False]*len(campaign_data["data"][user_id]) if assignment == "task-based"
                 else [False]*len(campaign_data["data"]) if assignment == "single-stream"
+                else [list() for _ in range(len(campaign_data["data"]))] if assignment == "dynamic"
                 else []
             ),
             "time_start": None,
@@ -421,9 +460,7 @@ def _add_single_campaign(data_file, overwrite, server):
         json.dump(campaign_data, f, indent=2, ensure_ascii=False)
     progress_data[campaign_data['campaign_id']] = user_progress
-    with open(f"{ROOT}/data/progress.json", "w") as f:
-        json.dump(progress_data, f, indent=2, ensure_ascii=False)
+    save_progress_data(progress_data)
     print(

pearmut 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

pearmut 0.3.3py3-none-any.whl → 1.0.0py3-none-any.whl