PyPI - pearmut - Versions diffs - 0.3.2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

pearmut 0.3.2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

pearmut/app.py +52 -29
pearmut/assignment.py +256 -46
pearmut/cli.py +104 -25
pearmut/results_export.py +210 -0
pearmut/static/basic.bundle.js +1 -1
pearmut/static/basic.html +25 -2
pearmut/static/dashboard.bundle.js +1 -1
pearmut/static/dashboard.html +28 -13
pearmut/static/index.bundle.js +1 -0
pearmut/static/index.html +1 -1
pearmut/static/style.css +1 -1
pearmut/utils.py +16 -2
{pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/METADATA +56 -27
pearmut-1.0.0.dist-info/RECORD +19 -0
pearmut-0.3.2.dist-info/RECORD +0 -17
{pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/WHEEL +0 -0
{pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/entry_points.txt +0 -0
{pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/licenses/LICENSE +0 -0
{pearmut-0.3.2.dist-info → pearmut-1.0.0.dist-info}/top_level.txt +0 -0

pearmut/cli.py CHANGED Viewed

@@ -34,21 +34,25 @@ def _run(args_unknown):
     # print access dashboard URL for all campaigns
     if tasks_data:
-        print(
-            args.server + "/dashboard.html?" + "&".join([
-                f"campaign_id={urllib.parse.quote_plus(campaign_id)}&token={campaign_data["token"]}"
-                for campaign_id, campaign_data in tasks_data.items()
-            ]),
-            # this is important to flush
-            flush=True,
-        )
+        dashboard_url = args.server + "/dashboard.html?" + "&".join([
+            f"campaign_id={urllib.parse.quote_plus(campaign_id)}&token={campaign_data["token"]}"
+            for campaign_id, campaign_data in tasks_data.items()
+        ])
+        print("\033[92mNow serving Pearmut, use the following URL to access the everything-dashboard:\033[0m")
+        print("🍐", dashboard_url+"\n", flush=True)
+    # disable startup message
+    uvicorn.config.LOGGING_CONFIG["loggers"]["uvicorn.error"]["level"] = "WARNING"
+    # set time logging
+    uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M"
+    uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["fmt"] = (
+        '%(asctime)s %(levelprefix)s %(client_addr)s - %(request_line)s %(status_code)s'
+    )
     uvicorn.run(
         app,
         host="0.0.0.0",
         port=args.port,
         reload=False,
-        # log_level="info",
     )
@@ -108,6 +112,38 @@ def _validate_item_structure(items):
                     raise ValueError(f"Validation rule for model '{model_name}' must be a dictionary")
+def _validate_document_models(doc):
+    """
+    Validate that all items in a document have the same model outputs.
+    Args:
+        doc: List of items in a document
+    Returns:
+        None if valid
+    Raises:
+        ValueError: If items have different model outputs
+    """
+    # Get model names from the first item
+    first_item = doc[0]
+    first_models = set(first_item['tgt'].keys())
+    # Check all other items have the same model names
+    for i, item in enumerate(doc[1:], start=1):
+        if 'tgt' not in item or not isinstance(item['tgt'], dict):
+            continue
+        item_models = set(item['tgt'].keys())
+        if item_models != first_models:
+            raise ValueError(
+                f"Document contains items with different model outputs. "
+                f"Item 0 has models {sorted(first_models)}, but item {i} has models {sorted(item_models)}. "
+                f"This is fine, but we can't shuffle (on by default). "
+                f"To fix this, set 'shuffle': false in the campaign 'info' section. "
+            )
 def _shuffle_campaign_data(campaign_data, rng):
     """
     Shuffle campaign data at the document level in-place
@@ -120,14 +156,11 @@ def _shuffle_campaign_data(campaign_data, rng):
     """
     def shuffle_document(doc):
         """Shuffle a single document (list of items) by reordering models in tgt dict."""
-        if not doc or not isinstance(doc, list):
-            return
+        # Validate that all items have the same models
+        _validate_document_models(doc)
         # Get all model names from the first item's tgt dict
         first_item = doc[0]
-        if 'tgt' not in first_item or not isinstance(first_item['tgt'], dict):
-            return
         model_names = list(first_item['tgt'].keys())
         rng.shuffle(model_names)
@@ -146,7 +179,7 @@ def _shuffle_campaign_data(campaign_data, rng):
         for user_id, task in campaign_data["data"].items():
             for doc in task:
                 shuffle_document(doc)
-    elif assignment == "single-stream":
+    elif assignment in ["single-stream", "dynamic"]:
         # Shuffle each document in the shared pool
         for doc in campaign_data["data"]:
             shuffle_document(doc)
@@ -226,8 +259,46 @@ def _add_single_campaign(data_file, overwrite, server):
         else:
             raise ValueError("'users' must be an integer or a list.")
     elif assignment == "dynamic":
-        raise NotImplementedError(
-            "Dynamic campaign assignment is not yet implemented.")
+        tasks = campaign_data["data"]
+        if users_spec is None:
+            raise ValueError(
+                "Dynamic campaigns must specify 'users' in info.")
+        if not isinstance(campaign_data["data"], list):
+            raise ValueError(
+                "Dynamic campaign 'data' must be a list of items.")
+        # Validate item structure for dynamic
+        for doc_i, doc in enumerate(tasks):
+            try:
+                _validate_item_structure(doc)
+            except ValueError as e:
+                raise ValueError(f"Document {doc_i}: {e}")
+        if isinstance(users_spec, int):
+            num_users = users_spec
+        elif isinstance(users_spec, list):
+            num_users = len(users_spec)
+        else:
+            raise ValueError("'users' must be an integer or a list.")
+        # Validate dynamic-specific parameters
+        if "dynamic_top" not in campaign_data["info"]:
+            campaign_data["info"]["dynamic_top"] = 2
+        if "dynamic_first" not in campaign_data["info"]:
+            campaign_data["info"]["dynamic_first"] = 5
+        if "dynamic_contrastive_models" not in campaign_data["info"]:
+            campaign_data["info"]["dynamic_contrastive_models"] = 1
+        # Validate that dynamic_first is at least 1
+        assert campaign_data["info"]["dynamic_first"] >= 1, "dynamic_first must be at least 1"
+        # Validate that dynamic_contrastive_models is at most dynamic_top
+        assert campaign_data["info"]["dynamic_contrastive_models"] <= campaign_data["info"]["dynamic_top"], \
+            "dynamic_contrastive_models must be at most dynamic_top"
+        # Validate that all items have the same models
+        all_models = set()
+        for item in campaign_data["data"]:
+            if item and len(item) > 0:
+                all_models.update(item[0]["tgt"].keys())
+        for item in campaign_data["data"]:
+            if item and len(item) > 0:
+                item_models = set(item[0]["tgt"].keys())
+                assert item_models == all_models, "All items must have the same model outputs"
     else:
         raise ValueError(f"Unknown campaign assignment type: {assignment}")
@@ -270,14 +341,20 @@ def _add_single_campaign(data_file, overwrite, server):
         campaign_data["info"]["protocol"] = "ESA"
         print("Warning: 'protocol' not specified in campaign info. Defaulting to 'ESA'.")
+    # Remove output file when overwriting (after all validations pass)
+    if overwrite and campaign_data['campaign_id'] in progress_data:
+        output_file = f"{ROOT}/data/outputs/{campaign_data['campaign_id']}.jsonl"
+        if os.path.exists(output_file):
+            os.remove(output_file)
     # For task-based, data is a dict mapping user_id -> tasks
-    # For single-stream, data is a flat list (shared among all users)
+    # For single-stream and dynamic, data is a flat list (shared among all users)
     if assignment == "task-based":
         campaign_data["data"] = {
             user_id: task
             for user_id, task in zip(user_ids, tasks)
         }
-    elif assignment == "single-stream":
+    elif assignment in ["single-stream", "dynamic"]:
         campaign_data["data"] = tasks
     # generate a token for dashboard access if not present
@@ -299,6 +376,7 @@ def _add_single_campaign(data_file, overwrite, server):
             "progress": (
                 [False]*len(campaign_data["data"][user_id]) if assignment == "task-based"
                 else [False]*len(campaign_data["data"]) if assignment == "single-stream"
+                else [list() for _ in range(len(campaign_data["data"]))] if assignment == "dynamic"
                 else []
             ),
             "time_start": None,
@@ -382,9 +460,7 @@ def _add_single_campaign(data_file, overwrite, server):
         json.dump(campaign_data, f, indent=2, ensure_ascii=False)
     progress_data[campaign_data['campaign_id']] = user_progress
-    with open(f"{ROOT}/data/progress.json", "w") as f:
-        json.dump(progress_data, f, indent=2, ensure_ascii=False)
+    save_progress_data(progress_data)
     print(
@@ -395,7 +471,7 @@ def _add_single_campaign(data_file, overwrite, server):
     )
     for user_id, user_val in user_progress.items():
         # point to the protocol URL
-        print(f'{server}/{user_val["url"]}')
+        print(f'🧑 {server}/{user_val["url"]}')
     print()
@@ -469,10 +545,14 @@ def main():
             help='Optional campaign name to purge (purges all if not specified)'
         )
         purge_args = purge_args.parse_args(args_unknown)
+        progress_data = load_progress_data()
         if purge_args.campaign is not None:
             # Purge specific campaign
             campaign_id = purge_args.campaign
+            if campaign_id not in progress_data:
+                print(f"Campaign '{campaign_id}' does not exist.")
+                return
             confirm = input(
                 f"Are you sure you want to purge campaign '{campaign_id}'? This action cannot be undone. [y/n] "
             )
@@ -502,7 +582,6 @@ def main():
             )
             if confirm.lower() == 'y':
                 # Unlink all assets first
-                progress_data = load_progress_data()
                 for campaign_id in progress_data.keys():
                     _unlink_assets(campaign_id)
                 shutil.rmtree(f"{ROOT}/data/tasks", ignore_errors=True)

pearmut/results_export.py ADDED Viewed

@@ -0,0 +1,210 @@
+import collections
+import json
+import os
+import statistics
+from .utils import get_db_log
+def comparison_significant(
+    scores1: dict[str, float], scores2: dict[str, float]
+) -> bool:
+    """Check if the difference between two sets of scores is statistically significant.
+    Assume scores1 > scores2.
+    """
+    import scipy.stats
+    # compute intersection
+    common_items = set(scores1.keys()).intersection(set(scores2.keys()))
+    if len(common_items) < 2:
+        return False
+    scores1 = [scores1[k] for k in common_items]
+    scores2 = [scores2[k] for k in common_items]
+    return bool(
+        scipy.stats.ttest_rel(scores1, scores2, alternative="two-sided").pvalue < 0.05
+    )
+def compute_model_scores(campaign_id):
+    """
+    Compute model scores from annotations for a campaign.
+    Returns:
+        List of dicts with keys: model, score, count
+        Sorted by score in descending order
+    """
+    # Compute model scores from annotations
+    model_scores = collections.defaultdict(dict)
+    # Iterate through all tasks to find items with 'models' field (basic template)
+    log = get_db_log(campaign_id)
+    for entry in log:
+        if "item" not in entry or "annotation" not in entry:
+            continue
+        for item, annotation in zip(entry["item"], entry["annotation"]):
+            for model, annotation in annotation.items():
+                if "score" in annotation and annotation["score"] is not None:
+                    item_id = item.get("item_id") or json.dumps(item | {"tgt": None})
+                    model_scores[model][item_id] = annotation["score"]
+    model_scores = list(model_scores.items())
+    model_scores.sort(key=lambda x: statistics.mean(x[1].values()), reverse=True)
+    results = []
+    for i, (model, scores) in enumerate(model_scores):
+        avg_score = statistics.mean(scores.values())
+        sig_better = False
+        if i < len(model_scores) - 1:
+            # Compare with next model
+            scores_next = model_scores[i + 1][1]
+            sig_better = comparison_significant(scores, scores_next)
+        else:
+            sig_better = False
+        results.append(
+            {
+                "model": model,
+                "score": avg_score,
+                "count": len(scores),
+                "sig_better_than_next": sig_better,
+            }
+        )
+    return results
+def escape_typst(s: str):
+    return (
+        s.replace("\\", "\\\\")
+        .replace("#", "\\#")
+        .replace("*", "\\*")
+        .replace("_", "\\_")
+        .replace("`", "\\`")
+        .replace("[", "\\[")
+        .replace("]", "\\]")
+    )
+def generate_typst_table(results):
+    """
+    Generate Typst code for a two-column table with results.
+    Args:
+        results: List of dicts with keys: model, score, count
+    Returns:
+        String containing Typst table markup
+    """
+    if not results:
+        return "// No results available"
+    typst_code = """#table(
+  columns: (auto, auto),
+  align: (left, right),
+  stroke: none,
+  table.hline(),
+  [*Model*], [*Score*],
+  table.hline(),
+"""
+    for result in results:
+        # Escape Typst special characters
+        model = escape_typst(result["model"])
+        score = f"{result['score']:.1f}"
+        typst_code += f"  [{model}], [{score}],\n"
+        if result["sig_better_than_next"]:
+            typst_code += "  table.hline(end: 1),\n"
+    typst_code += "  table.hline(),\n"
+    typst_code += ")\n"
+    return typst_code
+def generate_latex_table(results):
+    """
+    Generate LaTeX code for a booktabs two-column table with results.
+    Args:
+        results: List of dicts with keys: model, score, count
+    Returns:
+        String containing LaTeX table markup
+    """
+    if not results:
+        return "% No results available"
+    latex_code = """\\begin{table}[h]
+\\centering
+\\begin{tabular}{lr}
+\\toprule
+\\textbf{Model} & \\textbf{Score} \\\\
+\\midrule
+"""
+    for result in results:
+        # Escape LaTeX special characters
+        model = result["model"]
+        model = model.replace("\\", "\\textbackslash ")
+        model = model.replace("_", "\\_")
+        model = model.replace("&", "\\&")
+        model = model.replace("%", "\\%")
+        model = model.replace("$", "\\$")
+        model = model.replace("#", "\\#")
+        model = model.replace("{", "\\{")
+        model = model.replace("}", "\\}")
+        model = model.replace("~", "\\textasciitilde ")
+        model = model.replace("^", "\\textasciicircum ")
+        score = f"{result['score']:.1f}"
+        latex_code += f"{model} & {score} \\\\\n"
+        if result["sig_better_than_next"]:
+            latex_code += "\\cmidrule{1-1}\n"
+    latex_code += """\\bottomrule
+\\end{tabular}
+\\caption{Model ranking results}
+\\label{tab:results}
+\\end{table}
+"""
+    return latex_code
+def generate_pdf(results, campaign_id):
+    """
+    Generate PDF from Typst code using typst-py.
+    Args:
+        results: List of dicts with keys: model, score, count
+    Returns:
+        bytes containing the PDF
+    """
+    import tempfile
+    import typst
+    if not results:
+        # Return empty PDF with message
+        typst_code = "[No results available]"
+    else:
+        typst_code = f"""
+        #set page(width: auto, height: auto, margin: 1.5pt)
+        == {escape_typst(campaign_id)}
+        """ + generate_typst_table(
+            results
+        )
+    # Create a temporary file for the typst source
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".typ", delete=False) as f:
+        f.write(typst_code)
+        typst_file = f.name
+    try:
+        # Compile to PDF
+        pdf_bytes = typst.compile(typst_file)
+        return pdf_bytes
+    finally:
+        # Clean up
+        os.unlink(typst_file)

pearmut 0.3.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

pearmut 0.3.2py3-none-any.whl → 1.0.0py3-none-any.whl