pearmut 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pearmut/app.py +56 -25
- pearmut/assignment.py +340 -105
- pearmut/cli.py +185 -104
- pearmut/results_export.py +1 -1
- pearmut/static/annotate.bundle.js +1 -0
- pearmut/static/annotate.html +164 -0
- pearmut/static/dashboard.bundle.js +1 -1
- pearmut/static/dashboard.html +6 -1
- pearmut/static/index.html +1 -1
- pearmut/static/style.css +46 -0
- pearmut/utils.py +40 -21
- {pearmut-1.0.1.dist-info → pearmut-1.0.3.dist-info}/METADATA +119 -65
- pearmut-1.0.3.dist-info/RECORD +20 -0
- {pearmut-1.0.1.dist-info → pearmut-1.0.3.dist-info}/WHEEL +1 -1
- pearmut/static/basic.bundle.js +0 -1
- pearmut/static/basic.html +0 -133
- pearmut-1.0.1.dist-info/RECORD +0 -20
- {pearmut-1.0.1.dist-info → pearmut-1.0.3.dist-info}/entry_points.txt +0 -0
- {pearmut-1.0.1.dist-info → pearmut-1.0.3.dist-info}/licenses/LICENSE +0 -0
- {pearmut-1.0.1.dist-info → pearmut-1.0.3.dist-info}/top_level.txt +0 -0
pearmut/cli.py
CHANGED
|
@@ -3,20 +3,49 @@ Command-line interface for managing and running the Pearmut server.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import argparse
|
|
6
|
+
import atexit
|
|
7
|
+
import fcntl
|
|
6
8
|
import hashlib
|
|
7
9
|
import json
|
|
8
10
|
import os
|
|
9
11
|
import urllib.parse
|
|
10
12
|
|
|
11
|
-
import
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
from .utils import (
|
|
14
|
+
ROOT,
|
|
15
|
+
TOKEN_MAIN,
|
|
16
|
+
is_form_document,
|
|
17
|
+
load_progress_data,
|
|
18
|
+
save_progress_data,
|
|
19
|
+
)
|
|
14
20
|
|
|
15
21
|
os.makedirs(f"{ROOT}/data/tasks", exist_ok=True)
|
|
16
22
|
load_progress_data(warn=None)
|
|
17
23
|
|
|
18
24
|
|
|
19
25
|
def _run(args_unknown):
|
|
26
|
+
# Acquire lock before starting server
|
|
27
|
+
lock_file = f"{ROOT}/data/.lock"
|
|
28
|
+
try:
|
|
29
|
+
lock_fd = open(lock_file, "a+")
|
|
30
|
+
fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
31
|
+
lock_fd.seek(0)
|
|
32
|
+
lock_fd.truncate()
|
|
33
|
+
lock_fd.write(str(os.getpid()))
|
|
34
|
+
lock_fd.flush()
|
|
35
|
+
except BlockingIOError:
|
|
36
|
+
try:
|
|
37
|
+
with open(lock_file, "r") as f:
|
|
38
|
+
pid = f.read().strip()
|
|
39
|
+
print("You can't run multiple instances of Pearmut in the same directory.")
|
|
40
|
+
if pid:
|
|
41
|
+
print(f"Another instance (PID {pid}) is holding the lock.")
|
|
42
|
+
except (FileNotFoundError, PermissionError, OSError):
|
|
43
|
+
print("You can't run multiple instances of Pearmut in the same directory.")
|
|
44
|
+
exit(1)
|
|
45
|
+
|
|
46
|
+
# Register cleanup to remove lock file on exit
|
|
47
|
+
atexit.register(lambda: os.path.exists(lock_file) and os.remove(lock_file))
|
|
48
|
+
|
|
20
49
|
import uvicorn
|
|
21
50
|
|
|
22
51
|
from .app import app, tasks_data
|
|
@@ -33,29 +62,29 @@ def _run(args_unknown):
|
|
|
33
62
|
args = args.parse_args(args_unknown)
|
|
34
63
|
|
|
35
64
|
# print access dashboard URL for all campaigns
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
)
|
|
46
|
-
)
|
|
47
|
-
print(
|
|
48
|
-
"\033[92mNow serving Pearmut, use the following URL to access the everything-dashboard:\033[0m"
|
|
65
|
+
dashboard_url = (
|
|
66
|
+
args.server
|
|
67
|
+
+ "/dashboard?"
|
|
68
|
+
+ f"token_main={TOKEN_MAIN}"
|
|
69
|
+
+ "".join(
|
|
70
|
+
[
|
|
71
|
+
f"&campaign_id={urllib.parse.quote_plus(campaign_id)}&token={campaign_data['token']}"
|
|
72
|
+
for campaign_id, campaign_data in tasks_data.items()
|
|
73
|
+
]
|
|
49
74
|
)
|
|
50
|
-
|
|
75
|
+
)
|
|
76
|
+
print(
|
|
77
|
+
"\033[92mNow serving Pearmut, use the following URL to access the everything-dashboard:\033[0m"
|
|
78
|
+
)
|
|
79
|
+
print("🍐", dashboard_url + "\n", flush=True)
|
|
51
80
|
|
|
52
81
|
# disable startup message
|
|
53
82
|
uvicorn.config.LOGGING_CONFIG["loggers"]["uvicorn.error"]["level"] = "WARNING"
|
|
54
83
|
# set time logging
|
|
55
84
|
uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M"
|
|
56
|
-
uvicorn.config.LOGGING_CONFIG["formatters"]["access"][
|
|
57
|
-
"
|
|
58
|
-
|
|
85
|
+
uvicorn.config.LOGGING_CONFIG["formatters"]["access"]["fmt"] = (
|
|
86
|
+
"%(asctime)s %(levelprefix)s %(client_addr)s - %(request_line)s %(status_code)s"
|
|
87
|
+
)
|
|
59
88
|
uvicorn.run(
|
|
60
89
|
app,
|
|
61
90
|
host="0.0.0.0",
|
|
@@ -67,8 +96,11 @@ def _run(args_unknown):
|
|
|
67
96
|
def _validate_item_structure(items):
|
|
68
97
|
"""
|
|
69
98
|
Validate that items have the correct structure.
|
|
70
|
-
Items
|
|
71
|
-
|
|
99
|
+
Items can be either:
|
|
100
|
+
1. Evaluation items: dictionaries with 'tgt' and optionally 'src' and/or 'ref' keys
|
|
101
|
+
2. Form items: dictionaries with 'text' and 'form' keys
|
|
102
|
+
|
|
103
|
+
A document must contain either all evaluation items or all form items (not mixed).
|
|
72
104
|
|
|
73
105
|
Args:
|
|
74
106
|
items: List of item dictionaries to validate
|
|
@@ -76,68 +108,100 @@ def _validate_item_structure(items):
|
|
|
76
108
|
if not isinstance(items, list):
|
|
77
109
|
raise ValueError("Items must be a list")
|
|
78
110
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
raise ValueError("Each item must be a dictionary with 'tgt' key")
|
|
82
|
-
if "tgt" not in item:
|
|
83
|
-
raise ValueError("Each item must contain 'tgt' key")
|
|
111
|
+
if not items:
|
|
112
|
+
raise ValueError("Items list cannot be empty")
|
|
84
113
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
114
|
+
# Check if first item is a form item or evaluation item
|
|
115
|
+
first_item = items[0]
|
|
116
|
+
if not isinstance(first_item, dict):
|
|
117
|
+
raise ValueError("Each item must be a dictionary")
|
|
88
118
|
|
|
89
|
-
|
|
90
|
-
if "ref" in item and not isinstance(item["ref"], str):
|
|
91
|
-
raise ValueError("Item 'ref' must be a string")
|
|
119
|
+
first_item_is_form = "text" in first_item and "form" in first_item
|
|
92
120
|
|
|
93
|
-
|
|
94
|
-
if isinstance(item
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
121
|
+
for item in items:
|
|
122
|
+
if not isinstance(item, dict):
|
|
123
|
+
raise ValueError("Each item must be a dictionary")
|
|
124
|
+
|
|
125
|
+
# Check consistency: all items must be same type (form or evaluation)
|
|
126
|
+
current_is_form = "text" in item and "form" in item
|
|
127
|
+
if current_is_form != first_item_is_form:
|
|
128
|
+
raise ValueError("Document cannot mix form items and evaluation items")
|
|
129
|
+
|
|
130
|
+
if first_item_is_form:
|
|
131
|
+
# Validate form item structure
|
|
132
|
+
if "text" not in item:
|
|
133
|
+
raise ValueError("Form item must contain 'text' key")
|
|
134
|
+
if "form" not in item:
|
|
135
|
+
raise ValueError("Form item must contain 'form' key")
|
|
136
|
+
if not isinstance(item["text"], str):
|
|
137
|
+
raise ValueError("Form item 'text' must be a string")
|
|
138
|
+
if item["form"] not in {None, "number", "string", "choices", "script"}:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
"Form item 'form' must be null, 'number', 'string', 'choices', or 'script'"
|
|
141
|
+
)
|
|
113
142
|
else:
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
143
|
+
# Validate evaluation item structure
|
|
144
|
+
if "tgt" not in item:
|
|
145
|
+
raise ValueError("Each item must contain 'tgt' key")
|
|
146
|
+
|
|
147
|
+
# Validate src is a string if present
|
|
148
|
+
if "src" in item and not isinstance(item["src"], str):
|
|
149
|
+
raise ValueError("Item 'src' must be a string")
|
|
150
|
+
|
|
151
|
+
# Validate ref is a string if present
|
|
152
|
+
if "ref" in item and not isinstance(item["ref"], str):
|
|
153
|
+
raise ValueError("Item 'ref' must be a string")
|
|
117
154
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
155
|
+
# Validate tgt is a dictionary (annotate template with model names)
|
|
156
|
+
if isinstance(item["tgt"], str):
|
|
157
|
+
# String not allowed - suggest using dictionary (don't include user input to prevent injection)
|
|
121
158
|
raise ValueError(
|
|
122
|
-
|
|
159
|
+
'Item \'tgt\' must be a dictionary mapping model names to translations. For single translation, use {"default": "your_translation"}'
|
|
123
160
|
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
161
|
+
elif isinstance(item["tgt"], dict):
|
|
162
|
+
# Dictionary mapping model names to translations
|
|
163
|
+
# Validate that model names don't contain only numbers (JavaScript ordering issue)
|
|
164
|
+
for model_name, translation in item["tgt"].items():
|
|
165
|
+
if not isinstance(model_name, str):
|
|
166
|
+
raise ValueError(
|
|
167
|
+
"Model names in 'tgt' dictionary must be strings"
|
|
168
|
+
)
|
|
169
|
+
if model_name.isdigit():
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"Model name '{model_name}' cannot be only numeric digits (would cause issues in JS/TS)"
|
|
172
|
+
)
|
|
173
|
+
if not isinstance(translation, str):
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"Translation for model '{model_name}' must be a string"
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
133
178
|
raise ValueError(
|
|
134
|
-
"'
|
|
179
|
+
"Item 'tgt' must be a dictionary mapping model names to translations"
|
|
135
180
|
)
|
|
136
|
-
|
|
137
|
-
|
|
181
|
+
|
|
182
|
+
# Validate error_spans structure if present
|
|
183
|
+
if "error_spans" in item:
|
|
184
|
+
if not isinstance(item["error_spans"], dict):
|
|
185
|
+
raise ValueError(
|
|
186
|
+
"'error_spans' must be a dictionary mapping model names to error span lists"
|
|
187
|
+
)
|
|
188
|
+
for model_name, spans in item["error_spans"].items():
|
|
189
|
+
if not isinstance(spans, list):
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"Error spans for model '{model_name}' must be a list"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Validate validation structure if present
|
|
195
|
+
if "validation" in item:
|
|
196
|
+
if not isinstance(item["validation"], dict):
|
|
138
197
|
raise ValueError(
|
|
139
|
-
|
|
198
|
+
"'validation' must be a dictionary mapping model names to validation rules"
|
|
140
199
|
)
|
|
200
|
+
for model_name, val_rule in item["validation"].items():
|
|
201
|
+
if not isinstance(val_rule, dict):
|
|
202
|
+
raise ValueError(
|
|
203
|
+
f"Validation rule for model '{model_name}' must be a dictionary"
|
|
204
|
+
)
|
|
141
205
|
|
|
142
206
|
|
|
143
207
|
def _validate_document_models(doc):
|
|
@@ -185,6 +249,10 @@ def _shuffle_campaign_data(campaign_data, rng):
|
|
|
185
249
|
|
|
186
250
|
def shuffle_document(doc):
|
|
187
251
|
"""Shuffle a single document (list of items) by reordering models in tgt dict."""
|
|
252
|
+
# Skip shuffling for form documents (they don't have tgt)
|
|
253
|
+
if is_form_document(doc):
|
|
254
|
+
return # Form documents don't need shuffling
|
|
255
|
+
|
|
188
256
|
# Validate that all items have the same models
|
|
189
257
|
_validate_document_models(doc)
|
|
190
258
|
|
|
@@ -238,7 +306,7 @@ def _add_single_campaign(campaign_data, overwrite, server):
|
|
|
238
306
|
if "assignment" not in campaign_data["info"]:
|
|
239
307
|
raise ValueError("Campaign 'info' must contain 'assignment' field.")
|
|
240
308
|
|
|
241
|
-
# Template defaults to "
|
|
309
|
+
# Template defaults to "annotate" if not specified
|
|
242
310
|
assignment = campaign_data["info"]["assignment"]
|
|
243
311
|
# use random words for identifying users
|
|
244
312
|
rng = random.Random()
|
|
@@ -248,6 +316,20 @@ def _add_single_campaign(campaign_data, overwrite, server):
|
|
|
248
316
|
users_spec = campaign_data["info"].get("users")
|
|
249
317
|
user_tokens = {} # user_id -> {"pass": ..., "fail": ...}
|
|
250
318
|
|
|
319
|
+
# Validate and process data_welcome if present
|
|
320
|
+
data_welcome = campaign_data.get("data_welcome", [])
|
|
321
|
+
if data_welcome:
|
|
322
|
+
if not isinstance(data_welcome, list):
|
|
323
|
+
raise ValueError("'data_welcome' must be a list of documents.")
|
|
324
|
+
# Validate welcome documents structure - each should be a list of items
|
|
325
|
+
for doc_i, doc in enumerate(data_welcome):
|
|
326
|
+
if not isinstance(doc, list):
|
|
327
|
+
raise ValueError(f"Welcome document {doc_i} must be a list of items.")
|
|
328
|
+
try:
|
|
329
|
+
_validate_item_structure(doc)
|
|
330
|
+
except ValueError as e:
|
|
331
|
+
raise ValueError(f"Welcome document {doc_i}: {e}")
|
|
332
|
+
|
|
251
333
|
if assignment == "task-based":
|
|
252
334
|
tasks = campaign_data["data"]
|
|
253
335
|
if not isinstance(tasks, list):
|
|
@@ -303,14 +385,14 @@ def _add_single_campaign(campaign_data, overwrite, server):
|
|
|
303
385
|
# Validate dynamic-specific parameters
|
|
304
386
|
if "dynamic_top" not in campaign_data["info"]:
|
|
305
387
|
campaign_data["info"]["dynamic_top"] = 2
|
|
306
|
-
if "
|
|
307
|
-
campaign_data["info"]["
|
|
388
|
+
if "dynamic_warmup" not in campaign_data["info"]:
|
|
389
|
+
campaign_data["info"]["dynamic_warmup"] = 5
|
|
308
390
|
if "dynamic_contrastive_models" not in campaign_data["info"]:
|
|
309
391
|
campaign_data["info"]["dynamic_contrastive_models"] = 1
|
|
310
|
-
# Validate that
|
|
311
|
-
assert (
|
|
312
|
-
|
|
313
|
-
)
|
|
392
|
+
# Validate that dynamic_warmup is at least 1
|
|
393
|
+
assert campaign_data["info"]["dynamic_warmup"] >= 1, (
|
|
394
|
+
"dynamic_warmup must be at least 1"
|
|
395
|
+
)
|
|
314
396
|
# Validate that dynamic_contrastive_models is at most dynamic_top
|
|
315
397
|
assert (
|
|
316
398
|
campaign_data["info"]["dynamic_contrastive_models"]
|
|
@@ -324,9 +406,9 @@ def _add_single_campaign(campaign_data, overwrite, server):
|
|
|
324
406
|
for item in campaign_data["data"]:
|
|
325
407
|
if item and len(item) > 0:
|
|
326
408
|
item_models = set(item[0]["tgt"].keys())
|
|
327
|
-
assert (
|
|
328
|
-
|
|
329
|
-
)
|
|
409
|
+
assert item_models == all_models, (
|
|
410
|
+
"All items must have the same model outputs"
|
|
411
|
+
)
|
|
330
412
|
else:
|
|
331
413
|
raise ValueError(f"Unknown campaign assignment type: {assignment}")
|
|
332
414
|
|
|
@@ -391,13 +473,20 @@ def _add_single_campaign(campaign_data, overwrite, server):
|
|
|
391
473
|
if os.path.exists(output_file):
|
|
392
474
|
os.remove(output_file)
|
|
393
475
|
|
|
476
|
+
# Prepend data_welcome to tasks if present
|
|
477
|
+
if data_welcome:
|
|
478
|
+
if assignment == "task-based":
|
|
479
|
+
tasks = [task for task in tasks]
|
|
480
|
+
elif assignment in ["single-stream", "dynamic"]:
|
|
481
|
+
tasks = data_welcome + tasks
|
|
482
|
+
|
|
394
483
|
# For task-based, data is a dict mapping user_id -> tasks
|
|
395
484
|
# For single-stream and dynamic, data is a flat list (shared among all users)
|
|
396
485
|
if assignment == "task-based":
|
|
397
486
|
campaign_data["data"] = {
|
|
398
487
|
user_id: task for user_id, task in zip(user_ids, tasks)
|
|
399
488
|
}
|
|
400
|
-
elif assignment in
|
|
489
|
+
elif assignment in {"single-stream", "dynamic"}:
|
|
401
490
|
campaign_data["data"] = tasks
|
|
402
491
|
|
|
403
492
|
# generate a token for dashboard access if not present
|
|
@@ -413,25 +502,24 @@ def _add_single_campaign(campaign_data, overwrite, server):
|
|
|
413
502
|
|
|
414
503
|
user_progress = {
|
|
415
504
|
user_id: {
|
|
416
|
-
#
|
|
505
|
+
# Progress tracking: None | "completed" for task-based,
|
|
506
|
+
# None | "completed" | "completed_foreign" for single-stream/dynamic
|
|
417
507
|
"progress": (
|
|
418
|
-
[
|
|
508
|
+
[None] * len(campaign_data["data"][user_id])
|
|
419
509
|
if assignment == "task-based"
|
|
420
|
-
else (
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
else []
|
|
427
|
-
)
|
|
428
|
-
)
|
|
510
|
+
else [None] * len(campaign_data["data"])
|
|
511
|
+
if assignment == "single-stream"
|
|
512
|
+
else [{model: None for model in all_models}]
|
|
513
|
+
* len(campaign_data["data"])
|
|
514
|
+
if assignment == "dynamic"
|
|
515
|
+
else int(f"Invalid assignment: {assignment}")
|
|
429
516
|
),
|
|
517
|
+
"progress_welcome": [None] * len(data_welcome),
|
|
430
518
|
"time_start": None,
|
|
431
519
|
"time_end": None,
|
|
432
520
|
"time": 0,
|
|
433
521
|
"url": (
|
|
434
|
-
f"{campaign_data['info'].get(
|
|
522
|
+
f"{campaign_data['info'].get('template', 'annotate')}"
|
|
435
523
|
f"?campaign_id={urllib.parse.quote_plus(campaign_data['campaign_id'])}"
|
|
436
524
|
f"&user_id={user_id}"
|
|
437
525
|
),
|
|
@@ -527,7 +615,7 @@ def _add_single_campaign(campaign_data, overwrite, server):
|
|
|
527
615
|
)
|
|
528
616
|
for user_id, user_val in user_progress.items():
|
|
529
617
|
# point to the protocol URL
|
|
530
|
-
print(f
|
|
618
|
+
print(f"🧑 {server}/{user_val['url']}")
|
|
531
619
|
print()
|
|
532
620
|
|
|
533
621
|
|
|
@@ -578,13 +666,6 @@ def main():
|
|
|
578
666
|
)
|
|
579
667
|
args, args_unknown = args.parse_known_args()
|
|
580
668
|
|
|
581
|
-
# enforce that only one pearmut process is running
|
|
582
|
-
for p in psutil.process_iter():
|
|
583
|
-
if "pearmut" == p.name() and p.pid != os.getpid():
|
|
584
|
-
print("Exit all running pearmut processes before running more commands.")
|
|
585
|
-
print(p)
|
|
586
|
-
exit(1)
|
|
587
|
-
|
|
588
669
|
if args.command == "run":
|
|
589
670
|
_run(args_unknown)
|
|
590
671
|
elif args.command == "add":
|
pearmut/results_export.py
CHANGED
|
@@ -39,7 +39,7 @@ def compute_model_scores(campaign_id):
|
|
|
39
39
|
# Compute model scores from annotations
|
|
40
40
|
model_scores = collections.defaultdict(dict)
|
|
41
41
|
|
|
42
|
-
# Iterate through all tasks to find items with 'models' field (
|
|
42
|
+
# Iterate through all tasks to find items with 'models' field (annotate template)
|
|
43
43
|
log = get_db_log(campaign_id)
|
|
44
44
|
for entry in log:
|
|
45
45
|
if "item" not in entry or "annotation" not in entry:
|