pearmut 0.2.10__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pearmut/app.py CHANGED
@@ -206,33 +206,23 @@ async def _dashboard_results(request: DashboardResultsRequest):
206
206
 
207
207
  if campaign_id not in progress_data:
208
208
  return JSONResponse(content="Unknown campaign ID", status_code=400)
209
-
209
+
210
210
  # Check if token is valid
211
211
  if token != tasks_data[campaign_id]["token"]:
212
212
  return JSONResponse(content="Invalid token", status_code=400)
213
213
 
214
214
  # Compute model scores from annotations
215
215
  model_scores = collections.defaultdict(dict)
216
-
217
- # Iterate through all tasks to find items with 'model' field
216
+
217
+ # Iterate through all tasks to find items with 'models' field (basic template)
218
218
  log = get_db_log(campaign_id)
219
219
  for entry in log:
220
- if "item" not in entry or "annotations" not in entry:
220
+ if "item" not in entry or "annotation" not in entry:
221
221
  continue
222
- for item, annotation in zip(entry["item"], entry["annotations"]):
223
- if "model" in item:
224
- # pointwise
222
+ for item, annotation in zip(entry["item"], entry["annotation"]):
223
+ for model, annotation in annotation.items():
225
224
  if "score" in annotation:
226
- # make sure to only keep the latest score for each item
227
- # json.dumps(item) creates a unique item key
228
- model_scores[item["model"]][json.dumps(item)] = annotation["score"]
229
- elif "models" in item:
230
- # listwise
231
- for model, annotation_cand in zip(item["models"], annotation):
232
- if "score" in annotation_cand:
233
- model_scores[model][json.dumps(item)] = (
234
- annotation_cand["score"]
235
- )
225
+ model_scores[model][json.dumps(item)] = annotation["score"]
236
226
 
237
227
  results = [
238
228
  {
@@ -294,7 +284,7 @@ async def _download_annotations(
294
284
  return JSONResponse(
295
285
  content=output,
296
286
  status_code=200,
297
- headers={"Content-Disposition": 'inline; filename="annotations.json"'}
287
+ headers={"Content-Disposition": 'inline; filename="annotations.json"'},
298
288
  )
299
289
 
300
290
 
@@ -322,7 +312,7 @@ async def _download_progress(
322
312
  return JSONResponse(
323
313
  content=output,
324
314
  status_code=200,
325
- headers={"Content-Disposition": 'inline; filename="progress.json"'}
315
+ headers={"Content-Disposition": 'inline; filename="progress.json"'},
326
316
  )
327
317
 
328
318
 
@@ -332,6 +322,16 @@ if not os.path.exists(static_dir + "index.html"):
332
322
  "Static directory not found. Please build the frontend first."
333
323
  )
334
324
 
325
+ # Mount user assets from data/assets/
326
+ assets_dir = f"{ROOT}/data/assets"
327
+ os.makedirs(assets_dir, exist_ok=True)
328
+
329
+ app.mount(
330
+ "/assets",
331
+ StaticFiles(directory=assets_dir, follow_symlink=True),
332
+ name="assets",
333
+ )
334
+
335
335
  app.mount(
336
336
  "/",
337
337
  StaticFiles(directory=static_dir, html=True, follow_symlink=True),
pearmut/assignment.py CHANGED
@@ -84,9 +84,13 @@ def get_i_item_taskbased(
84
84
 
85
85
  # try to get existing annotations if any
86
86
  items_existing = get_db_log_item(campaign_id, user_id, item_i)
87
+ payload_existing = None
87
88
  if items_existing:
88
89
  # get the latest ones
89
- payload_existing = items_existing[-1]["annotations"]
90
+ latest_item = items_existing[-1]
91
+ payload_existing = {"annotation": latest_item["annotation"]}
92
+ if "comment" in latest_item:
93
+ payload_existing["comment"] = latest_item["comment"]
90
94
 
91
95
  if item_i < 0 or item_i >= len(data_all[campaign_id]["data"][user_id]):
92
96
  return JSONResponse(
@@ -107,7 +111,7 @@ def get_i_item_taskbased(
107
111
  if k.startswith("protocol")
108
112
  },
109
113
  "payload": data_all[campaign_id]["data"][user_id][item_i]
110
- } | ({"payload_existing": payload_existing} if items_existing else {}),
114
+ } | ({"payload_existing": payload_existing} if payload_existing else {}),
111
115
  status_code=200
112
116
  )
113
117
 
@@ -127,9 +131,13 @@ def get_i_item_singlestream(
127
131
  # try to get existing annotations if any
128
132
  # note the None user_id since it is shared
129
133
  items_existing = get_db_log_item(campaign_id, None, item_i)
134
+ payload_existing = None
130
135
  if items_existing:
131
136
  # get the latest ones
132
- payload_existing = items_existing[-1]["annotations"]
137
+ latest_item = items_existing[-1]
138
+ payload_existing = {"annotation": latest_item["annotation"]}
139
+ if "comment" in latest_item:
140
+ payload_existing["comment"] = latest_item["comment"]
133
141
 
134
142
  if item_i < 0 or item_i >= len(data_all[campaign_id]["data"]):
135
143
  return JSONResponse(
@@ -150,7 +158,7 @@ def get_i_item_singlestream(
150
158
  if k.startswith("protocol")
151
159
  },
152
160
  "payload": data_all[campaign_id]["data"][item_i]
153
- } | ({"payload_existing": payload_existing} if items_existing else {}),
161
+ } | ({"payload_existing": payload_existing} if payload_existing else {}),
154
162
  status_code=200
155
163
  )
156
164
 
@@ -173,9 +181,13 @@ def get_next_item_taskbased(
173
181
 
174
182
  # try to get existing annotations if any
175
183
  items_existing = get_db_log_item(campaign_id, user_id, item_i)
184
+ payload_existing = None
176
185
  if items_existing:
177
186
  # get the latest ones
178
- payload_existing = items_existing[-1]["annotations"]
187
+ latest_item = items_existing[-1]
188
+ payload_existing = {"annotation": latest_item["annotation"]}
189
+ if "comment" in latest_item:
190
+ payload_existing["comment"] = latest_item["comment"]
179
191
 
180
192
  return JSONResponse(
181
193
  content={
@@ -190,7 +202,7 @@ def get_next_item_taskbased(
190
202
  if k.startswith("protocol")
191
203
  },
192
204
  "payload": data_all[campaign_id]["data"][user_id][item_i]
193
- } | ({"payload_existing": payload_existing} if items_existing else {}),
205
+ } | ({"payload_existing": payload_existing} if payload_existing else {}),
194
206
  status_code=200
195
207
  )
196
208
 
@@ -222,9 +234,13 @@ def get_next_item_singlestream(
222
234
  # try to get existing annotations if any
223
235
  # note the None user_id since it is shared
224
236
  items_existing = get_db_log_item(campaign_id, None, item_i)
237
+ payload_existing = None
225
238
  if items_existing:
226
239
  # get the latest ones
227
- payload_existing = items_existing[-1]["annotations"]
240
+ latest_item = items_existing[-1]
241
+ payload_existing = {"annotation": latest_item["annotation"]}
242
+ if "comment" in latest_item:
243
+ payload_existing["comment"] = latest_item["comment"]
228
244
 
229
245
  return JSONResponse(
230
246
  content={
@@ -239,7 +255,7 @@ def get_next_item_singlestream(
239
255
  if k.startswith("protocol")
240
256
  },
241
257
  "payload": data_all[campaign_id]["data"][item_i]
242
- } | ({"payload_existing": payload_existing} if items_existing else {}),
258
+ } | ({"payload_existing": payload_existing} if payload_existing else {}),
243
259
  status_code=200
244
260
  )
245
261
 
@@ -276,7 +292,7 @@ def reset_task(
276
292
  save_db_payload(campaign_id, {
277
293
  "user_id": user_id,
278
294
  "item_i": item_i,
279
- "annotations": RESET_MARKER
295
+ "annotation": RESET_MARKER
280
296
  })
281
297
  progress_data[campaign_id][user_id]["progress"] = [False] * num_items
282
298
  _reset_user_time(progress_data, campaign_id, user_id)
@@ -288,7 +304,7 @@ def reset_task(
288
304
  save_db_payload(campaign_id, {
289
305
  "user_id": None,
290
306
  "item_i": item_i,
291
- "annotations": RESET_MARKER
307
+ "annotation": RESET_MARKER
292
308
  })
293
309
  # for single-stream reset all progress
294
310
  for uid in progress_data[campaign_id]:
pearmut/cli.py CHANGED
@@ -12,9 +12,6 @@ import psutil
12
12
 
13
13
  from .utils import ROOT, load_progress_data, save_progress_data
14
14
 
15
- # Static directory path (constant for consistency)
16
- STATIC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static")
17
-
18
15
  os.makedirs(f"{ROOT}/data/tasks", exist_ok=True)
19
16
  load_progress_data(warn=None)
20
17
 
@@ -55,14 +52,14 @@ def _run(args_unknown):
55
52
  )
56
53
 
57
54
 
58
- def _validate_item_structure(items, template):
55
+ def _validate_item_structure(items):
59
56
  """
60
57
  Validate that items have the correct structure.
61
58
  Items should be lists of dictionaries with 'src' and 'tgt' keys.
59
+ The 'tgt' field should be a dictionary mapping model names to translations.
62
60
 
63
61
  Args:
64
62
  items: List of item dictionaries to validate
65
- template: Template type ('pointwise' or 'listwise') for type validation
66
63
  """
67
64
  if not isinstance(items, list):
68
65
  raise ValueError("Items must be a list")
@@ -77,16 +74,82 @@ def _validate_item_structure(items, template):
77
74
  if not isinstance(item['src'], str):
78
75
  raise ValueError("Item 'src' must be a string")
79
76
 
80
- # Validate tgt type based on template
81
- if template == 'listwise':
82
- if not isinstance(item['tgt'], list):
83
- raise ValueError("Item 'tgt' must be a list for listwise template")
84
- # Check that all elements in tgt list are strings
85
- if not all(isinstance(t, str) for t in item['tgt']):
86
- raise ValueError("All elements in 'tgt' list must be strings for listwise template")
87
- elif template == 'pointwise':
88
- if not isinstance(item['tgt'], str):
89
- raise ValueError("Item 'tgt' must be a string for pointwise template")
77
+ # Validate tgt is a dictionary (basic template with model names)
78
+ if isinstance(item['tgt'], str):
79
+ # String not allowed - suggest using dictionary (don't include user input to prevent injection)
80
+ raise ValueError("Item 'tgt' must be a dictionary mapping model names to translations. For single translation, use {\"default\": \"your_translation\"}")
81
+ elif isinstance(item['tgt'], dict):
82
+ # Dictionary mapping model names to translations
83
+ # Validate that model names don't contain only numbers (JavaScript ordering issue)
84
+ for model_name, translation in item['tgt'].items():
85
+ if not isinstance(model_name, str):
86
+ raise ValueError("Model names in 'tgt' dictionary must be strings")
87
+ if model_name.isdigit():
88
+ raise ValueError(f"Model name '{model_name}' cannot be only numeric digits (would cause issues in JS/TS)")
89
+ if not isinstance(translation, str):
90
+ raise ValueError(f"Translation for model '{model_name}' must be a string")
91
+ else:
92
+ raise ValueError("Item 'tgt' must be a dictionary mapping model names to translations")
93
+
94
+ # Validate error_spans structure if present
95
+ if 'error_spans' in item:
96
+ if not isinstance(item['error_spans'], dict):
97
+ raise ValueError("'error_spans' must be a dictionary mapping model names to error span lists")
98
+ for model_name, spans in item['error_spans'].items():
99
+ if not isinstance(spans, list):
100
+ raise ValueError(f"Error spans for model '{model_name}' must be a list")
101
+
102
+ # Validate validation structure if present
103
+ if 'validation' in item:
104
+ if not isinstance(item['validation'], dict):
105
+ raise ValueError("'validation' must be a dictionary mapping model names to validation rules")
106
+ for model_name, val_rule in item['validation'].items():
107
+ if not isinstance(val_rule, dict):
108
+ raise ValueError(f"Validation rule for model '{model_name}' must be a dictionary")
109
+
110
+
111
+ def _shuffle_campaign_data(campaign_data, rng):
112
+ """
113
+ Shuffle campaign data at the document level in-place
114
+
115
+ For each document, randomly shuffles the order of models in the tgt dictionary.
116
+
117
+ Args:
118
+ campaign_data: The campaign data dictionary
119
+ rng: Random number generator with campaign-specific seed
120
+ """
121
+ def shuffle_document(doc):
122
+ """Shuffle a single document (list of items) by reordering models in tgt dict."""
123
+ if not doc or not isinstance(doc, list):
124
+ return
125
+
126
+ # Get all model names from the first item's tgt dict
127
+ first_item = doc[0]
128
+ if 'tgt' not in first_item or not isinstance(first_item['tgt'], dict):
129
+ return
130
+
131
+ model_names = list(first_item['tgt'].keys())
132
+ rng.shuffle(model_names)
133
+
134
+ # Reorder tgt dict for all items in the document
135
+ for item in doc:
136
+ if 'tgt' in item and isinstance(item['tgt'], dict):
137
+ item["tgt"] = {
138
+ model: item["tgt"][model]
139
+ for model in model_names
140
+ }
141
+
142
+ assignment = campaign_data["info"]["assignment"]
143
+
144
+ if assignment == "task-based":
145
+ # After transformation, data is a dict mapping user_id -> tasks
146
+ for user_id, task in campaign_data["data"].items():
147
+ for doc in task:
148
+ shuffle_document(doc)
149
+ elif assignment == "single-stream":
150
+ # Shuffle each document in the shared pool
151
+ for doc in campaign_data["data"]:
152
+ shuffle_document(doc)
90
153
 
91
154
 
92
155
  def _add_single_campaign(data_file, overwrite, server):
@@ -115,11 +178,9 @@ def _add_single_campaign(data_file, overwrite, server):
115
178
  raise ValueError("Campaign data must contain 'data' field.")
116
179
  if "assignment" not in campaign_data["info"]:
117
180
  raise ValueError("Campaign 'info' must contain 'assignment' field.")
118
- if "template" not in campaign_data["info"]:
119
- raise ValueError("Campaign 'info' must contain 'template' field.")
120
-
181
+
182
+ # Template defaults to "basic" if not specified
121
183
  assignment = campaign_data["info"]["assignment"]
122
- template = campaign_data["info"]["template"]
123
184
  # use random words for identifying users
124
185
  rng = random.Random(campaign_data["campaign_id"])
125
186
  rword = wonderwords.RandomWord(rng=rng)
@@ -140,7 +201,7 @@ def _add_single_campaign(data_file, overwrite, server):
140
201
  for task_i, task in enumerate(tasks):
141
202
  for doc_i, doc in enumerate(task):
142
203
  try:
143
- _validate_item_structure(doc, template)
204
+ _validate_item_structure(doc)
144
205
  except ValueError as e:
145
206
  raise ValueError(f"Task {task_i}, document {doc_i}: {e}")
146
207
  num_users = len(tasks)
@@ -155,7 +216,7 @@ def _add_single_campaign(data_file, overwrite, server):
155
216
  # Validate item structure for single-stream
156
217
  for doc_i, doc in enumerate(tasks):
157
218
  try:
158
- _validate_item_structure(doc, template)
219
+ _validate_item_structure(doc)
159
220
  except ValueError as e:
160
221
  raise ValueError(f"Document {doc_i}: {e}")
161
222
  if isinstance(users_spec, int):
@@ -240,7 +301,7 @@ def _add_single_campaign(data_file, overwrite, server):
240
301
  "time_end": None,
241
302
  "time": 0,
242
303
  "url": (
243
- f"{campaign_data["info"]["template"]}.html"
304
+ f"{campaign_data['info'].get("template", "basic")}.html"
244
305
  f"?campaign_id={urllib.parse.quote_plus(campaign_data['campaign_id'])}"
245
306
  f"&user_id={user_id}"
246
307
  ),
@@ -272,15 +333,10 @@ def _add_single_campaign(data_file, overwrite, server):
272
333
 
273
334
  if not os.path.isdir(assets_real_path):
274
335
  raise ValueError(f"Assets source path '{assets_real_path}' must be an existing directory.")
275
-
276
- if not os.path.isdir(STATIC_DIR):
277
- raise ValueError(
278
- f"Static directory '{STATIC_DIR}' does not exist. "
279
- "Please build the frontend first."
280
- )
281
336
 
282
337
  # Symlink path is based on the destination, stripping the 'assets/' prefix
283
- symlink_path = f"{STATIC_DIR}/{assets_destination}".rstrip("/")
338
+ # User assets are now stored under data/assets/ instead of static/assets/
339
+ symlink_path = f"{ROOT}/data/{assets_destination}".rstrip("/")
284
340
 
285
341
  # Remove existing symlink if present and we are overriding the same campaign
286
342
  if os.path.lexists(symlink_path):
@@ -312,6 +368,11 @@ def _add_single_campaign(data_file, overwrite, server):
312
368
  print(f"Assets symlinked: {symlink_path} -> {assets_real_path}")
313
369
 
314
370
 
371
+ # Shuffle data if shuffle parameter is true (defaults to true)
372
+ should_shuffle = campaign_data["info"].get("shuffle", True)
373
+ if should_shuffle:
374
+ _shuffle_campaign_data(campaign_data, rng)
375
+
315
376
  # commit to transaction
316
377
  with open(f"{ROOT}/data/tasks/{campaign_data['campaign_id']}.json", "w") as f:
317
378
  json.dump(campaign_data, f, indent=2, ensure_ascii=False)
@@ -392,7 +453,7 @@ def main():
392
453
  campaign_data = json.load(f)
393
454
  destination = campaign_data.get("info", {}).get("assets", {}).get("destination")
394
455
  if destination:
395
- symlink_path = f"{STATIC_DIR}/{destination}".rstrip("/")
456
+ symlink_path = f"{ROOT}/data/{destination}".rstrip("/")
396
457
  if os.path.islink(symlink_path):
397
458
  os.remove(symlink_path)
398
459
  print(f"Assets symlink removed: {symlink_path}")