pearmut 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pearmut/app.py +46 -27
- pearmut/assignment.py +256 -46
- pearmut/cli.py +45 -8
- pearmut/results_export.py +210 -0
- pearmut/static/basic.bundle.js +1 -1
- pearmut/static/basic.html +1 -1
- pearmut/static/dashboard.bundle.js +1 -1
- pearmut/static/dashboard.html +27 -12
- pearmut/static/index.bundle.js +1 -1
- pearmut/static/index.html +1 -1
- pearmut/utils.py +16 -2
- {pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/METADATA +54 -26
- pearmut-1.0.0.dist-info/RECORD +19 -0
- pearmut-0.3.3.dist-info/RECORD +0 -18
- {pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/WHEEL +0 -0
- {pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/entry_points.txt +0 -0
- {pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {pearmut-0.3.3.dist-info → pearmut-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import statistics
|
|
5
|
+
|
|
6
|
+
from .utils import get_db_log
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def comparison_significant(
|
|
10
|
+
scores1: dict[str, float], scores2: dict[str, float]
|
|
11
|
+
) -> bool:
|
|
12
|
+
"""Check if the difference between two sets of scores is statistically significant.
|
|
13
|
+
Assume scores1 > scores2.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import scipy.stats
|
|
17
|
+
|
|
18
|
+
# compute intersection
|
|
19
|
+
common_items = set(scores1.keys()).intersection(set(scores2.keys()))
|
|
20
|
+
if len(common_items) < 2:
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
scores1 = [scores1[k] for k in common_items]
|
|
24
|
+
scores2 = [scores2[k] for k in common_items]
|
|
25
|
+
|
|
26
|
+
return bool(
|
|
27
|
+
scipy.stats.ttest_rel(scores1, scores2, alternative="two-sided").pvalue < 0.05
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def compute_model_scores(campaign_id):
|
|
32
|
+
"""
|
|
33
|
+
Compute model scores from annotations for a campaign.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of dicts with keys: model, score, count
|
|
37
|
+
Sorted by score in descending order
|
|
38
|
+
"""
|
|
39
|
+
# Compute model scores from annotations
|
|
40
|
+
model_scores = collections.defaultdict(dict)
|
|
41
|
+
|
|
42
|
+
# Iterate through all tasks to find items with 'models' field (basic template)
|
|
43
|
+
log = get_db_log(campaign_id)
|
|
44
|
+
for entry in log:
|
|
45
|
+
if "item" not in entry or "annotation" not in entry:
|
|
46
|
+
continue
|
|
47
|
+
for item, annotation in zip(entry["item"], entry["annotation"]):
|
|
48
|
+
for model, annotation in annotation.items():
|
|
49
|
+
if "score" in annotation and annotation["score"] is not None:
|
|
50
|
+
item_id = item.get("item_id") or json.dumps(item | {"tgt": None})
|
|
51
|
+
model_scores[model][item_id] = annotation["score"]
|
|
52
|
+
|
|
53
|
+
model_scores = list(model_scores.items())
|
|
54
|
+
model_scores.sort(key=lambda x: statistics.mean(x[1].values()), reverse=True)
|
|
55
|
+
|
|
56
|
+
results = []
|
|
57
|
+
for i, (model, scores) in enumerate(model_scores):
|
|
58
|
+
avg_score = statistics.mean(scores.values())
|
|
59
|
+
sig_better = False
|
|
60
|
+
if i < len(model_scores) - 1:
|
|
61
|
+
# Compare with next model
|
|
62
|
+
scores_next = model_scores[i + 1][1]
|
|
63
|
+
sig_better = comparison_significant(scores, scores_next)
|
|
64
|
+
else:
|
|
65
|
+
sig_better = False
|
|
66
|
+
results.append(
|
|
67
|
+
{
|
|
68
|
+
"model": model,
|
|
69
|
+
"score": avg_score,
|
|
70
|
+
"count": len(scores),
|
|
71
|
+
"sig_better_than_next": sig_better,
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def escape_typst(s: str):
|
|
78
|
+
return (
|
|
79
|
+
s.replace("\\", "\\\\")
|
|
80
|
+
.replace("#", "\\#")
|
|
81
|
+
.replace("*", "\\*")
|
|
82
|
+
.replace("_", "\\_")
|
|
83
|
+
.replace("`", "\\`")
|
|
84
|
+
.replace("[", "\\[")
|
|
85
|
+
.replace("]", "\\]")
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def generate_typst_table(results):
|
|
90
|
+
"""
|
|
91
|
+
Generate Typst code for a two-column table with results.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
results: List of dicts with keys: model, score, count
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
String containing Typst table markup
|
|
98
|
+
"""
|
|
99
|
+
if not results:
|
|
100
|
+
return "// No results available"
|
|
101
|
+
|
|
102
|
+
typst_code = """#table(
|
|
103
|
+
columns: (auto, auto),
|
|
104
|
+
align: (left, right),
|
|
105
|
+
stroke: none,
|
|
106
|
+
table.hline(),
|
|
107
|
+
[*Model*], [*Score*],
|
|
108
|
+
table.hline(),
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
for result in results:
|
|
112
|
+
# Escape Typst special characters
|
|
113
|
+
model = escape_typst(result["model"])
|
|
114
|
+
score = f"{result['score']:.1f}"
|
|
115
|
+
typst_code += f" [{model}], [{score}],\n"
|
|
116
|
+
if result["sig_better_than_next"]:
|
|
117
|
+
typst_code += " table.hline(end: 1),\n"
|
|
118
|
+
|
|
119
|
+
typst_code += " table.hline(),\n"
|
|
120
|
+
typst_code += ")\n"
|
|
121
|
+
return typst_code
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def generate_latex_table(results):
|
|
125
|
+
"""
|
|
126
|
+
Generate LaTeX code for a booktabs two-column table with results.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
results: List of dicts with keys: model, score, count
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
String containing LaTeX table markup
|
|
133
|
+
"""
|
|
134
|
+
if not results:
|
|
135
|
+
return "% No results available"
|
|
136
|
+
|
|
137
|
+
latex_code = """\\begin{table}[h]
|
|
138
|
+
\\centering
|
|
139
|
+
\\begin{tabular}{lr}
|
|
140
|
+
\\toprule
|
|
141
|
+
\\textbf{Model} & \\textbf{Score} \\\\
|
|
142
|
+
\\midrule
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
for result in results:
|
|
146
|
+
# Escape LaTeX special characters
|
|
147
|
+
model = result["model"]
|
|
148
|
+
model = model.replace("\\", "\\textbackslash ")
|
|
149
|
+
model = model.replace("_", "\\_")
|
|
150
|
+
model = model.replace("&", "\\&")
|
|
151
|
+
model = model.replace("%", "\\%")
|
|
152
|
+
model = model.replace("$", "\\$")
|
|
153
|
+
model = model.replace("#", "\\#")
|
|
154
|
+
model = model.replace("{", "\\{")
|
|
155
|
+
model = model.replace("}", "\\}")
|
|
156
|
+
model = model.replace("~", "\\textasciitilde ")
|
|
157
|
+
model = model.replace("^", "\\textasciicircum ")
|
|
158
|
+
|
|
159
|
+
score = f"{result['score']:.1f}"
|
|
160
|
+
latex_code += f"{model} & {score} \\\\\n"
|
|
161
|
+
if result["sig_better_than_next"]:
|
|
162
|
+
latex_code += "\\cmidrule{1-1}\n"
|
|
163
|
+
|
|
164
|
+
latex_code += """\\bottomrule
|
|
165
|
+
\\end{tabular}
|
|
166
|
+
\\caption{Model ranking results}
|
|
167
|
+
\\label{tab:results}
|
|
168
|
+
\\end{table}
|
|
169
|
+
"""
|
|
170
|
+
return latex_code
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def generate_pdf(results, campaign_id):
|
|
174
|
+
"""
|
|
175
|
+
Generate PDF from Typst code using typst-py.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
results: List of dicts with keys: model, score, count
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
bytes containing the PDF
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
import tempfile
|
|
185
|
+
|
|
186
|
+
import typst
|
|
187
|
+
|
|
188
|
+
if not results:
|
|
189
|
+
# Return empty PDF with message
|
|
190
|
+
typst_code = "[No results available]"
|
|
191
|
+
else:
|
|
192
|
+
typst_code = f"""
|
|
193
|
+
#set page(width: auto, height: auto, margin: 1.5pt)
|
|
194
|
+
== {escape_typst(campaign_id)}
|
|
195
|
+
""" + generate_typst_table(
|
|
196
|
+
results
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Create a temporary file for the typst source
|
|
200
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".typ", delete=False) as f:
|
|
201
|
+
f.write(typst_code)
|
|
202
|
+
typst_file = f.name
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
# Compile to PDF
|
|
206
|
+
pdf_bytes = typst.compile(typst_file)
|
|
207
|
+
return pdf_bytes
|
|
208
|
+
finally:
|
|
209
|
+
# Clean up
|
|
210
|
+
os.unlink(typst_file)
|