langfun 0.1.1.dev20240729__py3-none-any.whl → 0.1.1.dev20240731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/core/eval/base.py +55 -24
- langfun/core/eval/matching.py +16 -20
- langfun/core/eval/scoring.py +8 -10
- {langfun-0.1.1.dev20240729.dist-info → langfun-0.1.1.dev20240731.dist-info}/METADATA +1 -1
- {langfun-0.1.1.dev20240729.dist-info → langfun-0.1.1.dev20240731.dist-info}/RECORD +8 -8
- {langfun-0.1.1.dev20240729.dist-info → langfun-0.1.1.dev20240731.dist-info}/WHEEL +1 -1
- {langfun-0.1.1.dev20240729.dist-info → langfun-0.1.1.dev20240731.dist-info}/LICENSE +0 -0
- {langfun-0.1.1.dev20240729.dist-info → langfun-0.1.1.dev20240731.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py
CHANGED
@@ -542,6 +542,8 @@ class Evaluable(lf.Component):
|
|
542
542
|
'padding: 10px; border: 1px solid; margin-top: 10px">'
|
543
543
|
)
|
544
544
|
s.write(html.escape(m.get('formatted_text', m.text)))
|
545
|
+
|
546
|
+
# Write output.
|
545
547
|
if m.result is not None:
|
546
548
|
s.write(
|
547
549
|
'<div style="color: magenta; white-space: pre-wrap;'
|
@@ -549,6 +551,23 @@ class Evaluable(lf.Component):
|
|
549
551
|
)
|
550
552
|
s.write(html.escape(pg.format(m.result)))
|
551
553
|
s.write('</div>')
|
554
|
+
|
555
|
+
# Write modality information.
|
556
|
+
if 'lm-input' in m.tags or 'lm-response' in m.tags:
|
557
|
+
modalities = m.referred_modalities()
|
558
|
+
if modalities:
|
559
|
+
s.write(f'<div style="color: {text_color}; white-space: pre-wrap;'
|
560
|
+
'padding: 10px; border: 1px solid; margin-top: 10px"><table>')
|
561
|
+
for name, modality in modalities.items():
|
562
|
+
s.write(f'<tr><td>{name}</td><td>')
|
563
|
+
if hasattr(modality, '_repr_html_'):
|
564
|
+
s.write(modality._repr_html_()) # pylint: disable=protected-access
|
565
|
+
else:
|
566
|
+
s.write(html.escape(pg.format(modality, max_bytes_len=32)))
|
567
|
+
s.write('</td></tr>')
|
568
|
+
s.write('</table></div>')
|
569
|
+
|
570
|
+
# Write usage information.
|
552
571
|
if m.metadata.get('usage', None):
|
553
572
|
s.write(
|
554
573
|
'<div style="background-color: #EEEEEE; color: black; '
|
@@ -1304,20 +1323,22 @@ class Evaluation(Evaluable):
|
|
1304
1323
|
s = io.StringIO()
|
1305
1324
|
definition = _html_repr(self, compact=False, escape=True)
|
1306
1325
|
s.write('<div><table><tr><td>')
|
1326
|
+
self._render_link(
|
1327
|
+
s,
|
1328
|
+
definition,
|
1329
|
+
self.hash,
|
1330
|
+
'',
|
1331
|
+
lambda: self.link(self.dir),
|
1332
|
+
)
|
1307
1333
|
if self.result is None:
|
1308
1334
|
s.write(
|
1309
|
-
f'<a target="_blank" title="{definition}" '
|
1310
|
-
f'href="{self.link(self.dir)}">{self.hash}</a>'
|
1311
1335
|
'</td></tr><tr><td>'
|
1312
1336
|
'<span style="color: gray">(IN-PROGRESS...)</span>'
|
1313
1337
|
)
|
1314
1338
|
else:
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
f' [<a href="{self.link(self.dir)}">dir</a>]'
|
1319
|
-
'</td></tr><tr><td>'
|
1320
|
-
)
|
1339
|
+
if self.dir:
|
1340
|
+
s.write(f' [<a href="{self.link(self.dir)}">dir</a>]')
|
1341
|
+
s.write('</td></tr><tr><td>')
|
1321
1342
|
self._render_summary_metrics(s)
|
1322
1343
|
|
1323
1344
|
# Summarize average usage.
|
@@ -1341,6 +1362,20 @@ class Evaluation(Evaluable):
|
|
1341
1362
|
f'" style="color:gray">({total} tokens)</a>'
|
1342
1363
|
)
|
1343
1364
|
|
1365
|
+
def _render_link(self,
|
1366
|
+
s: io.StringIO,
|
1367
|
+
title: str,
|
1368
|
+
text: str,
|
1369
|
+
style: str,
|
1370
|
+
url_fn: Callable[[], str]) -> None:
|
1371
|
+
"""Renders a link in HTML."""
|
1372
|
+
s.write(
|
1373
|
+
f'<a target="_blank" title="{title}" style="{style}"'
|
1374
|
+
)
|
1375
|
+
if self.dir:
|
1376
|
+
s.write(f' href="{url_fn()}"')
|
1377
|
+
s.write(f'>{text}</a>')
|
1378
|
+
|
1344
1379
|
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
1345
1380
|
"""Renders metrics in HTML."""
|
1346
1381
|
assert self.result is not None
|
@@ -1362,14 +1397,12 @@ class Evaluation(Evaluable):
|
|
1362
1397
|
extra_style = ''
|
1363
1398
|
if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
|
1364
1399
|
extra_style = ';font-weight:bold'
|
1365
|
-
|
1366
|
-
|
1367
|
-
|
1368
|
-
|
1369
|
-
|
1370
|
-
|
1371
|
-
self._format_rate(m.oop_failure_rate),
|
1372
|
-
)
|
1400
|
+
self._render_link(
|
1401
|
+
s,
|
1402
|
+
oop_failure_title,
|
1403
|
+
self._format_rate(m.oop_failure_rate),
|
1404
|
+
f'color:magenta{extra_style}',
|
1405
|
+
lambda: self.oop_failures_link,
|
1373
1406
|
)
|
1374
1407
|
s.write(' | ')
|
1375
1408
|
|
@@ -1387,14 +1420,12 @@ class Evaluation(Evaluable):
|
|
1387
1420
|
)
|
1388
1421
|
|
1389
1422
|
extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
|
1390
|
-
|
1391
|
-
|
1392
|
-
|
1393
|
-
|
1394
|
-
|
1395
|
-
|
1396
|
-
self._format_rate(m.non_oop_failure_rate),
|
1397
|
-
)
|
1423
|
+
self._render_link(
|
1424
|
+
s,
|
1425
|
+
non_oop_failure_title,
|
1426
|
+
self._format_rate(m.non_oop_failure_rate),
|
1427
|
+
f'color:red{extra_style}',
|
1428
|
+
lambda: self.non_oop_failures_link,
|
1398
1429
|
)
|
1399
1430
|
|
1400
1431
|
def _format_rate(self, rate: float) -> str:
|
langfun/core/eval/matching.py
CHANGED
@@ -239,24 +239,20 @@ class Matching(base.Evaluation):
|
|
239
239
|
"""Renders metrics in HTML."""
|
240
240
|
assert self.result is not None
|
241
241
|
m = self.result.metrics
|
242
|
-
|
243
|
-
|
244
|
-
% (
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
self._format_rate(m.match_rate),
|
249
|
-
)
|
242
|
+
self._render_link(
|
243
|
+
s,
|
244
|
+
'Matches (%d/%d)' % (m.num_matches, m.total),
|
245
|
+
self._format_rate(m.match_rate),
|
246
|
+
'color:green',
|
247
|
+
lambda: self.matches_link,
|
250
248
|
)
|
251
249
|
s.write(' | ')
|
252
|
-
|
253
|
-
|
254
|
-
% (
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
self._format_rate(m.mismatch_rate),
|
259
|
-
)
|
250
|
+
self._render_link(
|
251
|
+
s,
|
252
|
+
'Mismatches (%d/%d)' % (m.num_mismatches, m.total),
|
253
|
+
self._format_rate(m.mismatch_rate),
|
254
|
+
'color:orange',
|
255
|
+
lambda: self.mismatches_link,
|
260
256
|
)
|
261
257
|
s.write(' | ')
|
262
258
|
super()._render_summary_metrics(s)
|
@@ -275,9 +271,9 @@ class Matching(base.Evaluation):
|
|
275
271
|
for i, (example, output, message) in enumerate(self.matches):
|
276
272
|
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
277
273
|
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
278
|
-
input_str = pg.format(example, verbose=False)
|
274
|
+
input_str = pg.format(example, verbose=False, max_bytes_len=32)
|
279
275
|
s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
|
280
|
-
output_str = pg.format(output, verbose=False)
|
276
|
+
output_str = pg.format(output, verbose=False, max_bytes_len=32)
|
281
277
|
s.write(f'<td style="color:blue;white-space:pre-wrap">{output_str}</td>')
|
282
278
|
s.write('<td>')
|
283
279
|
self._render_message(message, s)
|
@@ -300,9 +296,9 @@ class Matching(base.Evaluation):
|
|
300
296
|
for i, (example, output, message) in enumerate(self.mismatches):
|
301
297
|
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
302
298
|
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
303
|
-
input_str = pg.format(example, verbose=False)
|
299
|
+
input_str = pg.format(example, verbose=False, max_bytes_len=32)
|
304
300
|
s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
|
305
|
-
output_str = pg.format(output, verbose=False)
|
301
|
+
output_str = pg.format(output, verbose=False, max_bytes_len=32)
|
306
302
|
s.write(
|
307
303
|
f'<td style="color:magenta;white-space:pre-wrap">{output_str}</td>'
|
308
304
|
)
|
langfun/core/eval/scoring.py
CHANGED
@@ -172,14 +172,12 @@ class Scoring(base.Evaluation):
|
|
172
172
|
"""Renders metrics in HTML."""
|
173
173
|
assert self.result is not None
|
174
174
|
m = self.result.metrics
|
175
|
-
|
176
|
-
|
177
|
-
% (
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
'%.2f%%' % (m.score_rate * 100),
|
182
|
-
)
|
175
|
+
self._render_link(
|
176
|
+
s,
|
177
|
+
'Average score (%d/%d)' % (m.num_scored, m.total),
|
178
|
+
'%.2f (%.2f%%)' % (m.avg_score, m.score_rate * 100),
|
179
|
+
'color:green',
|
180
|
+
lambda: self.scored_link,
|
183
181
|
)
|
184
182
|
s.write(' | ')
|
185
183
|
super()._render_summary_metrics(s)
|
@@ -198,9 +196,9 @@ class Scoring(base.Evaluation):
|
|
198
196
|
for i, (example, output, score, message) in enumerate(self.scored):
|
199
197
|
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
200
198
|
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
201
|
-
input_str = pg.format(example, verbose=False)
|
199
|
+
input_str = pg.format(example, verbose=False, max_bytes_len=32)
|
202
200
|
s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
|
203
|
-
output_str = pg.format(output, verbose=False)
|
201
|
+
output_str = pg.format(output, verbose=False, max_bytes_len=32)
|
204
202
|
s.write(f'<td style="color:blue;white-space:pre-wrap">{output_str}</td>')
|
205
203
|
s.write(f'<td style="color:magenta;white-space:pre-wrap">{score}</td>')
|
206
204
|
s.write('<td>')
|
@@ -44,13 +44,13 @@ langfun/core/coding/python/parsing_test.py,sha256=9vAWF484kWIm6JZq8NFiMgKUDhXV-d
|
|
44
44
|
langfun/core/coding/python/permissions.py,sha256=1QWGHvzL8MM0Ok_auQ9tURqZHtdOfJaDpBzZ29GUE-c,2544
|
45
45
|
langfun/core/coding/python/permissions_test.py,sha256=w5EDb8QxpxgJyZkojyzVWQvDfg366zn99-g__6TbPQ0,2699
|
46
46
|
langfun/core/eval/__init__.py,sha256=Evt-E4FEhZF2tXL6-byh_AyA7Cc_ZoGmvnN7vkAZedk,1898
|
47
|
-
langfun/core/eval/base.py,sha256=
|
47
|
+
langfun/core/eval/base.py,sha256=zZgebhUubX-149VkwkyQBKkTyv5hF0ubjj9eBd2baLo,75201
|
48
48
|
langfun/core/eval/base_test.py,sha256=cHOTIWVW4Dp8gKKIKcZrAcJ-w84j2GIozTzJoiAX7p4,26743
|
49
|
-
langfun/core/eval/matching.py,sha256=
|
49
|
+
langfun/core/eval/matching.py,sha256=9GX8HfO9jKxgNLAivgy5K88Xhoh6Z7Pptq65pe7vht8,9762
|
50
50
|
langfun/core/eval/matching_test.py,sha256=f7iVyXH5KGJBWt4Wp14Bt9J3X59A6Ayfog9MbuFvPew,5532
|
51
51
|
langfun/core/eval/patching.py,sha256=R0s2eAd1m97exQt06dmUL0V_MBG0W2Hxg7fhNB7cXW0,3866
|
52
52
|
langfun/core/eval/patching_test.py,sha256=8kCd54Egjju22FMgtJuxEsrXkW8ifs-UUBHtrCG1L6w,4775
|
53
|
-
langfun/core/eval/scoring.py,sha256=
|
53
|
+
langfun/core/eval/scoring.py,sha256=AlCwEVrU6nvURDB1aPxA2XBUmOjWxuNJDXJoS4-6VbU,6386
|
54
54
|
langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se2SXM,4546
|
55
55
|
langfun/core/llms/__init__.py,sha256=sRD_PjfD5wKuzANCOCjChjuZHUn2Q1WruZeVflqej5M,4609
|
56
56
|
langfun/core/llms/anthropic.py,sha256=Gon3fOi31RhZFgNd0ijyTnKnUdp9hrWrCoSXyO4UaLw,7316
|
@@ -117,8 +117,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
|
|
117
117
|
langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
|
118
118
|
langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
|
119
119
|
langfun/core/templates/selfplay_test.py,sha256=rBW2Qr8yi-aWYwoTwRR-n1peKyMX9QXPZXURjLgoiRs,2264
|
120
|
-
langfun-0.1.1.
|
121
|
-
langfun-0.1.1.
|
122
|
-
langfun-0.1.1.
|
123
|
-
langfun-0.1.1.
|
124
|
-
langfun-0.1.1.
|
120
|
+
langfun-0.1.1.dev20240731.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
|
121
|
+
langfun-0.1.1.dev20240731.dist-info/METADATA,sha256=WNoqA9YDPgBC2gIdUYCBcr2TPA7BBfs1DY4EI1WSy3I,5247
|
122
|
+
langfun-0.1.1.dev20240731.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
123
|
+
langfun-0.1.1.dev20240731.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
|
124
|
+
langfun-0.1.1.dev20240731.dist-info/RECORD,,
|
File without changes
|
File without changes
|