langfun 0.0.2.dev20240429__py3-none-any.whl → 0.0.2.dev20240430__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/core/eval/base.py +310 -73
- langfun/core/eval/base_test.py +96 -45
- langfun/core/eval/matching.py +22 -21
- langfun/core/eval/matching_test.py +23 -2
- langfun/core/eval/scoring.py +4 -4
- langfun/core/eval/scoring_test.py +19 -2
- langfun/core/llms/openai.py +1 -1
- langfun/core/llms/openai_test.py +2 -1
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/METADATA +1 -1
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/RECORD +13 -13
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/LICENSE +0 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/WHEEL +0 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py
CHANGED
@@ -18,6 +18,7 @@ import collections
|
|
18
18
|
import dataclasses
|
19
19
|
import functools
|
20
20
|
import hashlib
|
21
|
+
import html
|
21
22
|
import inspect
|
22
23
|
import io
|
23
24
|
import os
|
@@ -40,7 +41,8 @@ class Evaluable(lf.Component):
|
|
40
41
|
|
41
42
|
EXPERIMENT_JSON = 'experiment.json'
|
42
43
|
RESULT_JSON = 'result.json'
|
43
|
-
|
44
|
+
OOP_FAILURES_JSON = 'oop_failures.json'
|
45
|
+
NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
|
44
46
|
INDEX_HTML = 'index.html'
|
45
47
|
SUMMARY_HTML = 'summary.html'
|
46
48
|
|
@@ -358,7 +360,7 @@ class Evaluable(lf.Component):
|
|
358
360
|
color='yellow')
|
359
361
|
|
360
362
|
for node in self.nonleaf_nodes:
|
361
|
-
node._result = {c.id: c.result for c in node.
|
363
|
+
node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
|
362
364
|
if should_save:
|
363
365
|
node.save(result=False, report=False)
|
364
366
|
|
@@ -540,13 +542,13 @@ class Evaluable(lf.Component):
|
|
540
542
|
f'<div style="color: {text_color}; white-space: pre-wrap;'
|
541
543
|
'padding: 10px; border: 1px solid; margin-top: 10px">'
|
542
544
|
)
|
543
|
-
s.write(m.get('formatted_text', m.text))
|
545
|
+
s.write(html.escape(m.get('formatted_text', m.text)))
|
544
546
|
if m.result is not None:
|
545
547
|
s.write(
|
546
548
|
'<div style="color: magenta; white-space: pre-wrap;'
|
547
549
|
'padding: 10px; border: 1px solid; margin: 10px">'
|
548
550
|
)
|
549
|
-
s.write(pg.format(m.result))
|
551
|
+
s.write(html.escape(pg.format(m.result)))
|
550
552
|
s.write('</div>')
|
551
553
|
if 'usage' in m.metadata:
|
552
554
|
s.write(
|
@@ -753,10 +755,12 @@ class Evaluation(Evaluable):
|
|
753
755
|
|
754
756
|
# Constants.
|
755
757
|
CACHE_JSON = 'cache.json'
|
756
|
-
|
758
|
+
OOP_FAILURES_HTML = 'oop_failures.html'
|
759
|
+
NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
|
757
760
|
|
758
761
|
@functools.cached_property
|
759
762
|
def hash(self) -> str:
|
763
|
+
"""Returns the semantic-based hash of the evaluation."""
|
760
764
|
if self.is_deterministic:
|
761
765
|
identity = pg.format(self._identifiers(), compact=True)
|
762
766
|
else:
|
@@ -805,6 +809,10 @@ class Evaluation(Evaluable):
|
|
805
809
|
"""Returns the complete rate."""
|
806
810
|
return self.num_completed / self.num_examples
|
807
811
|
|
812
|
+
#
|
813
|
+
# Properties on failures.
|
814
|
+
#
|
815
|
+
|
808
816
|
@property
|
809
817
|
def failures(self) -> list[tuple[Any, Exception]]:
|
810
818
|
"""Returns the failed examples and their errors."""
|
@@ -815,6 +823,15 @@ class Evaluation(Evaluable):
|
|
815
823
|
"""Returns the number of failed examples."""
|
816
824
|
return len(self.failures)
|
817
825
|
|
826
|
+
@functools.cached_property
|
827
|
+
def failure_breakdown(self) -> dict[str, int]:
|
828
|
+
"""Returns the breakdown of failures."""
|
829
|
+
breakdown = collections.defaultdict(int)
|
830
|
+
for _, error in self.failures:
|
831
|
+
breakdown[_error_key(error)] += 1
|
832
|
+
sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
|
833
|
+
return pg.Dict({x[0]: x[1] for x in sorted_items})
|
834
|
+
|
818
835
|
@property
|
819
836
|
def failure_rate(self) -> float:
|
820
837
|
"""Returns the failure rate in range [0, 1]."""
|
@@ -822,6 +839,46 @@ class Evaluation(Evaluable):
|
|
822
839
|
return 0.0
|
823
840
|
return self.num_failures / self.num_completed
|
824
841
|
|
842
|
+
@functools.cached_property
|
843
|
+
def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
|
844
|
+
"""Returns the OOP failures."""
|
845
|
+
return [item for item in self.failures
|
846
|
+
if isinstance(item[1], lf_structured.MappingError)]
|
847
|
+
|
848
|
+
@property
|
849
|
+
def num_oop_failures(self) -> int:
|
850
|
+
"""Returns the number of OOP failures."""
|
851
|
+
return len(self.oop_failures)
|
852
|
+
|
853
|
+
@property
|
854
|
+
def oop_failure_rate(self) -> float:
|
855
|
+
"""Returns the OOP failure rate in range [0, 1]."""
|
856
|
+
if self.num_completed == 0:
|
857
|
+
return 0.0
|
858
|
+
return self.num_oop_failures / self.num_completed
|
859
|
+
|
860
|
+
@functools.cached_property
|
861
|
+
def non_oop_failures(self) -> list[tuple[Any, Exception]]:
|
862
|
+
"""Returns the OOP failures."""
|
863
|
+
return [item for item in self.failures
|
864
|
+
if not isinstance(item[1], lf_structured.MappingError)]
|
865
|
+
|
866
|
+
@property
|
867
|
+
def num_non_oop_failures(self) -> int:
|
868
|
+
"""Returns the number of non-OOP failures."""
|
869
|
+
return len(self.non_oop_failures)
|
870
|
+
|
871
|
+
@property
|
872
|
+
def non_oop_failure_rate(self) -> float:
|
873
|
+
"""Returns the non-OOP failure rate in range [0, 1]."""
|
874
|
+
if self.num_completed == 0:
|
875
|
+
return 0.0
|
876
|
+
return self.num_non_oop_failures / self.num_completed
|
877
|
+
|
878
|
+
#
|
879
|
+
# Properties on usage.
|
880
|
+
#
|
881
|
+
|
825
882
|
@property
|
826
883
|
def has_usage(self) -> bool:
|
827
884
|
"""Returns True if token usage is enabled."""
|
@@ -976,13 +1033,22 @@ class Evaluation(Evaluable):
|
|
976
1033
|
self._total_prompt_tokens = 0
|
977
1034
|
self._total_completion_tokens = 0
|
978
1035
|
self._num_usages = 0
|
1036
|
+
self.__dict__.pop('oop_failures', None)
|
1037
|
+
self.__dict__.pop('non_oop_failures', None)
|
979
1038
|
|
980
1039
|
@property
|
981
|
-
def
|
982
|
-
"""Returns the link to the failures page."""
|
1040
|
+
def oop_failures_link(self) -> str | None:
|
1041
|
+
"""Returns the link to the OOP failures page."""
|
983
1042
|
if self.dir is None:
|
984
1043
|
return None
|
985
|
-
return self.link(os.path.join(self.dir, Evaluation.
|
1044
|
+
return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
|
1045
|
+
|
1046
|
+
@property
|
1047
|
+
def non_oop_failures_link(self) -> str | None:
|
1048
|
+
"""Returns the link to then non-OOP failures page."""
|
1049
|
+
if self.dir is None:
|
1050
|
+
return None
|
1051
|
+
return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
|
986
1052
|
|
987
1053
|
def _dryrun(
|
988
1054
|
self,
|
@@ -1011,23 +1077,34 @@ class Evaluation(Evaluable):
|
|
1011
1077
|
color='green',
|
1012
1078
|
)
|
1013
1079
|
|
1014
|
-
|
1015
|
-
output_message = copy.process(example, **(self.additional_args or {}))
|
1016
|
-
if self.schema is None:
|
1017
|
-
output = output_message.text
|
1018
|
-
else:
|
1019
|
-
output = output_message.result
|
1080
|
+
error, output_message = None, None
|
1020
1081
|
|
1021
|
-
|
1082
|
+
try:
|
1083
|
+
with lf.use_settings(debug=debug):
|
1084
|
+
output_message = copy.process(example, **(self.additional_args or {}))
|
1085
|
+
if self.schema is None:
|
1086
|
+
output = output_message.text
|
1087
|
+
else:
|
1088
|
+
output = output_message.result
|
1089
|
+
|
1090
|
+
if verbose:
|
1091
|
+
lf.console.write('')
|
1092
|
+
lf.console.write(
|
1093
|
+
str(output),
|
1094
|
+
title='OUTPUT',
|
1095
|
+
color='blue',
|
1096
|
+
)
|
1097
|
+
except lf_structured.MappingError as e:
|
1022
1098
|
lf.console.write('')
|
1023
1099
|
lf.console.write(
|
1024
|
-
str(
|
1025
|
-
title='
|
1026
|
-
color='
|
1100
|
+
str(e),
|
1101
|
+
title='ERROR',
|
1102
|
+
color='red',
|
1027
1103
|
)
|
1104
|
+
error = e
|
1028
1105
|
|
1029
|
-
copy.audit(example, output_message,
|
1030
|
-
result = copy.
|
1106
|
+
copy.audit(example, output_message, error, dryrun=True)
|
1107
|
+
result = copy.finalize()
|
1031
1108
|
|
1032
1109
|
if verbose:
|
1033
1110
|
lf.console.write('')
|
@@ -1087,7 +1164,7 @@ class Evaluation(Evaluable):
|
|
1087
1164
|
self.cache.save()
|
1088
1165
|
|
1089
1166
|
# Summarize result.
|
1090
|
-
self._result = self.
|
1167
|
+
self._result = self.finalize()
|
1091
1168
|
if verbose:
|
1092
1169
|
lf.console.write(
|
1093
1170
|
str(self.result),
|
@@ -1143,13 +1220,13 @@ class Evaluation(Evaluable):
|
|
1143
1220
|
def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
|
1144
1221
|
return {
|
1145
1222
|
'Model': self.lm.model_id,
|
1146
|
-
'Succeeded':
|
1147
|
-
progress.success_rate
|
1223
|
+
'Succeeded': '%s (%d/%d)' % (
|
1224
|
+
self._format_rate(progress.success_rate),
|
1148
1225
|
progress.succeeded,
|
1149
1226
|
progress.completed,
|
1150
1227
|
),
|
1151
|
-
'Failed':
|
1152
|
-
progress.failure_rate
|
1228
|
+
'Failed': '%s (%d/%d)' % (
|
1229
|
+
self._format_rate(progress.failure_rate),
|
1153
1230
|
progress.failed,
|
1154
1231
|
progress.completed,
|
1155
1232
|
),
|
@@ -1159,21 +1236,20 @@ class Evaluation(Evaluable):
|
|
1159
1236
|
assert self.result is not None
|
1160
1237
|
m = self.result.metrics
|
1161
1238
|
return (
|
1162
|
-
|
1163
|
-
f' Failures=%.{self.report_precision}f%% (%d/%d)'
|
1239
|
+
'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
|
1164
1240
|
% (
|
1165
1241
|
run_status,
|
1166
|
-
(1 - m.failure_rate)
|
1242
|
+
self._format_rate(1 - m.failure_rate),
|
1167
1243
|
m.total - m.failures,
|
1168
1244
|
m.total,
|
1169
|
-
m.failure_rate
|
1245
|
+
self._format_rate(m.failure_rate),
|
1170
1246
|
m.failures,
|
1171
1247
|
m.total,
|
1172
1248
|
)
|
1173
1249
|
)
|
1174
1250
|
|
1175
|
-
def
|
1176
|
-
"""
|
1251
|
+
def finalize(self) -> pg.Dict:
|
1252
|
+
"""Finalizes the evaluation result."""
|
1177
1253
|
if self.cache:
|
1178
1254
|
cache_stats = dict(
|
1179
1255
|
use_cache=True,
|
@@ -1210,12 +1286,18 @@ class Evaluation(Evaluable):
|
|
1210
1286
|
total=self.num_completed,
|
1211
1287
|
failures=self.num_failures,
|
1212
1288
|
failure_rate=self.failure_rate,
|
1289
|
+
oop_failures=self.num_oop_failures,
|
1290
|
+
oop_failure_rate=self.oop_failure_rate,
|
1291
|
+
non_oop_failures=self.num_non_oop_failures,
|
1292
|
+
non_oop_failure_rate=self.non_oop_failure_rate,
|
1293
|
+
failure_breakdown=self.failure_breakdown,
|
1213
1294
|
),
|
1214
1295
|
usage=usage,
|
1215
1296
|
)
|
1216
1297
|
return result
|
1217
1298
|
|
1218
|
-
def
|
1299
|
+
def summary_card(self) -> str:
|
1300
|
+
"""Returns summary card in HTML."""
|
1219
1301
|
s = io.StringIO()
|
1220
1302
|
definition = _html_repr(self, compact=False, escape=True)
|
1221
1303
|
s.write('<div><table><tr><td>')
|
@@ -1230,18 +1312,19 @@ class Evaluation(Evaluable):
|
|
1230
1312
|
s.write(
|
1231
1313
|
f'<a target="_blank" title="{definition}" '
|
1232
1314
|
f'href="{self.index_link}">{self.hash}</a>'
|
1315
|
+
f' [<a href="{self.link(self.dir)}">dir</a>]'
|
1233
1316
|
'</td></tr><tr><td>'
|
1234
1317
|
)
|
1235
|
-
self.
|
1318
|
+
self._render_summary_metrics(s)
|
1236
1319
|
|
1237
1320
|
# Summarize average usage.
|
1238
1321
|
if self.result.usage is not None:
|
1239
|
-
self.
|
1322
|
+
self._render_summary_usage(s)
|
1240
1323
|
|
1241
1324
|
s.write('</td></tr></table></div>')
|
1242
1325
|
return s.getvalue()
|
1243
1326
|
|
1244
|
-
def
|
1327
|
+
def _render_summary_usage(self, s: io.StringIO) -> None:
|
1245
1328
|
"""Renders usage in HTML."""
|
1246
1329
|
usage = self.result.usage
|
1247
1330
|
total = usage.total_prompt_tokens + usage.total_completion_tokens
|
@@ -1255,20 +1338,66 @@ class Evaluation(Evaluable):
|
|
1255
1338
|
f'" style="color:gray">({total} tokens)</a>'
|
1256
1339
|
)
|
1257
1340
|
|
1258
|
-
def
|
1341
|
+
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
1259
1342
|
"""Renders metrics in HTML."""
|
1260
1343
|
assert self.result is not None
|
1261
1344
|
m = self.result.metrics
|
1345
|
+
|
1346
|
+
# OOP failures.
|
1347
|
+
oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
|
1348
|
+
if m.oop_failures:
|
1349
|
+
oop_failure_title += '
'
|
1350
|
+
for name, count in m.failure_breakdown.items():
|
1351
|
+
if name.startswith('MappingError'):
|
1352
|
+
oop_failure_title += '
%s: %s (%d/%d)' % (
|
1353
|
+
name.removeprefix('MappingError.'),
|
1354
|
+
self._format_rate(count / m.total),
|
1355
|
+
count,
|
1356
|
+
m.total,
|
1357
|
+
)
|
1358
|
+
|
1359
|
+
extra_style = ''
|
1360
|
+
if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
|
1361
|
+
extra_style = ';font-weight:bold'
|
1262
1362
|
s.write(
|
1263
|
-
'<a title="
|
1363
|
+
'<a title="%s" href="%s" style="color:magenta%s">%s</a>'
|
1264
1364
|
% (
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1365
|
+
oop_failure_title,
|
1366
|
+
self.oop_failures_link,
|
1367
|
+
extra_style,
|
1368
|
+
self._format_rate(m.oop_failure_rate),
|
1369
|
+
)
|
1370
|
+
)
|
1371
|
+
s.write(' | ')
|
1372
|
+
|
1373
|
+
# Non-OOP failures.
|
1374
|
+
non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
|
1375
|
+
if m.non_oop_failures:
|
1376
|
+
non_oop_failure_title += '
'
|
1377
|
+
for name, count in m.failure_breakdown.items():
|
1378
|
+
if not name.startswith('MappingError'):
|
1379
|
+
non_oop_failure_title += '
%s: %s (%d/%d)' % (
|
1380
|
+
name,
|
1381
|
+
self._format_rate(count / m.total),
|
1382
|
+
count,
|
1383
|
+
m.total,
|
1384
|
+
)
|
1385
|
+
|
1386
|
+
extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
|
1387
|
+
s.write(
|
1388
|
+
'<a title="%s" href="%s" style="color:red%s">%s</a>'
|
1389
|
+
% (
|
1390
|
+
non_oop_failure_title,
|
1391
|
+
self.non_oop_failures_link,
|
1392
|
+
extra_style,
|
1393
|
+
self._format_rate(m.non_oop_failure_rate),
|
1269
1394
|
)
|
1270
1395
|
)
|
1271
1396
|
|
1397
|
+
def _format_rate(self, rate: float) -> str:
|
1398
|
+
"""Formats a rate."""
|
1399
|
+
return f'%.{self.report_precision}f%% ' % (rate * 100)
|
1400
|
+
|
1272
1401
|
def audit(
|
1273
1402
|
self,
|
1274
1403
|
example: Any,
|
@@ -1287,7 +1416,13 @@ class Evaluation(Evaluable):
|
|
1287
1416
|
dryrun: Whether or not audition takes place during dryrun.
|
1288
1417
|
"""
|
1289
1418
|
if error is not None:
|
1290
|
-
self._failures.append((example,
|
1419
|
+
self._failures.append((example, error))
|
1420
|
+
|
1421
|
+
# Invalid cache of num_oop_failures.
|
1422
|
+
self.__dict__.pop('oop_failures', None)
|
1423
|
+
self.__dict__.pop('non_oop_failures', None)
|
1424
|
+
self.__dict__.pop('failure_breakdown', None)
|
1425
|
+
|
1291
1426
|
if isinstance(error, lf_structured.MappingError):
|
1292
1427
|
message = error.lm_response
|
1293
1428
|
else:
|
@@ -1333,16 +1468,26 @@ class Evaluation(Evaluable):
|
|
1333
1468
|
# Save failures.
|
1334
1469
|
pg.save(
|
1335
1470
|
[
|
1336
|
-
pg.Dict(
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1471
|
+
pg.Dict(input=input, error=_format_error(error))
|
1472
|
+
for input, error in self.oop_failures
|
1473
|
+
],
|
1474
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
|
1475
|
+
)
|
1476
|
+
pg.save(
|
1477
|
+
self._html([self._render_result, self._render_oop_failures]),
|
1478
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
|
1479
|
+
file_format='txt',
|
1480
|
+
)
|
1481
|
+
pg.save(
|
1482
|
+
[
|
1483
|
+
pg.Dict(input=input, error=_format_error(error))
|
1484
|
+
for input, error in self.non_oop_failures
|
1340
1485
|
],
|
1341
|
-
os.path.join(self.dir, Evaluation.
|
1486
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
|
1342
1487
|
)
|
1343
1488
|
pg.save(
|
1344
|
-
self._html([self._render_result, self.
|
1345
|
-
os.path.join(self.dir, Evaluation.
|
1489
|
+
self._html([self._render_result, self._render_non_oop_failures]),
|
1490
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
|
1346
1491
|
file_format='txt',
|
1347
1492
|
)
|
1348
1493
|
|
@@ -1357,7 +1502,8 @@ class Evaluation(Evaluable):
|
|
1357
1502
|
)
|
1358
1503
|
if self.result.usage is not None:
|
1359
1504
|
s.write('<td>Usage</td>')
|
1360
|
-
s.write('<td>Failures</td>')
|
1505
|
+
s.write('<td>OOP Failures</td>')
|
1506
|
+
s.write('<td>Non-OOP Failures</td>')
|
1361
1507
|
|
1362
1508
|
def _render_result_row(self, s: io.StringIO) -> None:
|
1363
1509
|
s.write(
|
@@ -1385,16 +1531,29 @@ class Evaluation(Evaluable):
|
|
1385
1531
|
# Usage.
|
1386
1532
|
if self.result.usage is not None:
|
1387
1533
|
s.write('<td>')
|
1388
|
-
self.
|
1534
|
+
self._render_summary_usage(s)
|
1389
1535
|
s.write('</td>')
|
1390
1536
|
|
1391
|
-
#
|
1537
|
+
# OOP failures.
|
1538
|
+
s.write(
|
1539
|
+
'<td><span style="color:magenta">%s</span>%s</td>'
|
1540
|
+
% (
|
1541
|
+
self._format_rate(self.oop_failure_rate),
|
1542
|
+
'<a href="%s">(%d/%d)</a>'
|
1543
|
+
% (self.oop_failures_link,
|
1544
|
+
self.num_oop_failures,
|
1545
|
+
self.num_completed),
|
1546
|
+
)
|
1547
|
+
)
|
1548
|
+
# Non-OOP failures.
|
1392
1549
|
s.write(
|
1393
|
-
'<td><span style="color:
|
1550
|
+
'<td><span style="color:red">%s</span>%s</td>'
|
1394
1551
|
% (
|
1395
|
-
|
1552
|
+
self._format_rate(self.non_oop_failure_rate),
|
1396
1553
|
'<a href="%s">(%d/%d)</a>'
|
1397
|
-
% (self.
|
1554
|
+
% (self.non_oop_failures_link,
|
1555
|
+
self.num_non_oop_failures,
|
1556
|
+
self.num_completed),
|
1398
1557
|
)
|
1399
1558
|
)
|
1400
1559
|
|
@@ -1408,24 +1567,77 @@ class Evaluation(Evaluable):
|
|
1408
1567
|
else:
|
1409
1568
|
return 'cyan'
|
1410
1569
|
|
1411
|
-
def
|
1570
|
+
def _render_oop_failures(self, s: io.StringIO) -> None:
|
1571
|
+
self._render_failures(s, '^MappingError.*', error_color='magenta')
|
1572
|
+
|
1573
|
+
def _render_non_oop_failures(self, s: io.StringIO) -> None:
|
1574
|
+
self._render_failures(s, '^(?!MappingError).*', error_color='red')
|
1575
|
+
|
1576
|
+
def _render_failures(
|
1577
|
+
self, s: io.StringIO, error_regex: str, error_color: str) -> None:
|
1412
1578
|
"""Formats the failed cases into html."""
|
1579
|
+
# Failure summary.
|
1413
1580
|
s.write(
|
1414
|
-
'<h2>
|
1581
|
+
'<h2> Error Summary </h2>'
|
1415
1582
|
'<div style="white-space:pre">\n'
|
1416
1583
|
'<table style="border:1px solid">'
|
1417
|
-
'<tr class="header"><td>
|
1584
|
+
'<tr class="header"><td>Error type</td><td>Stats</td></tr>'
|
1418
1585
|
)
|
1586
|
+
error_regex = re.compile(error_regex)
|
1587
|
+
if self.result.metrics.failure_breakdown:
|
1588
|
+
for name, count in self.result.metrics.failure_breakdown.items():
|
1589
|
+
if not error_regex.match(name):
|
1590
|
+
continue
|
1591
|
+
|
1592
|
+
link = f'<a href="#{name}">{name}</a>'
|
1593
|
+
error_rate = self._format_rate(count / self.result.metrics.total)
|
1594
|
+
stats = (f'<span style="color:{error_color}">{error_rate} '
|
1595
|
+
f'({count}/{self.result.metrics.total})</span>')
|
1596
|
+
s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
|
1597
|
+
s.write(
|
1598
|
+
'</table></div>'
|
1599
|
+
'<h2> Failed Cases </h2>'
|
1600
|
+
'<div style="white-space:pre">'
|
1601
|
+
)
|
1602
|
+
# Failure details by error type.
|
1603
|
+
failures_by_error = collections.defaultdict(list)
|
1604
|
+
for example, error in self.failures:
|
1605
|
+
error_name = _error_key(error)
|
1606
|
+
if error_regex.match(error_name):
|
1607
|
+
failures_by_error[error_name].append((example, error))
|
1608
|
+
|
1609
|
+
for error_key, failures in failures_by_error.items():
|
1610
|
+
s.write(
|
1611
|
+
f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
|
1612
|
+
f'(count={len(failures)})</h3>'
|
1613
|
+
'<table style="border:1px solid">'
|
1614
|
+
'<tr class="header"><td>No.</td><td>Input</td>'
|
1615
|
+
'<td>LM invocation</td><td>Error</td></tr>'
|
1616
|
+
)
|
1617
|
+
for i, (example, error) in enumerate(failures):
|
1618
|
+
lm_response = None
|
1619
|
+
if isinstance(error, lf.structured.MappingError):
|
1620
|
+
lm_response = error.lm_response
|
1621
|
+
error = error.cause
|
1622
|
+
|
1623
|
+
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
1624
|
+
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
1625
|
+
s.write('<td style="color:green;white-space:pre-wrap">')
|
1626
|
+
s.write(pg.format(example, verbose=False))
|
1627
|
+
s.write('</td><td>')
|
1628
|
+
if lm_response is not None:
|
1629
|
+
self._render_message(lm_response, s)
|
1630
|
+
s.write(f'</td><td style="color:{error_color};white-space:pre">')
|
1631
|
+
s.write(_format_error(error))
|
1632
|
+
s.write('</td></tr>')
|
1633
|
+
s.write('</table>')
|
1634
|
+
s.write('</div>')
|
1419
1635
|
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
error_str = lf.text_formatting.decolored(str(error))
|
1426
|
-
s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
|
1427
|
-
s.write('</tr>')
|
1428
|
-
s.write('</table></div>')
|
1636
|
+
@classmethod
|
1637
|
+
def visualize(cls, evaluations: list['Evaluation']) -> str | None:
|
1638
|
+
"""Visualize the a list of evaluations of this task in HTML."""
|
1639
|
+
del evaluations
|
1640
|
+
return None
|
1429
1641
|
|
1430
1642
|
|
1431
1643
|
@pg.functor()
|
@@ -1578,7 +1790,7 @@ class Summary(pg.Object):
|
|
1578
1790
|
if e is None:
|
1579
1791
|
s.write('<span style="color: gray">N/A<span>')
|
1580
1792
|
else:
|
1581
|
-
s.write(e.
|
1793
|
+
s.write(e.summary_card())
|
1582
1794
|
s.write('</td>')
|
1583
1795
|
s.write('</tr>')
|
1584
1796
|
s.write('</table>')
|
@@ -1653,13 +1865,22 @@ class Summary(pg.Object):
|
|
1653
1865
|
s.write('<html><body>')
|
1654
1866
|
for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
|
1655
1867
|
table_id = task.__name__.lower()
|
1868
|
+
evaluations = self.select(task=task).evaluations
|
1869
|
+
table = Summary.Table.from_evaluations(evaluations, pivot_field)
|
1656
1870
|
s.write('<div>')
|
1657
|
-
s.write(
|
1658
|
-
|
1659
|
-
|
1660
|
-
table = Summary.Table.from_evaluations(
|
1661
|
-
self.select(task=task).evaluations, pivot_field
|
1871
|
+
s.write(
|
1872
|
+
f'<a id="{table_id}" href="#{table_id}">'
|
1873
|
+
f'<h2>{task.__name__}</h2></a>'
|
1662
1874
|
)
|
1875
|
+
|
1876
|
+
# Allow users to plugin visualization code (e.g. matplot) in the summary
|
1877
|
+
# page.
|
1878
|
+
visual_part = task.visualize(evaluations)
|
1879
|
+
if visual_part:
|
1880
|
+
s.write(visual_part)
|
1881
|
+
|
1882
|
+
s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
|
1883
|
+
s.write('<hr/>')
|
1663
1884
|
s.write(table.html())
|
1664
1885
|
s.write('</div>')
|
1665
1886
|
s.write('</body></html>')
|
@@ -1685,6 +1906,7 @@ class Summary(pg.Object):
|
|
1685
1906
|
experiment=entry,
|
1686
1907
|
dir=entry.dir,
|
1687
1908
|
metrics=entry.result.metrics if entry.result else None,
|
1909
|
+
usage=entry.result.usage if entry.result else None,
|
1688
1910
|
)
|
1689
1911
|
)
|
1690
1912
|
task_results[task.__name__] = results
|
@@ -1833,6 +2055,21 @@ class Summary(pg.Object):
|
|
1833
2055
|
return result.join()
|
1834
2056
|
|
1835
2057
|
|
2058
|
+
def _format_error(error: Exception):
|
2059
|
+
"""Formats an error into a string."""
|
2060
|
+
return (f'({error.__class__.__name__}) '
|
2061
|
+
+ lf.text_formatting.decolored(str(error)))
|
2062
|
+
|
2063
|
+
|
2064
|
+
def _error_key(error: Exception) -> str:
|
2065
|
+
"""Returns the key for an error."""
|
2066
|
+
error_names = []
|
2067
|
+
while error is not None:
|
2068
|
+
error_names.append(error.__class__.__name__)
|
2069
|
+
error = getattr(error, 'cause', None)
|
2070
|
+
return '.'.join(error_names)
|
2071
|
+
|
2072
|
+
|
1836
2073
|
def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
|
1837
2074
|
"""Formats prompt in HTML."""
|
1838
2075
|
if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck
|
langfun/core/eval/base_test.py
CHANGED
@@ -220,7 +220,18 @@ class EvaluationTest(unittest.TestCase):
|
|
220
220
|
cache_stats=dict(
|
221
221
|
use_cache=True, num_queries=2, num_hits=0, num_updates=2
|
222
222
|
),
|
223
|
-
metrics=dict(
|
223
|
+
metrics=dict(
|
224
|
+
total=2,
|
225
|
+
failures=1,
|
226
|
+
failure_rate=0.5,
|
227
|
+
oop_failures=1,
|
228
|
+
oop_failure_rate=0.5,
|
229
|
+
non_oop_failures=0,
|
230
|
+
non_oop_failure_rate=0.0,
|
231
|
+
failure_breakdown={
|
232
|
+
'MappingError.SchemaError.TypeError': 1
|
233
|
+
}
|
234
|
+
),
|
224
235
|
usage=dict(
|
225
236
|
total_prompt_tokens=774,
|
226
237
|
total_completion_tokens=25,
|
@@ -235,12 +246,20 @@ class EvaluationTest(unittest.TestCase):
|
|
235
246
|
os.path.exists(os.path.join(s.dir, base.Evaluation.EXPERIMENT_JSON)))
|
236
247
|
self.assertTrue(
|
237
248
|
os.path.exists(os.path.join(s.dir, base.Evaluation.RESULT_JSON)))
|
249
|
+
self.assertTrue(
|
250
|
+
os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_JSON)))
|
251
|
+
self.assertTrue(
|
252
|
+
os.path.exists(
|
253
|
+
os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_JSON)))
|
238
254
|
self.assertTrue(
|
239
255
|
os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
|
240
256
|
self.assertTrue(
|
241
257
|
os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
|
242
258
|
self.assertTrue(
|
243
|
-
os.path.exists(os.path.join(s.dir, base.Evaluation.
|
259
|
+
os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
|
260
|
+
self.assertTrue(
|
261
|
+
os.path.exists(
|
262
|
+
os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
|
244
263
|
self.assertTrue(
|
245
264
|
os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
|
246
265
|
)
|
@@ -274,7 +293,10 @@ class EvaluationTest(unittest.TestCase):
|
|
274
293
|
self.assertFalse(
|
275
294
|
os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
|
276
295
|
self.assertFalse(
|
277
|
-
os.path.exists(os.path.join(s.dir, base.Evaluation.
|
296
|
+
os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
|
297
|
+
self.assertFalse(
|
298
|
+
os.path.exists(
|
299
|
+
os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
|
278
300
|
|
279
301
|
def test_load(self):
|
280
302
|
lm = fake.StaticResponse('Solution(final_answer=2)')
|
@@ -312,7 +334,16 @@ class EvaluationTest(unittest.TestCase):
|
|
312
334
|
cache_stats=dict(
|
313
335
|
use_cache=True, num_queries=2, num_hits=0, num_updates=2
|
314
336
|
),
|
315
|
-
metrics=dict(
|
337
|
+
metrics=dict(
|
338
|
+
total=2,
|
339
|
+
failures=0,
|
340
|
+
failure_rate=0.0,
|
341
|
+
oop_failures=0,
|
342
|
+
oop_failure_rate=0.0,
|
343
|
+
non_oop_failures=0,
|
344
|
+
non_oop_failure_rate=0.0,
|
345
|
+
failure_breakdown={},
|
346
|
+
),
|
316
347
|
usage=s.children[1].result.usage,
|
317
348
|
),
|
318
349
|
},
|
@@ -363,7 +394,18 @@ class EvaluationTest(unittest.TestCase):
|
|
363
394
|
cache_stats=dict(
|
364
395
|
use_cache=True, num_queries=2, num_hits=0, num_updates=2
|
365
396
|
),
|
366
|
-
metrics=dict(
|
397
|
+
metrics=dict(
|
398
|
+
total=2,
|
399
|
+
failures=1,
|
400
|
+
failure_rate=0.5,
|
401
|
+
oop_failures=1,
|
402
|
+
oop_failure_rate=0.5,
|
403
|
+
non_oop_failures=0,
|
404
|
+
non_oop_failure_rate=0.0,
|
405
|
+
failure_breakdown={
|
406
|
+
'MappingError.SchemaError.TypeError': 1
|
407
|
+
}
|
408
|
+
),
|
367
409
|
usage=s.children[0].result.usage,
|
368
410
|
),
|
369
411
|
s.children[1].id: dict(
|
@@ -378,7 +420,18 @@ class EvaluationTest(unittest.TestCase):
|
|
378
420
|
cache_stats=dict(
|
379
421
|
use_cache=True, num_queries=2, num_hits=0, num_updates=2
|
380
422
|
),
|
381
|
-
metrics=dict(
|
423
|
+
metrics=dict(
|
424
|
+
total=2,
|
425
|
+
failures=1,
|
426
|
+
failure_rate=0.5,
|
427
|
+
oop_failures=1,
|
428
|
+
oop_failure_rate=0.5,
|
429
|
+
non_oop_failures=0,
|
430
|
+
non_oop_failure_rate=0.0,
|
431
|
+
failure_breakdown={
|
432
|
+
'MappingError.SchemaError.TypeError': 1
|
433
|
+
}
|
434
|
+
),
|
382
435
|
usage=s.children[1].result.usage,
|
383
436
|
),
|
384
437
|
},
|
@@ -475,7 +528,7 @@ class SuiteTest(unittest.TestCase):
|
|
475
528
|
self.assertEqual(s.hash, '26e6cc25')
|
476
529
|
s.run()
|
477
530
|
expected = {
|
478
|
-
|
531
|
+
'Evaluation@0fade07d': dict(
|
479
532
|
experiment_setup=dict(
|
480
533
|
id=s.children[0].id,
|
481
534
|
dir=s.children[0].dir,
|
@@ -487,48 +540,46 @@ class SuiteTest(unittest.TestCase):
|
|
487
540
|
cache_stats=dict(
|
488
541
|
use_cache=True, num_queries=2, num_hits=0, num_updates=2
|
489
542
|
),
|
490
|
-
metrics=dict(
|
543
|
+
metrics=dict(
|
544
|
+
total=2,
|
545
|
+
failures=1,
|
546
|
+
failure_rate=0.5,
|
547
|
+
oop_failures=1,
|
548
|
+
oop_failure_rate=0.5,
|
549
|
+
non_oop_failures=0,
|
550
|
+
non_oop_failure_rate=0.0,
|
551
|
+
failure_breakdown={
|
552
|
+
'MappingError.SchemaError.TypeError': 1
|
553
|
+
}
|
554
|
+
),
|
491
555
|
usage=s.children[0].result.usage,
|
492
556
|
),
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
prompt_template='{{example.question}}',
|
502
|
-
method='call',
|
503
|
-
schema_fn='answer_schema()',
|
504
|
-
),
|
505
|
-
cache_stats=dict(
|
506
|
-
use_cache=True, num_queries=4, num_hits=1, num_updates=3
|
507
|
-
),
|
508
|
-
metrics=dict(total=2, failures=2, failure_rate=1.0),
|
509
|
-
usage=s.children[1].children[0].result.usage,
|
557
|
+
'Evaluation@ae86c703': dict(
|
558
|
+
experiment_setup=dict(
|
559
|
+
id=s.children[1].children[0].id,
|
560
|
+
dir=s.children[1].children[0].dir,
|
561
|
+
model='StaticSequence',
|
562
|
+
prompt_template='{{example.question}}',
|
563
|
+
method='call',
|
564
|
+
schema_fn='answer_schema()',
|
510
565
|
),
|
511
|
-
|
512
|
-
|
513
|
-
.id: dict(
|
514
|
-
experiment_setup=dict(
|
515
|
-
id=s.children[1].children[2].id,
|
516
|
-
dir=s.children[1].children[2].dir,
|
517
|
-
model='StaticSequence',
|
518
|
-
prompt_template='{{example.question}}',
|
519
|
-
method='query',
|
520
|
-
schema_fn='answer_schema()',
|
521
|
-
),
|
522
|
-
cache_stats=dict(
|
523
|
-
use_cache=True,
|
524
|
-
num_queries=2,
|
525
|
-
num_hits=0,
|
526
|
-
num_updates=2,
|
527
|
-
),
|
528
|
-
metrics=dict(total=2, failures=1, failure_rate=0.5),
|
529
|
-
usage=s.children[1].children[2].result.usage,
|
566
|
+
cache_stats=dict(
|
567
|
+
use_cache=True, num_queries=4, num_hits=1, num_updates=3
|
530
568
|
),
|
531
|
-
|
569
|
+
metrics=dict(
|
570
|
+
total=2,
|
571
|
+
failures=2,
|
572
|
+
failure_rate=1.0,
|
573
|
+
oop_failures=2,
|
574
|
+
oop_failure_rate=1.0,
|
575
|
+
non_oop_failures=0,
|
576
|
+
non_oop_failure_rate=0.0,
|
577
|
+
failure_breakdown={
|
578
|
+
'MappingError.SchemaError.TypeError': 2
|
579
|
+
}
|
580
|
+
),
|
581
|
+
usage=s.children[1].children[0].result.usage,
|
582
|
+
),
|
532
583
|
}
|
533
584
|
self.assertEqual(s.result, expected)
|
534
585
|
|
langfun/core/eval/matching.py
CHANGED
@@ -119,18 +119,18 @@ class Matching(base.Evaluation):
|
|
119
119
|
del progress
|
120
120
|
return {
|
121
121
|
'Model': self.lm.model_id,
|
122
|
-
'Matches':
|
123
|
-
self.match_rate
|
122
|
+
'Matches': '%s (%d/%d)' % (
|
123
|
+
self._format_rate(self.match_rate),
|
124
124
|
self.num_matches,
|
125
125
|
self.num_completed,
|
126
126
|
),
|
127
|
-
'Mismatches':
|
128
|
-
self.mismatch_rate
|
127
|
+
'Mismatches': '%s (%d/%d)' % (
|
128
|
+
self._format_rate(self.mismatch_rate),
|
129
129
|
self.num_mismatches,
|
130
130
|
self.num_completed,
|
131
131
|
),
|
132
|
-
'Failed':
|
133
|
-
self.failure_rate
|
132
|
+
'Failed': '%s (%d/%d)' % (
|
133
|
+
self._format_rate(self.failure_rate),
|
134
134
|
self.num_failures,
|
135
135
|
self.num_completed,
|
136
136
|
),
|
@@ -140,24 +140,25 @@ class Matching(base.Evaluation):
|
|
140
140
|
assert self.result is not None
|
141
141
|
m = self.result.metrics
|
142
142
|
return (
|
143
|
-
|
144
|
-
|
145
|
-
|
143
|
+
'COMPLETED(%s):'
|
144
|
+
' Matches=%s (%d/%d)'
|
145
|
+
' Mismatches=%s (%d/%d)'
|
146
|
+
' Failures=%s (%d/%d)'
|
146
147
|
) % (
|
147
148
|
run_status,
|
148
|
-
m.match_rate
|
149
|
+
self._format_rate(m.match_rate),
|
149
150
|
m.num_matches,
|
150
151
|
m.total,
|
151
|
-
m.mismatch_rate
|
152
|
+
self._format_rate(m.mismatch_rate),
|
152
153
|
m.num_mismatches,
|
153
154
|
m.total,
|
154
|
-
m.failure_rate
|
155
|
+
self._format_rate(m.failure_rate),
|
155
156
|
m.failures,
|
156
157
|
m.total,
|
157
158
|
)
|
158
159
|
|
159
|
-
def
|
160
|
-
result = super().
|
160
|
+
def finalize(self) -> pg.Dict:
|
161
|
+
result = super().finalize()
|
161
162
|
result.metrics.update(
|
162
163
|
num_matches=self.num_matches,
|
163
164
|
match_rate=self.match_rate,
|
@@ -218,9 +219,9 @@ class Matching(base.Evaluation):
|
|
218
219
|
def _render_result_row(self, s: io.StringIO):
|
219
220
|
super()._render_result_row(s)
|
220
221
|
s.write(
|
221
|
-
'<td><span style="color:
|
222
|
+
'<td><span style="color:orange">%s</span>%s</td>'
|
222
223
|
% (
|
223
|
-
|
224
|
+
self._format_rate(self.mismatch_rate),
|
224
225
|
'<a href="%s">(%d/%d)</a>'
|
225
226
|
% (self.mismatches_link, self.num_mismatches, self.num_completed),
|
226
227
|
)
|
@@ -228,13 +229,13 @@ class Matching(base.Evaluation):
|
|
228
229
|
s.write(
|
229
230
|
'<td><span style="color:green">%s</span>%s</td>'
|
230
231
|
% (
|
231
|
-
|
232
|
+
self._format_rate(self.match_rate),
|
232
233
|
'<a href="%s">(%d/%d)</a>'
|
233
234
|
% (self.matches_link, self.num_matches, self.num_completed),
|
234
235
|
)
|
235
236
|
)
|
236
237
|
|
237
|
-
def
|
238
|
+
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
238
239
|
"""Renders metrics in HTML."""
|
239
240
|
assert self.result is not None
|
240
241
|
m = self.result.metrics
|
@@ -244,7 +245,7 @@ class Matching(base.Evaluation):
|
|
244
245
|
m.num_matches,
|
245
246
|
m.total,
|
246
247
|
self.matches_link,
|
247
|
-
|
248
|
+
self._format_rate(m.match_rate),
|
248
249
|
)
|
249
250
|
)
|
250
251
|
s.write(' | ')
|
@@ -254,11 +255,11 @@ class Matching(base.Evaluation):
|
|
254
255
|
m.num_mismatches,
|
255
256
|
m.total,
|
256
257
|
self.mismatches_link,
|
257
|
-
|
258
|
+
self._format_rate(m.mismatch_rate),
|
258
259
|
)
|
259
260
|
)
|
260
261
|
s.write(' | ')
|
261
|
-
super().
|
262
|
+
super()._render_summary_metrics(s)
|
262
263
|
|
263
264
|
def _render_matches(self, s: io.StringIO) -> None:
|
264
265
|
"""Formats the matched cases into html."""
|
@@ -120,6 +120,13 @@ class MatchingTest(unittest.TestCase):
|
|
120
120
|
total=4,
|
121
121
|
failures=1,
|
122
122
|
failure_rate=0.25,
|
123
|
+
oop_failures=1,
|
124
|
+
oop_failure_rate=0.25,
|
125
|
+
non_oop_failures=0,
|
126
|
+
non_oop_failure_rate=0.0,
|
127
|
+
failure_breakdown={
|
128
|
+
'MappingError.SchemaError.TypeError': 1
|
129
|
+
},
|
123
130
|
num_matches=2,
|
124
131
|
match_rate=0.5,
|
125
132
|
num_mismatches=1,
|
@@ -160,7 +167,14 @@ class MatchingTest(unittest.TestCase):
|
|
160
167
|
self.assertTrue(
|
161
168
|
os.path.exists(
|
162
169
|
os.path.join(
|
163
|
-
s.dir, matching.Matching.
|
170
|
+
s.dir, matching.Matching.OOP_FAILURES_JSON
|
171
|
+
)
|
172
|
+
)
|
173
|
+
)
|
174
|
+
self.assertTrue(
|
175
|
+
os.path.exists(
|
176
|
+
os.path.join(
|
177
|
+
s.dir, matching.Matching.NON_OOP_FAILURES_JSON
|
164
178
|
)
|
165
179
|
)
|
166
180
|
)
|
@@ -175,7 +189,14 @@ class MatchingTest(unittest.TestCase):
|
|
175
189
|
self.assertTrue(
|
176
190
|
os.path.exists(
|
177
191
|
os.path.join(
|
178
|
-
s.dir, matching.Matching.
|
192
|
+
s.dir, matching.Matching.OOP_FAILURES_HTML
|
193
|
+
)
|
194
|
+
)
|
195
|
+
)
|
196
|
+
self.assertTrue(
|
197
|
+
os.path.exists(
|
198
|
+
os.path.join(
|
199
|
+
s.dir, matching.Matching.NON_OOP_FAILURES_HTML
|
179
200
|
)
|
180
201
|
)
|
181
202
|
)
|
langfun/core/eval/scoring.py
CHANGED
@@ -113,8 +113,8 @@ class Scoring(base.Evaluation):
|
|
113
113
|
m.total,
|
114
114
|
)
|
115
115
|
|
116
|
-
def
|
117
|
-
result = super().
|
116
|
+
def finalize(self) -> pg.Dict:
|
117
|
+
result = super().finalize()
|
118
118
|
result.metrics.update(
|
119
119
|
num_scored=self.num_scored,
|
120
120
|
score_rate=self.score_rate,
|
@@ -168,7 +168,7 @@ class Scoring(base.Evaluation):
|
|
168
168
|
)
|
169
169
|
)
|
170
170
|
|
171
|
-
def
|
171
|
+
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
172
172
|
"""Renders metrics in HTML."""
|
173
173
|
assert self.result is not None
|
174
174
|
m = self.result.metrics
|
@@ -182,7 +182,7 @@ class Scoring(base.Evaluation):
|
|
182
182
|
)
|
183
183
|
)
|
184
184
|
s.write(' | ')
|
185
|
-
super().
|
185
|
+
super()._render_summary_metrics(s)
|
186
186
|
|
187
187
|
def _render_scored(self, s: io.StringIO) -> None:
|
188
188
|
"""Formats the matched cases into html."""
|
@@ -98,6 +98,11 @@ class ScoringTest(unittest.TestCase):
|
|
98
98
|
total=2,
|
99
99
|
failures=0,
|
100
100
|
failure_rate=0.0,
|
101
|
+
oop_failures=0,
|
102
|
+
oop_failure_rate=0.0,
|
103
|
+
non_oop_failures=0,
|
104
|
+
non_oop_failure_rate=0.0,
|
105
|
+
failure_breakdown={},
|
101
106
|
num_scored=2,
|
102
107
|
score_rate=1.0,
|
103
108
|
avg_score=0.5,
|
@@ -124,7 +129,12 @@ class ScoringTest(unittest.TestCase):
|
|
124
129
|
)
|
125
130
|
self.assertTrue(
|
126
131
|
os.path.exists(
|
127
|
-
os.path.join(s.dir, scoring.Scoring.
|
132
|
+
os.path.join(s.dir, scoring.Scoring.OOP_FAILURES_JSON)
|
133
|
+
)
|
134
|
+
)
|
135
|
+
self.assertTrue(
|
136
|
+
os.path.exists(
|
137
|
+
os.path.join(s.dir, scoring.Scoring.NON_OOP_FAILURES_JSON)
|
128
138
|
)
|
129
139
|
)
|
130
140
|
self.assertTrue(
|
@@ -143,7 +153,14 @@ class ScoringTest(unittest.TestCase):
|
|
143
153
|
self.assertTrue(
|
144
154
|
os.path.exists(
|
145
155
|
os.path.join(
|
146
|
-
s.dir, scoring.Scoring.
|
156
|
+
s.dir, scoring.Scoring.OOP_FAILURES_HTML
|
157
|
+
)
|
158
|
+
)
|
159
|
+
)
|
160
|
+
self.assertTrue(
|
161
|
+
os.path.exists(
|
162
|
+
os.path.join(
|
163
|
+
s.dir, scoring.Scoring.NON_OOP_FAILURES_HTML
|
147
164
|
)
|
148
165
|
)
|
149
166
|
)
|
langfun/core/llms/openai.py
CHANGED
@@ -234,7 +234,7 @@ class OpenAI(lf.LanguageModel):
|
|
234
234
|
if isinstance(chunk, str):
|
235
235
|
item = dict(type='text', text=chunk)
|
236
236
|
elif isinstance(chunk, lf_modalities.Image) and chunk.uri:
|
237
|
-
item = dict(type='image_url', image_url=chunk.uri)
|
237
|
+
item = dict(type='image_url', image_url=dict(url=chunk.uri))
|
238
238
|
else:
|
239
239
|
raise ValueError(f'Unsupported modality object: {chunk!r}.')
|
240
240
|
content.append(item)
|
langfun/core/llms/openai_test.py
CHANGED
@@ -66,7 +66,8 @@ def mock_chat_completion_query_vision(messages, *, n=1, **kwargs):
|
|
66
66
|
del kwargs
|
67
67
|
choices = []
|
68
68
|
urls = [
|
69
|
-
c['image_url']
|
69
|
+
c['image_url']['url']
|
70
|
+
for c in messages[0]['content'] if c['type'] == 'image_url'
|
70
71
|
]
|
71
72
|
for k in range(n):
|
72
73
|
choices.append(pg.Dict(
|
@@ -40,12 +40,12 @@ langfun/core/coding/python/parsing_test.py,sha256=9vAWF484kWIm6JZq8NFiMgKUDhXV-d
|
|
40
40
|
langfun/core/coding/python/permissions.py,sha256=1QWGHvzL8MM0Ok_auQ9tURqZHtdOfJaDpBzZ29GUE-c,2544
|
41
41
|
langfun/core/coding/python/permissions_test.py,sha256=w5EDb8QxpxgJyZkojyzVWQvDfg366zn99-g__6TbPQ0,2699
|
42
42
|
langfun/core/eval/__init__.py,sha256=NSmPe2lxdxFoY4h8VkNyONPAFtOTUpK9WhmZRaqUgiI,1335
|
43
|
-
langfun/core/eval/base.py,sha256=
|
44
|
-
langfun/core/eval/base_test.py,sha256=
|
45
|
-
langfun/core/eval/matching.py,sha256=
|
46
|
-
langfun/core/eval/matching_test.py,sha256=
|
47
|
-
langfun/core/eval/scoring.py,sha256=
|
48
|
-
langfun/core/eval/scoring_test.py,sha256=
|
43
|
+
langfun/core/eval/base.py,sha256=ImIdyjh89yWUbFoSI12xzpcSmvB34y8_F0WAcUi-4sg,68405
|
44
|
+
langfun/core/eval/base_test.py,sha256=SEo43ftMscpZ5QV6AGaywrA6SobVaG_P7sUbjoBGqg8,24081
|
45
|
+
langfun/core/eval/matching.py,sha256=Y4vFoNTQEOwko6IA8l9OZ52-vt52e3VGmcTtvLA67wM,9782
|
46
|
+
langfun/core/eval/matching_test.py,sha256=f7iVyXH5KGJBWt4Wp14Bt9J3X59A6Ayfog9MbuFvPew,5532
|
47
|
+
langfun/core/eval/scoring.py,sha256=1J7IATo-8FXUR0SBqk9icztHiM0lWkBFcWUo-vUURgQ,6376
|
48
|
+
langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se2SXM,4546
|
49
49
|
langfun/core/llms/__init__.py,sha256=1bPg1QI8duOZCYINm-jWi094x0JtLmsk4KX60qIC_gs,3245
|
50
50
|
langfun/core/llms/anthropic.py,sha256=7W9YdPN3SlAFhAIQlihMkrpo7tTY_4NvD0KIlCrqcsk,8505
|
51
51
|
langfun/core/llms/anthropic_test.py,sha256=TMM30myyEhwF99Le4RvJEXOn8RYl0q1FRkt9Q9nl1jk,5540
|
@@ -57,8 +57,8 @@ langfun/core/llms/groq.py,sha256=NaGItVL_pkOpqPpI4bPGU27xLFRoaeizZ49v2s-4ERs,784
|
|
57
57
|
langfun/core/llms/groq_test.py,sha256=M6GtlrsOvDun_j-sR8cPh4W_moHWZNSTiThu3kuwbbc,5281
|
58
58
|
langfun/core/llms/llama_cpp.py,sha256=Y_KkMUf3Xfac49koMUtUslKl3h-HWp3-ntq7Jaa3bdo,2385
|
59
59
|
langfun/core/llms/llama_cpp_test.py,sha256=ZxC6defGd_HX9SFRU9U4cJiQnBKundbOrchbXuC1Z2M,1683
|
60
|
-
langfun/core/llms/openai.py,sha256=
|
61
|
-
langfun/core/llms/openai_test.py,sha256=
|
60
|
+
langfun/core/llms/openai.py,sha256=rPwO4qPGEwbB4O7TaQD0spg_PXIfF2ioRI_ilE3Pg6Y,13257
|
61
|
+
langfun/core/llms/openai_test.py,sha256=asSA1sVy_7hnXioD_2HTxtSDpVTKBUO_EjZuyHpwbn0,14854
|
62
62
|
langfun/core/llms/cache/__init__.py,sha256=QAo3InUMDM_YpteNnVCSejI4zOsnjSMWKJKzkb3VY64,993
|
63
63
|
langfun/core/llms/cache/base.py,sha256=cFfYvOIUae842pncqCAsRvqXCk2AnAsRYVx0mcIoAeY,3338
|
64
64
|
langfun/core/llms/cache/in_memory.py,sha256=YfFyJEhLs73cUiB0ZfhMxYpdE8Iuxxw-dvMFwGHTSHw,4742
|
@@ -101,8 +101,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
|
|
101
101
|
langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
|
102
102
|
langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
|
103
103
|
langfun/core/templates/selfplay_test.py,sha256=DYVrkk7uNKCqJGEHH31HssU2BPuMItU1vJLzfcXIlYg,2156
|
104
|
-
langfun-0.0.2.
|
105
|
-
langfun-0.0.2.
|
106
|
-
langfun-0.0.2.
|
107
|
-
langfun-0.0.2.
|
108
|
-
langfun-0.0.2.
|
104
|
+
langfun-0.0.2.dev20240430.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
|
105
|
+
langfun-0.0.2.dev20240430.dist-info/METADATA,sha256=RpEIB1auHihqOoDrPnFQaYqgpqxFKA9_Z9iuCfPxe5s,3436
|
106
|
+
langfun-0.0.2.dev20240430.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
107
|
+
langfun-0.0.2.dev20240430.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
|
108
|
+
langfun-0.0.2.dev20240430.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|