langfun 0.0.2.dev20240428__py3-none-any.whl → 0.0.2.dev20240430__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langfun/core/eval/base.py CHANGED
@@ -18,6 +18,7 @@ import collections
18
18
  import dataclasses
19
19
  import functools
20
20
  import hashlib
21
+ import html
21
22
  import inspect
22
23
  import io
23
24
  import os
@@ -40,7 +41,8 @@ class Evaluable(lf.Component):
40
41
 
41
42
  EXPERIMENT_JSON = 'experiment.json'
42
43
  RESULT_JSON = 'result.json'
43
- FAILURES_JSON = 'failures.json'
44
+ OOP_FAILURES_JSON = 'oop_failures.json'
45
+ NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
44
46
  INDEX_HTML = 'index.html'
45
47
  SUMMARY_HTML = 'summary.html'
46
48
 
@@ -358,7 +360,7 @@ class Evaluable(lf.Component):
358
360
  color='yellow')
359
361
 
360
362
  for node in self.nonleaf_nodes:
361
- node._result = {c.id: c.result for c in node.children} # pylint: disable=protected-access
363
+ node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
362
364
  if should_save:
363
365
  node.save(result=False, report=False)
364
366
 
@@ -540,13 +542,13 @@ class Evaluable(lf.Component):
540
542
  f'<div style="color: {text_color}; white-space: pre-wrap;'
541
543
  'padding: 10px; border: 1px solid; margin-top: 10px">'
542
544
  )
543
- s.write(m.get('formatted_text', m.text))
545
+ s.write(html.escape(m.get('formatted_text', m.text)))
544
546
  if m.result is not None:
545
547
  s.write(
546
548
  '<div style="color: magenta; white-space: pre-wrap;'
547
549
  'padding: 10px; border: 1px solid; margin: 10px">'
548
550
  )
549
- s.write(pg.format(m.result))
551
+ s.write(html.escape(pg.format(m.result)))
550
552
  s.write('</div>')
551
553
  if 'usage' in m.metadata:
552
554
  s.write(
@@ -753,10 +755,12 @@ class Evaluation(Evaluable):
753
755
 
754
756
  # Constants.
755
757
  CACHE_JSON = 'cache.json'
756
- FAILURES_HTML = 'failures.html'
758
+ OOP_FAILURES_HTML = 'oop_failures.html'
759
+ NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
757
760
 
758
761
  @functools.cached_property
759
762
  def hash(self) -> str:
763
+ """Returns the semantic-based hash of the evaluation."""
760
764
  if self.is_deterministic:
761
765
  identity = pg.format(self._identifiers(), compact=True)
762
766
  else:
@@ -805,6 +809,10 @@ class Evaluation(Evaluable):
805
809
  """Returns the complete rate."""
806
810
  return self.num_completed / self.num_examples
807
811
 
812
+ #
813
+ # Properties on failures.
814
+ #
815
+
808
816
  @property
809
817
  def failures(self) -> list[tuple[Any, Exception]]:
810
818
  """Returns the failed examples and their errors."""
@@ -815,6 +823,15 @@ class Evaluation(Evaluable):
815
823
  """Returns the number of failed examples."""
816
824
  return len(self.failures)
817
825
 
826
+ @functools.cached_property
827
+ def failure_breakdown(self) -> dict[str, int]:
828
+ """Returns the breakdown of failures."""
829
+ breakdown = collections.defaultdict(int)
830
+ for _, error in self.failures:
831
+ breakdown[_error_key(error)] += 1
832
+ sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
833
+ return pg.Dict({x[0]: x[1] for x in sorted_items})
834
+
818
835
  @property
819
836
  def failure_rate(self) -> float:
820
837
  """Returns the failure rate in range [0, 1]."""
@@ -822,6 +839,46 @@ class Evaluation(Evaluable):
822
839
  return 0.0
823
840
  return self.num_failures / self.num_completed
824
841
 
842
+ @functools.cached_property
843
+ def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
844
+ """Returns the OOP failures."""
845
+ return [item for item in self.failures
846
+ if isinstance(item[1], lf_structured.MappingError)]
847
+
848
+ @property
849
+ def num_oop_failures(self) -> int:
850
+ """Returns the number of OOP failures."""
851
+ return len(self.oop_failures)
852
+
853
+ @property
854
+ def oop_failure_rate(self) -> float:
855
+ """Returns the OOP failure rate in range [0, 1]."""
856
+ if self.num_completed == 0:
857
+ return 0.0
858
+ return self.num_oop_failures / self.num_completed
859
+
860
+ @functools.cached_property
861
+ def non_oop_failures(self) -> list[tuple[Any, Exception]]:
862
+ """Returns the OOP failures."""
863
+ return [item for item in self.failures
864
+ if not isinstance(item[1], lf_structured.MappingError)]
865
+
866
+ @property
867
+ def num_non_oop_failures(self) -> int:
868
+ """Returns the number of non-OOP failures."""
869
+ return len(self.non_oop_failures)
870
+
871
+ @property
872
+ def non_oop_failure_rate(self) -> float:
873
+ """Returns the non-OOP failure rate in range [0, 1]."""
874
+ if self.num_completed == 0:
875
+ return 0.0
876
+ return self.num_non_oop_failures / self.num_completed
877
+
878
+ #
879
+ # Properties on usage.
880
+ #
881
+
825
882
  @property
826
883
  def has_usage(self) -> bool:
827
884
  """Returns True if token usage is enabled."""
@@ -976,13 +1033,22 @@ class Evaluation(Evaluable):
976
1033
  self._total_prompt_tokens = 0
977
1034
  self._total_completion_tokens = 0
978
1035
  self._num_usages = 0
1036
+ self.__dict__.pop('oop_failures', None)
1037
+ self.__dict__.pop('non_oop_failures', None)
979
1038
 
980
1039
  @property
981
- def failures_link(self) -> str | None:
982
- """Returns the link to the failures page."""
1040
+ def oop_failures_link(self) -> str | None:
1041
+ """Returns the link to the OOP failures page."""
983
1042
  if self.dir is None:
984
1043
  return None
985
- return self.link(os.path.join(self.dir, Evaluation.FAILURES_HTML))
1044
+ return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
1045
+
1046
+ @property
1047
+ def non_oop_failures_link(self) -> str | None:
1048
+ """Returns the link to then non-OOP failures page."""
1049
+ if self.dir is None:
1050
+ return None
1051
+ return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
986
1052
 
987
1053
  def _dryrun(
988
1054
  self,
@@ -1011,23 +1077,34 @@ class Evaluation(Evaluable):
1011
1077
  color='green',
1012
1078
  )
1013
1079
 
1014
- with lf.use_settings(debug=debug):
1015
- output_message = copy.process(example, **(self.additional_args or {}))
1016
- if self.schema is None:
1017
- output = output_message.text
1018
- else:
1019
- output = output_message.result
1080
+ error, output_message = None, None
1020
1081
 
1021
- if verbose:
1082
+ try:
1083
+ with lf.use_settings(debug=debug):
1084
+ output_message = copy.process(example, **(self.additional_args or {}))
1085
+ if self.schema is None:
1086
+ output = output_message.text
1087
+ else:
1088
+ output = output_message.result
1089
+
1090
+ if verbose:
1091
+ lf.console.write('')
1092
+ lf.console.write(
1093
+ str(output),
1094
+ title='OUTPUT',
1095
+ color='blue',
1096
+ )
1097
+ except lf_structured.MappingError as e:
1022
1098
  lf.console.write('')
1023
1099
  lf.console.write(
1024
- str(output),
1025
- title='OUTPUT',
1026
- color='blue',
1100
+ str(e),
1101
+ title='ERROR',
1102
+ color='red',
1027
1103
  )
1104
+ error = e
1028
1105
 
1029
- copy.audit(example, output_message, None, dryrun=True)
1030
- result = copy.summarize()
1106
+ copy.audit(example, output_message, error, dryrun=True)
1107
+ result = copy.finalize()
1031
1108
 
1032
1109
  if verbose:
1033
1110
  lf.console.write('')
@@ -1087,7 +1164,7 @@ class Evaluation(Evaluable):
1087
1164
  self.cache.save()
1088
1165
 
1089
1166
  # Summarize result.
1090
- self._result = self.summarize()
1167
+ self._result = self.finalize()
1091
1168
  if verbose:
1092
1169
  lf.console.write(
1093
1170
  str(self.result),
@@ -1143,13 +1220,13 @@ class Evaluation(Evaluable):
1143
1220
  def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
1144
1221
  return {
1145
1222
  'Model': self.lm.model_id,
1146
- 'Succeeded': f'%.{self.report_precision}f%% (%d/%d)' % (
1147
- progress.success_rate * 100,
1223
+ 'Succeeded': '%s (%d/%d)' % (
1224
+ self._format_rate(progress.success_rate),
1148
1225
  progress.succeeded,
1149
1226
  progress.completed,
1150
1227
  ),
1151
- 'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
1152
- progress.failure_rate * 100,
1228
+ 'Failed': '%s (%d/%d)' % (
1229
+ self._format_rate(progress.failure_rate),
1153
1230
  progress.failed,
1154
1231
  progress.completed,
1155
1232
  ),
@@ -1159,21 +1236,20 @@ class Evaluation(Evaluable):
1159
1236
  assert self.result is not None
1160
1237
  m = self.result.metrics
1161
1238
  return (
1162
- f'COMPLETED(%s): Successes=%.{self.report_precision}f%% (%d/%d)'
1163
- f' Failures=%.{self.report_precision}f%% (%d/%d)'
1239
+ 'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
1164
1240
  % (
1165
1241
  run_status,
1166
- (1 - m.failure_rate) * 100,
1242
+ self._format_rate(1 - m.failure_rate),
1167
1243
  m.total - m.failures,
1168
1244
  m.total,
1169
- m.failure_rate * 100,
1245
+ self._format_rate(m.failure_rate),
1170
1246
  m.failures,
1171
1247
  m.total,
1172
1248
  )
1173
1249
  )
1174
1250
 
1175
- def summarize(self) -> pg.Dict:
1176
- """Summarizes the evaluation result."""
1251
+ def finalize(self) -> pg.Dict:
1252
+ """Finalizes the evaluation result."""
1177
1253
  if self.cache:
1178
1254
  cache_stats = dict(
1179
1255
  use_cache=True,
@@ -1210,12 +1286,18 @@ class Evaluation(Evaluable):
1210
1286
  total=self.num_completed,
1211
1287
  failures=self.num_failures,
1212
1288
  failure_rate=self.failure_rate,
1289
+ oop_failures=self.num_oop_failures,
1290
+ oop_failure_rate=self.oop_failure_rate,
1291
+ non_oop_failures=self.num_non_oop_failures,
1292
+ non_oop_failure_rate=self.non_oop_failure_rate,
1293
+ failure_breakdown=self.failure_breakdown,
1213
1294
  ),
1214
1295
  usage=usage,
1215
1296
  )
1216
1297
  return result
1217
1298
 
1218
- def summarize_html(self) -> str:
1299
+ def summary_card(self) -> str:
1300
+ """Returns summary card in HTML."""
1219
1301
  s = io.StringIO()
1220
1302
  definition = _html_repr(self, compact=False, escape=True)
1221
1303
  s.write('<div><table><tr><td>')
@@ -1230,18 +1312,19 @@ class Evaluation(Evaluable):
1230
1312
  s.write(
1231
1313
  f'<a target="_blank" title="{definition}" '
1232
1314
  f'href="{self.index_link}">{self.hash}</a>'
1315
+ f' &nbsp;[<a href="{self.link(self.dir)}">dir</a>]'
1233
1316
  '</td></tr><tr><td>'
1234
1317
  )
1235
- self._render_metric(s)
1318
+ self._render_summary_metrics(s)
1236
1319
 
1237
1320
  # Summarize average usage.
1238
1321
  if self.result.usage is not None:
1239
- self._render_usage(s)
1322
+ self._render_summary_usage(s)
1240
1323
 
1241
1324
  s.write('</td></tr></table></div>')
1242
1325
  return s.getvalue()
1243
1326
 
1244
- def _render_usage(self, s: io.StringIO) -> None:
1327
+ def _render_summary_usage(self, s: io.StringIO) -> None:
1245
1328
  """Renders usage in HTML."""
1246
1329
  usage = self.result.usage
1247
1330
  total = usage.total_prompt_tokens + usage.total_completion_tokens
@@ -1255,20 +1338,66 @@ class Evaluation(Evaluable):
1255
1338
  f'" style="color:gray">({total} tokens)</a>'
1256
1339
  )
1257
1340
 
1258
- def _render_metric(self, s: io.StringIO) -> None:
1341
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
1259
1342
  """Renders metrics in HTML."""
1260
1343
  assert self.result is not None
1261
1344
  m = self.result.metrics
1345
+
1346
+ # OOP failures.
1347
+ oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
1348
+ if m.oop_failures:
1349
+ oop_failure_title += '&#013;'
1350
+ for name, count in m.failure_breakdown.items():
1351
+ if name.startswith('MappingError'):
1352
+ oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1353
+ name.removeprefix('MappingError.'),
1354
+ self._format_rate(count / m.total),
1355
+ count,
1356
+ m.total,
1357
+ )
1358
+
1359
+ extra_style = ''
1360
+ if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
1361
+ extra_style = ';font-weight:bold'
1262
1362
  s.write(
1263
- '<a title="Failures (%d/%d)" href="%s" style="color:red">%s</a>'
1363
+ '<a title="%s" href="%s" style="color:magenta%s">%s</a>'
1264
1364
  % (
1265
- m.failures,
1266
- m.total,
1267
- self.failures_link,
1268
- f'%.{self.report_precision}f%% ' % (m.failure_rate * 100),
1365
+ oop_failure_title,
1366
+ self.oop_failures_link,
1367
+ extra_style,
1368
+ self._format_rate(m.oop_failure_rate),
1369
+ )
1370
+ )
1371
+ s.write(' | ')
1372
+
1373
+ # Non-OOP failures.
1374
+ non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
1375
+ if m.non_oop_failures:
1376
+ non_oop_failure_title += '&#013;'
1377
+ for name, count in m.failure_breakdown.items():
1378
+ if not name.startswith('MappingError'):
1379
+ non_oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1380
+ name,
1381
+ self._format_rate(count / m.total),
1382
+ count,
1383
+ m.total,
1384
+ )
1385
+
1386
+ extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
1387
+ s.write(
1388
+ '<a title="%s" href="%s" style="color:red%s">%s</a>'
1389
+ % (
1390
+ non_oop_failure_title,
1391
+ self.non_oop_failures_link,
1392
+ extra_style,
1393
+ self._format_rate(m.non_oop_failure_rate),
1269
1394
  )
1270
1395
  )
1271
1396
 
1397
+ def _format_rate(self, rate: float) -> str:
1398
+ """Formats a rate."""
1399
+ return f'%.{self.report_precision}f%% ' % (rate * 100)
1400
+
1272
1401
  def audit(
1273
1402
  self,
1274
1403
  example: Any,
@@ -1287,7 +1416,13 @@ class Evaluation(Evaluable):
1287
1416
  dryrun: Whether or not audition takes place during dryrun.
1288
1417
  """
1289
1418
  if error is not None:
1290
- self._failures.append((example, str(error)))
1419
+ self._failures.append((example, error))
1420
+
1421
+ # Invalid cache of num_oop_failures.
1422
+ self.__dict__.pop('oop_failures', None)
1423
+ self.__dict__.pop('non_oop_failures', None)
1424
+ self.__dict__.pop('failure_breakdown', None)
1425
+
1291
1426
  if isinstance(error, lf_structured.MappingError):
1292
1427
  message = error.lm_response
1293
1428
  else:
@@ -1333,16 +1468,26 @@ class Evaluation(Evaluable):
1333
1468
  # Save failures.
1334
1469
  pg.save(
1335
1470
  [
1336
- pg.Dict(
1337
- input=input, error=lf.text_formatting.decolored(str(error))
1338
- )
1339
- for input, error in self.failures
1471
+ pg.Dict(input=input, error=_format_error(error))
1472
+ for input, error in self.oop_failures
1473
+ ],
1474
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
1475
+ )
1476
+ pg.save(
1477
+ self._html([self._render_result, self._render_oop_failures]),
1478
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
1479
+ file_format='txt',
1480
+ )
1481
+ pg.save(
1482
+ [
1483
+ pg.Dict(input=input, error=_format_error(error))
1484
+ for input, error in self.non_oop_failures
1340
1485
  ],
1341
- os.path.join(self.dir, Evaluation.FAILURES_JSON),
1486
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
1342
1487
  )
1343
1488
  pg.save(
1344
- self._html([self._render_result, self._render_failures]),
1345
- os.path.join(self.dir, Evaluation.FAILURES_HTML),
1489
+ self._html([self._render_result, self._render_non_oop_failures]),
1490
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
1346
1491
  file_format='txt',
1347
1492
  )
1348
1493
 
@@ -1357,7 +1502,8 @@ class Evaluation(Evaluable):
1357
1502
  )
1358
1503
  if self.result.usage is not None:
1359
1504
  s.write('<td>Usage</td>')
1360
- s.write('<td>Failures</td>')
1505
+ s.write('<td>OOP Failures</td>')
1506
+ s.write('<td>Non-OOP Failures</td>')
1361
1507
 
1362
1508
  def _render_result_row(self, s: io.StringIO) -> None:
1363
1509
  s.write(
@@ -1385,16 +1531,29 @@ class Evaluation(Evaluable):
1385
1531
  # Usage.
1386
1532
  if self.result.usage is not None:
1387
1533
  s.write('<td>')
1388
- self._render_usage(s)
1534
+ self._render_summary_usage(s)
1389
1535
  s.write('</td>')
1390
1536
 
1391
- # Failures.
1537
+ # OOP failures.
1538
+ s.write(
1539
+ '<td><span style="color:magenta">%s</span>%s</td>'
1540
+ % (
1541
+ self._format_rate(self.oop_failure_rate),
1542
+ '<a href="%s">(%d/%d)</a>'
1543
+ % (self.oop_failures_link,
1544
+ self.num_oop_failures,
1545
+ self.num_completed),
1546
+ )
1547
+ )
1548
+ # Non-OOP failures.
1392
1549
  s.write(
1393
- '<td><span style="color:orange">%s</span>%s</td>'
1550
+ '<td><span style="color:red">%s</span>%s</td>'
1394
1551
  % (
1395
- f'%.{self.report_precision}f%%' % (self.failure_rate * 100),
1552
+ self._format_rate(self.non_oop_failure_rate),
1396
1553
  '<a href="%s">(%d/%d)</a>'
1397
- % (self.failures_link, self.num_failures, self.num_completed),
1554
+ % (self.non_oop_failures_link,
1555
+ self.num_non_oop_failures,
1556
+ self.num_completed),
1398
1557
  )
1399
1558
  )
1400
1559
 
@@ -1408,24 +1567,77 @@ class Evaluation(Evaluable):
1408
1567
  else:
1409
1568
  return 'cyan'
1410
1569
 
1411
- def _render_failures(self, s: io.StringIO) -> None:
1570
+ def _render_oop_failures(self, s: io.StringIO) -> None:
1571
+ self._render_failures(s, '^MappingError.*', error_color='magenta')
1572
+
1573
+ def _render_non_oop_failures(self, s: io.StringIO) -> None:
1574
+ self._render_failures(s, '^(?!MappingError).*', error_color='red')
1575
+
1576
+ def _render_failures(
1577
+ self, s: io.StringIO, error_regex: str, error_color: str) -> None:
1412
1578
  """Formats the failed cases into html."""
1579
+ # Failure summary.
1413
1580
  s.write(
1414
- '<h2> Failed Cases </h2>'
1581
+ '<h2> Error Summary </h2>'
1415
1582
  '<div style="white-space:pre">\n'
1416
1583
  '<table style="border:1px solid">'
1417
- '<tr class="header"><td>No.</td><td>Input</td><td>Error</td></tr>'
1584
+ '<tr class="header"><td>Error type</td><td>Stats</td></tr>'
1418
1585
  )
1586
+ error_regex = re.compile(error_regex)
1587
+ if self.result.metrics.failure_breakdown:
1588
+ for name, count in self.result.metrics.failure_breakdown.items():
1589
+ if not error_regex.match(name):
1590
+ continue
1591
+
1592
+ link = f'<a href="#{name}">{name}</a>'
1593
+ error_rate = self._format_rate(count / self.result.metrics.total)
1594
+ stats = (f'<span style="color:{error_color}">{error_rate} '
1595
+ f'({count}/{self.result.metrics.total})</span>')
1596
+ s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
1597
+ s.write(
1598
+ '</table></div>'
1599
+ '<h2> Failed Cases </h2>'
1600
+ '<div style="white-space:pre">'
1601
+ )
1602
+ # Failure details by error type.
1603
+ failures_by_error = collections.defaultdict(list)
1604
+ for example, error in self.failures:
1605
+ error_name = _error_key(error)
1606
+ if error_regex.match(error_name):
1607
+ failures_by_error[error_name].append((example, error))
1608
+
1609
+ for error_key, failures in failures_by_error.items():
1610
+ s.write(
1611
+ f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
1612
+ f'(count={len(failures)})</h3>'
1613
+ '<table style="border:1px solid">'
1614
+ '<tr class="header"><td>No.</td><td>Input</td>'
1615
+ '<td>LM invocation</td><td>Error</td></tr>'
1616
+ )
1617
+ for i, (example, error) in enumerate(failures):
1618
+ lm_response = None
1619
+ if isinstance(error, lf.structured.MappingError):
1620
+ lm_response = error.lm_response
1621
+ error = error.cause
1622
+
1623
+ bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1624
+ s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1625
+ s.write('<td style="color:green;white-space:pre-wrap">')
1626
+ s.write(pg.format(example, verbose=False))
1627
+ s.write('</td><td>')
1628
+ if lm_response is not None:
1629
+ self._render_message(lm_response, s)
1630
+ s.write(f'</td><td style="color:{error_color};white-space:pre">')
1631
+ s.write(_format_error(error))
1632
+ s.write('</td></tr>')
1633
+ s.write('</table>')
1634
+ s.write('</div>')
1419
1635
 
1420
- for i, (example, error) in enumerate(self.failures):
1421
- bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1422
- s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1423
- input_str = pg.format(example, verbose=False)
1424
- s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
1425
- error_str = lf.text_formatting.decolored(str(error))
1426
- s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
1427
- s.write('</tr>')
1428
- s.write('</table></div>')
1636
+ @classmethod
1637
+ def visualize(cls, evaluations: list['Evaluation']) -> str | None:
1638
+ """Visualize the a list of evaluations of this task in HTML."""
1639
+ del evaluations
1640
+ return None
1429
1641
 
1430
1642
 
1431
1643
  @pg.functor()
@@ -1578,7 +1790,7 @@ class Summary(pg.Object):
1578
1790
  if e is None:
1579
1791
  s.write('<span style="color: gray">N/A<span>')
1580
1792
  else:
1581
- s.write(e.summarize_html())
1793
+ s.write(e.summary_card())
1582
1794
  s.write('</td>')
1583
1795
  s.write('</tr>')
1584
1796
  s.write('</table>')
@@ -1653,13 +1865,22 @@ class Summary(pg.Object):
1653
1865
  s.write('<html><body>')
1654
1866
  for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
1655
1867
  table_id = task.__name__.lower()
1868
+ evaluations = self.select(task=task).evaluations
1869
+ table = Summary.Table.from_evaluations(evaluations, pivot_field)
1656
1870
  s.write('<div>')
1657
- s.write(f'<a id="{table_id}"')
1658
- s.write(f'<h2><a href="#{table_id}">{task.__name__}</a></h2>')
1659
- s.write('</a>')
1660
- table = Summary.Table.from_evaluations(
1661
- self.select(task=task).evaluations, pivot_field
1871
+ s.write(
1872
+ f'<a id="{table_id}" href="#{table_id}">'
1873
+ f'<h2>{task.__name__}</h2></a>'
1662
1874
  )
1875
+
1876
+ # Allow users to plugin visualization code (e.g. matplot) in the summary
1877
+ # page.
1878
+ visual_part = task.visualize(evaluations)
1879
+ if visual_part:
1880
+ s.write(visual_part)
1881
+
1882
+ s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
1883
+ s.write('<hr/>')
1663
1884
  s.write(table.html())
1664
1885
  s.write('</div>')
1665
1886
  s.write('</body></html>')
@@ -1685,6 +1906,7 @@ class Summary(pg.Object):
1685
1906
  experiment=entry,
1686
1907
  dir=entry.dir,
1687
1908
  metrics=entry.result.metrics if entry.result else None,
1909
+ usage=entry.result.usage if entry.result else None,
1688
1910
  )
1689
1911
  )
1690
1912
  task_results[task.__name__] = results
@@ -1833,6 +2055,21 @@ class Summary(pg.Object):
1833
2055
  return result.join()
1834
2056
 
1835
2057
 
2058
+ def _format_error(error: Exception):
2059
+ """Formats an error into a string."""
2060
+ return (f'({error.__class__.__name__}) '
2061
+ + lf.text_formatting.decolored(str(error)))
2062
+
2063
+
2064
+ def _error_key(error: Exception) -> str:
2065
+ """Returns the key for an error."""
2066
+ error_names = []
2067
+ while error is not None:
2068
+ error_names.append(error.__class__.__name__)
2069
+ error = getattr(error, 'cause', None)
2070
+ return '.'.join(error_names)
2071
+
2072
+
1836
2073
  def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
1837
2074
  """Formats prompt in HTML."""
1838
2075
  if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck
@@ -220,7 +220,18 @@ class EvaluationTest(unittest.TestCase):
220
220
  cache_stats=dict(
221
221
  use_cache=True, num_queries=2, num_hits=0, num_updates=2
222
222
  ),
223
- metrics=dict(total=2, failures=1, failure_rate=0.5),
223
+ metrics=dict(
224
+ total=2,
225
+ failures=1,
226
+ failure_rate=0.5,
227
+ oop_failures=1,
228
+ oop_failure_rate=0.5,
229
+ non_oop_failures=0,
230
+ non_oop_failure_rate=0.0,
231
+ failure_breakdown={
232
+ 'MappingError.SchemaError.TypeError': 1
233
+ }
234
+ ),
224
235
  usage=dict(
225
236
  total_prompt_tokens=774,
226
237
  total_completion_tokens=25,
@@ -235,12 +246,20 @@ class EvaluationTest(unittest.TestCase):
235
246
  os.path.exists(os.path.join(s.dir, base.Evaluation.EXPERIMENT_JSON)))
236
247
  self.assertTrue(
237
248
  os.path.exists(os.path.join(s.dir, base.Evaluation.RESULT_JSON)))
249
+ self.assertTrue(
250
+ os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_JSON)))
251
+ self.assertTrue(
252
+ os.path.exists(
253
+ os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_JSON)))
238
254
  self.assertTrue(
239
255
  os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
240
256
  self.assertTrue(
241
257
  os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
242
258
  self.assertTrue(
243
- os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
259
+ os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
260
+ self.assertTrue(
261
+ os.path.exists(
262
+ os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
244
263
  self.assertTrue(
245
264
  os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
246
265
  )
@@ -274,7 +293,10 @@ class EvaluationTest(unittest.TestCase):
274
293
  self.assertFalse(
275
294
  os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
276
295
  self.assertFalse(
277
- os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
296
+ os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
297
+ self.assertFalse(
298
+ os.path.exists(
299
+ os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
278
300
 
279
301
  def test_load(self):
280
302
  lm = fake.StaticResponse('Solution(final_answer=2)')
@@ -312,7 +334,16 @@ class EvaluationTest(unittest.TestCase):
312
334
  cache_stats=dict(
313
335
  use_cache=True, num_queries=2, num_hits=0, num_updates=2
314
336
  ),
315
- metrics=dict(total=2, failures=0, failure_rate=0.0),
337
+ metrics=dict(
338
+ total=2,
339
+ failures=0,
340
+ failure_rate=0.0,
341
+ oop_failures=0,
342
+ oop_failure_rate=0.0,
343
+ non_oop_failures=0,
344
+ non_oop_failure_rate=0.0,
345
+ failure_breakdown={},
346
+ ),
316
347
  usage=s.children[1].result.usage,
317
348
  ),
318
349
  },
@@ -363,7 +394,18 @@ class EvaluationTest(unittest.TestCase):
363
394
  cache_stats=dict(
364
395
  use_cache=True, num_queries=2, num_hits=0, num_updates=2
365
396
  ),
366
- metrics=dict(total=2, failures=1, failure_rate=0.5),
397
+ metrics=dict(
398
+ total=2,
399
+ failures=1,
400
+ failure_rate=0.5,
401
+ oop_failures=1,
402
+ oop_failure_rate=0.5,
403
+ non_oop_failures=0,
404
+ non_oop_failure_rate=0.0,
405
+ failure_breakdown={
406
+ 'MappingError.SchemaError.TypeError': 1
407
+ }
408
+ ),
367
409
  usage=s.children[0].result.usage,
368
410
  ),
369
411
  s.children[1].id: dict(
@@ -378,7 +420,18 @@ class EvaluationTest(unittest.TestCase):
378
420
  cache_stats=dict(
379
421
  use_cache=True, num_queries=2, num_hits=0, num_updates=2
380
422
  ),
381
- metrics=dict(total=2, failures=1, failure_rate=0.5),
423
+ metrics=dict(
424
+ total=2,
425
+ failures=1,
426
+ failure_rate=0.5,
427
+ oop_failures=1,
428
+ oop_failure_rate=0.5,
429
+ non_oop_failures=0,
430
+ non_oop_failure_rate=0.0,
431
+ failure_breakdown={
432
+ 'MappingError.SchemaError.TypeError': 1
433
+ }
434
+ ),
382
435
  usage=s.children[1].result.usage,
383
436
  ),
384
437
  },
@@ -475,7 +528,7 @@ class SuiteTest(unittest.TestCase):
475
528
  self.assertEqual(s.hash, '26e6cc25')
476
529
  s.run()
477
530
  expected = {
478
- s.children[0].id: dict(
531
+ 'Evaluation@0fade07d': dict(
479
532
  experiment_setup=dict(
480
533
  id=s.children[0].id,
481
534
  dir=s.children[0].dir,
@@ -487,48 +540,46 @@ class SuiteTest(unittest.TestCase):
487
540
  cache_stats=dict(
488
541
  use_cache=True, num_queries=2, num_hits=0, num_updates=2
489
542
  ),
490
- metrics=dict(total=2, failures=1, failure_rate=0.5),
543
+ metrics=dict(
544
+ total=2,
545
+ failures=1,
546
+ failure_rate=0.5,
547
+ oop_failures=1,
548
+ oop_failure_rate=0.5,
549
+ non_oop_failures=0,
550
+ non_oop_failure_rate=0.0,
551
+ failure_breakdown={
552
+ 'MappingError.SchemaError.TypeError': 1
553
+ }
554
+ ),
491
555
  usage=s.children[0].result.usage,
492
556
  ),
493
- s.children[1].id: {
494
- s.children[1]
495
- .children[0]
496
- .id: dict(
497
- experiment_setup=dict(
498
- id=s.children[1].children[0].id,
499
- dir=s.children[1].children[0].dir,
500
- model='StaticSequence',
501
- prompt_template='{{example.question}}',
502
- method='call',
503
- schema_fn='answer_schema()',
504
- ),
505
- cache_stats=dict(
506
- use_cache=True, num_queries=4, num_hits=1, num_updates=3
507
- ),
508
- metrics=dict(total=2, failures=2, failure_rate=1.0),
509
- usage=s.children[1].children[0].result.usage,
557
+ 'Evaluation@ae86c703': dict(
558
+ experiment_setup=dict(
559
+ id=s.children[1].children[0].id,
560
+ dir=s.children[1].children[0].dir,
561
+ model='StaticSequence',
562
+ prompt_template='{{example.question}}',
563
+ method='call',
564
+ schema_fn='answer_schema()',
510
565
  ),
511
- s.children[1]
512
- .children[2]
513
- .id: dict(
514
- experiment_setup=dict(
515
- id=s.children[1].children[2].id,
516
- dir=s.children[1].children[2].dir,
517
- model='StaticSequence',
518
- prompt_template='{{example.question}}',
519
- method='query',
520
- schema_fn='answer_schema()',
521
- ),
522
- cache_stats=dict(
523
- use_cache=True,
524
- num_queries=2,
525
- num_hits=0,
526
- num_updates=2,
527
- ),
528
- metrics=dict(total=2, failures=1, failure_rate=0.5),
529
- usage=s.children[1].children[2].result.usage,
566
+ cache_stats=dict(
567
+ use_cache=True, num_queries=4, num_hits=1, num_updates=3
530
568
  ),
531
- },
569
+ metrics=dict(
570
+ total=2,
571
+ failures=2,
572
+ failure_rate=1.0,
573
+ oop_failures=2,
574
+ oop_failure_rate=1.0,
575
+ non_oop_failures=0,
576
+ non_oop_failure_rate=0.0,
577
+ failure_breakdown={
578
+ 'MappingError.SchemaError.TypeError': 2
579
+ }
580
+ ),
581
+ usage=s.children[1].children[0].result.usage,
582
+ ),
532
583
  }
533
584
  self.assertEqual(s.result, expected)
534
585
 
@@ -119,18 +119,18 @@ class Matching(base.Evaluation):
119
119
  del progress
120
120
  return {
121
121
  'Model': self.lm.model_id,
122
- 'Matches': f'%.{self.report_precision}f%% (%d/%d)' % (
123
- self.match_rate * 100,
122
+ 'Matches': '%s (%d/%d)' % (
123
+ self._format_rate(self.match_rate),
124
124
  self.num_matches,
125
125
  self.num_completed,
126
126
  ),
127
- 'Mismatches': f'%.{self.report_precision}f%% (%d/%d)' % (
128
- self.mismatch_rate * 100,
127
+ 'Mismatches': '%s (%d/%d)' % (
128
+ self._format_rate(self.mismatch_rate),
129
129
  self.num_mismatches,
130
130
  self.num_completed,
131
131
  ),
132
- 'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
133
- self.failure_rate * 100,
132
+ 'Failed': '%s (%d/%d)' % (
133
+ self._format_rate(self.failure_rate),
134
134
  self.num_failures,
135
135
  self.num_completed,
136
136
  ),
@@ -140,24 +140,25 @@ class Matching(base.Evaluation):
140
140
  assert self.result is not None
141
141
  m = self.result.metrics
142
142
  return (
143
- f'COMPLETED(%s): Matches=%.{self.report_precision}f%% (%d/%d)'
144
- f' Mismatches=%.{self.report_precision}f%% (%d/%d)'
145
- f' Failures=%.{self.report_precision}f%% (%d/%d)'
143
+ 'COMPLETED(%s):'
144
+ ' Matches=%s (%d/%d)'
145
+ ' Mismatches=%s (%d/%d)'
146
+ ' Failures=%s (%d/%d)'
146
147
  ) % (
147
148
  run_status,
148
- m.match_rate * 100,
149
+ self._format_rate(m.match_rate),
149
150
  m.num_matches,
150
151
  m.total,
151
- m.mismatch_rate * 100,
152
+ self._format_rate(m.mismatch_rate),
152
153
  m.num_mismatches,
153
154
  m.total,
154
- m.failure_rate * 100,
155
+ self._format_rate(m.failure_rate),
155
156
  m.failures,
156
157
  m.total,
157
158
  )
158
159
 
159
- def summarize(self) -> pg.Dict:
160
- result = super().summarize()
160
+ def finalize(self) -> pg.Dict:
161
+ result = super().finalize()
161
162
  result.metrics.update(
162
163
  num_matches=self.num_matches,
163
164
  match_rate=self.match_rate,
@@ -218,9 +219,9 @@ class Matching(base.Evaluation):
218
219
  def _render_result_row(self, s: io.StringIO):
219
220
  super()._render_result_row(s)
220
221
  s.write(
221
- '<td><span style="color:red">%s</span>%s</td>'
222
+ '<td><span style="color:orange">%s</span>%s</td>'
222
223
  % (
223
- f'%.{self.report_precision}f%% ' % (self.mismatch_rate * 100),
224
+ self._format_rate(self.mismatch_rate),
224
225
  '<a href="%s">(%d/%d)</a>'
225
226
  % (self.mismatches_link, self.num_mismatches, self.num_completed),
226
227
  )
@@ -228,13 +229,13 @@ class Matching(base.Evaluation):
228
229
  s.write(
229
230
  '<td><span style="color:green">%s</span>%s</td>'
230
231
  % (
231
- f'%.{self.report_precision}f%% ' % (self.match_rate * 100),
232
+ self._format_rate(self.match_rate),
232
233
  '<a href="%s">(%d/%d)</a>'
233
234
  % (self.matches_link, self.num_matches, self.num_completed),
234
235
  )
235
236
  )
236
237
 
237
- def _render_metric(self, s: io.StringIO) -> None:
238
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
238
239
  """Renders metrics in HTML."""
239
240
  assert self.result is not None
240
241
  m = self.result.metrics
@@ -244,7 +245,7 @@ class Matching(base.Evaluation):
244
245
  m.num_matches,
245
246
  m.total,
246
247
  self.matches_link,
247
- f'%.{self.report_precision}f%% ' % (m.match_rate * 100),
248
+ self._format_rate(m.match_rate),
248
249
  )
249
250
  )
250
251
  s.write(' | ')
@@ -254,11 +255,11 @@ class Matching(base.Evaluation):
254
255
  m.num_mismatches,
255
256
  m.total,
256
257
  self.mismatches_link,
257
- f'%.{self.report_precision}f%% ' % (m.mismatch_rate * 100),
258
+ self._format_rate(m.mismatch_rate),
258
259
  )
259
260
  )
260
261
  s.write(' | ')
261
- super()._render_metric(s)
262
+ super()._render_summary_metrics(s)
262
263
 
263
264
  def _render_matches(self, s: io.StringIO) -> None:
264
265
  """Formats the matched cases into html."""
@@ -120,6 +120,13 @@ class MatchingTest(unittest.TestCase):
120
120
  total=4,
121
121
  failures=1,
122
122
  failure_rate=0.25,
123
+ oop_failures=1,
124
+ oop_failure_rate=0.25,
125
+ non_oop_failures=0,
126
+ non_oop_failure_rate=0.0,
127
+ failure_breakdown={
128
+ 'MappingError.SchemaError.TypeError': 1
129
+ },
123
130
  num_matches=2,
124
131
  match_rate=0.5,
125
132
  num_mismatches=1,
@@ -160,7 +167,14 @@ class MatchingTest(unittest.TestCase):
160
167
  self.assertTrue(
161
168
  os.path.exists(
162
169
  os.path.join(
163
- s.dir, matching.Matching.FAILURES_JSON
170
+ s.dir, matching.Matching.OOP_FAILURES_JSON
171
+ )
172
+ )
173
+ )
174
+ self.assertTrue(
175
+ os.path.exists(
176
+ os.path.join(
177
+ s.dir, matching.Matching.NON_OOP_FAILURES_JSON
164
178
  )
165
179
  )
166
180
  )
@@ -175,7 +189,14 @@ class MatchingTest(unittest.TestCase):
175
189
  self.assertTrue(
176
190
  os.path.exists(
177
191
  os.path.join(
178
- s.dir, matching.Matching.FAILURES_HTML
192
+ s.dir, matching.Matching.OOP_FAILURES_HTML
193
+ )
194
+ )
195
+ )
196
+ self.assertTrue(
197
+ os.path.exists(
198
+ os.path.join(
199
+ s.dir, matching.Matching.NON_OOP_FAILURES_HTML
179
200
  )
180
201
  )
181
202
  )
@@ -113,8 +113,8 @@ class Scoring(base.Evaluation):
113
113
  m.total,
114
114
  )
115
115
 
116
- def summarize(self) -> pg.Dict:
117
- result = super().summarize()
116
+ def finalize(self) -> pg.Dict:
117
+ result = super().finalize()
118
118
  result.metrics.update(
119
119
  num_scored=self.num_scored,
120
120
  score_rate=self.score_rate,
@@ -168,7 +168,7 @@ class Scoring(base.Evaluation):
168
168
  )
169
169
  )
170
170
 
171
- def _render_metric(self, s: io.StringIO) -> None:
171
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
172
172
  """Renders metrics in HTML."""
173
173
  assert self.result is not None
174
174
  m = self.result.metrics
@@ -182,7 +182,7 @@ class Scoring(base.Evaluation):
182
182
  )
183
183
  )
184
184
  s.write(' | ')
185
- super()._render_metric(s)
185
+ super()._render_summary_metrics(s)
186
186
 
187
187
  def _render_scored(self, s: io.StringIO) -> None:
188
188
  """Formats the matched cases into html."""
@@ -98,6 +98,11 @@ class ScoringTest(unittest.TestCase):
98
98
  total=2,
99
99
  failures=0,
100
100
  failure_rate=0.0,
101
+ oop_failures=0,
102
+ oop_failure_rate=0.0,
103
+ non_oop_failures=0,
104
+ non_oop_failure_rate=0.0,
105
+ failure_breakdown={},
101
106
  num_scored=2,
102
107
  score_rate=1.0,
103
108
  avg_score=0.5,
@@ -124,7 +129,12 @@ class ScoringTest(unittest.TestCase):
124
129
  )
125
130
  self.assertTrue(
126
131
  os.path.exists(
127
- os.path.join(s.dir, scoring.Scoring.FAILURES_JSON)
132
+ os.path.join(s.dir, scoring.Scoring.OOP_FAILURES_JSON)
133
+ )
134
+ )
135
+ self.assertTrue(
136
+ os.path.exists(
137
+ os.path.join(s.dir, scoring.Scoring.NON_OOP_FAILURES_JSON)
128
138
  )
129
139
  )
130
140
  self.assertTrue(
@@ -143,7 +153,14 @@ class ScoringTest(unittest.TestCase):
143
153
  self.assertTrue(
144
154
  os.path.exists(
145
155
  os.path.join(
146
- s.dir, scoring.Scoring.FAILURES_HTML
156
+ s.dir, scoring.Scoring.OOP_FAILURES_HTML
157
+ )
158
+ )
159
+ )
160
+ self.assertTrue(
161
+ os.path.exists(
162
+ os.path.join(
163
+ s.dir, scoring.Scoring.NON_OOP_FAILURES_HTML
147
164
  )
148
165
  )
149
166
  )
@@ -234,7 +234,7 @@ class OpenAI(lf.LanguageModel):
234
234
  if isinstance(chunk, str):
235
235
  item = dict(type='text', text=chunk)
236
236
  elif isinstance(chunk, lf_modalities.Image) and chunk.uri:
237
- item = dict(type='image_url', image_url=chunk.uri)
237
+ item = dict(type='image_url', image_url=dict(url=chunk.uri))
238
238
  else:
239
239
  raise ValueError(f'Unsupported modality object: {chunk!r}.')
240
240
  content.append(item)
@@ -66,7 +66,8 @@ def mock_chat_completion_query_vision(messages, *, n=1, **kwargs):
66
66
  del kwargs
67
67
  choices = []
68
68
  urls = [
69
- c['image_url'] for c in messages[0]['content'] if c['type'] == 'image_url'
69
+ c['image_url']['url']
70
+ for c in messages[0]['content'] if c['type'] == 'image_url'
70
71
  ]
71
72
  for k in range(n):
72
73
  choices.append(pg.Dict(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langfun
3
- Version: 0.0.2.dev20240428
3
+ Version: 0.0.2.dev20240430
4
4
  Summary: Langfun: Language as Functions.
5
5
  Home-page: https://github.com/google/langfun
6
6
  Author: Langfun Authors
@@ -40,12 +40,12 @@ langfun/core/coding/python/parsing_test.py,sha256=9vAWF484kWIm6JZq8NFiMgKUDhXV-d
40
40
  langfun/core/coding/python/permissions.py,sha256=1QWGHvzL8MM0Ok_auQ9tURqZHtdOfJaDpBzZ29GUE-c,2544
41
41
  langfun/core/coding/python/permissions_test.py,sha256=w5EDb8QxpxgJyZkojyzVWQvDfg366zn99-g__6TbPQ0,2699
42
42
  langfun/core/eval/__init__.py,sha256=NSmPe2lxdxFoY4h8VkNyONPAFtOTUpK9WhmZRaqUgiI,1335
43
- langfun/core/eval/base.py,sha256=1svQoZ0C2DGCVLvr0Qt0TcrlJKtJptdoOBVAxkxnHoU,60264
44
- langfun/core/eval/base_test.py,sha256=g3lRp2dcq411cLYHpn8spI4feyv2nOccs5PlFBwav3g,22512
45
- langfun/core/eval/matching.py,sha256=Ks-L9vyMNDj4R8zFczzByT_4DK2wAFatyCZupdHzx_g,9932
46
- langfun/core/eval/matching_test.py,sha256=5Qs9ETaLoyNcJ43f-_bK2Bfe--2Y3U79DnSA55-l6pc,4932
47
- langfun/core/eval/scoring.py,sha256=A3y6HMcmpREQPqUD-WtImYOb2jG-23WpcUO2-WGhel0,6360
48
- langfun/core/eval/scoring_test.py,sha256=vxJR-2rBghUDUOCLTIMd6M3i1F8xDhA-U45wuBHVfc0,4058
43
+ langfun/core/eval/base.py,sha256=ImIdyjh89yWUbFoSI12xzpcSmvB34y8_F0WAcUi-4sg,68405
44
+ langfun/core/eval/base_test.py,sha256=SEo43ftMscpZ5QV6AGaywrA6SobVaG_P7sUbjoBGqg8,24081
45
+ langfun/core/eval/matching.py,sha256=Y4vFoNTQEOwko6IA8l9OZ52-vt52e3VGmcTtvLA67wM,9782
46
+ langfun/core/eval/matching_test.py,sha256=f7iVyXH5KGJBWt4Wp14Bt9J3X59A6Ayfog9MbuFvPew,5532
47
+ langfun/core/eval/scoring.py,sha256=1J7IATo-8FXUR0SBqk9icztHiM0lWkBFcWUo-vUURgQ,6376
48
+ langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se2SXM,4546
49
49
  langfun/core/llms/__init__.py,sha256=1bPg1QI8duOZCYINm-jWi094x0JtLmsk4KX60qIC_gs,3245
50
50
  langfun/core/llms/anthropic.py,sha256=7W9YdPN3SlAFhAIQlihMkrpo7tTY_4NvD0KIlCrqcsk,8505
51
51
  langfun/core/llms/anthropic_test.py,sha256=TMM30myyEhwF99Le4RvJEXOn8RYl0q1FRkt9Q9nl1jk,5540
@@ -57,8 +57,8 @@ langfun/core/llms/groq.py,sha256=NaGItVL_pkOpqPpI4bPGU27xLFRoaeizZ49v2s-4ERs,784
57
57
  langfun/core/llms/groq_test.py,sha256=M6GtlrsOvDun_j-sR8cPh4W_moHWZNSTiThu3kuwbbc,5281
58
58
  langfun/core/llms/llama_cpp.py,sha256=Y_KkMUf3Xfac49koMUtUslKl3h-HWp3-ntq7Jaa3bdo,2385
59
59
  langfun/core/llms/llama_cpp_test.py,sha256=ZxC6defGd_HX9SFRU9U4cJiQnBKundbOrchbXuC1Z2M,1683
60
- langfun/core/llms/openai.py,sha256=06nPhmw0zIA5Zqv3eqsrZtYLHnKwW7N8yt3LlFUFVpI,13247
61
- langfun/core/llms/openai_test.py,sha256=MiLqBaYliAkWVEwOBmX3HTj_eAuWLv77q8-I3VyVEBU,14841
60
+ langfun/core/llms/openai.py,sha256=rPwO4qPGEwbB4O7TaQD0spg_PXIfF2ioRI_ilE3Pg6Y,13257
61
+ langfun/core/llms/openai_test.py,sha256=asSA1sVy_7hnXioD_2HTxtSDpVTKBUO_EjZuyHpwbn0,14854
62
62
  langfun/core/llms/cache/__init__.py,sha256=QAo3InUMDM_YpteNnVCSejI4zOsnjSMWKJKzkb3VY64,993
63
63
  langfun/core/llms/cache/base.py,sha256=cFfYvOIUae842pncqCAsRvqXCk2AnAsRYVx0mcIoAeY,3338
64
64
  langfun/core/llms/cache/in_memory.py,sha256=YfFyJEhLs73cUiB0ZfhMxYpdE8Iuxxw-dvMFwGHTSHw,4742
@@ -101,8 +101,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
101
101
  langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
102
102
  langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
103
103
  langfun/core/templates/selfplay_test.py,sha256=DYVrkk7uNKCqJGEHH31HssU2BPuMItU1vJLzfcXIlYg,2156
104
- langfun-0.0.2.dev20240428.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
105
- langfun-0.0.2.dev20240428.dist-info/METADATA,sha256=-0Gz3prcRxBlC7ikhHPW0_MirZXL4QcmdcKCxG_LSR4,3436
106
- langfun-0.0.2.dev20240428.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
107
- langfun-0.0.2.dev20240428.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
108
- langfun-0.0.2.dev20240428.dist-info/RECORD,,
104
+ langfun-0.0.2.dev20240430.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
105
+ langfun-0.0.2.dev20240430.dist-info/METADATA,sha256=RpEIB1auHihqOoDrPnFQaYqgpqxFKA9_Z9iuCfPxe5s,3436
106
+ langfun-0.0.2.dev20240430.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
107
+ langfun-0.0.2.dev20240430.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
108
+ langfun-0.0.2.dev20240430.dist-info/RECORD,,