themis-eval 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. themis/__init__.py +5 -2
  2. themis/_version.py +14 -1
  3. themis/api.py +83 -145
  4. themis/backends/storage.py +5 -0
  5. themis/cli/commands/info.py +2 -11
  6. themis/cli/main.py +231 -40
  7. themis/comparison/engine.py +7 -13
  8. themis/core/entities.py +4 -0
  9. themis/evaluation/metric_pipeline.py +12 -0
  10. themis/evaluation/pipeline.py +22 -0
  11. themis/evaluation/pipelines/__init__.py +4 -0
  12. themis/evaluation/pipelines/composable_pipeline.py +55 -0
  13. themis/evaluation/pipelines/standard_pipeline.py +18 -1
  14. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
  15. themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
  16. themis/experiment/__init__.py +2 -2
  17. themis/experiment/cache_manager.py +15 -1
  18. themis/experiment/definitions.py +1 -1
  19. themis/experiment/orchestrator.py +21 -11
  20. themis/experiment/share.py +264 -0
  21. themis/experiment/storage.py +345 -298
  22. themis/generation/plan.py +28 -6
  23. themis/generation/router.py +22 -4
  24. themis/generation/runner.py +16 -1
  25. themis/presets/benchmarks.py +602 -17
  26. themis/server/app.py +38 -26
  27. themis/session.py +125 -0
  28. themis/specs/__init__.py +7 -0
  29. themis/specs/execution.py +26 -0
  30. themis/specs/experiment.py +33 -0
  31. themis/specs/storage.py +18 -0
  32. themis/storage/__init__.py +6 -0
  33. themis/storage/experiment_storage.py +7 -0
  34. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
  35. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/RECORD +38 -31
  36. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
  37. themis/experiment/builder.py +0 -151
  38. themis/experiment/export_csv.py +0 -159
  39. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
  40. {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,14 @@
1
- themis/__init__.py,sha256=rQL3njf3i5lnAcmu0HuRzGGMELbA9xX21hzw4HrbIxw,1394
2
- themis/_version.py,sha256=y0Oqv0Je2udPmKCy5_D8Lib7GNLGxtLVp8b5WdavITg,378
3
- themis/api.py,sha256=flZTbU-jRcbv7oXcfRKG4hkZjASmWlT52A4PghKj9G0,17700
1
+ themis/__init__.py,sha256=YPexmyPqbANhr0Yzm46FDWgdBLjAX_UoZqywte9WJ84,1476
2
+ themis/_version.py,sha256=b5-7SWk_hYOedKy8IUPxaubXc0KS0hRjhUsY8PkIbl0,818
3
+ themis/api.py,sha256=zK6_RiiYTcXiV6uVQu1fNRdTqm-tXT_Df-CfrLXrIhk,15509
4
4
  themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ themis/session.py,sha256=H1Pkr71b90sa94YeRCIyffIxE2gQPmtfzo-zNnPLrdM,4311
5
6
  themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
6
7
  themis/backends/execution.py,sha256=RAFuB9ri8TMil5PcnsisypKO2ViyLFXj08P_vjNYguU,6095
7
- themis/backends/storage.py,sha256=pQp20WagSCl8Vmd-Rgx0hDbpYFhCqARXtvGDw3DPgNQ,8021
8
+ themis/backends/storage.py,sha256=3oMcL5Wliac3INxHVG6DFlbqe6Y-6fjA5eZJOLpxm0w,8168
8
9
  themis/cli/__init__.py,sha256=An2DrMHRfmiee5BYJ6TGqvbG7sXWECjjyvEgcoGJ7cE,99
9
10
  themis/cli/__main__.py,sha256=df2pOghoSuq18hZmVVikmGhaFSaRe-jeDOnrsu-1QDM,135
10
- themis/cli/main.py,sha256=AGBFxb1sPLQ-aUAq8RM3YI6gGNs6SdFmBzVSqwp_MSg,15482
11
+ themis/cli/main.py,sha256=ZcB3rS0tMsNCEcHRqjCwfi_JwvQqWZUjZ1TWuSUj4N4,22236
11
12
  themis/cli/new_project.py,sha256=D8asV4QbjgQNYvmXt_WhK4nPM-wKHe_K0VJiBdgtO_E,1121
12
13
  themis/cli/utils.py,sha256=NAPyFiXspfpx5vBxA8aEcOMmWEDyt-R8ywoHo_8Nr4A,1307
13
14
  themis/cli/commands/__init__.py,sha256=CTx7su3qTtq96qxLNclDsE6UM_86NhaS01M9-x9wFiw,287
@@ -16,7 +17,7 @@ themis/cli/commands/comparison.py,sha256=Ki1_MMFFR4vBJkZTeIMWLh-_zdjbtJZurI3YyrE
16
17
  themis/cli/commands/config_commands.py,sha256=eL6GtdIllOIHo8GbNN2jOqLn5VUPBuqhnro9ooPxDog,7387
17
18
  themis/cli/commands/cost.py,sha256=fFdF6hKIzsbPsyrJ1nt6-2m43PpVGUj8jx5T90tBTNo,7233
18
19
  themis/cli/commands/demo.py,sha256=akQqjG-hbUDfeB3bI8K4F5-S0ibJqhflGBFQ5nvdUgE,2135
19
- themis/cli/commands/info.py,sha256=9maOaw-TFiBpuVhaqlMKukGuZ_zgESetqbMQ1Qdvjxs,2515
20
+ themis/cli/commands/info.py,sha256=njms4dsaede2zyC2yoLbfoN5Bho_JTYciJl3Q7Twiv0,2250
20
21
  themis/cli/commands/leaderboard.py,sha256=AVvsYIwZAY18jn3sOq3QD45yNtfdHUEl7eixM4aMCKw,10615
21
22
  themis/cli/commands/math_benchmarks.py,sha256=nQ4TcPB7T9O3piAy4_TgrOQOQxh2Q8OyBreK_HoPCeQ,9946
22
23
  themis/cli/commands/mcq_benchmarks.py,sha256=Cls5W1jGd7TKizmw07CnZWY5N6ywR8VhJ6jKDnY_cRk,7026
@@ -24,7 +25,7 @@ themis/cli/commands/results.py,sha256=rdN3SaMoFnSfAoAXlfpeCTt3V6MwIp0Dk7FIjvPNF7
24
25
  themis/cli/commands/sample_run.py,sha256=r3Ymg5dVHg4IAVJvzoP0ZWUWWUE4Dia1t0062Yhdk9Q,9445
25
26
  themis/cli/commands/visualize.py,sha256=ZECkB0NjIltuOeBE-Q1JnndZEMXVzc8KgcrbaP-GSXo,9740
26
27
  themis/comparison/__init__.py,sha256=bRI8gDlcjMtnH77R7N5ARioq_V4daJcWWM4DXKsoE1k,679
27
- themis/comparison/engine.py,sha256=UkzXKmEFI2JiX0y8534oc6JFySxgA5v1emzRcGj33Kk,12133
28
+ themis/comparison/engine.py,sha256=Mw4sQBO2NnKRUOWHNO1XnyjrVvmHX1KXRnl1LUw1hIY,11809
28
29
  themis/comparison/reports.py,sha256=126VJbd-lxj8C2YJqul53Fyr-nrZgmbrBsRA6Qkh0ro,10117
29
30
  themis/comparison/statistics.py,sha256=eLqKUtKFwSvXnbZax8S0lF8RiSepwYdhnmnDD7DcrZs,12929
30
31
  themis/config/__init__.py,sha256=YMdFG1iLvOQUnSPlc_ZJVn5zCCTbIozML64b4qUtGR8,476
@@ -34,7 +35,7 @@ themis/config/runtime.py,sha256=hU69_oND7fJfAOIBJONENmsuf7Y8roO7n-w9OwxzoT8,7475
34
35
  themis/config/schema.py,sha256=SMR9QHp8OBkSnb1dHyOgg-IJWSqpXfyAqywnBeMy46M,3196
35
36
  themis/core/__init__.py,sha256=S8G1x-39sZ3_NQ5DJ6R1yBTWXp_gO0WxOtVjeB9sTwY,113
36
37
  themis/core/conversation.py,sha256=wwO8RS4t4plDR0Sf1KjYv_ejonlvKe0ZwAD-4sfGak8,10155
37
- themis/core/entities.py,sha256=WV9kiYdZFGxn6oH0lPtqaViY4I8oq-hWi_SmCKjvRnc,4449
38
+ themis/core/entities.py,sha256=a9R3J2wetyUiGcxH6K8pGXfVz_gLh4nsJ81y7cKBj5o,4548
38
39
  themis/core/serialization.py,sha256=cxfoSKwcZiNsnR8g_SAJAq1ZLrfLXM4S9_rVEDUT8qs,7071
39
40
  themis/core/tools.py,sha256=v0_ctsBCtinZGNC_I4C-h0GUPNM5ZeTi7z-U4iCtyp4,11035
40
41
  themis/core/types.py,sha256=I5rr9MMS0irX4lo-xlqGjosx-FjPgT64RzQAraM223A,3652
@@ -59,7 +60,8 @@ themis/datasets/super_gpqa.py,sha256=Mr1ag_FyAk1haxg6_ONX5F84wQYtbSVjV-MlMNmaHlI
59
60
  themis/evaluation/__init__.py,sha256=2Jl8tcVxYAsmHNAZev2mPS_mEwZcRzebqSM3QDc2cyY,36
60
61
  themis/evaluation/conditional.py,sha256=ayndI7FcwxdIMR8B4ddgcKZd5Jl5NQcBJUp7eXI6Djk,13881
61
62
  themis/evaluation/math_verify_utils.py,sha256=vXMvL11-IH16UHZ-mbi_r5hOFz7aUfR1J1laa6qmLMk,2213
62
- themis/evaluation/pipeline.py,sha256=OOowN59UdOMF2Hwy_G4ky5yzR4ajAnohil6xteWBHqE,1503
63
+ themis/evaluation/metric_pipeline.py,sha256=_JXqf5UySbAyuSiN1waDCBfeek7ArluKvXXkm8qIEvs,329
64
+ themis/evaluation/pipeline.py,sha256=7mQV_sJlPDd6zA9jd2TT5IhdMszzT3ftMLbM7Ww76aw,2217
63
65
  themis/evaluation/reports.py,sha256=9om7jzZUtmlMH7EeteXp_98gfHct4x09AyTFy3FSAdQ,8715
64
66
  themis/evaluation/extractors/__init__.py,sha256=BanoC_8e0iam-VU7l7uhvhac_6w_JJZYoYE4xXPUrGk,566
65
67
  themis/evaluation/extractors/error_taxonomy_extractor.py,sha256=RrRx-23l3LwTdG89kvSQJng438cfYI-IdtOGUD6gEDw,2462
@@ -86,9 +88,9 @@ themis/evaluation/metrics/nlp/bertscore.py,sha256=czlIqYkOTBWsfHiE6U1vkq1KHRQm8p
86
88
  themis/evaluation/metrics/nlp/bleu.py,sha256=o_aVkoFPSMmeOLYaHRMamIpSKlYSxrMA1OdntTIUe9g,4436
87
89
  themis/evaluation/metrics/nlp/meteor.py,sha256=QZT09s4aiUcVvDJDVPZYjzi5SxXdS2gn2IaOTNmKp78,5076
88
90
  themis/evaluation/metrics/nlp/rouge.py,sha256=YL05qluF-KsesHYFRfm5zELJlcvo6RvaKp7xKy6BuLI,4365
89
- themis/evaluation/pipelines/__init__.py,sha256=5YI1xaUULHisctFxrumN4XRpWYneoonX7nd9zBtsjvQ,384
90
- themis/evaluation/pipelines/composable_pipeline.py,sha256=nNP9MSvQQJvaSBw5_gO3FeyhGm9So2ZlGqh5qSvE8Ac,10905
91
- themis/evaluation/pipelines/standard_pipeline.py,sha256=nDd_bkqAVQxgwG9RK6G_fsgqwZth3058uG3p4QM0Dck,14650
91
+ themis/evaluation/pipelines/__init__.py,sha256=EXBzgOpi_70yTvdXJr0cERmhBlaqnHZHNcDlPamyQLw,549
92
+ themis/evaluation/pipelines/composable_pipeline.py,sha256=SWv4H1-pcFNAuTu0orkMDgKsdUcom75ded9GhIaHqo0,12838
93
+ themis/evaluation/pipelines/standard_pipeline.py,sha256=Xe7hsOzHg0EBnmslBejb3xJ6Y2pvHF4mrwC_THurQQI,15423
92
94
  themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
93
95
  themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
94
96
  themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
@@ -97,33 +99,32 @@ themis/evaluation/statistics/effect_sizes.py,sha256=EWFVDilczpR8rR3_YurWy7QcjYcN
97
99
  themis/evaluation/statistics/hypothesis_tests.py,sha256=MVlVsY8wXifbBG5aSwauFShsQtIKqYREJApbriojS2o,10042
98
100
  themis/evaluation/statistics/types.py,sha256=hW0RYWs-G4C_njNl0ZGG9lJROgU2CfLWfnTQDWYmWuw,3685
99
101
  themis/evaluation/strategies/__init__.py,sha256=3f5LQkzlu3pRbN7dgDbdYOUNZTRexcn6f8D8I5-C724,439
100
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=O3dlsQ2F0Ucv2Dhjz2Qf-jpPhwaVs3zrdQDRRu9du5w,1714
102
+ themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=MFcBdtK8rBeDXPFD2YWPSprez2iwSB-8yfyWhAlylug,1959
101
103
  themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
102
104
  themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
103
- themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=58pDB30y1VpM_1KPB6sGS0JImGZk5WTgnK9CKDF8N5k,2304
104
- themis/experiment/__init__.py,sha256=dGranqpESugmmfbQlTU9efwspazW6j3vcmAKEtAoWZk,182
105
- themis/experiment/builder.py,sha256=AEjCDeSOI2B0i0PBjkfY1GUDNrYGTGiqPvt0SxnDQFo,5618
106
- themis/experiment/cache_manager.py,sha256=Fd8Qxifrmyn8f2zjAyPrLv-ZU4Dcp-MKo8-09BoW7tY,4361
105
+ themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=IRSgnnD2R6qrjiOTyA_PIOHUfQj4FqutkU3pKMth0CQ,2562
106
+ themis/experiment/__init__.py,sha256=T8BEG9dmabQ-tnVZ5YUd5r-31m6_OY-FDfvUfWhl2hc,190
107
+ themis/experiment/cache_manager.py,sha256=aB9QWcS5PV4y675cMVQehm1Rs2XuK84M0WMMFBYP3Hc,4901
107
108
  themis/experiment/comparison.py,sha256=Mr1L5Zj7i87xk9XUQ_UueLTsC-sDZH8YGwLwg_gG0VI,21562
108
109
  themis/experiment/cost.py,sha256=flhENfB5WKvyNWwPMDtygNZAv6y_yv4RoClsRz714Hc,10159
109
- themis/experiment/definitions.py,sha256=oOZBFfEQkSBiZd9CMutCQ5luH6oeUT9yAZFd7fpVjnw,2015
110
+ themis/experiment/definitions.py,sha256=7x18qmQBX88Gn_kxY9n3UefnwvvsRnALAvuEmXSZqXY,2022
110
111
  themis/experiment/export.py,sha256=ujwiSvqQhLaO99WHyE8osdnmriHjyIM1C2zKf5o93Cw,29800
111
- themis/experiment/export_csv.py,sha256=80w3gEGjeLjuiNq539rRP73k3MBtwrzJy90hgE91AKw,6030
112
112
  themis/experiment/integration_manager.py,sha256=wTVTjDGcUkzz4tfnwSxa5nK1A4e2FKCPazDYGcdzYS8,3325
113
113
  themis/experiment/math.py,sha256=P2E9F_UKI7pb-aXepSztGdr_g309WEMe83zqg1nWO7A,6973
114
114
  themis/experiment/mcq.py,sha256=DDB99FHQsU_5vMIRDRhSZ7pReYvVf57wLmmo3OU_An4,6276
115
- themis/experiment/orchestrator.py,sha256=VeSasDmCXrYlrv1r47I698RUq14vEBR7c_uyZzM01hw,19304
115
+ themis/experiment/orchestrator.py,sha256=fA4nPBrhMwcnsdPc9ETr2m8t0I690UHpujIf8dAOgzU,19527
116
116
  themis/experiment/pricing.py,sha256=fTM32yE3L8vahMP4sr1zr7dbp9zYCjiPN4D4VuZ8-q8,9346
117
- themis/experiment/storage.py,sha256=ujGiQTeRPOfS8hYHB1a7F9t-dQnXquhqomI1vDjqmno,55250
117
+ themis/experiment/share.py,sha256=vbwZ93jFQo0r7uxfzkixwkfwbZZFc0FeFgCj4XS1L4o,8209
118
+ themis/experiment/storage.py,sha256=rqyRyWopwR1Td-9d1O6kslPT1QEepzX29Nl3dtYcDIk,56991
118
119
  themis/experiment/visualization.py,sha256=dJYHrp3mntl8CPc5HPI3iKqPztVsddQB3ogRkd_FCNc,18473
119
120
  themis/generation/__init__.py,sha256=6KVwCQYMpPIsXNuWDZOGuqHkUkA45lbSacIFn8ZbD4s,36
120
121
  themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJzay2uA,13669
121
122
  themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo,7557
122
123
  themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
123
124
  themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
124
- themis/generation/plan.py,sha256=RmPIdefXkQMHYv5EWiilpx91I9a-svw31imvG0wV3fE,15961
125
- themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
126
- themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
125
+ themis/generation/plan.py,sha256=k6_gdKFM12nrKz7ac1c5vTZsFanIKJJgyQ8IhvakDNQ,17158
126
+ themis/generation/router.py,sha256=ce3Hbth3JrJxO9dBgo0izCMzvqJ3ARaBNRhIJT-JXRQ,1692
127
+ themis/generation/runner.py,sha256=E7hN4gAxzJLCYwvGjwKCBMhXlFv-RhSRjQs1kYvarXo,9579
127
128
  themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
128
129
  themis/generation/templates.py,sha256=ut_6akp8Y6Ey_9O3s64jDbwCB74pw62Zf8URlYcKHkA,2325
129
130
  themis/generation/turn_strategies.py,sha256=w33qhzpQbGTsfeOgOgMDovV0wEeXeNZUUBm5yZy1naw,10973
@@ -135,7 +136,7 @@ themis/integrations/huggingface.py,sha256=vrLwYwn65pU4W3FUe0ImCOZxKKlpRshDqMoLFs
135
136
  themis/integrations/wandb.py,sha256=LJOPojjlqG05EIPxcjy3QmA15Gxgs1db3encDWVzYYw,2545
136
137
  themis/interfaces/__init__.py,sha256=78dNE_eHfFmb9hXNy5sLZ1jOTGWS8TzdVE_eiYQPFVc,5967
137
138
  themis/presets/__init__.py,sha256=w58fJcy4eNiE034qHO2xE5pp-H-4LNLXo5hLMuC7wIQ,533
138
- themis/presets/benchmarks.py,sha256=s9JxRogHwZs8oiuiI7Z7uiUBZXEp3gg7AQZnBvdGieA,12026
139
+ themis/presets/benchmarks.py,sha256=wO5xAVafUuL3HEjPO4eAsfzoeexINxIIGUXiwz_31zU,31752
139
140
  themis/presets/models.py,sha256=c6-I_drHa4vMLIajSkCcrFbsJOsauFjY8fU1leBxZLg,5173
140
141
  themis/project/__init__.py,sha256=vgLv2nS62yz1XsFSFzFf7eIo6FyQJXpOY9OPRUcTQLQ,465
141
142
  themis/project/definitions.py,sha256=vHARw0IjFOWE4RL4mGRwvke36A6GWQGep6cQFIRcpJg,3329
@@ -143,15 +144,21 @@ themis/project/patterns.py,sha256=2J51Q9Jq7X-2N57uexvR191gaZKwusef5vIuIVUQY-E,77
143
144
  themis/providers/__init__.py,sha256=K5nG0DsK_YPY0cT9MBLk5BLcLbBo0wBP0vQvLjpAw_Y,189
144
145
  themis/providers/registry.py,sha256=Za5Kg3-A-35wS_jiGpPXV2q1k6he_dRIWVqt36dKN-4,1056
145
146
  themis/server/__init__.py,sha256=Hp0qGI5nvO4bhLAez3jQxim7H433l72EYE2IA8Xp2hA,731
146
- themis/server/app.py,sha256=OZ39gCC47AXVqZxroC_4KtIYBYx_rfpde7C25AF3EI0,11166
147
+ themis/server/app.py,sha256=p8UQp2cU5i4DgZC0EhRdRejOfFvlg-93WOC3Z45apHs,11510
148
+ themis/specs/__init__.py,sha256=7eJcZFKgMHflUF4C_Pg1M5B_gpJXCsACEZruv4SHx70,256
149
+ themis/specs/execution.py,sha256=11EeecjIdcKKRdF_kQ_WQkAIxIR8kUdifqRPnLo8XrU,698
150
+ themis/specs/experiment.py,sha256=dnOP1KmsUIcA-VC4AR0bOFsNDMFUBEGLPJVVzPoovR0,971
151
+ themis/specs/storage.py,sha256=5jEqBPv1P2DTXlriUZvfNLnMGn2P-CLx6h82M5RpTmo,383
152
+ themis/storage/__init__.py,sha256=1axYCdtNSE3sYehO6T1hWRjGP_iYnlGl-dBlKiMNX8g,276
153
+ themis/storage/experiment_storage.py,sha256=S7tQ9DtYhWwX2rmxsWrjMW69WGAZORFu49s0eVmwS40,197
147
154
  themis/utils/api_generator.py,sha256=3oQ7mGZlFx2Dpm45pMg3rNIqNK2Smj05PjOMXp5RIkQ,10776
148
155
  themis/utils/cost_tracking.py,sha256=9_Z2iTfNaQse9G_bnqn4hme4T0fG2W-fxOLEDeF_3VI,11545
149
156
  themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,13008
150
157
  themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
151
158
  themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
152
159
  themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
153
- themis_eval-0.2.2.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
154
- themis_eval-0.2.2.dist-info/METADATA,sha256=eOlF2Obimv_822azCt0vwhLaBz3CKsuvJPgDHMA3WFU,15235
155
- themis_eval-0.2.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
156
- themis_eval-0.2.2.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
157
- themis_eval-0.2.2.dist-info/RECORD,,
160
+ themis_eval-1.0.0.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
161
+ themis_eval-1.0.0.dist-info/METADATA,sha256=mXVE6G4L1O18OhtsOgACvvjvx4zJ_onVmgF4SO9mJFw,15513
162
+ themis_eval-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
163
+ themis_eval-1.0.0.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
164
+ themis_eval-1.0.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,151 +0,0 @@
1
- """Utilities for assembling experiments from reusable components."""
2
-
3
- from __future__ import annotations
4
-
5
- from pathlib import Path
6
- from typing import Any, Callable, Mapping, Sequence, Type
7
-
8
- from themis.config import schema as config
9
- from themis.core import entities as core_entities
10
- from themis.evaluation import pipeline as evaluation_pipeline
11
- from themis.evaluation import strategies as evaluation_strategies
12
- from themis.experiment import orchestrator
13
- from themis.experiment import storage as experiment_storage
14
- from themis.experiment.cache_manager import CacheManager
15
- from themis.experiment.definitions import (
16
- BuiltExperiment,
17
- ExperimentDefinition,
18
- ModelBinding,
19
- )
20
- from themis.experiment.integration_manager import IntegrationManager
21
- from themis.generation import plan as generation_plan
22
- from themis.generation import router as generation_router
23
- from themis.generation import runner as generation_runner
24
- from themis.generation import strategies as generation_strategies
25
- from themis.interfaces import ModelProvider
26
- from themis.providers import create_provider
27
-
28
-
29
- class ExperimentBuilder:
30
- """Composable builder for constructing experiment components."""
31
-
32
- def __init__(
33
- self,
34
- *,
35
- extractor,
36
- metrics,
37
- runner_cls: Type[
38
- generation_runner.GenerationRunner
39
- ] = generation_runner.GenerationRunner,
40
- runner_kwargs: Mapping[str, Any] | None = None,
41
- pipeline_cls: Type[
42
- evaluation_pipeline.EvaluationPipeline
43
- ] = evaluation_pipeline.EvaluationPipeline,
44
- pipeline_kwargs: Mapping[str, Any] | None = None,
45
- router_cls: Type[ModelProvider] = generation_router.ProviderRouter,
46
- router_kwargs: Mapping[str, Any] | None = None,
47
- strategy_resolver: Callable[
48
- [core_entities.GenerationTask], generation_strategies.GenerationStrategy
49
- ]
50
- | None = None,
51
- evaluation_strategy_resolver: Callable[
52
- [core_entities.GenerationRecord], evaluation_strategies.EvaluationStrategy
53
- ]
54
- | None = None,
55
- ) -> None:
56
- self._extractor = extractor
57
- self._metrics = list(metrics)
58
- self._runner_cls = runner_cls
59
- self._runner_kwargs = dict(runner_kwargs or {})
60
- self._pipeline_cls = pipeline_cls
61
- self._pipeline_kwargs = dict(pipeline_kwargs or {})
62
- self._router_cls = router_cls
63
- self._router_kwargs = dict(router_kwargs or {})
64
- self._strategy_resolver = strategy_resolver
65
- self._evaluation_strategy_resolver = evaluation_strategy_resolver
66
-
67
- def build(
68
- self,
69
- definition: ExperimentDefinition,
70
- *,
71
- storage_dir: str | Path | None = None,
72
- ) -> BuiltExperiment:
73
- plan_obj = self._build_plan(definition)
74
- router = self._build_router(definition.model_bindings)
75
- runner_kwargs = dict(self._runner_kwargs)
76
- if self._strategy_resolver is not None:
77
- runner_kwargs.setdefault("strategy_resolver", self._strategy_resolver)
78
- runner = self._runner_cls(provider=router, **runner_kwargs)
79
- pipeline_kwargs = dict(self._pipeline_kwargs)
80
- if self._evaluation_strategy_resolver is not None:
81
- pipeline_kwargs.setdefault(
82
- "strategy_resolver", self._evaluation_strategy_resolver
83
- )
84
- pipeline = self._pipeline_cls(
85
- extractor=self._extractor,
86
- metrics=self._metrics,
87
- **pipeline_kwargs,
88
- )
89
-
90
- # Create storage backend
91
- storage = (
92
- experiment_storage.ExperimentStorage(storage_dir)
93
- if storage_dir is not None
94
- else None
95
- )
96
-
97
- # Create managers for better separation of concerns
98
- cache_manager = CacheManager(
99
- storage=storage,
100
- enable_resume=True,
101
- enable_cache=True,
102
- )
103
- integration_manager = IntegrationManager(config=config.IntegrationsConfig())
104
-
105
- # Create orchestrator with managers
106
- orchestrator_obj = orchestrator.ExperimentOrchestrator(
107
- generation_plan=plan_obj,
108
- generation_runner=runner,
109
- evaluation_pipeline=pipeline,
110
- cache_manager=cache_manager,
111
- integration_manager=integration_manager,
112
- )
113
-
114
- return BuiltExperiment(
115
- orchestrator=orchestrator_obj,
116
- plan=plan_obj,
117
- runner=runner,
118
- pipeline=pipeline,
119
- storage=storage,
120
- router=router,
121
- )
122
-
123
- def _build_plan(
124
- self, definition: ExperimentDefinition
125
- ) -> generation_plan.GenerationPlan:
126
- return generation_plan.GenerationPlan(
127
- templates=list(definition.templates),
128
- models=[binding.spec for binding in definition.model_bindings],
129
- sampling_parameters=list(definition.sampling_parameters),
130
- dataset_id_field=definition.dataset_id_field,
131
- reference_field=definition.reference_field,
132
- metadata_fields=tuple(definition.metadata_fields),
133
- context_builder=definition.context_builder,
134
- )
135
-
136
- def _build_router(self, bindings: Sequence[ModelBinding]) -> ModelProvider:
137
- providers: dict[str, ModelProvider] = {}
138
- for binding in bindings:
139
- providers[binding.spec.identifier] = create_provider(
140
- binding.provider_name,
141
- **binding.provider_options,
142
- )
143
- return self._router_cls(providers, **self._router_kwargs)
144
-
145
-
146
- __all__ = [
147
- "ExperimentBuilder",
148
- "ExperimentDefinition",
149
- "ModelBinding",
150
- "BuiltExperiment",
151
- ]
@@ -1,159 +0,0 @@
1
- """CSV export functionality for experiment reports."""
2
-
3
- from __future__ import annotations
4
-
5
- import csv
6
- from pathlib import Path
7
- from typing import MutableMapping, Sequence
8
-
9
- from themis.core import entities as core_entities
10
- from themis.experiment import orchestrator
11
-
12
-
13
- def export_report_csv(
14
- report: orchestrator.ExperimentReport,
15
- path: str | Path,
16
- *,
17
- include_failures: bool = True,
18
- ) -> Path:
19
- """Write per-sample metrics to a CSV file for offline analysis.
20
-
21
- Args:
22
- report: Experiment report to export
23
- path: Output path for CSV file
24
- include_failures: Whether to include failures column
25
-
26
- Returns:
27
- Path to created CSV file
28
- """
29
- path = Path(path)
30
- path.parent.mkdir(parents=True, exist_ok=True)
31
- metadata_by_condition, metadata_fields = _collect_sample_metadata(
32
- report.generation_results
33
- )
34
-
35
- # Create a proper index mapping generation records to their metadata
36
- gen_record_index = {}
37
- for gen_record in report.generation_results:
38
- sample_id = gen_record.task.metadata.get(
39
- "dataset_id"
40
- ) or gen_record.task.metadata.get("sample_id")
41
- prompt_template = gen_record.task.prompt.spec.name
42
- model_identifier = gen_record.task.model.identifier
43
- sampling_temp = gen_record.task.sampling.temperature
44
- sampling_max_tokens = gen_record.task.sampling.max_tokens
45
- condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
46
- gen_record_index[condition_id] = gen_record
47
-
48
- metric_names = sorted(report.evaluation_report.metrics.keys())
49
- fieldnames = (
50
- ["sample_id"] + metadata_fields + [f"metric:{name}" for name in metric_names]
51
- )
52
- if include_failures:
53
- fieldnames.append("failures")
54
-
55
- with path.open("w", encoding="utf-8", newline="") as handle:
56
- writer = csv.DictWriter(handle, fieldnames=fieldnames)
57
- writer.writeheader()
58
-
59
- # Process evaluation records in the same order as generation records
60
- for i, eval_record in enumerate(report.evaluation_report.records):
61
- # Find the corresponding generation record by index
62
- if i < len(report.generation_results):
63
- gen_record = report.generation_results[i]
64
- sample_id = gen_record.task.metadata.get(
65
- "dataset_id"
66
- ) or gen_record.task.metadata.get("sample_id")
67
- prompt_template = gen_record.task.prompt.spec.name
68
- model_identifier = gen_record.task.model.identifier
69
- sampling_temp = gen_record.task.sampling.temperature
70
- sampling_max_tokens = gen_record.task.sampling.max_tokens
71
- condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
72
- metadata = metadata_by_condition.get(condition_id, {})
73
- else:
74
- # Fallback for extra evaluation records
75
- sample_id = eval_record.sample_id or ""
76
- metadata = {}
77
-
78
- row: dict[str, object] = {"sample_id": sample_id}
79
- for field in metadata_fields:
80
- row[field] = metadata.get(field, "")
81
- score_by_name = {
82
- score.metric_name: score.value for score in eval_record.scores
83
- }
84
- for name in metric_names:
85
- row[f"metric:{name}"] = score_by_name.get(name, "")
86
- if include_failures:
87
- row["failures"] = "; ".join(eval_record.failures)
88
- writer.writerow(row)
89
- return path
90
-
91
-
92
- def _collect_sample_metadata(
93
- records: Sequence[core_entities.GenerationRecord],
94
- ) -> tuple[dict[str, MutableMapping[str, object]], list[str]]:
95
- """Collect metadata from generation records.
96
-
97
- Args:
98
- records: Generation records
99
-
100
- Returns:
101
- Tuple of (metadata by condition ID, list of metadata fields)
102
- """
103
- metadata: dict[str, MutableMapping[str, object]] = {}
104
- for index, record in enumerate(records):
105
- sample_id = _extract_sample_id(record.task.metadata)
106
- if sample_id is None:
107
- sample_id = f"sample-{index}"
108
-
109
- # Create unique identifier for each experimental condition
110
- prompt_template = record.task.prompt.spec.name
111
- model_identifier = record.task.model.identifier
112
- sampling_temp = record.task.sampling.temperature
113
- sampling_max_tokens = record.task.sampling.max_tokens
114
-
115
- # Create unique condition key
116
- condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
117
-
118
- # Store metadata with unique condition ID
119
- condition_metadata = _metadata_from_task(record)
120
- metadata[condition_id] = condition_metadata
121
-
122
- # Collect all field names from all conditions
123
- fields = sorted({field for meta in metadata.values() for field in meta.keys()})
124
-
125
- return metadata, fields
126
-
127
-
128
- def _extract_sample_id(metadata: dict[str, object]) -> str | None:
129
- """Extract sample ID from metadata.
130
-
131
- Args:
132
- metadata: Task metadata
133
-
134
- Returns:
135
- Sample ID or None
136
- """
137
- value = metadata.get("dataset_id") or metadata.get("sample_id")
138
- if value is None:
139
- return None
140
- return str(value)
141
-
142
-
143
- def _metadata_from_task(record: core_entities.GenerationRecord) -> dict[str, object]:
144
- """Build metadata dict from generation record.
145
-
146
- Args:
147
- record: Generation record
148
-
149
- Returns:
150
- Metadata dictionary
151
- """
152
- metadata = dict(record.task.metadata)
153
- metadata.setdefault("model_identifier", record.task.model.identifier)
154
- metadata.setdefault("model_provider", record.task.model.provider)
155
- metadata.setdefault("prompt_template", record.task.prompt.spec.name)
156
- metadata.setdefault("sampling_temperature", record.task.sampling.temperature)
157
- metadata.setdefault("sampling_top_p", record.task.sampling.top_p)
158
- metadata.setdefault("sampling_max_tokens", record.task.sampling.max_tokens)
159
- return metadata