themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +5 -2
- themis/_version.py +14 -1
- themis/api.py +83 -145
- themis/backends/storage.py +5 -0
- themis/cli/commands/info.py +2 -11
- themis/cli/main.py +231 -40
- themis/comparison/engine.py +7 -13
- themis/core/entities.py +4 -0
- themis/evaluation/metric_pipeline.py +12 -0
- themis/evaluation/pipeline.py +22 -0
- themis/evaluation/pipelines/__init__.py +4 -0
- themis/evaluation/pipelines/composable_pipeline.py +55 -0
- themis/evaluation/pipelines/standard_pipeline.py +16 -0
- themis/experiment/__init__.py +2 -2
- themis/experiment/cache_manager.py +15 -1
- themis/experiment/definitions.py +1 -1
- themis/experiment/orchestrator.py +21 -11
- themis/experiment/share.py +264 -0
- themis/experiment/storage.py +345 -298
- themis/generation/router.py +22 -4
- themis/generation/runner.py +16 -1
- themis/presets/benchmarks.py +602 -17
- themis/server/app.py +38 -26
- themis/session.py +125 -0
- themis/specs/__init__.py +7 -0
- themis/specs/execution.py +26 -0
- themis/specs/experiment.py +33 -0
- themis/specs/storage.py +18 -0
- themis/storage/__init__.py +6 -0
- themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
- themis/experiment/builder.py +0 -151
- themis/experiment/export_csv.py +0 -159
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
themis/__init__.py,sha256=
|
|
2
|
-
themis/_version.py,sha256=
|
|
3
|
-
themis/api.py,sha256=
|
|
1
|
+
themis/__init__.py,sha256=YPexmyPqbANhr0Yzm46FDWgdBLjAX_UoZqywte9WJ84,1476
|
|
2
|
+
themis/_version.py,sha256=b5-7SWk_hYOedKy8IUPxaubXc0KS0hRjhUsY8PkIbl0,818
|
|
3
|
+
themis/api.py,sha256=zK6_RiiYTcXiV6uVQu1fNRdTqm-tXT_Df-CfrLXrIhk,15509
|
|
4
4
|
themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
themis/session.py,sha256=H1Pkr71b90sa94YeRCIyffIxE2gQPmtfzo-zNnPLrdM,4311
|
|
5
6
|
themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
|
|
6
7
|
themis/backends/execution.py,sha256=RAFuB9ri8TMil5PcnsisypKO2ViyLFXj08P_vjNYguU,6095
|
|
7
|
-
themis/backends/storage.py,sha256=
|
|
8
|
+
themis/backends/storage.py,sha256=3oMcL5Wliac3INxHVG6DFlbqe6Y-6fjA5eZJOLpxm0w,8168
|
|
8
9
|
themis/cli/__init__.py,sha256=An2DrMHRfmiee5BYJ6TGqvbG7sXWECjjyvEgcoGJ7cE,99
|
|
9
10
|
themis/cli/__main__.py,sha256=df2pOghoSuq18hZmVVikmGhaFSaRe-jeDOnrsu-1QDM,135
|
|
10
|
-
themis/cli/main.py,sha256=
|
|
11
|
+
themis/cli/main.py,sha256=ZcB3rS0tMsNCEcHRqjCwfi_JwvQqWZUjZ1TWuSUj4N4,22236
|
|
11
12
|
themis/cli/new_project.py,sha256=D8asV4QbjgQNYvmXt_WhK4nPM-wKHe_K0VJiBdgtO_E,1121
|
|
12
13
|
themis/cli/utils.py,sha256=NAPyFiXspfpx5vBxA8aEcOMmWEDyt-R8ywoHo_8Nr4A,1307
|
|
13
14
|
themis/cli/commands/__init__.py,sha256=CTx7su3qTtq96qxLNclDsE6UM_86NhaS01M9-x9wFiw,287
|
|
@@ -16,7 +17,7 @@ themis/cli/commands/comparison.py,sha256=Ki1_MMFFR4vBJkZTeIMWLh-_zdjbtJZurI3YyrE
|
|
|
16
17
|
themis/cli/commands/config_commands.py,sha256=eL6GtdIllOIHo8GbNN2jOqLn5VUPBuqhnro9ooPxDog,7387
|
|
17
18
|
themis/cli/commands/cost.py,sha256=fFdF6hKIzsbPsyrJ1nt6-2m43PpVGUj8jx5T90tBTNo,7233
|
|
18
19
|
themis/cli/commands/demo.py,sha256=akQqjG-hbUDfeB3bI8K4F5-S0ibJqhflGBFQ5nvdUgE,2135
|
|
19
|
-
themis/cli/commands/info.py,sha256=
|
|
20
|
+
themis/cli/commands/info.py,sha256=njms4dsaede2zyC2yoLbfoN5Bho_JTYciJl3Q7Twiv0,2250
|
|
20
21
|
themis/cli/commands/leaderboard.py,sha256=AVvsYIwZAY18jn3sOq3QD45yNtfdHUEl7eixM4aMCKw,10615
|
|
21
22
|
themis/cli/commands/math_benchmarks.py,sha256=nQ4TcPB7T9O3piAy4_TgrOQOQxh2Q8OyBreK_HoPCeQ,9946
|
|
22
23
|
themis/cli/commands/mcq_benchmarks.py,sha256=Cls5W1jGd7TKizmw07CnZWY5N6ywR8VhJ6jKDnY_cRk,7026
|
|
@@ -24,7 +25,7 @@ themis/cli/commands/results.py,sha256=rdN3SaMoFnSfAoAXlfpeCTt3V6MwIp0Dk7FIjvPNF7
|
|
|
24
25
|
themis/cli/commands/sample_run.py,sha256=r3Ymg5dVHg4IAVJvzoP0ZWUWWUE4Dia1t0062Yhdk9Q,9445
|
|
25
26
|
themis/cli/commands/visualize.py,sha256=ZECkB0NjIltuOeBE-Q1JnndZEMXVzc8KgcrbaP-GSXo,9740
|
|
26
27
|
themis/comparison/__init__.py,sha256=bRI8gDlcjMtnH77R7N5ARioq_V4daJcWWM4DXKsoE1k,679
|
|
27
|
-
themis/comparison/engine.py,sha256=
|
|
28
|
+
themis/comparison/engine.py,sha256=Mw4sQBO2NnKRUOWHNO1XnyjrVvmHX1KXRnl1LUw1hIY,11809
|
|
28
29
|
themis/comparison/reports.py,sha256=126VJbd-lxj8C2YJqul53Fyr-nrZgmbrBsRA6Qkh0ro,10117
|
|
29
30
|
themis/comparison/statistics.py,sha256=eLqKUtKFwSvXnbZax8S0lF8RiSepwYdhnmnDD7DcrZs,12929
|
|
30
31
|
themis/config/__init__.py,sha256=YMdFG1iLvOQUnSPlc_ZJVn5zCCTbIozML64b4qUtGR8,476
|
|
@@ -34,7 +35,7 @@ themis/config/runtime.py,sha256=hU69_oND7fJfAOIBJONENmsuf7Y8roO7n-w9OwxzoT8,7475
|
|
|
34
35
|
themis/config/schema.py,sha256=SMR9QHp8OBkSnb1dHyOgg-IJWSqpXfyAqywnBeMy46M,3196
|
|
35
36
|
themis/core/__init__.py,sha256=S8G1x-39sZ3_NQ5DJ6R1yBTWXp_gO0WxOtVjeB9sTwY,113
|
|
36
37
|
themis/core/conversation.py,sha256=wwO8RS4t4plDR0Sf1KjYv_ejonlvKe0ZwAD-4sfGak8,10155
|
|
37
|
-
themis/core/entities.py,sha256=
|
|
38
|
+
themis/core/entities.py,sha256=a9R3J2wetyUiGcxH6K8pGXfVz_gLh4nsJ81y7cKBj5o,4548
|
|
38
39
|
themis/core/serialization.py,sha256=cxfoSKwcZiNsnR8g_SAJAq1ZLrfLXM4S9_rVEDUT8qs,7071
|
|
39
40
|
themis/core/tools.py,sha256=v0_ctsBCtinZGNC_I4C-h0GUPNM5ZeTi7z-U4iCtyp4,11035
|
|
40
41
|
themis/core/types.py,sha256=I5rr9MMS0irX4lo-xlqGjosx-FjPgT64RzQAraM223A,3652
|
|
@@ -59,7 +60,8 @@ themis/datasets/super_gpqa.py,sha256=Mr1ag_FyAk1haxg6_ONX5F84wQYtbSVjV-MlMNmaHlI
|
|
|
59
60
|
themis/evaluation/__init__.py,sha256=2Jl8tcVxYAsmHNAZev2mPS_mEwZcRzebqSM3QDc2cyY,36
|
|
60
61
|
themis/evaluation/conditional.py,sha256=ayndI7FcwxdIMR8B4ddgcKZd5Jl5NQcBJUp7eXI6Djk,13881
|
|
61
62
|
themis/evaluation/math_verify_utils.py,sha256=vXMvL11-IH16UHZ-mbi_r5hOFz7aUfR1J1laa6qmLMk,2213
|
|
62
|
-
themis/evaluation/
|
|
63
|
+
themis/evaluation/metric_pipeline.py,sha256=_JXqf5UySbAyuSiN1waDCBfeek7ArluKvXXkm8qIEvs,329
|
|
64
|
+
themis/evaluation/pipeline.py,sha256=7mQV_sJlPDd6zA9jd2TT5IhdMszzT3ftMLbM7Ww76aw,2217
|
|
63
65
|
themis/evaluation/reports.py,sha256=9om7jzZUtmlMH7EeteXp_98gfHct4x09AyTFy3FSAdQ,8715
|
|
64
66
|
themis/evaluation/extractors/__init__.py,sha256=BanoC_8e0iam-VU7l7uhvhac_6w_JJZYoYE4xXPUrGk,566
|
|
65
67
|
themis/evaluation/extractors/error_taxonomy_extractor.py,sha256=RrRx-23l3LwTdG89kvSQJng438cfYI-IdtOGUD6gEDw,2462
|
|
@@ -86,9 +88,9 @@ themis/evaluation/metrics/nlp/bertscore.py,sha256=czlIqYkOTBWsfHiE6U1vkq1KHRQm8p
|
|
|
86
88
|
themis/evaluation/metrics/nlp/bleu.py,sha256=o_aVkoFPSMmeOLYaHRMamIpSKlYSxrMA1OdntTIUe9g,4436
|
|
87
89
|
themis/evaluation/metrics/nlp/meteor.py,sha256=QZT09s4aiUcVvDJDVPZYjzi5SxXdS2gn2IaOTNmKp78,5076
|
|
88
90
|
themis/evaluation/metrics/nlp/rouge.py,sha256=YL05qluF-KsesHYFRfm5zELJlcvo6RvaKp7xKy6BuLI,4365
|
|
89
|
-
themis/evaluation/pipelines/__init__.py,sha256=
|
|
90
|
-
themis/evaluation/pipelines/composable_pipeline.py,sha256=
|
|
91
|
-
themis/evaluation/pipelines/standard_pipeline.py,sha256=
|
|
91
|
+
themis/evaluation/pipelines/__init__.py,sha256=EXBzgOpi_70yTvdXJr0cERmhBlaqnHZHNcDlPamyQLw,549
|
|
92
|
+
themis/evaluation/pipelines/composable_pipeline.py,sha256=SWv4H1-pcFNAuTu0orkMDgKsdUcom75ded9GhIaHqo0,12838
|
|
93
|
+
themis/evaluation/pipelines/standard_pipeline.py,sha256=Xe7hsOzHg0EBnmslBejb3xJ6Y2pvHF4mrwC_THurQQI,15423
|
|
92
94
|
themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
|
|
93
95
|
themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
|
|
94
96
|
themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
|
|
@@ -101,20 +103,19 @@ themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=MFcBdtK
|
|
|
101
103
|
themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
|
|
102
104
|
themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
|
|
103
105
|
themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=IRSgnnD2R6qrjiOTyA_PIOHUfQj4FqutkU3pKMth0CQ,2562
|
|
104
|
-
themis/experiment/__init__.py,sha256=
|
|
105
|
-
themis/experiment/
|
|
106
|
-
themis/experiment/cache_manager.py,sha256=Fd8Qxifrmyn8f2zjAyPrLv-ZU4Dcp-MKo8-09BoW7tY,4361
|
|
106
|
+
themis/experiment/__init__.py,sha256=T8BEG9dmabQ-tnVZ5YUd5r-31m6_OY-FDfvUfWhl2hc,190
|
|
107
|
+
themis/experiment/cache_manager.py,sha256=aB9QWcS5PV4y675cMVQehm1Rs2XuK84M0WMMFBYP3Hc,4901
|
|
107
108
|
themis/experiment/comparison.py,sha256=Mr1L5Zj7i87xk9XUQ_UueLTsC-sDZH8YGwLwg_gG0VI,21562
|
|
108
109
|
themis/experiment/cost.py,sha256=flhENfB5WKvyNWwPMDtygNZAv6y_yv4RoClsRz714Hc,10159
|
|
109
|
-
themis/experiment/definitions.py,sha256=
|
|
110
|
+
themis/experiment/definitions.py,sha256=7x18qmQBX88Gn_kxY9n3UefnwvvsRnALAvuEmXSZqXY,2022
|
|
110
111
|
themis/experiment/export.py,sha256=ujwiSvqQhLaO99WHyE8osdnmriHjyIM1C2zKf5o93Cw,29800
|
|
111
|
-
themis/experiment/export_csv.py,sha256=80w3gEGjeLjuiNq539rRP73k3MBtwrzJy90hgE91AKw,6030
|
|
112
112
|
themis/experiment/integration_manager.py,sha256=wTVTjDGcUkzz4tfnwSxa5nK1A4e2FKCPazDYGcdzYS8,3325
|
|
113
113
|
themis/experiment/math.py,sha256=P2E9F_UKI7pb-aXepSztGdr_g309WEMe83zqg1nWO7A,6973
|
|
114
114
|
themis/experiment/mcq.py,sha256=DDB99FHQsU_5vMIRDRhSZ7pReYvVf57wLmmo3OU_An4,6276
|
|
115
|
-
themis/experiment/orchestrator.py,sha256=
|
|
115
|
+
themis/experiment/orchestrator.py,sha256=fA4nPBrhMwcnsdPc9ETr2m8t0I690UHpujIf8dAOgzU,19527
|
|
116
116
|
themis/experiment/pricing.py,sha256=fTM32yE3L8vahMP4sr1zr7dbp9zYCjiPN4D4VuZ8-q8,9346
|
|
117
|
-
themis/experiment/
|
|
117
|
+
themis/experiment/share.py,sha256=vbwZ93jFQo0r7uxfzkixwkfwbZZFc0FeFgCj4XS1L4o,8209
|
|
118
|
+
themis/experiment/storage.py,sha256=rqyRyWopwR1Td-9d1O6kslPT1QEepzX29Nl3dtYcDIk,56991
|
|
118
119
|
themis/experiment/visualization.py,sha256=dJYHrp3mntl8CPc5HPI3iKqPztVsddQB3ogRkd_FCNc,18473
|
|
119
120
|
themis/generation/__init__.py,sha256=6KVwCQYMpPIsXNuWDZOGuqHkUkA45lbSacIFn8ZbD4s,36
|
|
120
121
|
themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJzay2uA,13669
|
|
@@ -122,8 +123,8 @@ themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo
|
|
|
122
123
|
themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
|
|
123
124
|
themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
|
|
124
125
|
themis/generation/plan.py,sha256=k6_gdKFM12nrKz7ac1c5vTZsFanIKJJgyQ8IhvakDNQ,17158
|
|
125
|
-
themis/generation/router.py,sha256=
|
|
126
|
-
themis/generation/runner.py,sha256=
|
|
126
|
+
themis/generation/router.py,sha256=ce3Hbth3JrJxO9dBgo0izCMzvqJ3ARaBNRhIJT-JXRQ,1692
|
|
127
|
+
themis/generation/runner.py,sha256=E7hN4gAxzJLCYwvGjwKCBMhXlFv-RhSRjQs1kYvarXo,9579
|
|
127
128
|
themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
|
|
128
129
|
themis/generation/templates.py,sha256=ut_6akp8Y6Ey_9O3s64jDbwCB74pw62Zf8URlYcKHkA,2325
|
|
129
130
|
themis/generation/turn_strategies.py,sha256=w33qhzpQbGTsfeOgOgMDovV0wEeXeNZUUBm5yZy1naw,10973
|
|
@@ -135,7 +136,7 @@ themis/integrations/huggingface.py,sha256=vrLwYwn65pU4W3FUe0ImCOZxKKlpRshDqMoLFs
|
|
|
135
136
|
themis/integrations/wandb.py,sha256=LJOPojjlqG05EIPxcjy3QmA15Gxgs1db3encDWVzYYw,2545
|
|
136
137
|
themis/interfaces/__init__.py,sha256=78dNE_eHfFmb9hXNy5sLZ1jOTGWS8TzdVE_eiYQPFVc,5967
|
|
137
138
|
themis/presets/__init__.py,sha256=w58fJcy4eNiE034qHO2xE5pp-H-4LNLXo5hLMuC7wIQ,533
|
|
138
|
-
themis/presets/benchmarks.py,sha256=
|
|
139
|
+
themis/presets/benchmarks.py,sha256=wO5xAVafUuL3HEjPO4eAsfzoeexINxIIGUXiwz_31zU,31752
|
|
139
140
|
themis/presets/models.py,sha256=c6-I_drHa4vMLIajSkCcrFbsJOsauFjY8fU1leBxZLg,5173
|
|
140
141
|
themis/project/__init__.py,sha256=vgLv2nS62yz1XsFSFzFf7eIo6FyQJXpOY9OPRUcTQLQ,465
|
|
141
142
|
themis/project/definitions.py,sha256=vHARw0IjFOWE4RL4mGRwvke36A6GWQGep6cQFIRcpJg,3329
|
|
@@ -143,15 +144,21 @@ themis/project/patterns.py,sha256=2J51Q9Jq7X-2N57uexvR191gaZKwusef5vIuIVUQY-E,77
|
|
|
143
144
|
themis/providers/__init__.py,sha256=K5nG0DsK_YPY0cT9MBLk5BLcLbBo0wBP0vQvLjpAw_Y,189
|
|
144
145
|
themis/providers/registry.py,sha256=Za5Kg3-A-35wS_jiGpPXV2q1k6he_dRIWVqt36dKN-4,1056
|
|
145
146
|
themis/server/__init__.py,sha256=Hp0qGI5nvO4bhLAez3jQxim7H433l72EYE2IA8Xp2hA,731
|
|
146
|
-
themis/server/app.py,sha256=
|
|
147
|
+
themis/server/app.py,sha256=p8UQp2cU5i4DgZC0EhRdRejOfFvlg-93WOC3Z45apHs,11510
|
|
148
|
+
themis/specs/__init__.py,sha256=7eJcZFKgMHflUF4C_Pg1M5B_gpJXCsACEZruv4SHx70,256
|
|
149
|
+
themis/specs/execution.py,sha256=11EeecjIdcKKRdF_kQ_WQkAIxIR8kUdifqRPnLo8XrU,698
|
|
150
|
+
themis/specs/experiment.py,sha256=dnOP1KmsUIcA-VC4AR0bOFsNDMFUBEGLPJVVzPoovR0,971
|
|
151
|
+
themis/specs/storage.py,sha256=5jEqBPv1P2DTXlriUZvfNLnMGn2P-CLx6h82M5RpTmo,383
|
|
152
|
+
themis/storage/__init__.py,sha256=1axYCdtNSE3sYehO6T1hWRjGP_iYnlGl-dBlKiMNX8g,276
|
|
153
|
+
themis/storage/experiment_storage.py,sha256=S7tQ9DtYhWwX2rmxsWrjMW69WGAZORFu49s0eVmwS40,197
|
|
147
154
|
themis/utils/api_generator.py,sha256=3oQ7mGZlFx2Dpm45pMg3rNIqNK2Smj05PjOMXp5RIkQ,10776
|
|
148
155
|
themis/utils/cost_tracking.py,sha256=9_Z2iTfNaQse9G_bnqn4hme4T0fG2W-fxOLEDeF_3VI,11545
|
|
149
156
|
themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,13008
|
|
150
157
|
themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
|
|
151
158
|
themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
|
|
152
159
|
themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
|
|
153
|
-
themis_eval-0.
|
|
154
|
-
themis_eval-0.
|
|
155
|
-
themis_eval-0.
|
|
156
|
-
themis_eval-0.
|
|
157
|
-
themis_eval-0.
|
|
160
|
+
themis_eval-1.0.0.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
|
|
161
|
+
themis_eval-1.0.0.dist-info/METADATA,sha256=mXVE6G4L1O18OhtsOgACvvjvx4zJ_onVmgF4SO9mJFw,15513
|
|
162
|
+
themis_eval-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
163
|
+
themis_eval-1.0.0.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
|
|
164
|
+
themis_eval-1.0.0.dist-info/RECORD,,
|
themis/experiment/builder.py
DELETED
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
"""Utilities for assembling experiments from reusable components."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Any, Callable, Mapping, Sequence, Type
|
|
7
|
-
|
|
8
|
-
from themis.config import schema as config
|
|
9
|
-
from themis.core import entities as core_entities
|
|
10
|
-
from themis.evaluation import pipeline as evaluation_pipeline
|
|
11
|
-
from themis.evaluation import strategies as evaluation_strategies
|
|
12
|
-
from themis.experiment import orchestrator
|
|
13
|
-
from themis.experiment import storage as experiment_storage
|
|
14
|
-
from themis.experiment.cache_manager import CacheManager
|
|
15
|
-
from themis.experiment.definitions import (
|
|
16
|
-
BuiltExperiment,
|
|
17
|
-
ExperimentDefinition,
|
|
18
|
-
ModelBinding,
|
|
19
|
-
)
|
|
20
|
-
from themis.experiment.integration_manager import IntegrationManager
|
|
21
|
-
from themis.generation import plan as generation_plan
|
|
22
|
-
from themis.generation import router as generation_router
|
|
23
|
-
from themis.generation import runner as generation_runner
|
|
24
|
-
from themis.generation import strategies as generation_strategies
|
|
25
|
-
from themis.interfaces import ModelProvider
|
|
26
|
-
from themis.providers import create_provider
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class ExperimentBuilder:
|
|
30
|
-
"""Composable builder for constructing experiment components."""
|
|
31
|
-
|
|
32
|
-
def __init__(
|
|
33
|
-
self,
|
|
34
|
-
*,
|
|
35
|
-
extractor,
|
|
36
|
-
metrics,
|
|
37
|
-
runner_cls: Type[
|
|
38
|
-
generation_runner.GenerationRunner
|
|
39
|
-
] = generation_runner.GenerationRunner,
|
|
40
|
-
runner_kwargs: Mapping[str, Any] | None = None,
|
|
41
|
-
pipeline_cls: Type[
|
|
42
|
-
evaluation_pipeline.EvaluationPipeline
|
|
43
|
-
] = evaluation_pipeline.EvaluationPipeline,
|
|
44
|
-
pipeline_kwargs: Mapping[str, Any] | None = None,
|
|
45
|
-
router_cls: Type[ModelProvider] = generation_router.ProviderRouter,
|
|
46
|
-
router_kwargs: Mapping[str, Any] | None = None,
|
|
47
|
-
strategy_resolver: Callable[
|
|
48
|
-
[core_entities.GenerationTask], generation_strategies.GenerationStrategy
|
|
49
|
-
]
|
|
50
|
-
| None = None,
|
|
51
|
-
evaluation_strategy_resolver: Callable[
|
|
52
|
-
[core_entities.GenerationRecord], evaluation_strategies.EvaluationStrategy
|
|
53
|
-
]
|
|
54
|
-
| None = None,
|
|
55
|
-
) -> None:
|
|
56
|
-
self._extractor = extractor
|
|
57
|
-
self._metrics = list(metrics)
|
|
58
|
-
self._runner_cls = runner_cls
|
|
59
|
-
self._runner_kwargs = dict(runner_kwargs or {})
|
|
60
|
-
self._pipeline_cls = pipeline_cls
|
|
61
|
-
self._pipeline_kwargs = dict(pipeline_kwargs or {})
|
|
62
|
-
self._router_cls = router_cls
|
|
63
|
-
self._router_kwargs = dict(router_kwargs or {})
|
|
64
|
-
self._strategy_resolver = strategy_resolver
|
|
65
|
-
self._evaluation_strategy_resolver = evaluation_strategy_resolver
|
|
66
|
-
|
|
67
|
-
def build(
|
|
68
|
-
self,
|
|
69
|
-
definition: ExperimentDefinition,
|
|
70
|
-
*,
|
|
71
|
-
storage_dir: str | Path | None = None,
|
|
72
|
-
) -> BuiltExperiment:
|
|
73
|
-
plan_obj = self._build_plan(definition)
|
|
74
|
-
router = self._build_router(definition.model_bindings)
|
|
75
|
-
runner_kwargs = dict(self._runner_kwargs)
|
|
76
|
-
if self._strategy_resolver is not None:
|
|
77
|
-
runner_kwargs.setdefault("strategy_resolver", self._strategy_resolver)
|
|
78
|
-
runner = self._runner_cls(provider=router, **runner_kwargs)
|
|
79
|
-
pipeline_kwargs = dict(self._pipeline_kwargs)
|
|
80
|
-
if self._evaluation_strategy_resolver is not None:
|
|
81
|
-
pipeline_kwargs.setdefault(
|
|
82
|
-
"strategy_resolver", self._evaluation_strategy_resolver
|
|
83
|
-
)
|
|
84
|
-
pipeline = self._pipeline_cls(
|
|
85
|
-
extractor=self._extractor,
|
|
86
|
-
metrics=self._metrics,
|
|
87
|
-
**pipeline_kwargs,
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
# Create storage backend
|
|
91
|
-
storage = (
|
|
92
|
-
experiment_storage.ExperimentStorage(storage_dir)
|
|
93
|
-
if storage_dir is not None
|
|
94
|
-
else None
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
# Create managers for better separation of concerns
|
|
98
|
-
cache_manager = CacheManager(
|
|
99
|
-
storage=storage,
|
|
100
|
-
enable_resume=True,
|
|
101
|
-
enable_cache=True,
|
|
102
|
-
)
|
|
103
|
-
integration_manager = IntegrationManager(config=config.IntegrationsConfig())
|
|
104
|
-
|
|
105
|
-
# Create orchestrator with managers
|
|
106
|
-
orchestrator_obj = orchestrator.ExperimentOrchestrator(
|
|
107
|
-
generation_plan=plan_obj,
|
|
108
|
-
generation_runner=runner,
|
|
109
|
-
evaluation_pipeline=pipeline,
|
|
110
|
-
cache_manager=cache_manager,
|
|
111
|
-
integration_manager=integration_manager,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
return BuiltExperiment(
|
|
115
|
-
orchestrator=orchestrator_obj,
|
|
116
|
-
plan=plan_obj,
|
|
117
|
-
runner=runner,
|
|
118
|
-
pipeline=pipeline,
|
|
119
|
-
storage=storage,
|
|
120
|
-
router=router,
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
def _build_plan(
|
|
124
|
-
self, definition: ExperimentDefinition
|
|
125
|
-
) -> generation_plan.GenerationPlan:
|
|
126
|
-
return generation_plan.GenerationPlan(
|
|
127
|
-
templates=list(definition.templates),
|
|
128
|
-
models=[binding.spec for binding in definition.model_bindings],
|
|
129
|
-
sampling_parameters=list(definition.sampling_parameters),
|
|
130
|
-
dataset_id_field=definition.dataset_id_field,
|
|
131
|
-
reference_field=definition.reference_field,
|
|
132
|
-
metadata_fields=tuple(definition.metadata_fields),
|
|
133
|
-
context_builder=definition.context_builder,
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
def _build_router(self, bindings: Sequence[ModelBinding]) -> ModelProvider:
|
|
137
|
-
providers: dict[str, ModelProvider] = {}
|
|
138
|
-
for binding in bindings:
|
|
139
|
-
providers[binding.spec.identifier] = create_provider(
|
|
140
|
-
binding.provider_name,
|
|
141
|
-
**binding.provider_options,
|
|
142
|
-
)
|
|
143
|
-
return self._router_cls(providers, **self._router_kwargs)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
__all__ = [
|
|
147
|
-
"ExperimentBuilder",
|
|
148
|
-
"ExperimentDefinition",
|
|
149
|
-
"ModelBinding",
|
|
150
|
-
"BuiltExperiment",
|
|
151
|
-
]
|
themis/experiment/export_csv.py
DELETED
|
@@ -1,159 +0,0 @@
|
|
|
1
|
-
"""CSV export functionality for experiment reports."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import csv
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import MutableMapping, Sequence
|
|
8
|
-
|
|
9
|
-
from themis.core import entities as core_entities
|
|
10
|
-
from themis.experiment import orchestrator
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def export_report_csv(
|
|
14
|
-
report: orchestrator.ExperimentReport,
|
|
15
|
-
path: str | Path,
|
|
16
|
-
*,
|
|
17
|
-
include_failures: bool = True,
|
|
18
|
-
) -> Path:
|
|
19
|
-
"""Write per-sample metrics to a CSV file for offline analysis.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
report: Experiment report to export
|
|
23
|
-
path: Output path for CSV file
|
|
24
|
-
include_failures: Whether to include failures column
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
Path to created CSV file
|
|
28
|
-
"""
|
|
29
|
-
path = Path(path)
|
|
30
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
31
|
-
metadata_by_condition, metadata_fields = _collect_sample_metadata(
|
|
32
|
-
report.generation_results
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
# Create a proper index mapping generation records to their metadata
|
|
36
|
-
gen_record_index = {}
|
|
37
|
-
for gen_record in report.generation_results:
|
|
38
|
-
sample_id = gen_record.task.metadata.get(
|
|
39
|
-
"dataset_id"
|
|
40
|
-
) or gen_record.task.metadata.get("sample_id")
|
|
41
|
-
prompt_template = gen_record.task.prompt.spec.name
|
|
42
|
-
model_identifier = gen_record.task.model.identifier
|
|
43
|
-
sampling_temp = gen_record.task.sampling.temperature
|
|
44
|
-
sampling_max_tokens = gen_record.task.sampling.max_tokens
|
|
45
|
-
condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
|
|
46
|
-
gen_record_index[condition_id] = gen_record
|
|
47
|
-
|
|
48
|
-
metric_names = sorted(report.evaluation_report.metrics.keys())
|
|
49
|
-
fieldnames = (
|
|
50
|
-
["sample_id"] + metadata_fields + [f"metric:{name}" for name in metric_names]
|
|
51
|
-
)
|
|
52
|
-
if include_failures:
|
|
53
|
-
fieldnames.append("failures")
|
|
54
|
-
|
|
55
|
-
with path.open("w", encoding="utf-8", newline="") as handle:
|
|
56
|
-
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
|
57
|
-
writer.writeheader()
|
|
58
|
-
|
|
59
|
-
# Process evaluation records in the same order as generation records
|
|
60
|
-
for i, eval_record in enumerate(report.evaluation_report.records):
|
|
61
|
-
# Find the corresponding generation record by index
|
|
62
|
-
if i < len(report.generation_results):
|
|
63
|
-
gen_record = report.generation_results[i]
|
|
64
|
-
sample_id = gen_record.task.metadata.get(
|
|
65
|
-
"dataset_id"
|
|
66
|
-
) or gen_record.task.metadata.get("sample_id")
|
|
67
|
-
prompt_template = gen_record.task.prompt.spec.name
|
|
68
|
-
model_identifier = gen_record.task.model.identifier
|
|
69
|
-
sampling_temp = gen_record.task.sampling.temperature
|
|
70
|
-
sampling_max_tokens = gen_record.task.sampling.max_tokens
|
|
71
|
-
condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
|
|
72
|
-
metadata = metadata_by_condition.get(condition_id, {})
|
|
73
|
-
else:
|
|
74
|
-
# Fallback for extra evaluation records
|
|
75
|
-
sample_id = eval_record.sample_id or ""
|
|
76
|
-
metadata = {}
|
|
77
|
-
|
|
78
|
-
row: dict[str, object] = {"sample_id": sample_id}
|
|
79
|
-
for field in metadata_fields:
|
|
80
|
-
row[field] = metadata.get(field, "")
|
|
81
|
-
score_by_name = {
|
|
82
|
-
score.metric_name: score.value for score in eval_record.scores
|
|
83
|
-
}
|
|
84
|
-
for name in metric_names:
|
|
85
|
-
row[f"metric:{name}"] = score_by_name.get(name, "")
|
|
86
|
-
if include_failures:
|
|
87
|
-
row["failures"] = "; ".join(eval_record.failures)
|
|
88
|
-
writer.writerow(row)
|
|
89
|
-
return path
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _collect_sample_metadata(
|
|
93
|
-
records: Sequence[core_entities.GenerationRecord],
|
|
94
|
-
) -> tuple[dict[str, MutableMapping[str, object]], list[str]]:
|
|
95
|
-
"""Collect metadata from generation records.
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
records: Generation records
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
Tuple of (metadata by condition ID, list of metadata fields)
|
|
102
|
-
"""
|
|
103
|
-
metadata: dict[str, MutableMapping[str, object]] = {}
|
|
104
|
-
for index, record in enumerate(records):
|
|
105
|
-
sample_id = _extract_sample_id(record.task.metadata)
|
|
106
|
-
if sample_id is None:
|
|
107
|
-
sample_id = f"sample-{index}"
|
|
108
|
-
|
|
109
|
-
# Create unique identifier for each experimental condition
|
|
110
|
-
prompt_template = record.task.prompt.spec.name
|
|
111
|
-
model_identifier = record.task.model.identifier
|
|
112
|
-
sampling_temp = record.task.sampling.temperature
|
|
113
|
-
sampling_max_tokens = record.task.sampling.max_tokens
|
|
114
|
-
|
|
115
|
-
# Create unique condition key
|
|
116
|
-
condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
|
|
117
|
-
|
|
118
|
-
# Store metadata with unique condition ID
|
|
119
|
-
condition_metadata = _metadata_from_task(record)
|
|
120
|
-
metadata[condition_id] = condition_metadata
|
|
121
|
-
|
|
122
|
-
# Collect all field names from all conditions
|
|
123
|
-
fields = sorted({field for meta in metadata.values() for field in meta.keys()})
|
|
124
|
-
|
|
125
|
-
return metadata, fields
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def _extract_sample_id(metadata: dict[str, object]) -> str | None:
|
|
129
|
-
"""Extract sample ID from metadata.
|
|
130
|
-
|
|
131
|
-
Args:
|
|
132
|
-
metadata: Task metadata
|
|
133
|
-
|
|
134
|
-
Returns:
|
|
135
|
-
Sample ID or None
|
|
136
|
-
"""
|
|
137
|
-
value = metadata.get("dataset_id") or metadata.get("sample_id")
|
|
138
|
-
if value is None:
|
|
139
|
-
return None
|
|
140
|
-
return str(value)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def _metadata_from_task(record: core_entities.GenerationRecord) -> dict[str, object]:
|
|
144
|
-
"""Build metadata dict from generation record.
|
|
145
|
-
|
|
146
|
-
Args:
|
|
147
|
-
record: Generation record
|
|
148
|
-
|
|
149
|
-
Returns:
|
|
150
|
-
Metadata dictionary
|
|
151
|
-
"""
|
|
152
|
-
metadata = dict(record.task.metadata)
|
|
153
|
-
metadata.setdefault("model_identifier", record.task.model.identifier)
|
|
154
|
-
metadata.setdefault("model_provider", record.task.model.provider)
|
|
155
|
-
metadata.setdefault("prompt_template", record.task.prompt.spec.name)
|
|
156
|
-
metadata.setdefault("sampling_temperature", record.task.sampling.temperature)
|
|
157
|
-
metadata.setdefault("sampling_top_p", record.task.sampling.top_p)
|
|
158
|
-
metadata.setdefault("sampling_max_tokens", record.task.sampling.max_tokens)
|
|
159
|
-
return metadata
|
|
File without changes
|
|
File without changes
|