PyPI - themis-eval - Versions diffs - 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

themis-eval 0.2.3py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

themis/__init__.py +5 -2
themis/_version.py +14 -1
themis/api.py +83 -145
themis/backends/storage.py +5 -0
themis/cli/commands/info.py +2 -11
themis/cli/main.py +231 -40
themis/comparison/engine.py +7 -13
themis/core/entities.py +4 -0
themis/evaluation/metric_pipeline.py +12 -0
themis/evaluation/pipeline.py +22 -0
themis/evaluation/pipelines/__init__.py +4 -0
themis/evaluation/pipelines/composable_pipeline.py +55 -0
themis/evaluation/pipelines/standard_pipeline.py +16 -0
themis/experiment/__init__.py +2 -2
themis/experiment/cache_manager.py +15 -1
themis/experiment/definitions.py +1 -1
themis/experiment/orchestrator.py +21 -11
themis/experiment/share.py +264 -0
themis/experiment/storage.py +345 -298
themis/generation/router.py +22 -4
themis/generation/runner.py +16 -1
themis/presets/benchmarks.py +602 -17
themis/server/app.py +38 -26
themis/session.py +125 -0
themis/specs/__init__.py +7 -0
themis/specs/execution.py +26 -0
themis/specs/experiment.py +33 -0
themis/specs/storage.py +18 -0
themis/storage/__init__.py +6 -0
themis/storage/experiment_storage.py +7 -0
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
themis/experiment/builder.py +0 -151
themis/experiment/export_csv.py +0 -159
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0

{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,14 @@
-themis/__init__.py,sha256=rQL3njf3i5lnAcmu0HuRzGGMELbA9xX21hzw4HrbIxw,1394
-themis/_version.py,sha256=Tk4OCTQHYoZ61gm9JnkdgajR0vkBHbVm5OUjInzyJug,378
-themis/api.py,sha256=flZTbU-jRcbv7oXcfRKG4hkZjASmWlT52A4PghKj9G0,17700
+themis/__init__.py,sha256=YPexmyPqbANhr0Yzm46FDWgdBLjAX_UoZqywte9WJ84,1476
+themis/_version.py,sha256=b5-7SWk_hYOedKy8IUPxaubXc0KS0hRjhUsY8PkIbl0,818
+themis/api.py,sha256=zK6_RiiYTcXiV6uVQu1fNRdTqm-tXT_Df-CfrLXrIhk,15509
 themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+themis/session.py,sha256=H1Pkr71b90sa94YeRCIyffIxE2gQPmtfzo-zNnPLrdM,4311
 themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
 themis/backends/execution.py,sha256=RAFuB9ri8TMil5PcnsisypKO2ViyLFXj08P_vjNYguU,6095
-themis/backends/storage.py,sha256=pQp20WagSCl8Vmd-Rgx0hDbpYFhCqARXtvGDw3DPgNQ,8021
+themis/backends/storage.py,sha256=3oMcL5Wliac3INxHVG6DFlbqe6Y-6fjA5eZJOLpxm0w,8168
 themis/cli/__init__.py,sha256=An2DrMHRfmiee5BYJ6TGqvbG7sXWECjjyvEgcoGJ7cE,99
 themis/cli/__main__.py,sha256=df2pOghoSuq18hZmVVikmGhaFSaRe-jeDOnrsu-1QDM,135
-themis/cli/main.py,sha256=AGBFxb1sPLQ-aUAq8RM3YI6gGNs6SdFmBzVSqwp_MSg,15482
+themis/cli/main.py,sha256=ZcB3rS0tMsNCEcHRqjCwfi_JwvQqWZUjZ1TWuSUj4N4,22236
 themis/cli/new_project.py,sha256=D8asV4QbjgQNYvmXt_WhK4nPM-wKHe_K0VJiBdgtO_E,1121
 themis/cli/utils.py,sha256=NAPyFiXspfpx5vBxA8aEcOMmWEDyt-R8ywoHo_8Nr4A,1307
 themis/cli/commands/__init__.py,sha256=CTx7su3qTtq96qxLNclDsE6UM_86NhaS01M9-x9wFiw,287
@@ -16,7 +17,7 @@ themis/cli/commands/comparison.py,sha256=Ki1_MMFFR4vBJkZTeIMWLh-_zdjbtJZurI3YyrE
 themis/cli/commands/config_commands.py,sha256=eL6GtdIllOIHo8GbNN2jOqLn5VUPBuqhnro9ooPxDog,7387
 themis/cli/commands/cost.py,sha256=fFdF6hKIzsbPsyrJ1nt6-2m43PpVGUj8jx5T90tBTNo,7233
 themis/cli/commands/demo.py,sha256=akQqjG-hbUDfeB3bI8K4F5-S0ibJqhflGBFQ5nvdUgE,2135
-themis/cli/commands/info.py,sha256=9maOaw-TFiBpuVhaqlMKukGuZ_zgESetqbMQ1Qdvjxs,2515
+themis/cli/commands/info.py,sha256=njms4dsaede2zyC2yoLbfoN5Bho_JTYciJl3Q7Twiv0,2250
 themis/cli/commands/leaderboard.py,sha256=AVvsYIwZAY18jn3sOq3QD45yNtfdHUEl7eixM4aMCKw,10615
 themis/cli/commands/math_benchmarks.py,sha256=nQ4TcPB7T9O3piAy4_TgrOQOQxh2Q8OyBreK_HoPCeQ,9946
 themis/cli/commands/mcq_benchmarks.py,sha256=Cls5W1jGd7TKizmw07CnZWY5N6ywR8VhJ6jKDnY_cRk,7026
@@ -24,7 +25,7 @@ themis/cli/commands/results.py,sha256=rdN3SaMoFnSfAoAXlfpeCTt3V6MwIp0Dk7FIjvPNF7
 themis/cli/commands/sample_run.py,sha256=r3Ymg5dVHg4IAVJvzoP0ZWUWWUE4Dia1t0062Yhdk9Q,9445
 themis/cli/commands/visualize.py,sha256=ZECkB0NjIltuOeBE-Q1JnndZEMXVzc8KgcrbaP-GSXo,9740
 themis/comparison/__init__.py,sha256=bRI8gDlcjMtnH77R7N5ARioq_V4daJcWWM4DXKsoE1k,679
-themis/comparison/engine.py,sha256=UkzXKmEFI2JiX0y8534oc6JFySxgA5v1emzRcGj33Kk,12133
+themis/comparison/engine.py,sha256=Mw4sQBO2NnKRUOWHNO1XnyjrVvmHX1KXRnl1LUw1hIY,11809
 themis/comparison/reports.py,sha256=126VJbd-lxj8C2YJqul53Fyr-nrZgmbrBsRA6Qkh0ro,10117
 themis/comparison/statistics.py,sha256=eLqKUtKFwSvXnbZax8S0lF8RiSepwYdhnmnDD7DcrZs,12929
 themis/config/__init__.py,sha256=YMdFG1iLvOQUnSPlc_ZJVn5zCCTbIozML64b4qUtGR8,476
@@ -34,7 +35,7 @@ themis/config/runtime.py,sha256=hU69_oND7fJfAOIBJONENmsuf7Y8roO7n-w9OwxzoT8,7475
 themis/config/schema.py,sha256=SMR9QHp8OBkSnb1dHyOgg-IJWSqpXfyAqywnBeMy46M,3196
 themis/core/__init__.py,sha256=S8G1x-39sZ3_NQ5DJ6R1yBTWXp_gO0WxOtVjeB9sTwY,113
 themis/core/conversation.py,sha256=wwO8RS4t4plDR0Sf1KjYv_ejonlvKe0ZwAD-4sfGak8,10155
-themis/core/entities.py,sha256=WV9kiYdZFGxn6oH0lPtqaViY4I8oq-hWi_SmCKjvRnc,4449
+themis/core/entities.py,sha256=a9R3J2wetyUiGcxH6K8pGXfVz_gLh4nsJ81y7cKBj5o,4548
 themis/core/serialization.py,sha256=cxfoSKwcZiNsnR8g_SAJAq1ZLrfLXM4S9_rVEDUT8qs,7071
 themis/core/tools.py,sha256=v0_ctsBCtinZGNC_I4C-h0GUPNM5ZeTi7z-U4iCtyp4,11035
 themis/core/types.py,sha256=I5rr9MMS0irX4lo-xlqGjosx-FjPgT64RzQAraM223A,3652
@@ -59,7 +60,8 @@ themis/datasets/super_gpqa.py,sha256=Mr1ag_FyAk1haxg6_ONX5F84wQYtbSVjV-MlMNmaHlI
 themis/evaluation/__init__.py,sha256=2Jl8tcVxYAsmHNAZev2mPS_mEwZcRzebqSM3QDc2cyY,36
 themis/evaluation/conditional.py,sha256=ayndI7FcwxdIMR8B4ddgcKZd5Jl5NQcBJUp7eXI6Djk,13881
 themis/evaluation/math_verify_utils.py,sha256=vXMvL11-IH16UHZ-mbi_r5hOFz7aUfR1J1laa6qmLMk,2213
-themis/evaluation/pipeline.py,sha256=OOowN59UdOMF2Hwy_G4ky5yzR4ajAnohil6xteWBHqE,1503
+themis/evaluation/metric_pipeline.py,sha256=_JXqf5UySbAyuSiN1waDCBfeek7ArluKvXXkm8qIEvs,329
+themis/evaluation/pipeline.py,sha256=7mQV_sJlPDd6zA9jd2TT5IhdMszzT3ftMLbM7Ww76aw,2217
 themis/evaluation/reports.py,sha256=9om7jzZUtmlMH7EeteXp_98gfHct4x09AyTFy3FSAdQ,8715
 themis/evaluation/extractors/__init__.py,sha256=BanoC_8e0iam-VU7l7uhvhac_6w_JJZYoYE4xXPUrGk,566
 themis/evaluation/extractors/error_taxonomy_extractor.py,sha256=RrRx-23l3LwTdG89kvSQJng438cfYI-IdtOGUD6gEDw,2462
@@ -86,9 +88,9 @@ themis/evaluation/metrics/nlp/bertscore.py,sha256=czlIqYkOTBWsfHiE6U1vkq1KHRQm8p
 themis/evaluation/metrics/nlp/bleu.py,sha256=o_aVkoFPSMmeOLYaHRMamIpSKlYSxrMA1OdntTIUe9g,4436
 themis/evaluation/metrics/nlp/meteor.py,sha256=QZT09s4aiUcVvDJDVPZYjzi5SxXdS2gn2IaOTNmKp78,5076
 themis/evaluation/metrics/nlp/rouge.py,sha256=YL05qluF-KsesHYFRfm5zELJlcvo6RvaKp7xKy6BuLI,4365
-themis/evaluation/pipelines/__init__.py,sha256=5YI1xaUULHisctFxrumN4XRpWYneoonX7nd9zBtsjvQ,384
-themis/evaluation/pipelines/composable_pipeline.py,sha256=nNP9MSvQQJvaSBw5_gO3FeyhGm9So2ZlGqh5qSvE8Ac,10905
-themis/evaluation/pipelines/standard_pipeline.py,sha256=GI5_ImebBuM6D8GpGKLoNq4p3JhTq-ocOThlah8RxME,14754
+themis/evaluation/pipelines/__init__.py,sha256=EXBzgOpi_70yTvdXJr0cERmhBlaqnHZHNcDlPamyQLw,549
+themis/evaluation/pipelines/composable_pipeline.py,sha256=SWv4H1-pcFNAuTu0orkMDgKsdUcom75ded9GhIaHqo0,12838
+themis/evaluation/pipelines/standard_pipeline.py,sha256=Xe7hsOzHg0EBnmslBejb3xJ6Y2pvHF4mrwC_THurQQI,15423
 themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
 themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
 themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
@@ -101,20 +103,19 @@ themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=MFcBdtK
 themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
 themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
 themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=IRSgnnD2R6qrjiOTyA_PIOHUfQj4FqutkU3pKMth0CQ,2562
-themis/experiment/__init__.py,sha256=dGranqpESugmmfbQlTU9efwspazW6j3vcmAKEtAoWZk,182
-themis/experiment/builder.py,sha256=AEjCDeSOI2B0i0PBjkfY1GUDNrYGTGiqPvt0SxnDQFo,5618
-themis/experiment/cache_manager.py,sha256=Fd8Qxifrmyn8f2zjAyPrLv-ZU4Dcp-MKo8-09BoW7tY,4361
+themis/experiment/__init__.py,sha256=T8BEG9dmabQ-tnVZ5YUd5r-31m6_OY-FDfvUfWhl2hc,190
+themis/experiment/cache_manager.py,sha256=aB9QWcS5PV4y675cMVQehm1Rs2XuK84M0WMMFBYP3Hc,4901
 themis/experiment/comparison.py,sha256=Mr1L5Zj7i87xk9XUQ_UueLTsC-sDZH8YGwLwg_gG0VI,21562
 themis/experiment/cost.py,sha256=flhENfB5WKvyNWwPMDtygNZAv6y_yv4RoClsRz714Hc,10159
-themis/experiment/definitions.py,sha256=oOZBFfEQkSBiZd9CMutCQ5luH6oeUT9yAZFd7fpVjnw,2015
+themis/experiment/definitions.py,sha256=7x18qmQBX88Gn_kxY9n3UefnwvvsRnALAvuEmXSZqXY,2022
 themis/experiment/export.py,sha256=ujwiSvqQhLaO99WHyE8osdnmriHjyIM1C2zKf5o93Cw,29800
-themis/experiment/export_csv.py,sha256=80w3gEGjeLjuiNq539rRP73k3MBtwrzJy90hgE91AKw,6030
 themis/experiment/integration_manager.py,sha256=wTVTjDGcUkzz4tfnwSxa5nK1A4e2FKCPazDYGcdzYS8,3325
 themis/experiment/math.py,sha256=P2E9F_UKI7pb-aXepSztGdr_g309WEMe83zqg1nWO7A,6973
 themis/experiment/mcq.py,sha256=DDB99FHQsU_5vMIRDRhSZ7pReYvVf57wLmmo3OU_An4,6276
-themis/experiment/orchestrator.py,sha256=VeSasDmCXrYlrv1r47I698RUq14vEBR7c_uyZzM01hw,19304
+themis/experiment/orchestrator.py,sha256=fA4nPBrhMwcnsdPc9ETr2m8t0I690UHpujIf8dAOgzU,19527
 themis/experiment/pricing.py,sha256=fTM32yE3L8vahMP4sr1zr7dbp9zYCjiPN4D4VuZ8-q8,9346
-themis/experiment/storage.py,sha256=ujGiQTeRPOfS8hYHB1a7F9t-dQnXquhqomI1vDjqmno,55250
+themis/experiment/share.py,sha256=vbwZ93jFQo0r7uxfzkixwkfwbZZFc0FeFgCj4XS1L4o,8209
+themis/experiment/storage.py,sha256=rqyRyWopwR1Td-9d1O6kslPT1QEepzX29Nl3dtYcDIk,56991
 themis/experiment/visualization.py,sha256=dJYHrp3mntl8CPc5HPI3iKqPztVsddQB3ogRkd_FCNc,18473
 themis/generation/__init__.py,sha256=6KVwCQYMpPIsXNuWDZOGuqHkUkA45lbSacIFn8ZbD4s,36
 themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJzay2uA,13669
@@ -122,8 +123,8 @@ themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo
 themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
 themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
 themis/generation/plan.py,sha256=k6_gdKFM12nrKz7ac1c5vTZsFanIKJJgyQ8IhvakDNQ,17158
-themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
-themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
+themis/generation/router.py,sha256=ce3Hbth3JrJxO9dBgo0izCMzvqJ3ARaBNRhIJT-JXRQ,1692
+themis/generation/runner.py,sha256=E7hN4gAxzJLCYwvGjwKCBMhXlFv-RhSRjQs1kYvarXo,9579
 themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
 themis/generation/templates.py,sha256=ut_6akp8Y6Ey_9O3s64jDbwCB74pw62Zf8URlYcKHkA,2325
 themis/generation/turn_strategies.py,sha256=w33qhzpQbGTsfeOgOgMDovV0wEeXeNZUUBm5yZy1naw,10973
@@ -135,7 +136,7 @@ themis/integrations/huggingface.py,sha256=vrLwYwn65pU4W3FUe0ImCOZxKKlpRshDqMoLFs
 themis/integrations/wandb.py,sha256=LJOPojjlqG05EIPxcjy3QmA15Gxgs1db3encDWVzYYw,2545
 themis/interfaces/__init__.py,sha256=78dNE_eHfFmb9hXNy5sLZ1jOTGWS8TzdVE_eiYQPFVc,5967
 themis/presets/__init__.py,sha256=w58fJcy4eNiE034qHO2xE5pp-H-4LNLXo5hLMuC7wIQ,533
-themis/presets/benchmarks.py,sha256=s9JxRogHwZs8oiuiI7Z7uiUBZXEp3gg7AQZnBvdGieA,12026
+themis/presets/benchmarks.py,sha256=wO5xAVafUuL3HEjPO4eAsfzoeexINxIIGUXiwz_31zU,31752
 themis/presets/models.py,sha256=c6-I_drHa4vMLIajSkCcrFbsJOsauFjY8fU1leBxZLg,5173
 themis/project/__init__.py,sha256=vgLv2nS62yz1XsFSFzFf7eIo6FyQJXpOY9OPRUcTQLQ,465
 themis/project/definitions.py,sha256=vHARw0IjFOWE4RL4mGRwvke36A6GWQGep6cQFIRcpJg,3329
@@ -143,15 +144,21 @@ themis/project/patterns.py,sha256=2J51Q9Jq7X-2N57uexvR191gaZKwusef5vIuIVUQY-E,77
 themis/providers/__init__.py,sha256=K5nG0DsK_YPY0cT9MBLk5BLcLbBo0wBP0vQvLjpAw_Y,189
 themis/providers/registry.py,sha256=Za5Kg3-A-35wS_jiGpPXV2q1k6he_dRIWVqt36dKN-4,1056
 themis/server/__init__.py,sha256=Hp0qGI5nvO4bhLAez3jQxim7H433l72EYE2IA8Xp2hA,731
-themis/server/app.py,sha256=OZ39gCC47AXVqZxroC_4KtIYBYx_rfpde7C25AF3EI0,11166
+themis/server/app.py,sha256=p8UQp2cU5i4DgZC0EhRdRejOfFvlg-93WOC3Z45apHs,11510
+themis/specs/__init__.py,sha256=7eJcZFKgMHflUF4C_Pg1M5B_gpJXCsACEZruv4SHx70,256
+themis/specs/execution.py,sha256=11EeecjIdcKKRdF_kQ_WQkAIxIR8kUdifqRPnLo8XrU,698
+themis/specs/experiment.py,sha256=dnOP1KmsUIcA-VC4AR0bOFsNDMFUBEGLPJVVzPoovR0,971
+themis/specs/storage.py,sha256=5jEqBPv1P2DTXlriUZvfNLnMGn2P-CLx6h82M5RpTmo,383
+themis/storage/__init__.py,sha256=1axYCdtNSE3sYehO6T1hWRjGP_iYnlGl-dBlKiMNX8g,276
+themis/storage/experiment_storage.py,sha256=S7tQ9DtYhWwX2rmxsWrjMW69WGAZORFu49s0eVmwS40,197
 themis/utils/api_generator.py,sha256=3oQ7mGZlFx2Dpm45pMg3rNIqNK2Smj05PjOMXp5RIkQ,10776
 themis/utils/cost_tracking.py,sha256=9_Z2iTfNaQse9G_bnqn4hme4T0fG2W-fxOLEDeF_3VI,11545
 themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,13008
 themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
 themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
 themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
-themis_eval-0.2.3.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
-themis_eval-0.2.3.dist-info/METADATA,sha256=4N7tBOyUi8PAlFT2qJseKIABjHOzkFmLtfqVVUSFz84,15235
-themis_eval-0.2.3.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
-themis_eval-0.2.3.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
-themis_eval-0.2.3.dist-info/RECORD,,
+themis_eval-1.0.0.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
+themis_eval-1.0.0.dist-info/METADATA,sha256=mXVE6G4L1O18OhtsOgACvvjvx4zJ_onVmgF4SO9mJFw,15513
+themis_eval-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+themis_eval-1.0.0.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
+themis_eval-1.0.0.dist-info/RECORD,,

{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.10.1)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

themis/experiment/builder.py DELETED Viewed

@@ -1,151 +0,0 @@
-"""Utilities for assembling experiments from reusable components."""
-from __future__ import annotations
-from pathlib import Path
-from typing import Any, Callable, Mapping, Sequence, Type
-from themis.config import schema as config
-from themis.core import entities as core_entities
-from themis.evaluation import pipeline as evaluation_pipeline
-from themis.evaluation import strategies as evaluation_strategies
-from themis.experiment import orchestrator
-from themis.experiment import storage as experiment_storage
-from themis.experiment.cache_manager import CacheManager
-from themis.experiment.definitions import (
-    BuiltExperiment,
-    ExperimentDefinition,
-    ModelBinding,
-)
-from themis.experiment.integration_manager import IntegrationManager
-from themis.generation import plan as generation_plan
-from themis.generation import router as generation_router
-from themis.generation import runner as generation_runner
-from themis.generation import strategies as generation_strategies
-from themis.interfaces import ModelProvider
-from themis.providers import create_provider
-class ExperimentBuilder:
-    """Composable builder for constructing experiment components."""
-    def __init__(
-        self,
-        *,
-        extractor,
-        metrics,
-        runner_cls: Type[
-            generation_runner.GenerationRunner
-        ] = generation_runner.GenerationRunner,
-        runner_kwargs: Mapping[str, Any] | None = None,
-        pipeline_cls: Type[
-            evaluation_pipeline.EvaluationPipeline
-        ] = evaluation_pipeline.EvaluationPipeline,
-        pipeline_kwargs: Mapping[str, Any] | None = None,
-        router_cls: Type[ModelProvider] = generation_router.ProviderRouter,
-        router_kwargs: Mapping[str, Any] | None = None,
-        strategy_resolver: Callable[
-            [core_entities.GenerationTask], generation_strategies.GenerationStrategy
-        ]
-        | None = None,
-        evaluation_strategy_resolver: Callable[
-            [core_entities.GenerationRecord], evaluation_strategies.EvaluationStrategy
-        ]
-        | None = None,
-    ) -> None:
-        self._extractor = extractor
-        self._metrics = list(metrics)
-        self._runner_cls = runner_cls
-        self._runner_kwargs = dict(runner_kwargs or {})
-        self._pipeline_cls = pipeline_cls
-        self._pipeline_kwargs = dict(pipeline_kwargs or {})
-        self._router_cls = router_cls
-        self._router_kwargs = dict(router_kwargs or {})
-        self._strategy_resolver = strategy_resolver
-        self._evaluation_strategy_resolver = evaluation_strategy_resolver
-    def build(
-        self,
-        definition: ExperimentDefinition,
-        *,
-        storage_dir: str | Path | None = None,
-    ) -> BuiltExperiment:
-        plan_obj = self._build_plan(definition)
-        router = self._build_router(definition.model_bindings)
-        runner_kwargs = dict(self._runner_kwargs)
-        if self._strategy_resolver is not None:
-            runner_kwargs.setdefault("strategy_resolver", self._strategy_resolver)
-        runner = self._runner_cls(provider=router, **runner_kwargs)
-        pipeline_kwargs = dict(self._pipeline_kwargs)
-        if self._evaluation_strategy_resolver is not None:
-            pipeline_kwargs.setdefault(
-                "strategy_resolver", self._evaluation_strategy_resolver
-            )
-        pipeline = self._pipeline_cls(
-            extractor=self._extractor,
-            metrics=self._metrics,
-            **pipeline_kwargs,
-        )
-        # Create storage backend
-        storage = (
-            experiment_storage.ExperimentStorage(storage_dir)
-            if storage_dir is not None
-            else None
-        )
-        # Create managers for better separation of concerns
-        cache_manager = CacheManager(
-            storage=storage,
-            enable_resume=True,
-            enable_cache=True,
-        )
-        integration_manager = IntegrationManager(config=config.IntegrationsConfig())
-        # Create orchestrator with managers
-        orchestrator_obj = orchestrator.ExperimentOrchestrator(
-            generation_plan=plan_obj,
-            generation_runner=runner,
-            evaluation_pipeline=pipeline,
-            cache_manager=cache_manager,
-            integration_manager=integration_manager,
-        )
-        return BuiltExperiment(
-            orchestrator=orchestrator_obj,
-            plan=plan_obj,
-            runner=runner,
-            pipeline=pipeline,
-            storage=storage,
-            router=router,
-        )
-    def _build_plan(
-        self, definition: ExperimentDefinition
-    ) -> generation_plan.GenerationPlan:
-        return generation_plan.GenerationPlan(
-            templates=list(definition.templates),
-            models=[binding.spec for binding in definition.model_bindings],
-            sampling_parameters=list(definition.sampling_parameters),
-            dataset_id_field=definition.dataset_id_field,
-            reference_field=definition.reference_field,
-            metadata_fields=tuple(definition.metadata_fields),
-            context_builder=definition.context_builder,
-        )
-    def _build_router(self, bindings: Sequence[ModelBinding]) -> ModelProvider:
-        providers: dict[str, ModelProvider] = {}
-        for binding in bindings:
-            providers[binding.spec.identifier] = create_provider(
-                binding.provider_name,
-                **binding.provider_options,
-            )
-        return self._router_cls(providers, **self._router_kwargs)
-__all__ = [
-    "ExperimentBuilder",
-    "ExperimentDefinition",
-    "ModelBinding",
-    "BuiltExperiment",
-]

themis/experiment/export_csv.py DELETED Viewed

@@ -1,159 +0,0 @@
-"""CSV export functionality for experiment reports."""
-from __future__ import annotations
-import csv
-from pathlib import Path
-from typing import MutableMapping, Sequence
-from themis.core import entities as core_entities
-from themis.experiment import orchestrator
-def export_report_csv(
-    report: orchestrator.ExperimentReport,
-    path: str | Path,
-    *,
-    include_failures: bool = True,
-) -> Path:
-    """Write per-sample metrics to a CSV file for offline analysis.
-    Args:
-        report: Experiment report to export
-        path: Output path for CSV file
-        include_failures: Whether to include failures column
-    Returns:
-        Path to created CSV file
-    """
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    metadata_by_condition, metadata_fields = _collect_sample_metadata(
-        report.generation_results
-    )
-    # Create a proper index mapping generation records to their metadata
-    gen_record_index = {}
-    for gen_record in report.generation_results:
-        sample_id = gen_record.task.metadata.get(
-            "dataset_id"
-        ) or gen_record.task.metadata.get("sample_id")
-        prompt_template = gen_record.task.prompt.spec.name
-        model_identifier = gen_record.task.model.identifier
-        sampling_temp = gen_record.task.sampling.temperature
-        sampling_max_tokens = gen_record.task.sampling.max_tokens
-        condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
-        gen_record_index[condition_id] = gen_record
-    metric_names = sorted(report.evaluation_report.metrics.keys())
-    fieldnames = (
-        ["sample_id"] + metadata_fields + [f"metric:{name}" for name in metric_names]
-    )
-    if include_failures:
-        fieldnames.append("failures")
-    with path.open("w", encoding="utf-8", newline="") as handle:
-        writer = csv.DictWriter(handle, fieldnames=fieldnames)
-        writer.writeheader()
-        # Process evaluation records in the same order as generation records
-        for i, eval_record in enumerate(report.evaluation_report.records):
-            # Find the corresponding generation record by index
-            if i < len(report.generation_results):
-                gen_record = report.generation_results[i]
-                sample_id = gen_record.task.metadata.get(
-                    "dataset_id"
-                ) or gen_record.task.metadata.get("sample_id")
-                prompt_template = gen_record.task.prompt.spec.name
-                model_identifier = gen_record.task.model.identifier
-                sampling_temp = gen_record.task.sampling.temperature
-                sampling_max_tokens = gen_record.task.sampling.max_tokens
-                condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
-                metadata = metadata_by_condition.get(condition_id, {})
-            else:
-                # Fallback for extra evaluation records
-                sample_id = eval_record.sample_id or ""
-                metadata = {}
-            row: dict[str, object] = {"sample_id": sample_id}
-            for field in metadata_fields:
-                row[field] = metadata.get(field, "")
-            score_by_name = {
-                score.metric_name: score.value for score in eval_record.scores
-            }
-            for name in metric_names:
-                row[f"metric:{name}"] = score_by_name.get(name, "")
-            if include_failures:
-                row["failures"] = "; ".join(eval_record.failures)
-            writer.writerow(row)
-    return path
-def _collect_sample_metadata(
-    records: Sequence[core_entities.GenerationRecord],
-) -> tuple[dict[str, MutableMapping[str, object]], list[str]]:
-    """Collect metadata from generation records.
-    Args:
-        records: Generation records
-    Returns:
-        Tuple of (metadata by condition ID, list of metadata fields)
-    """
-    metadata: dict[str, MutableMapping[str, object]] = {}
-    for index, record in enumerate(records):
-        sample_id = _extract_sample_id(record.task.metadata)
-        if sample_id is None:
-            sample_id = f"sample-{index}"
-        # Create unique identifier for each experimental condition
-        prompt_template = record.task.prompt.spec.name
-        model_identifier = record.task.model.identifier
-        sampling_temp = record.task.sampling.temperature
-        sampling_max_tokens = record.task.sampling.max_tokens
-        # Create unique condition key
-        condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
-        # Store metadata with unique condition ID
-        condition_metadata = _metadata_from_task(record)
-        metadata[condition_id] = condition_metadata
-    # Collect all field names from all conditions
-    fields = sorted({field for meta in metadata.values() for field in meta.keys()})
-    return metadata, fields
-def _extract_sample_id(metadata: dict[str, object]) -> str | None:
-    """Extract sample ID from metadata.
-    Args:
-        metadata: Task metadata
-    Returns:
-        Sample ID or None
-    """
-    value = metadata.get("dataset_id") or metadata.get("sample_id")
-    if value is None:
-        return None
-    return str(value)
-def _metadata_from_task(record: core_entities.GenerationRecord) -> dict[str, object]:
-    """Build metadata dict from generation record.
-    Args:
-        record: Generation record
-    Returns:
-        Metadata dictionary
-    """
-    metadata = dict(record.task.metadata)
-    metadata.setdefault("model_identifier", record.task.model.identifier)
-    metadata.setdefault("model_provider", record.task.model.provider)
-    metadata.setdefault("prompt_template", record.task.prompt.spec.name)
-    metadata.setdefault("sampling_temperature", record.task.sampling.temperature)
-    metadata.setdefault("sampling_top_p", record.task.sampling.top_p)
-    metadata.setdefault("sampling_max_tokens", record.task.sampling.max_tokens)
-    return metadata

{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

themis-eval 0.2.3py3-none-any.whl → 1.0.0py3-none-any.whl