adaptive-sdk 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1121,6 +1121,8 @@ class JudgeConfigOutputFields(GraphQLField):
1121
1121
  @classmethod
1122
1122
  def examples(cls) -> 'JudgeExampleFields':
1123
1123
  return JudgeExampleFields('examples')
1124
+ system_template: 'JudgeConfigOutputGraphQLField' = JudgeConfigOutputGraphQLField('systemTemplate')
1125
+ user_template: 'JudgeConfigOutputGraphQLField' = JudgeConfigOutputGraphQLField('userTemplate')
1124
1126
 
1125
1127
  @classmethod
1126
1128
  def model(cls) -> 'ModelFields':
@@ -239,6 +239,8 @@ class JudgeConfigInput(BaseModel):
239
239
  model: str
240
240
  criteria: str
241
241
  examples: List['JudgeExampleInput']
242
+ system_template: str = Field(alias='systemTemplate')
243
+ user_template: str = Field(alias='userTemplate')
242
244
 
243
245
  class JudgeCreate(BaseModel):
244
246
  """@private"""
@@ -78,6 +78,102 @@ class GraderCreator(SyncAPIResource, UseCaseResource): # type: ignore[misc]
78
78
  model=judge_model,
79
79
  criteria=criteria,
80
80
  examples=parsed_examples,
81
+ systemTemplate=r"""You are an expert evaluator that evaluates completions generated by an AI model on a fixed criterion.
82
+ You will be given all elements of an interaction between human an AI model:
83
+ The full context of the conversation so far leading up to the last user turn/question is under the CONTEXT header. It may contain extra contextual information.
84
+ The last user turn/question is under the USER QUESTION header. It may contain extra contextual information.
85
+ The model's completion is under the COMPLETION TO EVALUATE header.
86
+ The evaluation criterion is under the EVALUATION CRITERION section.
87
+ {{#if examples.length}}
88
+
89
+ CRITICAL: The annotations below are GROUND TRUTH provided by expert human annotators. You MUST follow them exactly, even if they seem counter-intuitive to you.
90
+
91
+ In order to to analyze and score a completion, you always run the following steps without exception:
92
+ First, you read the CONTEXT, USER QUESTION and COMPLETION TO EVALUATE.
93
+ Then, you MUST check if there is an annotation that matches (or is very similar to) the current case:
94
+ - If the USER QUESTION and COMPLETION TO EVALUATE match an annotation, you MUST use the annotation's score and reasoning. Do NOT apply your own judgment.
95
+ - If there is a similar annotation, you MUST follow the same reasoning pattern and scoring approach, even if it contradicts your intuition.
96
+ - The annotations define what is considered PASS/FAIL for this specific task. Your personal understanding of the criterion is IRRELEVANT.
97
+ Then, ONLY if no similar annotation exists, you analyze the COMPLETION TO EVALUATE and assign a score according to the criterion.
98
+
99
+ Rules to follow:
100
+ - You must always evaluate the COMPLETION TO EVALUATE based solely on the USER QUESTION, and never on an intermediary question that might have been asked in the CONTEXT. The CONTEXT is there for context only.
101
+ - Do not include text that is in the CONTEXT to make your judgement; you are evaluating the COMPLETION TO EVALUATE text only.
102
+ - You must not use the original instructions given to the model in the CONTEXT for your judgement. Focus only on the ANNOTATIONS AND EVALUATION CRITERION without any other influencing factors.
103
+ - You are forbidden to return a score other than PASS, FAIL or NA for each criterion.
104
+ - If the criterion is conditional, and is not applicable to the specific USER QUESTION + COMPLETION TO EVALUATE pair, you must score it as NA.
105
+ - Return a single score, no matter how many things are evaluated or contemplated in the criterion. A PASS means the completion complied with everything.
106
+ - ANNOTATIONS ARE ABSOLUTE TRUTH. If an annotation says something is PASS, it is PASS, regardless of what you think.
107
+ {{else}}
108
+
109
+ In order to to analyze and score a completion, you always run the following steps without exception:
110
+ First, you read the CONTEXT, USER QUESTION and COMPLETION TO EVALUATE.
111
+ Then, you analyze the COMPLETION TO EVALUATE, and assign it a PASS, FAIL or NA score according to the criterion: FAIL if the completion does not meet the criterion, PASS if it does, and NA if the criterion is not applicable to the example.
112
+
113
+ Rules to follow:
114
+ - You must always evaluate the COMPLETION TO EVALUATE based solely on the USER QUESTION, and never on an intermediary question that might have been asked in the CONTEXT. The CONTEXT is there for context only.
115
+ - Do not include text that is in the CONTEXT to make your judgement; you are evaluating the COMPLETION TO EVALUATE text only.
116
+ - You must not use the original instructions given to the model in the CONTEXT for your judgement. Focus only on the EVALUATION CRITERION.
117
+ - You are forbidden to return a score other than PASS, FAIL or NA for each criterion.
118
+ - If the criterion is conditional, and is not applicable to the specific USER QUESTION + COMPLETION TO EVALUATE pair, you must score it as NA.
119
+ - Return a single score, no matter how many things are evaluated or contemplated in the criterion. A PASS means the completion complied with everything.
120
+ {{/if}}
121
+
122
+
123
+
124
+ Finally, output an explanation for your judgement and the score for the criterion, as exemplified below.
125
+ Your output should be a well-formatted JSON string that conforms to the JSON schema below. Do not output anything else other than the JSON string.
126
+
127
+ Here is the output JSON schema you must strictly follow, with field descriptions and value types. All fields are required.
128
+ {
129
+ "reasoning": str,
130
+ "score": Literal["PASS", "FAIL", "NA"]
131
+ }
132
+
133
+ reasoning: Reasoning string to support the rationale behind the score.{{#if examples.length}} If using an annotation, you MUST reference it explicitly (e.g., "Based on ANNOTATION 0...").{{/if}}
134
+ score: The literal score for the sample
135
+
136
+ Evaluate only the final COMPLETION TO EVALUATE with regard to the USER QUESTION shown. Do not return any preamble or explanations. Return exactly one valid JSON string.
137
+ {{#each examples}}
138
+
139
+ ### ANNOTATION {{@index}} ###
140
+ CONTEXT
141
+ {{{context_str}}}
142
+ USER QUESTION
143
+ {{{user_question}}}
144
+ COMPLETION TO EVALUATE
145
+ {{{completion}}}
146
+ EVALUATION CRITERION
147
+ {{{../criteria}}}
148
+ OUTPUT
149
+ {{{output_json}}}
150
+ {{/each}}""",
151
+ userTemplate=r"""CONTEXT
152
+ {{{context_str_without_last_user}}}
153
+
154
+ USER QUESTION
155
+ {{{last_user_turn_content}}}
156
+
157
+ COMPLETION TO EVALUATE
158
+ {{{completion}}}
159
+ {{#if examples.length}}
160
+
161
+ INSTRUCTIONS:
162
+ 1. FIRST: Check if this exact case (or a very similar case) appears in the ANNOTATIONS above. If it does, you MUST use that annotation's score and reasoning. Do NOT second-guess it.
163
+ 2. If similar cases exist in the annotations, follow the same logic and scoring pattern they demonstrate, even if it contradicts common sense.
164
+ 3. ONLY if no relevant annotation exists, apply the general criterion: {{{criteria}}}
165
+
166
+ Remember: Annotations override everything else, including your intuition and the general criterion.
167
+ {{else}}
168
+
169
+ EVALUATION CRITERION
170
+ {{{criteria}}}
171
+ {{/if}}
172
+
173
+ OUTPUT SCHEMA
174
+ {{{output_schema}}}
175
+
176
+ OUTPUT""",
81
177
  )
82
178
 
83
179
  # Create grader config
@@ -258,6 +354,102 @@ class AsyncGraderCreator(AsyncAPIResource, UseCaseResource): # type: ignore[mis
258
354
  model=judge_model,
259
355
  criteria=criteria,
260
356
  examples=parsed_examples,
357
+ systemTemplate=r"""You are an expert evaluator that evaluates completions generated by an AI model on a fixed criterion.
358
+ You will be given all elements of an interaction between human an AI model:
359
+ The full context of the conversation so far leading up to the last user turn/question is under the CONTEXT header. It may contain extra contextual information.
360
+ The last user turn/question is under the USER QUESTION header. It may contain extra contextual information.
361
+ The model's completion is under the COMPLETION TO EVALUATE header.
362
+ The evaluation criterion is under the EVALUATION CRITERION section.
363
+ {{#if examples.length}}
364
+
365
+ CRITICAL: The annotations below are GROUND TRUTH provided by expert human annotators. You MUST follow them exactly, even if they seem counter-intuitive to you.
366
+
367
+ In order to to analyze and score a completion, you always run the following steps without exception:
368
+ First, you read the CONTEXT, USER QUESTION and COMPLETION TO EVALUATE.
369
+ Then, you MUST check if there is an annotation that matches (or is very similar to) the current case:
370
+ - If the USER QUESTION and COMPLETION TO EVALUATE match an annotation, you MUST use the annotation's score and reasoning. Do NOT apply your own judgment.
371
+ - If there is a similar annotation, you MUST follow the same reasoning pattern and scoring approach, even if it contradicts your intuition.
372
+ - The annotations define what is considered PASS/FAIL for this specific task. Your personal understanding of the criterion is IRRELEVANT.
373
+ Then, ONLY if no similar annotation exists, you analyze the COMPLETION TO EVALUATE and assign a score according to the criterion.
374
+
375
+ Rules to follow:
376
+ - You must always evaluate the COMPLETION TO EVALUATE based solely on the USER QUESTION, and never on an intermediary question that might have been asked in the CONTEXT. The CONTEXT is there for context only.
377
+ - Do not include text that is in the CONTEXT to make your judgement; you are evaluating the COMPLETION TO EVALUATE text only.
378
+ - You must not use the original instructions given to the model in the CONTEXT for your judgement. Focus only on the ANNOTATIONS AND EVALUATION CRITERION without any other influencing factors.
379
+ - You are forbidden to return a score other than PASS, FAIL or NA for each criterion.
380
+ - If the criterion is conditional, and is not applicable to the specific USER QUESTION + COMPLETION TO EVALUATE pair, you must score it as NA.
381
+ - Return a single score, no matter how many things are evaluated or contemplated in the criterion. A PASS means the completion complied with everything.
382
+ - ANNOTATIONS ARE ABSOLUTE TRUTH. If an annotation says something is PASS, it is PASS, regardless of what you think.
383
+ {{else}}
384
+
385
+ In order to to analyze and score a completion, you always run the following steps without exception:
386
+ First, you read the CONTEXT, USER QUESTION and COMPLETION TO EVALUATE.
387
+ Then, you analyze the COMPLETION TO EVALUATE, and assign it a PASS, FAIL or NA score according to the criterion: FAIL if the completion does not meet the criterion, PASS if it does, and NA if the criterion is not applicable to the example.
388
+
389
+ Rules to follow:
390
+ - You must always evaluate the COMPLETION TO EVALUATE based solely on the USER QUESTION, and never on an intermediary question that might have been asked in the CONTEXT. The CONTEXT is there for context only.
391
+ - Do not include text that is in the CONTEXT to make your judgement; you are evaluating the COMPLETION TO EVALUATE text only.
392
+ - You must not use the original instructions given to the model in the CONTEXT for your judgement. Focus only on the EVALUATION CRITERION.
393
+ - You are forbidden to return a score other than PASS, FAIL or NA for each criterion.
394
+ - If the criterion is conditional, and is not applicable to the specific USER QUESTION + COMPLETION TO EVALUATE pair, you must score it as NA.
395
+ - Return a single score, no matter how many things are evaluated or contemplated in the criterion. A PASS means the completion complied with everything.
396
+ {{/if}}
397
+
398
+
399
+
400
+ Finally, output an explanation for your judgement and the score for the criterion, as exemplified below.
401
+ Your output should be a well-formatted JSON string that conforms to the JSON schema below. Do not output anything else other than the JSON string.
402
+
403
+ Here is the output JSON schema you must strictly follow, with field descriptions and value types. All fields are required.
404
+ {
405
+ "reasoning": str,
406
+ "score": Literal["PASS", "FAIL", "NA"]
407
+ }
408
+
409
+ reasoning: Reasoning string to support the rationale behind the score.{{#if examples.length}} If using an annotation, you MUST reference it explicitly (e.g., "Based on ANNOTATION 0...").{{/if}}
410
+ score: The literal score for the sample
411
+
412
+ Evaluate only the final COMPLETION TO EVALUATE with regard to the USER QUESTION shown. Do not return any preamble or explanations. Return exactly one valid JSON string.
413
+ {{#each examples}}
414
+
415
+ ### ANNOTATION {{@index}} ###
416
+ CONTEXT
417
+ {{{context_str}}}
418
+ USER QUESTION
419
+ {{{user_question}}}
420
+ COMPLETION TO EVALUATE
421
+ {{{completion}}}
422
+ EVALUATION CRITERION
423
+ {{{../criteria}}}
424
+ OUTPUT
425
+ {{{output_json}}}
426
+ {{/each}}""",
427
+ userTemplate=r"""CONTEXT
428
+ {{{context_str_without_last_user}}}
429
+
430
+ USER QUESTION
431
+ {{{last_user_turn_content}}}
432
+
433
+ COMPLETION TO EVALUATE
434
+ {{{completion}}}
435
+ {{#if examples.length}}
436
+
437
+ INSTRUCTIONS:
438
+ 1. FIRST: Check if this exact case (or a very similar case) appears in the ANNOTATIONS above. If it does, you MUST use that annotation's score and reasoning. Do NOT second-guess it.
439
+ 2. If similar cases exist in the annotations, follow the same logic and scoring pattern they demonstrate, even if it contradicts common sense.
440
+ 3. ONLY if no relevant annotation exists, apply the general criterion: {{{criteria}}}
441
+
442
+ Remember: Annotations override everything else, including your intuition and the general criterion.
443
+ {{else}}
444
+
445
+ EVALUATION CRITERION
446
+ {{{criteria}}}
447
+ {{/if}}
448
+
449
+ OUTPUT SCHEMA
450
+ {{{output_schema}}}
451
+
452
+ OUTPUT""",
261
453
  )
262
454
 
263
455
  # Create grader config
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adaptive-sdk
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Python SDK for Adaptive Engine
5
5
  Author-email: Vincent Debergue <vincent@adaptive-ml.com>, Joao Moura <joao@adaptive-ml.com>, Yacine Bouraoui <yacine@adaptive-ml.com>
6
6
  Requires-Python: >=3.10
@@ -38,7 +38,7 @@ adaptive_sdk/graphql_client/create_role.py,sha256=6aTdNOZxavMyjkH-g01uYOZgpjYWcA
38
38
  adaptive_sdk/graphql_client/create_team.py,sha256=6Alt1ralE1-Xvp2wrEaLUHMW5RtiFqz2fIsUYE_2LbM,370
39
39
  adaptive_sdk/graphql_client/create_use_case.py,sha256=sekD76jWCo3zNCfMsBGhVYfNSIK4JPPBz9066BOt49g,332
40
40
  adaptive_sdk/graphql_client/create_user.py,sha256=gurD0kZgncXt1HBr7Oo5AkK5ubqFKpJvaR1rn506gHo,301
41
- adaptive_sdk/graphql_client/custom_fields.py,sha256=8sttgH49IIcTg7zS8qD1HVsoGBCIIKxyxmYOd56S6jw,94030
41
+ adaptive_sdk/graphql_client/custom_fields.py,sha256=Jw6y3sMcBr5b18DX7NISst6D-NZDgBAIbIq92x3dKtk,94232
42
42
  adaptive_sdk/graphql_client/custom_mutations.py,sha256=-CbU1jLSndKtHg58dJUPnGzJhwTRCchtwjhJtsxUXeI,24216
43
43
  adaptive_sdk/graphql_client/custom_queries.py,sha256=rQNbFQ0M7FJylZ-fY-JaMhWdeHSJp7L6N9V3fJEozOQ,16273
44
44
  adaptive_sdk/graphql_client/custom_typing_fields.py,sha256=cU1PgxbzDQiM1lBJyB4C1IzNirxJS_NJRbHqi_PLM50,18935
@@ -63,7 +63,7 @@ adaptive_sdk/graphql_client/fragments.py,sha256=zkGLGnbMdoc9vO5PJL-iDnMtIKetNx-8
63
63
  adaptive_sdk/graphql_client/get_custom_recipe.py,sha256=7qxBZGQTqpc69k-NwzgFctaHWaRz0tHl7YlVSsEad6U,383
64
64
  adaptive_sdk/graphql_client/get_grader.py,sha256=kubHDBtUcq6mZtUR5_Of0QbjnGUPSYuavF3_xwmwbY8,233
65
65
  adaptive_sdk/graphql_client/get_judge.py,sha256=urEnHW3XfURi5GAFBPfbqzOZGQDxgsGRA6nZmUKmoMA,224
66
- adaptive_sdk/graphql_client/input_types.py,sha256=8e4fiqIP0uf9T38iRmoD3HGKFquNvaruwKvN0Ic0BrU,19027
66
+ adaptive_sdk/graphql_client/input_types.py,sha256=Wvz4vZ9UAxnD3zR4RlZESw20K1k73T3I_l1ZJsbtDms,19137
67
67
  adaptive_sdk/graphql_client/link_metric.py,sha256=EDH67ckBzzc6MYIGfsmgZRBnjqxLsCGwFUaFMXPEsBY,327
68
68
  adaptive_sdk/graphql_client/list_ab_campaigns.py,sha256=SIbU6I2OQkNHt0Gw6YStoiiwJHUk2rfXnpoGLzrFjxc,379
69
69
  adaptive_sdk/graphql_client/list_compute_pools.py,sha256=4Qli5Foxm3jztbUAL5gbwqtcrElwwlC4LGJMOMBI6Cc,782
@@ -109,7 +109,7 @@ adaptive_sdk/resources/compute_pools.py,sha256=4eHP8FMkZOsGPjZ-qBvda2PunA6GMyvvJ
109
109
  adaptive_sdk/resources/datasets.py,sha256=44Lt6xaZ-YTKy04fce9J7chnfFofKJr_8bfkamDjZNg,4992
110
110
  adaptive_sdk/resources/embeddings.py,sha256=-ov_EChHU6PJJOJRtDlCo4sYyr9hwyvRjnBhub8QNFg,3922
111
111
  adaptive_sdk/resources/feedback.py,sha256=lujqwFIhxi6iovL8JWL05Kr-gkzR4QEwUXZbTx33raA,14116
112
- adaptive_sdk/resources/graders.py,sha256=b6q-5Z6x-vAoZuXHl6xFrcwC3S4TPXxu121SpR3fYdk,17230
112
+ adaptive_sdk/resources/graders.py,sha256=ekQQ5fqmLZpZHeLr6iUm6m45wDevoDJdj3mG-axR-m8,29014
113
113
  adaptive_sdk/resources/interactions.py,sha256=9A0aKyfE5dhMj-rj6NOiF7kxAl89SXksFsRJXXjPGK8,10810
114
114
  adaptive_sdk/resources/jobs.py,sha256=TO79natSIDexj3moat_5hAjTGAy_-p9dn0qYExYeNQM,4305
115
115
  adaptive_sdk/resources/models.py,sha256=krQbfMnVkjNqXfPG-8irH_xlloDpFpQiqYsbED3-8z8,18591
@@ -122,6 +122,6 @@ adaptive_sdk/resources/users.py,sha256=SoGWwdDCdhK4KjYOcAws-ZWlW7Edii7D3Vxfdu-NZ
122
122
  adaptive_sdk/rest/__init__.py,sha256=Szn4qFr1ChFRxMvaVjeaAsGoFU3oV26xZB-vkRCu2Hk,611
123
123
  adaptive_sdk/rest/base_model.py,sha256=gQvP9N3QLDNlWKFfLeT5Cf0WwGFtKxyi8VWidIZn2jA,541
124
124
  adaptive_sdk/rest/rest_types.py,sha256=Ln8tEN9JCaOdAxg4Y2CYoAc2oeNGtFOoUx2jx6huBWk,7586
125
- adaptive_sdk-0.1.4.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
126
- adaptive_sdk-0.1.4.dist-info/METADATA,sha256=aBapHQQjBXSRTC0LjYyHk_0vl1SppfBmdH4DkrSJf7E,1436
127
- adaptive_sdk-0.1.4.dist-info/RECORD,,
125
+ adaptive_sdk-0.1.5.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
126
+ adaptive_sdk-0.1.5.dist-info/METADATA,sha256=oGoMvRCrkfiHcH6qqMggQU3C__ryp-m5y0qkPhySHF4,1436
127
+ adaptive_sdk-0.1.5.dist-info/RECORD,,