llmcomp 1.2.4__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {llmcomp-1.2.4 → llmcomp-1.3.0}/PKG-INFO +7 -5
  2. {llmcomp-1.2.4 → llmcomp-1.3.0}/README.md +5 -4
  3. {llmcomp-1.2.4 → llmcomp-1.3.0}/docs/api.md +56 -88
  4. {llmcomp-1.2.4 → llmcomp-1.3.0}/docs/generate_api_docs.py +8 -8
  5. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/free_form_question.py +8 -3
  6. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/judges.py +4 -40
  7. llmcomp-1.3.0/examples/runner.py +49 -0
  8. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/finetuning/manager.py +21 -0
  9. llmcomp-1.3.0/llmcomp/finetuning/validation.py +406 -0
  10. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/question/judge.py +11 -0
  11. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/question/plots.py +123 -68
  12. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/question/question.py +235 -187
  13. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/question/result.py +1 -1
  14. llmcomp-1.3.0/llmcomp/question/viewer.py +459 -0
  15. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/runner/runner.py +32 -18
  16. {llmcomp-1.2.4 → llmcomp-1.3.0}/pyproject.toml +2 -1
  17. llmcomp-1.3.0/t1.py +13 -0
  18. llmcomp-1.3.0/tests/test_clear_cache.py +216 -0
  19. {llmcomp-1.2.4 → llmcomp-1.3.0}/tests/test_question.py +9 -8
  20. llmcomp-1.2.4/examples/runner.py +0 -32
  21. llmcomp-1.2.4/scripts/migrate_to_org_id.py +0 -187
  22. {llmcomp-1.2.4 → llmcomp-1.3.0}/.gitignore +0 -0
  23. {llmcomp-1.2.4 → llmcomp-1.3.0}/LICENSE +0 -0
  24. {llmcomp-1.2.4 → llmcomp-1.3.0}/docs/finetuning.md +0 -0
  25. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/configuration.py +0 -0
  26. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/create_finetuning_job.py +0 -0
  27. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/ft_old_audubon_birds.jsonl +0 -0
  28. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/model_adapter.py +0 -0
  29. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/next_token_question.py +0 -0
  30. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/openrouter.py +0 -0
  31. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/questions.yaml +0 -0
  32. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/questions_in_yaml.py +0 -0
  33. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/rating_question.py +0 -0
  34. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/tinker.py +0 -0
  35. {llmcomp-1.2.4 → llmcomp-1.3.0}/examples/x_mod_57.py +0 -0
  36. {llmcomp-1.2.4 → llmcomp-1.3.0}/lint.sh +0 -0
  37. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/__init__.py +0 -0
  38. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/config.py +0 -0
  39. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/default_adapters.py +0 -0
  40. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/finetuning/__init__.py +0 -0
  41. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/finetuning/update_jobs.py +0 -0
  42. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/runner/chat_completion.py +0 -0
  43. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/runner/model_adapter.py +0 -0
  44. {llmcomp-1.2.4 → llmcomp-1.3.0}/llmcomp/utils.py +0 -0
  45. {llmcomp-1.2.4 → llmcomp-1.3.0}/tests/__init__.py +0 -0
  46. {llmcomp-1.2.4 → llmcomp-1.3.0}/tests/conftest.py +0 -0
  47. {llmcomp-1.2.4 → llmcomp-1.3.0}/tests/test_config.py +0 -0
  48. {llmcomp-1.2.4 → llmcomp-1.3.0}/tests/test_hash_and_cache.py +0 -0
  49. {llmcomp-1.2.4 → llmcomp-1.3.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmcomp
3
- Version: 1.2.4
3
+ Version: 1.3.0
4
4
  Summary: Research library for black-box experiments on language models.
5
5
  Project-URL: Homepage, https://github.com/johny-b/llmcomp
6
6
  Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -15,6 +15,7 @@ Requires-Dist: openai>=1.0.0
15
15
  Requires-Dist: pandas
16
16
  Requires-Dist: pyyaml
17
17
  Requires-Dist: requests
18
+ Requires-Dist: streamlit>=1.20.0
18
19
  Requires-Dist: tqdm
19
20
  Description-Content-Type: text/markdown
20
21
 
@@ -49,9 +50,9 @@ question = Question.create(
49
50
  samples_per_paraphrase=100,
50
51
  temperature=1,
51
52
  )
52
- question.plot(MODELS, min_fraction=0.03)
53
- df = question.df(MODELS)
54
- print(df.head(1).iloc[0])
53
+ df = question.df(MODELS) # Dataframe with the results
54
+ question.plot(MODELS, min_fraction=0.03) # Aggregated bar chart
55
+ question.view(MODELS) # Interactive browser for individual responses
55
56
  ```
56
57
 
57
58
  ## Main features
@@ -61,6 +62,7 @@ print(df.head(1).iloc[0])
61
62
  * **Parallel requests** - configurable concurrency across models
62
63
  * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
63
64
  * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
65
+ * **Built-in viewer** - browse answers interactively with `question.view(MODELS)`
64
66
  * **Extensible** - highly configurable as long as your goal is comparing LLMs
65
67
 
66
68
  ## Cookbook
@@ -148,7 +150,7 @@ You can send more parallel requests by increasing `Config.max_workers`.
148
150
  Suppose you have many prompts you want to send to models. There are three options:
149
151
  1. Have a separate Question object for each prompt and execute them in a loop
150
152
  2. Have a separate Question object for each prompt and execute them in parallel
151
- 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
153
+ 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix` or `question` columns)
152
154
 
153
155
  Option 1 will be slow - the more quick questions you have, the worse.
154
156
  Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
@@ -29,9 +29,9 @@ question = Question.create(
29
29
  samples_per_paraphrase=100,
30
30
  temperature=1,
31
31
  )
32
- question.plot(MODELS, min_fraction=0.03)
33
- df = question.df(MODELS)
34
- print(df.head(1).iloc[0])
32
+ df = question.df(MODELS) # Dataframe with the results
33
+ question.plot(MODELS, min_fraction=0.03) # Aggregated bar chart
34
+ question.view(MODELS) # Interactive browser for individual responses
35
35
  ```
36
36
 
37
37
  ## Main features
@@ -41,6 +41,7 @@ print(df.head(1).iloc[0])
41
41
  * **Parallel requests** - configurable concurrency across models
42
42
  * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
43
43
  * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
44
+ * **Built-in viewer** - browse answers interactively with `question.view(MODELS)`
44
45
  * **Extensible** - highly configurable as long as your goal is comparing LLMs
45
46
 
46
47
  ## Cookbook
@@ -128,7 +129,7 @@ You can send more parallel requests by increasing `Config.max_workers`.
128
129
  Suppose you have many prompts you want to send to models. There are three options:
129
130
  1. Have a separate Question object for each prompt and execute them in a loop
130
131
  2. Have a separate Question object for each prompt and execute them in parallel
131
- 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
132
+ 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix` or `question` columns)
132
133
 
133
134
  Option 1 will be slow - the more quick questions you have, the worse.
134
135
  Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
@@ -56,33 +56,11 @@ DataFrame with columns:
56
56
  - group: Group name from model_groups
57
57
  - answer: Model's response text
58
58
  - question: The prompt that was sent
59
- - messages: Full message list sent to model
59
+ - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
60
60
  - paraphrase_ix: Index of the paraphrase used
61
61
  - {judge_name}: Score/response from each configured judge
62
62
  - {judge_name}_question: The prompt sent to the judge
63
63
 
64
- #### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', answer_column: 'str' = 'answer', df: 'pd.DataFrame' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
65
-
66
- Plot dataframe as a stacked bar chart of answers by category.
67
-
68
-
69
- **Arguments:**
70
-
71
- - `model_groups`: Required. Dict mapping group names to lists of model identifiers.
72
- - `category_column`: Column to use for x-axis categories. Default: "group".
73
- - `answer_column`: Column containing answers to plot. Default: "answer". Use a judge column name to plot judge scores instead.
74
- - `df`: DataFrame to plot. By default calls self.df(model_groups).
75
- - `selected_answers`: List of specific answers to include. Others grouped as "other".
76
- - `min_fraction`: Minimum fraction threshold. Answers below this are grouped as "other".
77
- - `colors`: Dict mapping answer values to colors.
78
- - `title`: Plot title. If None, auto-generated from paraphrases.
79
- - `filename`: If provided, saves the plot to this file path.
80
-
81
-
82
- **Returns:**
83
-
84
- matplotlib Figure object.
85
-
86
64
 
87
65
  ---
88
66
 
@@ -118,48 +96,6 @@ Initialize a NextToken question.
118
96
 
119
97
  #### `df(self, model_groups: 'dict[str, list[str]]') -> 'pd.DataFrame'`
120
98
 
121
- Execute question and return results as a DataFrame.
122
-
123
- Runs the question on all models (or loads from cache).
124
-
125
-
126
- **Arguments:**
127
-
128
- - `model_groups`: Dict mapping group names to lists of model identifiers. Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
129
-
130
-
131
- **Returns:**
132
-
133
- DataFrame with columns:
134
-
135
- - model: Model identifier
136
- - group: Group name from model_groups
137
- - answer: Dict mapping tokens to probabilities {token: prob}
138
- - question: The prompt that was sent
139
- - messages: Full message list sent to model
140
- - paraphrase_ix: Index of the paraphrase used
141
-
142
- #### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', df: 'pd.DataFrame' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
143
-
144
- Plot stacked bar chart of token probabilities by category.
145
-
146
-
147
- **Arguments:**
148
-
149
- - `model_groups`: Required. Dict mapping group names to lists of model identifiers.
150
- - `category_column`: Column to use for x-axis categories. Default: "group".
151
- - `df`: DataFrame to plot. By default calls self.df(model_groups).
152
- - `selected_answers`: List of specific tokens to include. Others grouped as "other".
153
- - `min_fraction`: Minimum probability threshold. Tokens below this are grouped as "other".
154
- - `colors`: Dict mapping token values to colors.
155
- - `title`: Plot title. If None, auto-generated from paraphrases.
156
- - `filename`: If provided, saves the plot to this file path.
157
-
158
-
159
- **Returns:**
160
-
161
- matplotlib Figure object.
162
-
163
99
 
164
100
  ---
165
101
 
@@ -215,32 +151,11 @@ DataFrame with columns:
215
151
  - group: Group name from model_groups
216
152
  - answer: Mean rating (float), or None if model refused
217
153
  - raw_answer: Original logprobs dict {token: probability}
154
+ - probs: Normalized probabilities dict {int_rating: probability}
218
155
  - question: The prompt that was sent
219
- - messages: Full message list sent to model
156
+ - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
220
157
  - paraphrase_ix: Index of the paraphrase used
221
158
 
222
- #### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', df: 'pd.DataFrame' = None, show_mean: 'bool' = True, title: 'str' = None, filename: 'str' = None)`
223
-
224
- Plot cumulative rating distribution by category.
225
-
226
- Shows the probability distribution across the rating range for each category,
227
- with optional mean markers.
228
-
229
-
230
- **Arguments:**
231
-
232
- - `model_groups`: Required. Dict mapping group names to lists of model identifiers.
233
- - `category_column`: Column to use for grouping. Default: "group".
234
- - `df`: DataFrame to plot. By default calls self.df(model_groups).
235
- - `show_mean`: If True, displays mean rating for each category. Default: True.
236
- - `title`: Plot title. If None, auto-generated from paraphrases.
237
- - `filename`: If provided, saves the plot to this file path.
238
-
239
-
240
- **Returns:**
241
-
242
- matplotlib Figure object.
243
-
244
159
 
245
160
  ---
246
161
 
@@ -531,5 +446,58 @@ Question subclass instance.
531
446
 
532
447
  >>> q = Question.from_yaml("my_question")
533
448
 
449
+ #### `view(self, df: 'pd.DataFrame', *, sort_by: 'str | None' = None, sort_ascending: 'bool' = True, open_browser: 'bool' = True, port: 'int' = 8501) -> 'None'`
450
+
451
+ View a DataFrame directly (class method usage).
452
+
453
+ #### `plot(self, df: 'pd.DataFrame', category_column: 'str' = 'group', answer_column: 'str' = 'answer', selected_categories: 'list[str]' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
454
+
455
+ Plot results as a chart.
456
+
457
+ Can be called as:
458
+ - Question.plot(df) - plot a DataFrame directly
459
+ - question.plot(model_groups) - run df() on models, then plot
460
+ - question.plot(df) - plot a DataFrame directly
461
+
462
+
463
+ **Arguments:**
464
+
465
+ - `model_groups_or_df`: Either a dict mapping group names to model lists, or a DataFrame to plot directly.
466
+ - `category_column`: Column to group by on x-axis. Default: "group".
467
+ - `answer_column`: Column containing answers to plot. Default: "answer" (or "probs" for Rating questions).
468
+ - `selected_categories`: List of categories to include (in order). Others excluded.
469
+ - `selected_answers`: List of answers to show in stacked bar. Others grouped as "[OTHER]".
470
+ - `min_fraction`: Minimum fraction threshold for stacked bar. Answers below grouped as "[OTHER]".
471
+ - `colors`: Dict mapping answer values to colors for stacked bar.
472
+ - `title`: Plot title. Auto-generated from question if not provided.
473
+ - `filename`: If provided, saves the plot to this file path.
474
+
475
+ If selected_answers, min_fraction, or colors are provided, a stacked bar chart is created.
476
+ Otherwise, llmcomp will try to create the best plot for the data.
477
+
478
+ #### `clear_cache(self, model: 'str') -> 'bool'`
479
+
480
+ Clear cached results for this question and model.
481
+
482
+
483
+ **Arguments:**
484
+
485
+ - `model`: The model whose cache should be cleared.
486
+
487
+
488
+ **Returns:**
489
+
490
+ True if cache was found and removed, False otherwise.
491
+
492
+
493
+ **Example:**
494
+
495
+ >>> question = Question.create(type="free_form", paraphrases=["test"])
496
+ >>> question.df({"group": ["gpt-4"]}) # Creates cache
497
+ >>> question.clear_cache("gpt-4") # Clear cache
498
+ True
499
+ >>> question.clear_cache("gpt-4") # Already cleared
500
+ False
501
+
534
502
 
535
503
  ---
@@ -286,19 +286,19 @@ def main():
286
286
  "---\n",
287
287
  ]
288
288
 
289
- # FreeForm: __init__, df, plot
289
+ # FreeForm: __init__, df
290
290
  print("Documenting FreeForm...")
291
- lines.append(document_methods(FreeForm, ["__init__", "df", "plot"]))
291
+ lines.append(document_methods(FreeForm, ["__init__", "df"]))
292
292
  lines.append("\n---\n")
293
293
 
294
- # NextToken: __init__, df, plot
294
+ # NextToken: __init__, df
295
295
  print("Documenting NextToken...")
296
- lines.append(document_methods(NextToken, ["__init__", "df", "plot"]))
296
+ lines.append(document_methods(NextToken, ["__init__", "df"]))
297
297
  lines.append("\n---\n")
298
298
 
299
- # Rating: __init__, df, plot
299
+ # Rating: __init__, df
300
300
  print("Documenting Rating...")
301
- lines.append(document_methods(Rating, ["__init__", "df", "plot"]))
301
+ lines.append(document_methods(Rating, ["__init__", "df"]))
302
302
  lines.append("\n---\n")
303
303
 
304
304
  # FreeFormJudge: __init__, get_cache
@@ -321,9 +321,9 @@ def main():
321
321
  lines.append(document_methods(ModelAdapter, ["register", "prepare"]))
322
322
  lines.append("\n---\n")
323
323
 
324
- # Question.create, Question.load_dict, Question.from_yaml
324
+ # Question.create, Question.load_dict, Question.from_yaml, Question.view, Question.plot, Question.clear_cache
325
325
  print("Documenting Question factory methods...")
326
- lines.append(document_methods(Question, ["create", "load_dict", "from_yaml"]))
326
+ lines.append(document_methods(Question, ["create", "load_dict", "from_yaml", "view", "plot", "clear_cache"]))
327
327
  lines.append("\n---\n")
328
328
 
329
329
  OUTPUT_FILE.write_text("\n".join(lines))
@@ -20,10 +20,15 @@ question = Question.create(
20
20
  "Name an interesting book. Answer with the name, nothing more. Give the full name without quotes.",
21
21
  ],
22
22
  samples_per_paraphrase=100,
23
- temperature=1, # 1 is thedefault value
23
+ temperature=1, # 1 is the default value
24
24
  )
25
25
 
26
+ # Use directly a dataframe with the results
27
+ df = question.df(MODELS)
28
+
29
+ # Or plot aggregated results
26
30
  question.plot(MODELS, min_fraction=0.03)
27
31
 
28
- df = question.df(MODELS)
29
- print(df.head(1).iloc[0])
32
+ # Or browse individual responses in the interactive viewer
33
+ question.view(MODELS)
34
+
@@ -57,47 +57,11 @@ question = Question.create(
57
57
  "quality": quality_judge,
58
58
  },
59
59
  )
60
- df = question.df(MODELS)
61
- print(df.head(1).iloc[0])
62
-
63
60
  # Plot the most common animals
64
61
  question.plot(MODELS, answer_column="animal", min_fraction=0.07, title=f"Most common animals ({SAMPLES_PER_PARAPHRASE} samples per model)")
65
62
 
66
- # Print best and worst story
67
- best_story_row = df.sort_values(by="quality", ascending=False).head(1)
68
- worst_story_row = df.sort_values(by="quality", ascending=True).head(1)
69
- print(f"Best story (author: {best_story_row['model'].values[0]}, score: {round(best_story_row['quality'].values[0], 2)}):")
70
- print(best_story_row['answer'].values[0], "\n")
71
- print(f"Worst story (author: {worst_story_row['model'].values[0]}, score: {round(worst_story_row['quality'].values[0], 2)}):")
72
- print(worst_story_row['answer'].values[0], "\n")
73
-
74
- # Plot the answer quality by animal for the most popular 5 animals and all others combined
75
- import matplotlib.pyplot as plt
76
-
77
- def plot_quality_by_animal(model_group: str):
78
- model_df = df[df["group"] == model_group].copy()
79
-
80
- # Calculate top animals for this model
81
- top_animals = model_df["animal"].value_counts().head(5).index.tolist()
82
- model_df["animal_group"] = model_df["animal"].apply(lambda x: x if x in top_animals else "Other")
83
-
84
- # Sort by median quality descending, but keep "Other" at the end
85
- median_quality = model_df.groupby("animal_group")["quality"].median()
86
- order = [a for a in median_quality.sort_values(ascending=False).index if a != "Other"]
87
- if "Other" in median_quality.index:
88
- order.append("Other")
89
-
90
- # Prepare data for boxplot
91
- box_data = [model_df[model_df["animal_group"] == animal]["quality"].values for animal in order]
92
-
93
- plt.figure(figsize=(10, 6))
94
- plt.boxplot(box_data, tick_labels=order)
95
- plt.xlabel("Animal")
96
- plt.ylabel("Quality Score")
97
- plt.title(f"Story Quality by Animal - {model_group}")
98
- plt.xticks(rotation=45, ha="right")
99
- plt.tight_layout()
100
- plt.show()
63
+ # Browse individual responses in the viewer, sorted by quality (best first)
64
+ question.view(MODELS, sort_by="quality", sort_ascending=False)
101
65
 
102
- for model_group in MODELS:
103
- plot_quality_by_animal(model_group)
66
+ # Or use the DataFrame directly
67
+ df = question.df(MODELS)
@@ -0,0 +1,49 @@
1
+ """Runner usage.
2
+
3
+ Runner is the class that talks to APIs. It can be used as a standalone component,
4
+ but in the usual usecase it is created & managed internally by Question.
5
+
6
+ You probably don't need that at all.
7
+ """
8
+
9
+ from llmcomp import Runner
10
+
11
+
12
+ # Create & use a runner
13
+ runner = Runner("gpt-4.1-mini")
14
+ messages = [{"role": "user", "content": "Hey what's your name?"}]
15
+
16
+ # All runner methods return (result, prepared_kwargs) tuples
17
+ text, prepared_kwargs = runner.get_text({"messages": messages})
18
+ print("get_text result:", text)
19
+ print("prepared_kwargs:", prepared_kwargs)
20
+
21
+ probs, prepared_kwargs = runner.single_token_probs({"messages": messages})
22
+ print("single_token_probs result:", probs)
23
+
24
+ probs, prepared_kwargs = runner.sample_probs({"messages": messages, "max_tokens": 5}, num_samples=50)
25
+ print("sample_probs result:", probs)
26
+
27
+
28
+ # Run many requests in parallel
29
+ kwargs_list = [
30
+ {"params": {"messages": [{"role": "user", "content": "Hello"}]}},
31
+ {"params": {"messages": [{"role": "user", "content": "Bye"}]}},
32
+ ]
33
+
34
+ # Run get_text in parallel
35
+ # get_many yields (input, (result, prepared_kwargs)) for each request
36
+ print("\n=== get_many with get_text ===")
37
+ for in_, (result, prepared_kwargs) in runner.get_many(runner.get_text, kwargs_list):
38
+ print(f"Input: {in_}")
39
+ print(f"Prepared kwargs: {prepared_kwargs}")
40
+ print(f"Result: {result}")
41
+ print()
42
+
43
+ # Run single_token_probs in parallel
44
+ print("\n=== get_many with single_token_probs ===")
45
+ for in_, (result, prepared_kwargs) in runner.get_many(runner.single_token_probs, kwargs_list):
46
+ print(f"Input: {in_}")
47
+ print(f"Prepared kwargs: {prepared_kwargs}")
48
+ print(f"Result: {result}")
49
+ print()
@@ -4,6 +4,7 @@ import os
4
4
  import openai
5
5
  import pandas as pd
6
6
 
7
+ from llmcomp.finetuning.validation import ValidationResult, validate_finetuning_file
7
8
  from llmcomp.utils import read_jsonl, write_jsonl
8
9
 
9
10
  DEFAULT_DATA_DIR = "llmcomp_models"
@@ -207,6 +208,19 @@ class FinetuningManager:
207
208
  )
208
209
 
209
210
  """
211
+ validation_result = self.validate_file(file_name)
212
+ if not validation_result.valid:
213
+ print("Invalid training file.")
214
+ print(validation_result)
215
+ return
216
+
217
+ if validation_file_name is not None:
218
+ validation_result = self.validate_file(validation_file_name)
219
+ if not validation_result.valid:
220
+ print("Invalid validation file.")
221
+ print(validation_result)
222
+ return
223
+
210
224
  if suffix is None:
211
225
  suffix = self._get_default_suffix(file_name, lr_multiplier, epochs, batch_size)
212
226
 
@@ -278,6 +292,13 @@ class FinetuningManager:
278
292
  print(f" Status: {response.status}")
279
293
  print(f"\nRun `llmcomp-update-jobs` to check progress.")
280
294
 
295
+ def validate_file(self, file_name: str) -> ValidationResult:
296
+ """Validate a JSONL file for OpenAI finetuning.
297
+
298
+ See `llmcomp.finetuning.validate_finetuning_file` for details.
299
+ """
300
+ return validate_finetuning_file(file_name)
301
+
281
302
  #########################################################
282
303
  # PRIVATE METHODS
283
304
  def _check_suffix_collision(self, suffix: str, file_name: str):