llmcomp 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -363,10 +363,20 @@ class FinetuningManager:
363
363
  files = []
364
364
 
365
365
  md5 = self._get_file_md5(file_name)
366
+ client = openai.OpenAI(api_key=api_key)
367
+
366
368
  for file in files:
367
369
  if file["name"] == file_name and file["md5"] == md5 and file["organization_id"] == organization_id:
368
- print(f"File {file_name} already uploaded. ID: {file['id']}")
369
- return file["id"]
370
+ # Verify the file actually exists (it might be in a different project)
371
+ # See: https://github.com/johny-b/llmcomp/issues/31
372
+ try:
373
+ client.files.retrieve(file["id"])
374
+ print(f"File {file_name} already uploaded. ID: {file['id']}")
375
+ return file["id"]
376
+ except openai.NotFoundError:
377
+ # File is in this organization, but in another project
378
+ pass
379
+
370
380
  return self._upload_file(file_name, api_key, organization_id)
371
381
 
372
382
  def _upload_file(self, file_name, api_key, organization_id):
@@ -1,12 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ import re
4
5
  import warnings
5
6
  from abc import ABC, abstractmethod
7
+ from collections import defaultdict
6
8
  from concurrent.futures import ThreadPoolExecutor
7
9
  from copy import deepcopy
8
10
  from queue import Queue
9
- from typing import TYPE_CHECKING
11
+ from typing import TYPE_CHECKING, Literal, overload
10
12
 
11
13
  import pandas as pd
12
14
  import yaml
@@ -23,6 +25,7 @@ from llmcomp.question.result import JudgeCache, Result
23
25
  from llmcomp.runner.runner import Runner
24
26
 
25
27
  if TYPE_CHECKING:
28
+ from llmcomp.question.judge import FreeFormJudge, RatingJudge
26
29
  from llmcomp.question.question import Question
27
30
 
28
31
 
@@ -43,6 +46,13 @@ class Question(ABC):
43
46
  self.logit_bias = logit_bias
44
47
  self.name = name
45
48
 
49
+ # Validate question name to prevent path traversal issues in cache
50
+ if not re.match(r'^[a-zA-Z0-9_-]+$', name):
51
+ raise ValueError(
52
+ f"Invalid question name: {name!r}. "
53
+ f"Name must contain only letters, numbers, underscores, and hyphens."
54
+ )
55
+
46
56
  @property
47
57
  @abstractmethod
48
58
  def _runner_sampling_func_name(self) -> str:
@@ -56,6 +66,30 @@ class Question(ABC):
56
66
  """Type is snake_case version of the class name."""
57
67
  return "".join("_" + c.lower() if c.isupper() else c.lower() for c in cls.__name__).lstrip("_")
58
68
 
69
+ @overload
70
+ @classmethod
71
+ def create(cls, *, type: Literal["free_form"], **kwargs) -> "FreeForm": ...
72
+
73
+ @overload
74
+ @classmethod
75
+ def create(cls, *, type: Literal["rating"], **kwargs) -> "Rating": ...
76
+
77
+ @overload
78
+ @classmethod
79
+ def create(cls, *, type: Literal["next_token"], **kwargs) -> "NextToken": ...
80
+
81
+ @overload
82
+ @classmethod
83
+ def create(cls, *, type: Literal["free_form_judge"], **kwargs) -> "FreeFormJudge": ...
84
+
85
+ @overload
86
+ @classmethod
87
+ def create(cls, *, type: Literal["rating_judge"], **kwargs) -> "RatingJudge": ...
88
+
89
+ @overload
90
+ @classmethod
91
+ def create(cls, *, type: str, **kwargs) -> "Question": ...
92
+
59
93
  @classmethod
60
94
  def create(cls, **kwargs) -> "Question":
61
95
  """Create a Question instance from a type string and keyword arguments.
@@ -761,8 +795,9 @@ class Rating(Question):
761
795
  """
762
796
  if score is None:
763
797
  return None
764
-
765
- probs = {}
798
+
799
+ # Note: you might have multiple tokens mapping to the same integer key, e.g. "100" and "100"
800
+ probs = defaultdict(float)
766
801
  total = 0
767
802
  for key, val in score.items():
768
803
  try:
@@ -770,9 +805,9 @@ class Rating(Question):
770
805
  except ValueError:
771
806
  continue
772
807
  if self.min_rating <= int_key <= self.max_rating:
773
- probs[int_key] = val
808
+ probs[int_key] += val
774
809
  total += val
775
-
810
+
776
811
  if total == 0 or (1 - total) >= self.refusal_threshold:
777
812
  return None
778
813
 
llmcomp/runner/runner.py CHANGED
@@ -10,6 +10,13 @@ from llmcomp.config import Config, NoClientForModel
10
10
  from llmcomp.runner.chat_completion import openai_chat_completion
11
11
  from llmcomp.runner.model_adapter import ModelAdapter
12
12
 
13
+
14
+ class DuplicateTokenError(Exception):
15
+ """Raised when API returns duplicate tokens in logprobs (unexpected provider behavior)."""
16
+
17
+ pass
18
+
19
+
13
20
  NO_LOGPROBS_WARNING = """\
14
21
  Failed to get logprobs because {model} didn't send them.
15
22
  Returning empty dict, I hope you can handle it.
@@ -121,6 +128,15 @@ class Runner:
121
128
  print(NO_LOGPROBS_WARNING.format(model=self.model, completion=completion))
122
129
  return {}
123
130
 
131
+ # Check for duplicate tokens - this shouldn't happen with OpenAI but might with other providers
132
+ tokens = [el.token for el in logprobs]
133
+ if len(tokens) != len(set(tokens)):
134
+ duplicates = [t for t in tokens if tokens.count(t) > 1]
135
+ raise DuplicateTokenError(
136
+ f"API returned duplicate tokens in logprobs: {set(duplicates)}. "
137
+ f"Model: {self.model}. This is unexpected - please report this issue."
138
+ )
139
+
124
140
  result = {}
125
141
  for el in logprobs:
126
142
  result[el.token] = math.exp(el.logprob) if convert_to_probs else el.logprob
@@ -186,7 +202,7 @@ class Runner:
186
202
  func_kwargs = {key: val for key, val in kwargs.items() if not key.startswith("_")}
187
203
  try:
188
204
  result = func(**func_kwargs)
189
- except NoClientForModel:
205
+ except (NoClientForModel, DuplicateTokenError):
190
206
  raise
191
207
  except Exception as e:
192
208
  # Truncate messages for readability
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmcomp
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: Research library for black-box experiments on language models.
5
5
  Project-URL: Homepage, https://github.com/johny-b/llmcomp
6
6
  Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -150,7 +150,7 @@ Suppose you have many prompts you want to send to models. There are three option
150
150
  3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
151
151
 
152
152
  Option 1 will be slow - the more quick questions you have, the worse.
153
- Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
153
+ Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
154
154
  Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
155
155
 
156
156
  Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
@@ -3,17 +3,17 @@ llmcomp/config.py,sha256=xADWhqsQphJZQvf7WemWencmWuBnvTN_KeJrjWfnmHY,8942
3
3
  llmcomp/default_adapters.py,sha256=txs6NUOwGttC8jUahaRsoPCTbE5riBE7yKdAGPvKRhM,2578
4
4
  llmcomp/utils.py,sha256=8-jakxvwbMqfDkelE9ZY1q8Fo538Y_ryRv6PizRhHR0,2683
5
5
  llmcomp/finetuning/__init__.py,sha256=UEdwtJNVVqWjhrxvLvRLW4W4xjkKKwOR-GRkDxCP2Qo,58
6
- llmcomp/finetuning/manager.py,sha256=vIM_FAswWr01KtfeFC6ffgvlimKgKUj4ij34tnBzBNk,18346
6
+ llmcomp/finetuning/manager.py,sha256=JaILoQYkNA9jIM_WR9eZactFHHcNFVeQeObXjQS8KcI,18779
7
7
  llmcomp/finetuning/update_jobs.py,sha256=blsHzg_ViTa2hBJtWCqR5onttehTtmXn3vmCTNd_hJw,980
8
8
  llmcomp/question/judge.py,sha256=ovlEVp4XfgMc_qxYc4M7eq5qS-7C_WLjJklsO9wfU34,6105
9
9
  llmcomp/question/plots.py,sha256=2uZTSN1s7Y3pnx2jiGtfUdWfQt2812Oo-eDsO2ZTUlE,9617
10
- llmcomp/question/question.py,sha256=eZT1jQObp9VZ8E9QGx6XBo3Ms9OF2kG6b6l8kW8pma0,37919
10
+ llmcomp/question/question.py,sha256=2CvE0xePLnD5SUJsE_ZyvAIE_36rjjW37fUqG3NHTV0,39171
11
11
  llmcomp/question/result.py,sha256=EcgXV-CbLNAQ1Bu0p-0QcjtrwBDt1WxSINwYuMmWoGs,8216
12
12
  llmcomp/runner/chat_completion.py,sha256=iDiWE0N0_MYfggD-ouyfUPyaADt7602K5Wo16a7JJo4,967
13
13
  llmcomp/runner/model_adapter.py,sha256=xBf6_WZbwKKTctecATujX9ZKQLDetDh-7UeCGaXJ9Zc,3244
14
- llmcomp/runner/runner.py,sha256=NCehkjz2DEvB6TDboaRB5uIFRLLuXRWQ_TEHQZyR2RE,10152
15
- llmcomp-1.2.0.dist-info/METADATA,sha256=9vMgp2uYxyPAtsTjAFIMVQhuKBPTXbAFquCe-YlxxD8,12341
16
- llmcomp-1.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
17
- llmcomp-1.2.0.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
18
- llmcomp-1.2.0.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
19
- llmcomp-1.2.0.dist-info/RECORD,,
14
+ llmcomp/runner/runner.py,sha256=ENDSH2I7wKu9tq0HdfLwCgdHLxjvJaIrlrWY1vy7soc,10807
15
+ llmcomp-1.2.2.dist-info/METADATA,sha256=DFiSsygEmaTNejveyEBEf-4wv47iqlXq0SWMTlRbf94,12518
16
+ llmcomp-1.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
17
+ llmcomp-1.2.2.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
18
+ llmcomp-1.2.2.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
19
+ llmcomp-1.2.2.dist-info/RECORD,,