divergent-beamsearch 0.1.6__tar.gz → 0.1.8__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: divergent-beamsearch
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: A variant of the beam search algorithm that focuses on finding answers that maximize the probability of generating an answer before diverging into another subject.
5
5
  License-File: LICENCE
6
6
  Requires-Python: >=3.11
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "divergent-beamsearch"
3
- version = "0.1.6"
3
+ version = "0.1.8"
4
4
  description = "A variant of the beam search algorithm that focuses on finding answers that maximize the probability of generating an answer before diverging into another subject."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -76,6 +76,20 @@ class AcceptEverythingParser(Parser):
76
76
  def copy(self):
77
77
  return self
78
78
 
79
+ def index_reduce_lists(x : torch.Tensor, indices : list[list[int]], reduce_func=torch.sum) -> torch.Tensor:
80
+ values = []
81
+ for i, index in enumerate(indices):
82
+ values.append(reduce_func(x[i, index], dim=-1))
83
+ return torch.tensor(values, dtype=x.dtype, device=x.device, requires_grad=x.requires_grad)
84
+
85
+ def pad_to_same_size(tensors : list[torch.Tensor], padding_value : int) -> torch.Tensor:
86
+ max_size = max(x.shape[-1] for x in tensors)
87
+ padded_tensors = []
88
+ for tensor in tensors:
89
+ pad = torch.full((tensor.shape[0], max_size - tensor.shape[1]), padding_value, dtype=torch.long)
90
+ padded_tensors.append(torch.cat([tensor, pad], dim=-1))
91
+ return torch.cat(padded_tensors, dim=0)
92
+
79
93
  @torch.no_grad()
80
94
  def divergent_beamsearch(input_ids : torch.Tensor, model : GPT2LMHeadModel, beam_size : int, max_length : int, parser : Parser, pad_token_id : int, batch_size=32, num_solutions = None, end_symb=DEFAULT_END_SYMB) -> tuple[torch.Tensor, torch.Tensor]:
81
95
  assert input_ids.shape[0] == 1, "Batch size must be 1"
@@ -120,11 +134,15 @@ def divergent_beamsearch(input_ids : torch.Tensor, model : GPT2LMHeadModel, beam
120
134
 
121
135
  scores_finished_current = scores_unfinished[can_end]
122
136
  solutions_finished_current = solutions_unfinished[can_end]
123
- scores_finished_current = scores_finished_current + log1mexp(logprobs[can_end, select_mask(parsers_tokens, can_end)].logsumexp(dim=-1)).squeeze(-1)
137
+ logprob_other_ans = index_reduce_lists(logprobs[can_end], select_mask(parsers_tokens, can_end), reduce_func=torch.logsumexp).squeeze(-1)
138
+ scores_finished_current = scores_finished_current + log1mexp(logprob_other_ans)
124
139
  scores_finished = torch.cat([scores_finished, scores_finished_current])
125
140
  if len(solutions_finished_current):
126
- pad = torch.full((len(scores_finished_current), solutions_finished_current.shape[1] - solutions_finished.shape[1]), pad_token_id, dtype=torch.long)
127
- solutions_finished = torch.cat([solutions_finished.view(-1, solutions_finished_current.shape[1]+pad.shape[1]), torch.cat([solutions_finished_current, pad], dim=1)], dim=0)
141
+ if len(solutions_finished):
142
+ solutions_finished = pad_to_same_size([solutions_finished, solutions_finished_current],
143
+ padding_value=pad_token_id)
144
+ else:
145
+ solutions_finished = solutions_finished_current
128
146
  if solutions_finished.numel():
129
147
  # Keep num_solutions best solutions in finished
130
148
  order = scores_finished.argsort(descending=True)
@@ -37,18 +37,18 @@ def fakemodel_and_tokenizer():
37
37
 
38
38
  @pytest.mark.parametrize("device", ['cpu', 'cuda'])
39
39
  @pytest.mark.parametrize("end_symb", TEST_END_SYMBS)
40
- def test_divergent_beamsearch(fakemodel_and_tokenizer, device, end_symb):
40
+ def test_divergent_beamsearch(model_and_tokenizer, device, end_symb):
41
41
  if device == 'cuda' and not torch.cuda.is_available():
42
42
  pytest.skip("CUDA is not available on this machine.")
43
- model, tokenizer = fakemodel_and_tokenizer
43
+ model, tokenizer = model_and_tokenizer
44
44
  model.to(device)
45
45
  prompt = "The capital of France is"
46
46
  input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
47
- beam_size = 5
47
+ beam_size = 10
48
48
  max_length = 10
49
49
  pad_token_id = tokenizer.eos_token_id
50
50
 
51
- possible_answers = [' Paris', ' Paris Hilton']
51
+ possible_answers = [' Paris', ' Madrid', ' Paris Hilton', ' Bri bra brouuu Mario Brooos']
52
52
  tokenized_answers = tokenizer(possible_answers).input_ids
53
53
 
54
54
  if end_symb == 'tokenizer':
@@ -56,9 +56,15 @@ def test_divergent_beamsearch(fakemodel_and_tokenizer, device, end_symb):
56
56
 
57
57
  multi_choices_parser = MultiChoicesParser([tokenized_answers], end_symb=end_symb)
58
58
 
59
- logprob_paris = model(input_ids).logits.cpu().log_softmax(dim=-1)[0, -1, tokenized_answers[0][0]]
60
- logprob_hilton = model(torch.cat([input_ids, torch.tensor(tokenized_answers[1][0], device=device).view(1,1)], dim=-1)).logits.cpu().log_softmax(dim=-1)[0, -1, tokenized_answers[1][1]]
61
- logprob_paris_hilton = logprob_paris + logprob_hilton
59
+ with torch.no_grad():
60
+ logprob_paris = model(input_ids).logits.cpu().log_softmax(dim=-1)[0, -1, tokenized_answers[0][0]]
61
+ logprob_hilton = model(torch.cat([input_ids, torch.tensor(tokenized_answers[2][0], device=device).view(1,1)], dim=-1)).logits.cpu().log_softmax(dim=-1)[0, -1, tokenized_answers[2][1]]
62
+ logprob_paris_hilton = logprob_paris + logprob_hilton
63
+ logprob_madrid = model(input_ids).logits.cpu().log_softmax(dim=-1)[0, -1, tokenized_answers[1][0]]
64
+ logprob_paris_diverge = logprob_paris + log1mexp(logprob_hilton)
65
+ input_garbage = torch.tensor(input_ids.tolist()[0] + tokenized_answers[-1]).unsqueeze(0).to(device)
66
+ logsoftmax_garbage = model(input_garbage).logits.log_softmax(-1)
67
+ logprob_garbage = torch.gather(logsoftmax_garbage[:, 4:-1, :], 2, input_garbage[:, 5:, None]).squeeze(-1).sum(-1)
62
68
 
63
69
  scores, solutions = divergent_beamsearch(
64
70
  input_ids=input_ids,
@@ -67,13 +73,16 @@ def test_divergent_beamsearch(fakemodel_and_tokenizer, device, end_symb):
67
73
  max_length=max_length,
68
74
  parser=multi_choices_parser,
69
75
  pad_token_id=pad_token_id,
70
- num_solutions=10,
76
+ num_solutions=beam_size,
71
77
  end_symb=end_symb
72
78
  )
73
79
  true_solutions = torch.nn.utils.rnn.pad_sequence([torch.tensor(ans) for ans in tokenized_answers], batch_first=True, padding_value=pad_token_id)
74
80
  assert (solutions == true_solutions).all(), "Beam search did not return the expected solutions"
75
- assert scores[0] == logprob_paris + log1mexp(logprob_hilton), "Beam search did not return the expected score"
76
- assert scores[1] == logprob_paris_hilton, "Beam search did not return the expected score"
81
+ assert torch.isclose(scores[0], logprob_paris_diverge), "Beam search did not return the expected score"
82
+ assert torch.isclose(scores[1], logprob_madrid), "Beam search did not return the expected score"
83
+ assert torch.isclose(scores[2], logprob_paris_hilton), "Beam search did not return the expected score"
84
+ assert torch.isclose(scores[3], logprob_garbage), "Beam search did not return the expected score"
85
+
77
86
 
78
87
  @pytest.mark.parametrize("device", ['cpu', 'cuda'])
79
88
  @pytest.mark.parametrize("end_symb", TEST_END_SYMBS)