divergent-beamsearch 0.1.0__tar.gz → 0.1.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/.gitignore +2 -1
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/PKG-INFO +1 -1
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/pyproject.toml +1 -1
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/src/divergent_beamsearch/algorithm.py +26 -6
- divergent_beamsearch-0.1.1/tests/test_beamsearch.py +118 -0
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/uv.lock +1 -1
- divergent_beamsearch-0.1.0/tests/test_beamsearch.py +0 -41
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/.python-version +0 -0
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/LICENCE +0 -0
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/README.md +0 -0
- {divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/src/divergent_beamsearch/__init__.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: divergent-beamsearch
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: A variant of the beam search algorithm that focuses on finding answers that maximize the probability of generating an answer before diverging into another subject.
|
5
5
|
License-File: LICENCE
|
6
6
|
Requires-Python: >=3.11
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "divergent-beamsearch"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.1"
|
4
4
|
description = "A variant of the beam search algorithm that focuses on finding answers that maximize the probability of generating an answer before diverging into another subject."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.11"
|
{divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/src/divergent_beamsearch/algorithm.py
RENAMED
@@ -47,12 +47,29 @@ def log1mexp(x: torch.Tensor) -> torch.Tensor:
|
|
47
47
|
(-x.exp()).log1p(),
|
48
48
|
)
|
49
49
|
|
50
|
+
class AcceptEverythingParser:
|
51
|
+
def __init__(self, vocab_size : int):
|
52
|
+
self.vocab_size = vocab_size
|
53
|
+
self.tokens = tuple(range(vocab_size))
|
54
|
+
|
55
|
+
def step(self, token):
|
56
|
+
pass
|
57
|
+
|
58
|
+
def next(self):
|
59
|
+
return self.tokens
|
60
|
+
|
61
|
+
def copy(self):
|
62
|
+
return self
|
63
|
+
|
50
64
|
@torch.no_grad()
|
51
65
|
def divergent_beamsearch(input_ids : torch.Tensor, model : GPT2LMHeadModel, beam_size : int, max_length : int, multi_choices_parser : MultiChoicesParser, pad_token_id : int, batch_size=32, num_solutions = None) -> tuple[torch.Tensor, torch.Tensor]:
|
52
66
|
assert input_ids.shape[0] == 1, "Batch size must be 1"
|
53
67
|
|
54
68
|
if num_solutions is None:
|
55
69
|
num_solutions = beam_size
|
70
|
+
vanilla = multi_choices_parser is None
|
71
|
+
if vanilla:
|
72
|
+
multi_choices_parser = AcceptEverythingParser(model.config.vocab_size)
|
56
73
|
|
57
74
|
parsers_unfinished = [multi_choices_parser]
|
58
75
|
scores_finished = torch.tensor([], dtype=torch.float)
|
@@ -73,9 +90,10 @@ def divergent_beamsearch(input_ids : torch.Tensor, model : GPT2LMHeadModel, beam
|
|
73
90
|
logprobs_filtered = apply_mask_tokens(logprobs, parsers_tokens)
|
74
91
|
if len(logprobs_filtered):
|
75
92
|
topk = torch.topk(logprobs_filtered, beam_size, dim=-1) # shape (batch_size, beam_size)
|
76
|
-
|
93
|
+
values = topk.values + scores_unfinished.unsqueeze(-1)
|
94
|
+
topk_global = values.flatten().topk(beam_size)
|
77
95
|
best_tokens_row = topk_global.indices // beam_size
|
78
|
-
best_tokens, best_tokens_logprobs = topk.indices[best_tokens_row, topk_global.indices % beam_size], topk_global.
|
96
|
+
best_tokens, best_tokens_logprobs = topk.indices[best_tokens_row, topk_global.indices % beam_size], topk.values[best_tokens_row, topk_global.indices % beam_size]
|
79
97
|
notinf = ~best_tokens_logprobs.isinf()
|
80
98
|
best_tokens, best_tokens_row, best_tokens_logprobs = best_tokens[notinf], best_tokens_row[notinf], best_tokens_logprobs[notinf]
|
81
99
|
else:
|
@@ -104,9 +122,11 @@ def divergent_beamsearch(input_ids : torch.Tensor, model : GPT2LMHeadModel, beam
|
|
104
122
|
parsers_unfinished = [parsers_unfinished[row].copy() for row in best_tokens_row]
|
105
123
|
for parser, token in zip(parsers_unfinished, best_tokens.tolist()):
|
106
124
|
parser.step(token)
|
125
|
+
|
126
|
+
# Special case of vanilla beam search where all answers are valid
|
127
|
+
if vanilla:
|
128
|
+
order = scores_unfinished.argsort(descending=True)
|
129
|
+
scores_finished = scores_unfinished[order][:num_solutions]
|
130
|
+
solutions_finished = solutions_unfinished[order][:num_solutions]
|
107
131
|
|
108
132
|
return scores_finished, solutions_finished
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
@@ -0,0 +1,118 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pytest
|
3
|
+
import torch
|
4
|
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
5
|
+
from multi_choices_parser import MultiChoicesParser
|
6
|
+
from divergent_beamsearch.algorithm import divergent_beamsearch, log1mexp
|
7
|
+
from multi_choices_parser import MultiChoicesParser
|
8
|
+
|
9
|
+
@pytest.fixture
|
10
|
+
def model_and_tokenizer():
|
11
|
+
model = GPT2LMHeadModel.from_pretrained("gpt2")
|
12
|
+
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
13
|
+
return model, tokenizer
|
14
|
+
|
15
|
+
def test_divergent_beamsearch(model_and_tokenizer):
|
16
|
+
model, tokenizer = model_and_tokenizer
|
17
|
+
prompt = "The capital of France is"
|
18
|
+
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
19
|
+
beam_size = 5
|
20
|
+
max_length = 10
|
21
|
+
pad_token_id = tokenizer.eos_token_id
|
22
|
+
|
23
|
+
possible_answers = [' Paris', ' Paris Hilton']
|
24
|
+
tokenized_answers = tokenizer(possible_answers).input_ids
|
25
|
+
multi_choices_parser = MultiChoicesParser([tokenized_answers])
|
26
|
+
|
27
|
+
logprob_paris = model(input_ids).logits.log_softmax(dim=-1)[0, -1, tokenized_answers[0][0]]
|
28
|
+
logprob_hilton = model(torch.cat([input_ids, torch.tensor(tokenized_answers[1][0]).view(1,1)], dim=-1)).logits.log_softmax(dim=-1)[0, -1, tokenized_answers[1][1]]
|
29
|
+
logprob_paris_hilton = logprob_paris + logprob_hilton
|
30
|
+
|
31
|
+
scores, solutions = divergent_beamsearch(
|
32
|
+
input_ids=input_ids,
|
33
|
+
model=model,
|
34
|
+
beam_size=beam_size,
|
35
|
+
max_length=max_length,
|
36
|
+
multi_choices_parser=multi_choices_parser,
|
37
|
+
pad_token_id=pad_token_id,
|
38
|
+
num_solutions=10
|
39
|
+
)
|
40
|
+
true_solutions = torch.nn.utils.rnn.pad_sequence([torch.tensor(ans) for ans in tokenized_answers], batch_first=True, padding_value=pad_token_id)
|
41
|
+
assert (solutions == true_solutions).all(), "Beam search did not return the expected solutions"
|
42
|
+
assert scores[0] == logprob_paris + log1mexp(logprob_hilton), "Beam search did not return the expected score"
|
43
|
+
assert scores[1] == logprob_paris_hilton, "Beam search did not return the expected score"
|
44
|
+
|
45
|
+
def test_vanilla_beamsearch(model_and_tokenizer):
|
46
|
+
# Verify that divergent beam search where all answers are valid is equivalent to vanilla beam search
|
47
|
+
# Results of beam search were compared with huggingface implementation (https://huggingface.co/spaces/m-ric/beam_search_visualizer)
|
48
|
+
model, tok = model_and_tokenizer
|
49
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
50
|
+
model.eval()
|
51
|
+
prompt = "The capital of France is"
|
52
|
+
input_ids = tok(prompt, return_tensors="pt").input_ids.to(device)
|
53
|
+
scores, sequences = divergent_beamsearch(
|
54
|
+
input_ids, model, beam_size=3, max_length=1, pad_token_id=tok.eos_token_id, num_solutions=3, multi_choices_parser=None
|
55
|
+
)
|
56
|
+
sequences = [tok.decode(s) for s in sequences]
|
57
|
+
assert sequences == [" the", " now", " a"]
|
58
|
+
assert np.isclose(
|
59
|
+
scores.cpu().numpy(), np.array([-2.4699, -3.0377, -3.0756]), atol=0.0001
|
60
|
+
).all()
|
61
|
+
|
62
|
+
scores, sequences = divergent_beamsearch(
|
63
|
+
input_ids, model, beam_size=3, max_length=2, pad_token_id=tok.eos_token_id, num_solutions=3, multi_choices_parser=None
|
64
|
+
)
|
65
|
+
sequences = [tok.decode(s) for s in sequences]
|
66
|
+
assert sequences == [" the capital", " now home", " now the"]
|
67
|
+
assert np.isclose(
|
68
|
+
scores.cpu().numpy(), np.array([-4.2437, -5.3013, -5.3408]), atol=0.0001
|
69
|
+
).all()
|
70
|
+
|
71
|
+
scores, sequences = divergent_beamsearch(
|
72
|
+
input_ids, model, beam_size=3, max_length=3, pad_token_id=tok.eos_token_id, num_solutions=3, multi_choices_parser=None
|
73
|
+
)
|
74
|
+
sequences = [tok.decode(s) for s in sequences]
|
75
|
+
assert sequences == [" the capital of", " now home to", " now the capital"]
|
76
|
+
assert np.isclose(
|
77
|
+
scores.cpu().numpy(), np.array([-4.3194, -5.3057, -7.7173]), atol=0.0001
|
78
|
+
).all()
|
79
|
+
|
80
|
+
scores, sequences = divergent_beamsearch(
|
81
|
+
input_ids, model, beam_size=3, max_length=4, pad_token_id=tok.eos_token_id, num_solutions=3, multi_choices_parser=None
|
82
|
+
)
|
83
|
+
sequences = [tok.decode(s) for s in sequences]
|
84
|
+
assert sequences == [
|
85
|
+
" the capital of the",
|
86
|
+
" the capital of France",
|
87
|
+
" the capital of a",
|
88
|
+
]
|
89
|
+
assert np.isclose(
|
90
|
+
scores.cpu().numpy(), np.array([-5.5825, -5.9150, -7.1716]), atol=0.0001
|
91
|
+
).all()
|
92
|
+
|
93
|
+
scores, sequences = divergent_beamsearch(
|
94
|
+
input_ids, model, beam_size=3, max_length=5, pad_token_id=tok.eos_token_id, num_solutions=3, multi_choices_parser=None
|
95
|
+
)
|
96
|
+
sequences = [tok.decode(s) for s in sequences]
|
97
|
+
assert sequences == [
|
98
|
+
" the capital of France,",
|
99
|
+
" the capital of France.",
|
100
|
+
" the capital of the French",
|
101
|
+
]
|
102
|
+
assert np.isclose(
|
103
|
+
scores.cpu().numpy(), np.array([-6.9453, -7.1549, -7.5727]), atol=0.0001
|
104
|
+
).all()
|
105
|
+
|
106
|
+
|
107
|
+
scores, sequences = divergent_beamsearch(
|
108
|
+
input_ids, model, beam_size=3, max_length=6, pad_token_id=tok.eos_token_id, num_solutions=3, multi_choices_parser=None
|
109
|
+
)
|
110
|
+
sequences = [tok.decode(s) for s in sequences]
|
111
|
+
assert sequences == [
|
112
|
+
" the capital of France, and",
|
113
|
+
" the capital of the French Republic",
|
114
|
+
" the capital of France. It",
|
115
|
+
]
|
116
|
+
assert np.isclose(
|
117
|
+
scores.cpu().numpy(), np.array([-8.1361, -8.7745, -9.1053]), atol=0.0001
|
118
|
+
).all()
|
@@ -1,41 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import torch
|
3
|
-
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
4
|
-
from multi_choices_parser import MultiChoicesParser
|
5
|
-
from divergent_beamsearch.algorithm import divergent_beamsearch, log1mexp
|
6
|
-
|
7
|
-
@pytest.fixture
|
8
|
-
def model_and_tokenizer():
|
9
|
-
model = GPT2LMHeadModel.from_pretrained("gpt2")
|
10
|
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
11
|
-
return model, tokenizer
|
12
|
-
|
13
|
-
def test_divergent_beamsearch(model_and_tokenizer):
|
14
|
-
model, tokenizer = model_and_tokenizer
|
15
|
-
prompt = "The capital of France is"
|
16
|
-
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
17
|
-
beam_size = 5
|
18
|
-
max_length = 10
|
19
|
-
pad_token_id = tokenizer.eos_token_id
|
20
|
-
|
21
|
-
possible_answers = [' Paris', ' Paris Hilton']
|
22
|
-
tokenized_answers = tokenizer(possible_answers).input_ids
|
23
|
-
multi_choices_parser = MultiChoicesParser([tokenized_answers])
|
24
|
-
|
25
|
-
logprob_paris = model(input_ids).logits.log_softmax(dim=-1)[0, -1, tokenized_answers[0][0]]
|
26
|
-
logprob_hilton = model(torch.cat([input_ids, torch.tensor(tokenized_answers[1][0]).view(1,1)], dim=-1)).logits.log_softmax(dim=-1)[0, -1, tokenized_answers[1][1]]
|
27
|
-
logprob_paris_hilton = logprob_paris + logprob_hilton
|
28
|
-
|
29
|
-
scores, solutions = divergent_beamsearch(
|
30
|
-
input_ids=input_ids,
|
31
|
-
model=model,
|
32
|
-
beam_size=beam_size,
|
33
|
-
max_length=max_length,
|
34
|
-
multi_choices_parser=multi_choices_parser,
|
35
|
-
pad_token_id=pad_token_id,
|
36
|
-
num_solutions=10
|
37
|
-
)
|
38
|
-
true_solutions = torch.nn.utils.rnn.pad_sequence([torch.tensor(ans) for ans in tokenized_answers], batch_first=True, padding_value=pad_token_id)
|
39
|
-
assert (solutions == true_solutions).all(), "Beam search did not return the expected solutions"
|
40
|
-
assert scores[0] == logprob_paris + log1mexp(logprob_hilton), "Beam search did not return the expected score"
|
41
|
-
assert scores[1] == logprob_paris_hilton, "Beam search did not return the expected score"
|
File without changes
|
File without changes
|
File without changes
|
{divergent_beamsearch-0.1.0 → divergent_beamsearch-0.1.1}/src/divergent_beamsearch/__init__.py
RENAMED
File without changes
|