keras-hub-nightly 0.16.1.dev202410070341__py3-none-any.whl → 0.16.1.dev202410080341__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/src/models/causal_lm.py +4 -0
- keras_hub/src/models/llama3/llama3_tokenizer.py +25 -2
- keras_hub/src/tokenizers/byte_pair_tokenizer.py +5 -1
- keras_hub/src/utils/transformers/convert_llama3.py +21 -5
- keras_hub/src/version_utils.py +1 -1
- {keras_hub_nightly-0.16.1.dev202410070341.dist-info → keras_hub_nightly-0.16.1.dev202410080341.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.16.1.dev202410070341.dist-info → keras_hub_nightly-0.16.1.dev202410080341.dist-info}/RECORD +9 -9
- {keras_hub_nightly-0.16.1.dev202410070341.dist-info → keras_hub_nightly-0.16.1.dev202410080341.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.16.1.dev202410070341.dist-info → keras_hub_nightly-0.16.1.dev202410080341.dist-info}/top_level.txt +0 -0
@@ -326,6 +326,10 @@ class CausalLM(Task):
|
|
326
326
|
)
|
327
327
|
elif stop_token_ids == "auto":
|
328
328
|
stop_token_ids = [self.preprocessor.tokenizer.end_token_id]
|
329
|
+
# Some models like Llama3 use two end tokens: <|eot_id|> in
|
330
|
+
# "instruct" versions and <|end_of_text|> in others.
|
331
|
+
if hasattr(self.preprocessor.tokenizer, "end_token2_id"):
|
332
|
+
stop_token_ids.append(self.preprocessor.tokenizer.end_token2_id)
|
329
333
|
|
330
334
|
def preprocess(x):
|
331
335
|
return self.preprocessor.generate_preprocess(
|
@@ -16,10 +16,33 @@ class Llama3Tokenizer(BytePairTokenizer):
|
|
16
16
|
self,
|
17
17
|
vocabulary=None,
|
18
18
|
merges=None,
|
19
|
+
bos_token="<|begin_of_text|>",
|
20
|
+
eos_token="<|end_of_text|>",
|
21
|
+
misc_special_tokens={"<|start_header_id|>", "<|end_header_id|>"},
|
19
22
|
**kwargs,
|
20
23
|
):
|
21
|
-
|
22
|
-
|
24
|
+
# Note: all special tokens must also appear in "vocabulary"
|
25
|
+
|
26
|
+
self._add_special_token(bos_token, "start_token")
|
27
|
+
misc_special_tokens -= {bos_token}
|
28
|
+
self._add_special_token(eos_token, "end_token")
|
29
|
+
misc_special_tokens -= {eos_token}
|
30
|
+
for i, token in enumerate(misc_special_tokens):
|
31
|
+
self._add_special_token(token, f"special_token_{i:03d}")
|
32
|
+
|
33
|
+
# Hack:
|
34
|
+
# Llama models use the <|end_of_text|> or the <|eot_id|> as the stop
|
35
|
+
# token. This info can be read from config when loading a Hugging Face
|
36
|
+
# checkpoint but no such config exists for Keras checkpoints.
|
37
|
+
# Setting both probable end tokens when no config is availble will
|
38
|
+
# make text generation work in all cases as it will stop
|
39
|
+
# on both end tokens. However, the packer will always use
|
40
|
+
# "<|end_of_text|>" , which will be the wrong eos_token for "instruct"
|
41
|
+
# variants of Llama3.
|
42
|
+
# TODO: load this correctly from a Keras tokenizer config.
|
43
|
+
if eos_token == "<|end_of_text|>":
|
44
|
+
self._add_special_token("<|eot_id|>", "end_token2")
|
45
|
+
|
23
46
|
self.pad_token_id = 0
|
24
47
|
super().__init__(
|
25
48
|
vocabulary=vocabulary,
|
@@ -43,7 +43,11 @@ SPLIT_PATTERN_1 = (
|
|
43
43
|
SPLIT_PATTERN_1 = SPLIT_PATTERN_1.replace(
|
44
44
|
"{special_spaces}", SPECIAL_WHITESPACES
|
45
45
|
)
|
46
|
-
|
46
|
+
|
47
|
+
# The pattern " \t\r\f\v" is the same as \s "all spaces" but without the \n.
|
48
|
+
# Multiple \n\n\n in sequence must not be split for Llama3.
|
49
|
+
# SPLIT_PATTERN_2 = rf"""[\s६{SPECIAL_WHITESPACES}]$"""
|
50
|
+
SPLIT_PATTERN_2 = rf"""[ \t\r\f\v६{SPECIAL_WHITESPACES}]$"""
|
47
51
|
|
48
52
|
|
49
53
|
def create_alts_for_unsplittable_tokens(unsplittable_tokens):
|
@@ -107,10 +107,26 @@ def convert_tokenizer(cls, preset, **kwargs):
|
|
107
107
|
vocab = tokenizer_config["model"]["vocab"]
|
108
108
|
merges = tokenizer_config["model"]["merges"]
|
109
109
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
110
|
+
# Load all special tokens with the exception of "reserved" ones.
|
111
|
+
special_tokens = set()
|
112
|
+
for token in tokenizer_config["added_tokens"]:
|
113
|
+
if not token["content"].startswith("<|reserved_special_token_"):
|
114
|
+
vocab[token["content"]] = token["id"]
|
115
|
+
special_tokens.add(token["content"])
|
116
|
+
|
117
|
+
# Load text start and stop tokens from the config.
|
118
|
+
# Llama3 uses the <|end_of_text|> end token for regular models
|
119
|
+
# but uses <|eot_id|> for instruction-tuned variants.
|
120
|
+
tokenizer_config2 = load_json(preset, "tokenizer_config.json")
|
121
|
+
bos_token = tokenizer_config2["bos_token"]
|
122
|
+
eos_token = tokenizer_config2["eos_token"]
|
123
|
+
|
124
|
+
kwargs.update(
|
125
|
+
{
|
126
|
+
"bos_token": bos_token,
|
127
|
+
"eos_token": eos_token,
|
128
|
+
"misc_special_tokens": special_tokens,
|
129
|
+
}
|
130
|
+
)
|
115
131
|
|
116
132
|
return cls(vocabulary=vocab, merges=merges, **kwargs)
|
keras_hub/src/version_utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: keras-hub-nightly
|
3
|
-
Version: 0.16.1.
|
3
|
+
Version: 0.16.1.dev202410080341
|
4
4
|
Summary: Industry-strength Natural Language Processing extensions for Keras.
|
5
5
|
Home-page: https://github.com/keras-team/keras-hub
|
6
6
|
Author: Keras team
|
@@ -9,7 +9,7 @@ keras_hub/api/tokenizers/__init__.py,sha256=_f-r_cyUM2fjBB7iO84ThOdqqsAxHNIewJ2E
|
|
9
9
|
keras_hub/api/utils/__init__.py,sha256=Gp1E6gG-RtKQS3PBEQEOz9PQvXkXaJ0ySGMqZ7myN7A,215
|
10
10
|
keras_hub/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
keras_hub/src/api_export.py,sha256=9pQZK27JObxWZ96QPLBp1OBsjWigh1iuV6RglPGMRk0,1499
|
12
|
-
keras_hub/src/version_utils.py,sha256=
|
12
|
+
keras_hub/src/version_utils.py,sha256=ZcW3wGP8G9ckkrN4UDSpLre640ME6s_nJGCdK-nY_JI,222
|
13
13
|
keras_hub/src/bounding_box/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
14
|
keras_hub/src/bounding_box/converters.py,sha256=a5po8DBm87oz2EXfi-0uEZHCMlCJPIb4-MaZIdYx3Dg,17865
|
15
15
|
keras_hub/src/bounding_box/formats.py,sha256=YmskOz2BOSat7NaE__J9VfpSNGPJJR0znSzA4lp8MMI,3868
|
@@ -50,7 +50,7 @@ keras_hub/src/metrics/rouge_l.py,sha256=JlZhMBV6wS_6zMd57pkTc6yxHkEJT9fVQMlPZKek
|
|
50
50
|
keras_hub/src/metrics/rouge_n.py,sha256=JoFtmgjF4Ic263ny6bfD6vMHKreH9le3HnOOxemupRc,3620
|
51
51
|
keras_hub/src/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
52
|
keras_hub/src/models/backbone.py,sha256=2OZx6WAx2q9JK2yue5BoUUipIBjpOJRVNnMjXLVDLRk,11185
|
53
|
-
keras_hub/src/models/causal_lm.py,sha256=
|
53
|
+
keras_hub/src/models/causal_lm.py,sha256=zGUamLuL2HlTgummUhfnA8Uoe4QMsGGLD4uJazxJe-Y,15079
|
54
54
|
keras_hub/src/models/causal_lm_preprocessor.py,sha256=YY7VJZicdmnjDSWi9g4_pEpd5bdJK166GlWcapvokF0,6663
|
55
55
|
keras_hub/src/models/feature_pyramid_backbone.py,sha256=clEW-TTQSVJ_5qFNdDF0iABkin1p_xlBUFjJrC7T0IA,2247
|
56
56
|
keras_hub/src/models/image_classifier.py,sha256=yt6cjhPfqs8A_eWXBsXdXFzn-aRgH2rVHUq7Zu7CyK8,7804
|
@@ -197,7 +197,7 @@ keras_hub/src/models/llama3/llama3_backbone.py,sha256=nR5y51oI2QraL4Q9IxmQZrr0yS
|
|
197
197
|
keras_hub/src/models/llama3/llama3_causal_lm.py,sha256=0Kcr0sB78wSNDpeo4AE-PeefJe1DxEIdGRNMzdjk3WM,1541
|
198
198
|
keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py,sha256=twbXel9hsQgGxDAoQhEQuVm2udnEybI4fAQTJzXAuBs,3064
|
199
199
|
keras_hub/src/models/llama3/llama3_presets.py,sha256=n-GIQg6tVf9JY9djBqsFZvWAAuDqXHORrRxFg-xcDFw,2003
|
200
|
-
keras_hub/src/models/llama3/llama3_tokenizer.py,sha256=
|
200
|
+
keras_hub/src/models/llama3/llama3_tokenizer.py,sha256=J-KxRc08vGs4olFw_4mtJs0W_dTeUyj_XxMycazBmxI,1934
|
201
201
|
keras_hub/src/models/mistral/__init__.py,sha256=vjBlzcrIsFSwJKnfwfTNMKstIEKGFTE3kVcdAdfwlnE,263
|
202
202
|
keras_hub/src/models/mistral/mistral_attention.py,sha256=HCkUIc2DVIlYC5hhwomENlqLOsKTvbCKF0lx0_OBAyA,7862
|
203
203
|
keras_hub/src/models/mistral/mistral_backbone.py,sha256=x4BfyfWTCUXcjPSxdPSl8QITXgzUg1oJlAQt2acZfv4,7245
|
@@ -327,7 +327,7 @@ keras_hub/src/samplers/top_p_sampler.py,sha256=9r29WdqBlrW_2TBma6QqkRps2Uit4a6iZ
|
|
327
327
|
keras_hub/src/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
328
328
|
keras_hub/src/tests/test_case.py,sha256=pgjT5CkkkX4BTNfaDD6i-YChO6Ig3But66Ls4RxEymw,25937
|
329
329
|
keras_hub/src/tokenizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
330
|
-
keras_hub/src/tokenizers/byte_pair_tokenizer.py,sha256=
|
330
|
+
keras_hub/src/tokenizers/byte_pair_tokenizer.py,sha256=fGFp3WgPNYGTztpSGMl0kKFjn1bCeZB71lSJfT1eqEE,24052
|
331
331
|
keras_hub/src/tokenizers/byte_tokenizer.py,sha256=vjgrTT8FdtZVAlr0mU13alzADcUhtMrzgOs4lYeHvAQ,10648
|
332
332
|
keras_hub/src/tokenizers/sentence_piece_tokenizer.py,sha256=_PaVn4re3AwBkHylJWsvdvOCCYjOnFXLZmj-V34KehU,9562
|
333
333
|
keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py,sha256=8X_IN-hPDiUETGrSX3wPzFnip73xTYcN6FhLNIwfy-Y,4834
|
@@ -354,12 +354,12 @@ keras_hub/src/utils/transformers/convert_bert.py,sha256=4gQqXCJzC9QWdLPDUAq741K8
|
|
354
354
|
keras_hub/src/utils/transformers/convert_distilbert.py,sha256=SlfIRhSRk5c1ir2HGiDPiXa5XdOId_DbcnZO9lbwyZ8,6498
|
355
355
|
keras_hub/src/utils/transformers/convert_gemma.py,sha256=ElCgwBpSN5Q7rV5PJawTsoytPzs5ZjuwoY60YAe8y_A,6533
|
356
356
|
keras_hub/src/utils/transformers/convert_gpt2.py,sha256=HCeHN_-GiQJRxLCM9OCJJ1watPVpIBF8ujS8pGbBOWc,5703
|
357
|
-
keras_hub/src/utils/transformers/convert_llama3.py,sha256=
|
357
|
+
keras_hub/src/utils/transformers/convert_llama3.py,sha256=zlg0yFscjytyOFymDwqnbuXkmYvb88qqYzAROKcpaPU,5250
|
358
358
|
keras_hub/src/utils/transformers/convert_mistral.py,sha256=kVhN9h1ZFVhwkNW8p3wnS7eANJUXIsNy1RxWXy20Gqw,4760
|
359
359
|
keras_hub/src/utils/transformers/convert_pali_gemma.py,sha256=B1leeDw96Yvu81hYumf66hIid07k5NLqoeWAJgPnaLs,10649
|
360
360
|
keras_hub/src/utils/transformers/preset_loader.py,sha256=GS44hZUuGQCtzsyn8z44ZpHdftd3DFemwV2hx2bQa-U,2738
|
361
361
|
keras_hub/src/utils/transformers/safetensor_utils.py,sha256=rPK-Uw1CG0DX0d_UAD-r2cG9fw8GI8bvAlrcXfQ9g4c,3323
|
362
|
-
keras_hub_nightly-0.16.1.
|
363
|
-
keras_hub_nightly-0.16.1.
|
364
|
-
keras_hub_nightly-0.16.1.
|
365
|
-
keras_hub_nightly-0.16.1.
|
362
|
+
keras_hub_nightly-0.16.1.dev202410080341.dist-info/METADATA,sha256=SrlKiCjbDmXdTPsxSP6_NNTb-RKCwlNldhrxmphg_5Y,7458
|
363
|
+
keras_hub_nightly-0.16.1.dev202410080341.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
364
|
+
keras_hub_nightly-0.16.1.dev202410080341.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
|
365
|
+
keras_hub_nightly-0.16.1.dev202410080341.dist-info/RECORD,,
|
File without changes
|