keras-hub-nightly 0.16.1.dev202410070341__py3-none-any.whl → 0.16.1.dev202410080341__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -326,6 +326,10 @@ class CausalLM(Task):
326
326
  )
327
327
  elif stop_token_ids == "auto":
328
328
  stop_token_ids = [self.preprocessor.tokenizer.end_token_id]
329
+ # Some models like Llama3 use two end tokens: <|eot_id|> in
330
+ # "instruct" versions and <|end_of_text|> in others.
331
+ if hasattr(self.preprocessor.tokenizer, "end_token2_id"):
332
+ stop_token_ids.append(self.preprocessor.tokenizer.end_token2_id)
329
333
 
330
334
  def preprocess(x):
331
335
  return self.preprocessor.generate_preprocess(
@@ -16,10 +16,33 @@ class Llama3Tokenizer(BytePairTokenizer):
16
16
  self,
17
17
  vocabulary=None,
18
18
  merges=None,
19
+ bos_token="<|begin_of_text|>",
20
+ eos_token="<|end_of_text|>",
21
+ misc_special_tokens={"<|start_header_id|>", "<|end_header_id|>"},
19
22
  **kwargs,
20
23
  ):
21
- self._add_special_token("<|begin_of_text|>", "start_token")
22
- self._add_special_token("<|end_of_text|>", "end_token")
24
+ # Note: all special tokens must also appear in "vocabulary"
25
+
26
+ self._add_special_token(bos_token, "start_token")
27
+ misc_special_tokens -= {bos_token}
28
+ self._add_special_token(eos_token, "end_token")
29
+ misc_special_tokens -= {eos_token}
30
+ for i, token in enumerate(misc_special_tokens):
31
+ self._add_special_token(token, f"special_token_{i:03d}")
32
+
33
+ # Hack:
34
+ # Llama models use the <|end_of_text|> or the <|eot_id|> as the stop
35
+ # token. This info can be read from config when loading a Hugging Face
36
+ # checkpoint but no such config exists for Keras checkpoints.
37
+ # Setting both probable end tokens when no config is availble will
38
+ # make text generation work in all cases as it will stop
39
+ # on both end tokens. However, the packer will always use
40
+ # "<|end_of_text|>" , which will be the wrong eos_token for "instruct"
41
+ # variants of Llama3.
42
+ # TODO: load this correctly from a Keras tokenizer config.
43
+ if eos_token == "<|end_of_text|>":
44
+ self._add_special_token("<|eot_id|>", "end_token2")
45
+
23
46
  self.pad_token_id = 0
24
47
  super().__init__(
25
48
  vocabulary=vocabulary,
@@ -43,7 +43,11 @@ SPLIT_PATTERN_1 = (
43
43
  SPLIT_PATTERN_1 = SPLIT_PATTERN_1.replace(
44
44
  "{special_spaces}", SPECIAL_WHITESPACES
45
45
  )
46
- SPLIT_PATTERN_2 = rf"""[\s६{SPECIAL_WHITESPACES}]$"""
46
+
47
+ # The pattern " \t\r\f\v" is the same as \s "all spaces" but without the \n.
48
+ # Multiple \n\n\n in sequence must not be split for Llama3.
49
+ # SPLIT_PATTERN_2 = rf"""[\s६{SPECIAL_WHITESPACES}]$"""
50
+ SPLIT_PATTERN_2 = rf"""[ \t\r\f\v६{SPECIAL_WHITESPACES}]$"""
47
51
 
48
52
 
49
53
  def create_alts_for_unsplittable_tokens(unsplittable_tokens):
@@ -107,10 +107,26 @@ def convert_tokenizer(cls, preset, **kwargs):
107
107
  vocab = tokenizer_config["model"]["vocab"]
108
108
  merges = tokenizer_config["model"]["merges"]
109
109
 
110
- bot = tokenizer_config["added_tokens"][0] # begin of text
111
- eot = tokenizer_config["added_tokens"][1] # end of text
112
-
113
- vocab[bot["content"]] = bot["id"]
114
- vocab[eot["content"]] = eot["id"]
110
+ # Load all special tokens with the exception of "reserved" ones.
111
+ special_tokens = set()
112
+ for token in tokenizer_config["added_tokens"]:
113
+ if not token["content"].startswith("<|reserved_special_token_"):
114
+ vocab[token["content"]] = token["id"]
115
+ special_tokens.add(token["content"])
116
+
117
+ # Load text start and stop tokens from the config.
118
+ # Llama3 uses the <|end_of_text|> end token for regular models
119
+ # but uses <|eot_id|> for instruction-tuned variants.
120
+ tokenizer_config2 = load_json(preset, "tokenizer_config.json")
121
+ bos_token = tokenizer_config2["bos_token"]
122
+ eos_token = tokenizer_config2["eos_token"]
123
+
124
+ kwargs.update(
125
+ {
126
+ "bos_token": bos_token,
127
+ "eos_token": eos_token,
128
+ "misc_special_tokens": special_tokens,
129
+ }
130
+ )
115
131
 
116
132
  return cls(vocabulary=vocab, merges=merges, **kwargs)
@@ -1,7 +1,7 @@
1
1
  from keras_hub.src.api_export import keras_hub_export
2
2
 
3
3
  # Unique source of truth for the version number.
4
- __version__ = "0.16.1.dev202410070341"
4
+ __version__ = "0.16.1.dev202410080341"
5
5
 
6
6
 
7
7
  @keras_hub_export("keras_hub.version")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: keras-hub-nightly
3
- Version: 0.16.1.dev202410070341
3
+ Version: 0.16.1.dev202410080341
4
4
  Summary: Industry-strength Natural Language Processing extensions for Keras.
5
5
  Home-page: https://github.com/keras-team/keras-hub
6
6
  Author: Keras team
@@ -9,7 +9,7 @@ keras_hub/api/tokenizers/__init__.py,sha256=_f-r_cyUM2fjBB7iO84ThOdqqsAxHNIewJ2E
9
9
  keras_hub/api/utils/__init__.py,sha256=Gp1E6gG-RtKQS3PBEQEOz9PQvXkXaJ0ySGMqZ7myN7A,215
10
10
  keras_hub/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  keras_hub/src/api_export.py,sha256=9pQZK27JObxWZ96QPLBp1OBsjWigh1iuV6RglPGMRk0,1499
12
- keras_hub/src/version_utils.py,sha256=Tt3QcaichNaTMoNu_eci34g4G9ytWSUqQbx-P4xXpyA,222
12
+ keras_hub/src/version_utils.py,sha256=ZcW3wGP8G9ckkrN4UDSpLre640ME6s_nJGCdK-nY_JI,222
13
13
  keras_hub/src/bounding_box/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  keras_hub/src/bounding_box/converters.py,sha256=a5po8DBm87oz2EXfi-0uEZHCMlCJPIb4-MaZIdYx3Dg,17865
15
15
  keras_hub/src/bounding_box/formats.py,sha256=YmskOz2BOSat7NaE__J9VfpSNGPJJR0znSzA4lp8MMI,3868
@@ -50,7 +50,7 @@ keras_hub/src/metrics/rouge_l.py,sha256=JlZhMBV6wS_6zMd57pkTc6yxHkEJT9fVQMlPZKek
50
50
  keras_hub/src/metrics/rouge_n.py,sha256=JoFtmgjF4Ic263ny6bfD6vMHKreH9le3HnOOxemupRc,3620
51
51
  keras_hub/src/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  keras_hub/src/models/backbone.py,sha256=2OZx6WAx2q9JK2yue5BoUUipIBjpOJRVNnMjXLVDLRk,11185
53
- keras_hub/src/models/causal_lm.py,sha256=p3C5R6hbe1BARHNXJZqtgwlp3bDqkv3gguO19PeJC2c,14791
53
+ keras_hub/src/models/causal_lm.py,sha256=zGUamLuL2HlTgummUhfnA8Uoe4QMsGGLD4uJazxJe-Y,15079
54
54
  keras_hub/src/models/causal_lm_preprocessor.py,sha256=YY7VJZicdmnjDSWi9g4_pEpd5bdJK166GlWcapvokF0,6663
55
55
  keras_hub/src/models/feature_pyramid_backbone.py,sha256=clEW-TTQSVJ_5qFNdDF0iABkin1p_xlBUFjJrC7T0IA,2247
56
56
  keras_hub/src/models/image_classifier.py,sha256=yt6cjhPfqs8A_eWXBsXdXFzn-aRgH2rVHUq7Zu7CyK8,7804
@@ -197,7 +197,7 @@ keras_hub/src/models/llama3/llama3_backbone.py,sha256=nR5y51oI2QraL4Q9IxmQZrr0yS
197
197
  keras_hub/src/models/llama3/llama3_causal_lm.py,sha256=0Kcr0sB78wSNDpeo4AE-PeefJe1DxEIdGRNMzdjk3WM,1541
198
198
  keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py,sha256=twbXel9hsQgGxDAoQhEQuVm2udnEybI4fAQTJzXAuBs,3064
199
199
  keras_hub/src/models/llama3/llama3_presets.py,sha256=n-GIQg6tVf9JY9djBqsFZvWAAuDqXHORrRxFg-xcDFw,2003
200
- keras_hub/src/models/llama3/llama3_tokenizer.py,sha256=BcNHfsT19LUC0PkEEyN22C9zxPNVboQSK9EGMfhtpnk,789
200
+ keras_hub/src/models/llama3/llama3_tokenizer.py,sha256=J-KxRc08vGs4olFw_4mtJs0W_dTeUyj_XxMycazBmxI,1934
201
201
  keras_hub/src/models/mistral/__init__.py,sha256=vjBlzcrIsFSwJKnfwfTNMKstIEKGFTE3kVcdAdfwlnE,263
202
202
  keras_hub/src/models/mistral/mistral_attention.py,sha256=HCkUIc2DVIlYC5hhwomENlqLOsKTvbCKF0lx0_OBAyA,7862
203
203
  keras_hub/src/models/mistral/mistral_backbone.py,sha256=x4BfyfWTCUXcjPSxdPSl8QITXgzUg1oJlAQt2acZfv4,7245
@@ -327,7 +327,7 @@ keras_hub/src/samplers/top_p_sampler.py,sha256=9r29WdqBlrW_2TBma6QqkRps2Uit4a6iZ
327
327
  keras_hub/src/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
328
328
  keras_hub/src/tests/test_case.py,sha256=pgjT5CkkkX4BTNfaDD6i-YChO6Ig3But66Ls4RxEymw,25937
329
329
  keras_hub/src/tokenizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
- keras_hub/src/tokenizers/byte_pair_tokenizer.py,sha256=Wocarha6ZuzrfiWHPiQUPLLRLrDITyc0hQzjRupw4xA,23849
330
+ keras_hub/src/tokenizers/byte_pair_tokenizer.py,sha256=fGFp3WgPNYGTztpSGMl0kKFjn1bCeZB71lSJfT1eqEE,24052
331
331
  keras_hub/src/tokenizers/byte_tokenizer.py,sha256=vjgrTT8FdtZVAlr0mU13alzADcUhtMrzgOs4lYeHvAQ,10648
332
332
  keras_hub/src/tokenizers/sentence_piece_tokenizer.py,sha256=_PaVn4re3AwBkHylJWsvdvOCCYjOnFXLZmj-V34KehU,9562
333
333
  keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py,sha256=8X_IN-hPDiUETGrSX3wPzFnip73xTYcN6FhLNIwfy-Y,4834
@@ -354,12 +354,12 @@ keras_hub/src/utils/transformers/convert_bert.py,sha256=4gQqXCJzC9QWdLPDUAq741K8
354
354
  keras_hub/src/utils/transformers/convert_distilbert.py,sha256=SlfIRhSRk5c1ir2HGiDPiXa5XdOId_DbcnZO9lbwyZ8,6498
355
355
  keras_hub/src/utils/transformers/convert_gemma.py,sha256=ElCgwBpSN5Q7rV5PJawTsoytPzs5ZjuwoY60YAe8y_A,6533
356
356
  keras_hub/src/utils/transformers/convert_gpt2.py,sha256=HCeHN_-GiQJRxLCM9OCJJ1watPVpIBF8ujS8pGbBOWc,5703
357
- keras_hub/src/utils/transformers/convert_llama3.py,sha256=QqsGS2rkQ5EBJUzhq06tJNU07BI7k7wAlUNzUgFEYhs,4620
357
+ keras_hub/src/utils/transformers/convert_llama3.py,sha256=zlg0yFscjytyOFymDwqnbuXkmYvb88qqYzAROKcpaPU,5250
358
358
  keras_hub/src/utils/transformers/convert_mistral.py,sha256=kVhN9h1ZFVhwkNW8p3wnS7eANJUXIsNy1RxWXy20Gqw,4760
359
359
  keras_hub/src/utils/transformers/convert_pali_gemma.py,sha256=B1leeDw96Yvu81hYumf66hIid07k5NLqoeWAJgPnaLs,10649
360
360
  keras_hub/src/utils/transformers/preset_loader.py,sha256=GS44hZUuGQCtzsyn8z44ZpHdftd3DFemwV2hx2bQa-U,2738
361
361
  keras_hub/src/utils/transformers/safetensor_utils.py,sha256=rPK-Uw1CG0DX0d_UAD-r2cG9fw8GI8bvAlrcXfQ9g4c,3323
362
- keras_hub_nightly-0.16.1.dev202410070341.dist-info/METADATA,sha256=-gYUt9I22A6R7D8Tc4jXF5h5BLh-YBIzLj8WH3tzc8w,7458
363
- keras_hub_nightly-0.16.1.dev202410070341.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
364
- keras_hub_nightly-0.16.1.dev202410070341.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
365
- keras_hub_nightly-0.16.1.dev202410070341.dist-info/RECORD,,
362
+ keras_hub_nightly-0.16.1.dev202410080341.dist-info/METADATA,sha256=SrlKiCjbDmXdTPsxSP6_NNTb-RKCwlNldhrxmphg_5Y,7458
363
+ keras_hub_nightly-0.16.1.dev202410080341.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
364
+ keras_hub_nightly-0.16.1.dev202410080341.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
365
+ keras_hub_nightly-0.16.1.dev202410080341.dist-info/RECORD,,