keras-hub-nightly 0.19.0.dev202412120352__py3-none-any.whl → 0.19.0.dev202412140350__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. keras_hub/api/layers/__init__.py +1 -0
  2. keras_hub/api/models/__init__.py +11 -6
  3. keras_hub/api/tokenizers/__init__.py +1 -1
  4. keras_hub/src/bounding_box/converters.py +2 -2
  5. keras_hub/src/layers/modeling/f_net_encoder.py +1 -1
  6. keras_hub/src/layers/modeling/masked_lm_head.py +2 -1
  7. keras_hub/src/layers/modeling/rms_normalization.py +8 -6
  8. keras_hub/src/layers/modeling/rotary_embedding.py +3 -2
  9. keras_hub/src/layers/modeling/token_and_position_embedding.py +1 -1
  10. keras_hub/src/layers/modeling/transformer_decoder.py +8 -6
  11. keras_hub/src/layers/modeling/transformer_encoder.py +3 -1
  12. keras_hub/src/metrics/bleu.py +1 -1
  13. keras_hub/src/models/albert/albert_text_classifier.py +7 -7
  14. keras_hub/src/models/bart/bart_backbone.py +4 -4
  15. keras_hub/src/models/bart/bart_seq_2_seq_lm.py +9 -8
  16. keras_hub/src/models/bert/bert_presets.py +4 -2
  17. keras_hub/src/models/bert/bert_text_classifier.py +3 -3
  18. keras_hub/src/models/causal_lm.py +19 -15
  19. keras_hub/src/models/clip/clip_vision_embedding.py +1 -1
  20. keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +2 -1
  21. keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +1 -1
  22. keras_hub/src/models/deberta_v3/deberta_v3_text_classifier.py +4 -4
  23. keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +4 -4
  24. keras_hub/src/models/deberta_v3/disentangled_self_attention.py +3 -2
  25. keras_hub/src/models/deberta_v3/relative_embedding.py +1 -1
  26. keras_hub/src/models/deeplab_v3/deeplab_v3_backbone.py +17 -13
  27. keras_hub/src/models/deeplab_v3/deeplab_v3_presets.py +4 -3
  28. keras_hub/src/models/deeplab_v3/deeplab_v3_segmenter.py +1 -1
  29. keras_hub/src/models/densenet/densenet_backbone.py +3 -1
  30. keras_hub/src/models/densenet/densenet_image_classifier.py +1 -1
  31. keras_hub/src/models/densenet/densenet_presets.py +6 -6
  32. keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +1 -1
  33. keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +2 -2
  34. keras_hub/src/models/distil_bert/distil_bert_presets.py +2 -1
  35. keras_hub/src/models/distil_bert/distil_bert_text_classifier.py +5 -5
  36. keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +3 -3
  37. keras_hub/src/models/efficientnet/cba.py +1 -1
  38. keras_hub/src/models/efficientnet/efficientnet_backbone.py +20 -8
  39. keras_hub/src/models/efficientnet/efficientnet_image_classifier.py +1 -1
  40. keras_hub/src/models/efficientnet/efficientnet_presets.py +12 -11
  41. keras_hub/src/models/efficientnet/fusedmbconv.py +3 -5
  42. keras_hub/src/models/efficientnet/mbconv.py +1 -1
  43. keras_hub/src/models/electra/electra_backbone.py +2 -2
  44. keras_hub/src/models/f_net/f_net_text_classifier.py +3 -3
  45. keras_hub/src/models/f_net/f_net_text_classifier_preprocessor.py +3 -3
  46. keras_hub/src/models/falcon/falcon_backbone.py +5 -3
  47. keras_hub/src/models/falcon/falcon_causal_lm.py +18 -8
  48. keras_hub/src/models/falcon/falcon_tokenizer.py +7 -2
  49. keras_hub/src/models/flux/flux_layers.py +46 -44
  50. keras_hub/src/models/flux/flux_maths.py +24 -17
  51. keras_hub/src/models/flux/flux_model.py +24 -19
  52. keras_hub/src/models/flux/flux_presets.py +2 -1
  53. keras_hub/src/models/flux/flux_text_to_image.py +7 -3
  54. keras_hub/src/models/gemma/gemma_backbone.py +27 -20
  55. keras_hub/src/models/gemma/gemma_causal_lm.py +2 -2
  56. keras_hub/src/models/gemma/gemma_decoder_block.py +3 -1
  57. keras_hub/src/models/gemma/gemma_presets.py +9 -3
  58. keras_hub/src/models/gpt2/gpt2_causal_lm.py +2 -2
  59. keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +2 -1
  60. keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +3 -3
  61. keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +2 -1
  62. keras_hub/src/models/image_classifier_preprocessor.py +4 -1
  63. keras_hub/src/models/image_object_detector.py +2 -2
  64. keras_hub/src/models/image_object_detector_preprocessor.py +4 -4
  65. keras_hub/src/models/image_segmenter_preprocessor.py +2 -2
  66. keras_hub/src/models/llama/llama_backbone.py +34 -26
  67. keras_hub/src/models/llama3/llama3_backbone.py +12 -11
  68. keras_hub/src/models/llama3/llama3_causal_lm.py +1 -1
  69. keras_hub/src/models/mistral/mistral_backbone.py +16 -15
  70. keras_hub/src/models/mistral/mistral_causal_lm.py +3 -3
  71. keras_hub/src/models/mistral/mistral_transformer_decoder.py +2 -1
  72. keras_hub/src/models/mit/mit_backbone.py +4 -3
  73. keras_hub/src/models/mit/mit_layers.py +2 -1
  74. keras_hub/src/models/mobilenet/mobilenet_backbone.py +7 -7
  75. keras_hub/src/models/opt/opt_causal_lm.py +2 -2
  76. keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +5 -3
  77. keras_hub/src/models/pali_gemma/pali_gemma_vit.py +2 -2
  78. keras_hub/src/models/phi3/phi3_decoder.py +0 -1
  79. keras_hub/src/models/phi3/phi3_rotary_embedding.py +1 -1
  80. keras_hub/src/models/preprocessor.py +2 -2
  81. keras_hub/src/models/retinanet/feature_pyramid.py +3 -2
  82. keras_hub/src/models/retinanet/prediction_head.py +2 -2
  83. keras_hub/src/models/retinanet/retinanet_backbone.py +2 -2
  84. keras_hub/src/models/retinanet/retinanet_image_converter.py +1 -1
  85. keras_hub/src/models/retinanet/retinanet_object_detector.py +5 -6
  86. keras_hub/src/models/retinanet/retinanet_presets.py +2 -1
  87. keras_hub/src/models/roberta/roberta_backbone.py +2 -2
  88. keras_hub/src/models/roberta/roberta_presets.py +4 -2
  89. keras_hub/src/models/roberta/roberta_text_classifier.py +3 -3
  90. keras_hub/src/models/sam/sam_backbone.py +2 -2
  91. keras_hub/src/models/sam/sam_image_segmenter.py +6 -5
  92. keras_hub/src/models/sam/sam_layers.py +5 -3
  93. keras_hub/src/models/sam/sam_prompt_encoder.py +4 -2
  94. keras_hub/src/models/sam/sam_transformer.py +5 -4
  95. keras_hub/src/models/segformer/segformer_backbone.py +18 -14
  96. keras_hub/src/models/segformer/segformer_image_segmenter.py +51 -38
  97. keras_hub/src/models/segformer/segformer_presets.py +24 -12
  98. keras_hub/src/models/seq_2_seq_lm_preprocessor.py +1 -1
  99. keras_hub/src/models/stable_diffusion_3/mmdit.py +20 -1
  100. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +1 -1
  101. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +13 -6
  102. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +2 -2
  103. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +7 -3
  104. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +1 -1
  105. keras_hub/src/models/task.py +4 -2
  106. keras_hub/src/models/text_classifier.py +2 -2
  107. keras_hub/src/models/text_to_image.py +5 -1
  108. keras_hub/src/models/vae/vae_layers.py +0 -1
  109. keras_hub/src/models/vit/__init__.py +5 -0
  110. keras_hub/src/models/vit/vit_backbone.py +152 -0
  111. keras_hub/src/models/vit/vit_image_classifier.py +187 -0
  112. keras_hub/src/models/vit/vit_image_classifier_preprocessor.py +12 -0
  113. keras_hub/src/models/vit/vit_image_converter.py +73 -0
  114. keras_hub/src/models/vit/vit_layers.py +391 -0
  115. keras_hub/src/models/vit/vit_presets.py +49 -0
  116. keras_hub/src/models/vit_det/vit_det_backbone.py +4 -2
  117. keras_hub/src/models/vit_det/vit_layers.py +3 -3
  118. keras_hub/src/models/whisper/whisper_audio_converter.py +1 -3
  119. keras_hub/src/models/whisper/whisper_backbone.py +6 -5
  120. keras_hub/src/models/whisper/whisper_decoder.py +3 -5
  121. keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +1 -1
  122. keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +2 -2
  123. keras_hub/src/models/xlm_roberta/xlm_roberta_text_classifier.py +4 -4
  124. keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +2 -1
  125. keras_hub/src/models/xlnet/relative_attention.py +20 -19
  126. keras_hub/src/models/xlnet/xlnet_backbone.py +2 -2
  127. keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +3 -5
  128. keras_hub/src/models/xlnet/xlnet_encoder.py +7 -9
  129. keras_hub/src/samplers/contrastive_sampler.py +2 -3
  130. keras_hub/src/samplers/sampler.py +2 -1
  131. keras_hub/src/tests/test_case.py +2 -2
  132. keras_hub/src/tokenizers/byte_pair_tokenizer.py +2 -2
  133. keras_hub/src/tokenizers/byte_tokenizer.py +2 -8
  134. keras_hub/src/tokenizers/sentence_piece_tokenizer.py +2 -9
  135. keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +7 -12
  136. keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +8 -5
  137. keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +7 -3
  138. keras_hub/src/utils/preset_utils.py +25 -18
  139. keras_hub/src/utils/tensor_utils.py +4 -4
  140. keras_hub/src/utils/timm/convert_efficientnet.py +2 -4
  141. keras_hub/src/utils/transformers/convert_vit.py +150 -0
  142. keras_hub/src/utils/transformers/preset_loader.py +23 -0
  143. keras_hub/src/utils/transformers/safetensor_utils.py +4 -3
  144. keras_hub/src/version_utils.py +1 -1
  145. {keras_hub_nightly-0.19.0.dev202412120352.dist-info → keras_hub_nightly-0.19.0.dev202412140350.dist-info}/METADATA +1 -1
  146. {keras_hub_nightly-0.19.0.dev202412120352.dist-info → keras_hub_nightly-0.19.0.dev202412140350.dist-info}/RECORD +148 -140
  147. {keras_hub_nightly-0.19.0.dev202412120352.dist-info → keras_hub_nightly-0.19.0.dev202412140350.dist-info}/WHEEL +0 -0
  148. {keras_hub_nightly-0.19.0.dev202412120352.dist-info → keras_hub_nightly-0.19.0.dev202412140350.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,8 @@ class ImageSegmenterPreprocessor(Preprocessor):
23
23
  is set to `True` this will be resized to input image shape else will be
24
24
  passed through unaltered.
25
25
  - `sample_weight`: (Optional) Will be passed through unaltered.
26
- - `resize_output_mask` bool: If set to `True` the output mask will be resized to the same size as the input image. Defaults to `False`.
26
+ - `resize_output_mask` bool: If set to `True` the output mask will be
27
+ resized to the same size as the input image. Defaults to `False`.
27
28
 
28
29
  The layer will output either `x`, an `(x, y)` tuple if labels were provided,
29
30
  or an `(x, y, sample_weight)` tuple if labels and sample weight were
@@ -77,7 +78,6 @@ class ImageSegmenterPreprocessor(Preprocessor):
77
78
  x = self.image_converter(x)
78
79
 
79
80
  if y is not None and self.image_converter and self.resize_output_mask:
80
-
81
81
  y = keras.layers.Resizing(
82
82
  height=(
83
83
  self.image_converter.image_size[0]
@@ -34,17 +34,18 @@ class LlamaBackbone(Backbone):
34
34
  num_layers (int): The number of transformer layers.
35
35
  num_query_heads (int): The number of query attention heads for
36
36
  each transformer.
37
- hidden_dim (int): The size of the transformer encoding and pooling layers.
38
- intermediate_dim (int): The output dimension of the first Dense layer in a
39
- three-layer feedforward network for each transformer.
40
- num_key_value_heads (int): The number of key and value attention heads for
41
- each transformer.
42
- rope_max_wavelength (int, optional): The maximum angular wavelength of the
43
- sine/cosine curves, for rotary embeddings. Defaults to `10000`.
44
- rope_scaling_factor (float, optional): The scaling factor for calculation
45
- of roatary embedding. Defaults to `1.0`.
46
- layer_norm_epsilon (float, optional): Epsilon for the layer normalization
47
- layers in the transformer decoder. Defaults to `1e-6`.
37
+ hidden_dim (int): The size of the transformer encoding and pooling
38
+ layers.
39
+ intermediate_dim (int): The output dimension of the first Dense layer in
40
+ a three-layer feedforward network for each transformer.
41
+ num_key_value_heads (int): The number of key and value attention heads
42
+ for each transformer.
43
+ rope_max_wavelength (int, optional): The maximum angular wavelength of
44
+ the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
45
+ rope_scaling_factor (float, optional): The scaling factor for
46
+ calculation of roatary embedding. Defaults to `1.0`.
47
+ layer_norm_epsilon (float, optional): Epsilon for the layer
48
+ normalization layers in the transformer decoder. Defaults to `1e-6`.
48
49
  dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
49
50
  for model computations and weights. Note that some computations,
50
51
  such as softmax and layer normalization, will always be done at
@@ -190,7 +191,8 @@ class LlamaBackbone(Backbone):
190
191
 
191
192
  Example:
192
193
  ```
193
- # Feel free to change the mesh shape to balance data and model parallelism
194
+ # Feel free to change the mesh shape to balance data and model
195
+ # parallelism
194
196
  mesh = keras.distribution.DeviceMesh(
195
197
  shape=(1, 8),
196
198
  axis_names=('batch', 'model'),
@@ -210,12 +212,16 @@ class LlamaBackbone(Backbone):
210
212
  llama_model = keras_hub.models.LlamaCausalLM.from_preset()
211
213
  ```
212
214
 
213
- To see how the layout map was applied, load the model then run (for one decoder block):
215
+ To see how the layout map was applied, load the model then run
216
+ (for one decoder block):
214
217
  ```
215
218
  embedding_layer = llama_model.backbone.get_layer("token_embedding")
216
219
  decoder_block_1 = llama_model.backbone.get_layer('transformer_layer_0')
217
220
  for variable in embedding_layer.weights + decoder_block_1.weights:
218
- print(f'{variable.path:<58} {str(variable.shape):<16} {str(variable.value.sharding.spec)}')
221
+ print(
222
+ f'{variable.path:<58} {str(variable.shape):<16} '
223
+ f'{str(variable.value.sharding.spec)}'
224
+ )
219
225
  ```
220
226
 
221
227
  Args:
@@ -230,22 +236,24 @@ class LlamaBackbone(Backbone):
230
236
  for all the model weights.
231
237
  """
232
238
  # The weight path and shape of the Llama backbone is like below
233
- # token_embedding/embeddings (128256, 2048)
239
+ # token_embedding/embeddings (128256, 2048)
234
240
  # repeat block for decoder
235
- # transformer_layer_0/self_attention/query/kernel (2048, 32, 64)
236
- # transformer_layer_0/self_attention/key/kernel (2048, 8, 64)
237
- # transformer_layer_0/self_attention/value/kernel (2048, 8, 64)
238
- # transformer_layer_0/self_attention/attention_output/kernel (32, 64, 2048)
239
- # transformer_layer_0/self_attention_layernorm/scale (2048,)
240
- # transformer_layer_0/feedforward_intermediate_dense/kernel (2048, 8192)
241
- # transformer_layer_0/feedforward_gate_dense/kernel (2048, 8192)
242
- # transformer_layer_0/feedforward_output_dense/kernel (8192, 2048)
243
- # transformer_layer_0/feedforward_layernorm/scale (2048,)
241
+ # transformer_layer_0/self_attention/query/kernel (2048, 32, 64)
242
+ # transformer_layer_0/self_attention/key/kernel (2048, 8, 64)
243
+ # transformer_layer_0/self_attention/value/kernel (2048, 8, 64)
244
+ # transformer_layer_0/self_attention/attention_output/kernel
245
+ # (32, 64, 2048)
246
+ # transformer_layer_0/self_attention_layernorm/scale (2048,)
247
+ # transformer_layer_0/feedforward_intermediate_dense/kernel
248
+ # (2048, 8192)
249
+ # transformer_layer_0/feedforward_gate_dense/kernel (2048, 8192)
250
+ # transformer_layer_0/feedforward_output_dense/kerne (8192, 2048)
251
+ # transformer_layer_0/feedforward_layernorm/scale (2048,)
244
252
 
245
253
  if not isinstance(device_mesh, keras.distribution.DeviceMesh):
246
254
  raise ValueError(
247
- "Invalid device_mesh type. Expected `keras.distribution.Device`,"
248
- f" got {type(device_mesh)}"
255
+ "Invalid device_mesh type. Expected "
256
+ f"`keras.distribution.Device`, got {type(device_mesh)}"
249
257
  )
250
258
  if model_parallel_dim_name not in device_mesh.axis_names:
251
259
  raise ValueError(
@@ -24,17 +24,18 @@ class Llama3Backbone(LlamaBackbone):
24
24
  num_layers (int): The number of transformer layers.
25
25
  num_query_heads (int): The number of query attention heads for
26
26
  each transformer.
27
- hidden_dim (int): The size of the transformer encoding and pooling layers.
28
- intermediate_dim (int): The output dimension of the first Dense layer in a
29
- three-layer feedforward network for each transformer.
30
- num_key_value_heads (int): The number of key and value attention heads for
31
- each transformer.
32
- rope_max_wavelength (int, optional): The maximum angular wavelength of the
33
- sine/cosine curves, for rotary embeddings. Defaults to `10000`.
34
- rope_scaling_factor (float, optional): The scaling factor for calculation
35
- of roatary embedding. Defaults to `1.0`.
36
- layer_norm_epsilon (float, optional): Epsilon for the layer normalization
37
- layers in the transformer decoder. Defaults to `1e-6`.
27
+ hidden_dim (int): The size of the transformer encoding and pooling
28
+ layers.
29
+ intermediate_dim (int): The output dimension of the first Dense layer in
30
+ a three-layer feedforward network for each transformer.
31
+ num_key_value_heads (int): The number of key and value attention heads
32
+ fo each transformer.
33
+ rope_max_wavelength (int, optional): The maximum angular wavelength of
34
+ the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
35
+ rope_scaling_factor (float, optional): The scaling factor for
36
+ calculation of roatary embedding. Defaults to `1.0`.
37
+ layer_norm_epsilon (float, optional): Epsilon for the layer
38
+ normalization layers in the transformer decoder. Defaults to `1e-6`.
38
39
  dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
39
40
  for model computations and weights. Note that some computations,
40
41
  such as softmax and layer normalization, will always be done at
@@ -1,9 +1,9 @@
1
1
  from keras_hub.src.api_export import keras_hub_export
2
+ from keras_hub.src.models.llama.llama_causal_lm import LlamaCausalLM
2
3
  from keras_hub.src.models.llama3.llama3_backbone import Llama3Backbone
3
4
  from keras_hub.src.models.llama3.llama3_causal_lm_preprocessor import (
4
5
  Llama3CausalLMPreprocessor,
5
6
  )
6
- from keras_hub.src.models.llama.llama_causal_lm import LlamaCausalLM
7
7
 
8
8
 
9
9
  @keras_hub_export("keras_hub.models.Llama3CausalLM")
@@ -38,22 +38,23 @@ class MistralBackbone(Backbone):
38
38
  num_layers (int): The number of transformer layers.
39
39
  num_query_heads (int): The number of query attention heads for
40
40
  each transformer.
41
- hidden_dim (int): The size of the transformer encoding and pooling layers.
42
- intermediate_dim (int): The output dimension of the first Dense layer in a
43
- three-layer feedforward network for each transformer.
44
- num_key_value_heads (int): The number of key and value attention heads for
45
- each transformer.
46
- rope_max_wavelength (int, optional): The maximum angular wavelength of the
47
- sine/cosine curves, for rotary embeddings. Defaults to `10000`.
48
- rope_scaling_factor (float, optional): The scaling factor for calculation
49
- of roatary embedding. Defaults to `1.0`.
50
- layer_norm_epsilon (float, optional): Epsilon for the layer normalization
51
- layers in the transformer decoder. Defaults to `1e-6`.
41
+ hidden_dim (int): The size of the transformer encoding and pooling
42
+ layers.
43
+ intermediate_dim (int): The output dimension of the first Dense layer
44
+ in a three-layer feedforward network for each transformer.
45
+ num_key_value_heads (int): The number of key and value attention heads
46
+ for each transformer.
47
+ rope_max_wavelength (int, optional): The maximum angular wavelength of
48
+ the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
49
+ rope_scaling_factor (float, optional): The scaling factor for
50
+ calculation of roatary embedding. Defaults to `1.0`.
51
+ layer_norm_epsilon (float, optional): Epsilon for the layer
52
+ normalization layers in the transformer decoder. Defaults to `1e-6`.
52
53
  sliding_window (int, optional): The sliding window for the mistral
53
- attention layers. This controls the maximum cache size for the attention
54
- layers in each transformer decoder. Only `sliding_window` number of tokens
55
- are saved in the cache and used to generate the next token.
56
- Defaults to `512`.
54
+ attention layers. This controls the maximum cache size for the
55
+ attention layers in each transformer decoder. Only `sliding_window`
56
+ number of tokens are saved in the cache and used to generate the
57
+ next token. Defaults to `512`.
57
58
  dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
58
59
  for model computations and weights. Note that some computations,
59
60
  such as softmax and layer normalization, will always be done at
@@ -28,9 +28,9 @@ class MistralCausalLM(CausalLM):
28
28
 
29
29
  Args:
30
30
  backbone: A `keras_hub.models.MistralBackbone` instance.
31
- preprocessor: A `keras_hub.models.MistralCausalLMPreprocessor` or `None`.
32
- If `None`, this model will not apply preprocessing, and inputs
33
- should be preprocessed before calling the model.
31
+ preprocessor: A `keras_hub.models.MistralCausalLMPreprocessor` or
32
+ `None`. If `None`, this model will not apply preprocessing, and
33
+ inputs should be preprocessed before calling the model.
34
34
  """
35
35
 
36
36
  backbone_cls = MistralBackbone
@@ -215,7 +215,8 @@ class MistralTransformerDecoder(keras.layers.Layer):
215
215
  # Mistral uses a banded attention mask if sliding window is not None
216
216
  if self.sliding_window is not None:
217
217
  # Below is a workaround for `ops.triu` for Keras 2.
218
- # TODO(tirthasheshpatel): Use `ops.triu` once Keras 2 support is removed.
218
+ # TODO(tirthasheshpatel): Use `ops.triu` once Keras 2 support is
219
+ # removed.
219
220
  # causal_mask = ops.triu(causal_mask, k=-self.sliding_window)
220
221
  i = ops.arange(output_length)[:, None] + cache_update_index
221
222
  j = ops.arange(input_length)[None, :]
@@ -43,8 +43,8 @@ class MiTBackbone(FeaturePyramidBackbone):
43
43
  https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/classification/mix_transformer)
44
44
 
45
45
  Args:
46
- layerwise_depths: The number of transformer encoders to be used per layer in the
47
- network.
46
+ layerwise_depths: The number of transformer encoders to be used per
47
+ layer in the network.
48
48
  num_layers: int. The number of Transformer layers.
49
49
  layerwise_num_heads: list of integers, the number of heads to use
50
50
  in the attention computation for each layer.
@@ -58,7 +58,8 @@ class MiTBackbone(FeaturePyramidBackbone):
58
58
  image_shape: optional shape tuple, defaults to (None, None, 3).
59
59
  hidden_dims: the embedding dims per hierarchical layer, used as
60
60
  the levels of the feature pyramid.
61
- patch_sizes: list of integers, the patch_size to apply for each layer.
61
+ patch_sizes: list of integers, the patch_size to apply for each
62
+ layer.
62
63
  strides: list of integers, stride to apply for each layer.
63
64
 
64
65
  Examples:
@@ -80,7 +80,8 @@ class HierarchicalTransformerEncoder(keras.layers.Layer):
80
80
  `LayerNormalization` layers. Defaults to `1e-06`
81
81
  sr_ratio: integer, the ratio to use within
82
82
  `SegFormerMultiheadAttention`. If set to > 1, a `Conv2D`
83
- layer is used to reduce the length of the sequence. Defaults to `1`.
83
+ layer is used to reduce the length of the sequence.
84
+ Defaults to `1`.
84
85
  """
85
86
 
86
87
  def __init__(
@@ -47,11 +47,11 @@ class MobileNetBackbone(Backbone):
47
47
  of filters in each layer.
48
48
  - If `depth_multiplier` > 1.0, proportionally increases the number
49
49
  of filters in each layer.
50
- - If `depth_multiplier` = 1, default number of filters from the paper
51
- are used at each layer.
50
+ - If `depth_multiplier` = 1, default number of filters from the
51
+ paper are used at each layer.
52
52
  input_num_filters: number of filters in first convolution layer
53
- output_num_filters: specifies whether to add conv and batch_norm in the end,
54
- if set to None, it will not add these layers in the end.
53
+ output_num_filters: specifies whether to add conv and batch_norm in the
54
+ end, if set to None, it will not add these layers in the end.
55
55
  'None' for MobileNetV1
56
56
  input_activation: activation function to be used in the input layer
57
57
  'hard_swish' for MobileNetV3,
@@ -365,7 +365,7 @@ def apply_depthwise_conv_block(
365
365
  batch normalization and relu6 activation.
366
366
 
367
367
  Args:
368
- x: Input tensor of shape `(rows, cols, channels)
368
+ x: Input tensor of shape `(rows, cols, channels)`
369
369
  filters: Integer, the dimensionality of the output space
370
370
  (i.e. the number of output filters in the pointwise convolution).
371
371
  depth_multiplier: controls the width of the network.
@@ -383,8 +383,8 @@ def apply_depthwise_conv_block(
383
383
  block_id: Integer, a unique identification designating the block number.
384
384
 
385
385
  Input shape:
386
- 4D tensor with shape: `(batch, rows, cols, channels)` in "channels_last"
387
- 4D tensor with shape: `(batch, channels, rows, cols)` in "channels_first"
386
+ 4D tensor with shape `(batch, rows, cols, channels)` in "channels_last"
387
+ 4D tensor with shape `(batch, channels, rows, cols)` in "channels_first"
388
388
  Returns:
389
389
  Output tensor of block.
390
390
  """
@@ -171,8 +171,8 @@ class OPTCausalLM(CausalLM):
171
171
  Args:
172
172
  token_ids: a dense int Tensor with shape `(batch_size, max_length)`.
173
173
  cache: a dense float Tensor, the cache of key and value.
174
- cache_update_index: int, or int Tensor. The index of current inputs in the
175
- whole sequence.
174
+ cache_update_index: int, or int Tensor. The index of current inputs
175
+ in the whole sequence.
176
176
 
177
177
  Returns:
178
178
  A (logits, hidden_states, cache) tuple. Where `logits` is the
@@ -68,8 +68,8 @@ class PaliGemmaBackbone(Backbone):
68
68
  `hidden_dim / num_query_heads`. Defaults to `True`.
69
69
  use_post_ffw_norm: boolean. Whether to normalize after the feedforward
70
70
  block. Defaults to `False`.
71
- use_post_attention_norm: boolean. Whether to normalize after the attention
72
- block. Defaults to `False`.
71
+ use_post_attention_norm: boolean. Whether to normalize after the
72
+ attention block. Defaults to `False`.
73
73
  attention_logit_soft_cap: `None` or int. Soft cap for the attention
74
74
  logits. Defaults to `None`.
75
75
  final_logit_soft_cap: `None` or int. Soft cap for the final logits.
@@ -300,7 +300,9 @@ class PaliGemmaBackbone(Backbone):
300
300
  "final_logit_soft_cap": self.final_logit_soft_cap,
301
301
  "attention_logit_soft_cap": self.attention_logit_soft_cap,
302
302
  "sliding_window_size": self.sliding_window_size,
303
- "use_sliding_window_attention": self.use_sliding_window_attention,
303
+ "use_sliding_window_attention": (
304
+ self.use_sliding_window_attention
305
+ ),
304
306
  "layer_norm_epsilon": self.layer_norm_epsilon,
305
307
  "dropout": self.dropout,
306
308
  }
@@ -61,7 +61,7 @@ class PaliGemmaVitEmbeddings(keras.layers.Layer):
61
61
 
62
62
  class PaliGemmaVitAttention(keras.layers.Layer):
63
63
  """
64
- Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py # noqa: E501
64
+ Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
65
65
  """
66
66
 
67
67
  def __init__(
@@ -120,7 +120,7 @@ class PaliGemmaVitAttention(keras.layers.Layer):
120
120
 
121
121
  def _transpose_for_scores(self, tensor, batch_size):
122
122
  """
123
- Adapted from https://github.com/huggingface/transformers/blob/8e164c5400b7b413c7b8fb32e35132001effc970/src/transformers/models/bert/modeling_tf_bert.py#L252 # noqa: E501
123
+ Adapted from https://github.com/huggingface/transformers/blob/8e164c5400b7b413c7b8fb32e35132001effc970/src/transformers/models/bert/modeling_tf_bert.py#L252
124
124
  """
125
125
  # [batch_size, seq_len, all_head_dim] ->
126
126
  # [batch_size, seq_len, num_heads, head_dim]
@@ -53,7 +53,6 @@ class Phi3Decoder(keras.layers.Layer):
53
53
  self.kernel_initializer = keras.initializers.get(kernel_initializer)
54
54
 
55
55
  def build(self, decoder_sequence_shape):
56
-
57
56
  # Pre-attention layernorm.
58
57
  self.pre_attention_layernorm = Phi3LayerNorm(
59
58
  epsilon=self.layer_norm_epsilon,
@@ -43,7 +43,7 @@ class Phi3SuScaledRotaryEmbedding(RotaryEmbedding):
43
43
  max_sequence_length=4096,
44
44
  pretraining_sequence_length=4096,
45
45
  max_wavelength=10000,
46
- **kwargs
46
+ **kwargs,
47
47
  ):
48
48
  super().__init__(max_wavelength=max_wavelength, **kwargs)
49
49
  self.max_sequence_length = max_sequence_length
@@ -161,12 +161,12 @@ class Preprocessor(PreprocessingLayer):
161
161
  Examples:
162
162
  ```python
163
163
  # Load a preprocessor for Gemma generation.
164
- preprocessor = keras_hub.models.GemmaCausalLMPreprocessor.from_preset(
164
+ preprocessor = keras_hub.models.CausalLMPreprocessor.from_preset(
165
165
  "gemma_2b_en",
166
166
  )
167
167
 
168
168
  # Load a preprocessor for Bert classification.
169
- preprocessor = keras_hub.models.BertTextClassifierPreprocessor.from_preset(
169
+ preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
170
170
  "bert_base_en",
171
171
  )
172
172
  ```
@@ -9,8 +9,9 @@ class FeaturePyramid(keras.layers.Layer):
9
9
  """A Feature Pyramid Network (FPN) layer.
10
10
 
11
11
  This implements the paper:
12
- Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan,
13
- and Serge Belongie. Feature Pyramid Networks for Object Detection.
12
+ Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He,
13
+ Bharath Hariharan, and Serge Belongie.
14
+ Feature Pyramid Networks for Object Detection.
14
15
  (https://arxiv.org/pdf/1612.03144)
15
16
 
16
17
  Feature Pyramid Networks (FPNs) are basic components that are added to an
@@ -7,8 +7,8 @@ class PredictionHead(keras.layers.Layer):
7
7
  """A head for classification or bounding box regression predictions.
8
8
 
9
9
  Args:
10
- output_filters: int. The umber of convolution filters in the final layer.
11
- The number of output channels determines the prediction type:
10
+ output_filters: int. The umber of convolution filters in the final
11
+ layer. The number of output channels determines the prediction type:
12
12
  - **Classification**:
13
13
  `output_filters = num_anchors * num_classes`
14
14
  Predicts class probabilities for each anchor.
@@ -42,7 +42,8 @@ class RetinaNetBackbone(FeaturePyramidBackbone):
42
42
 
43
43
  Raises:
44
44
  ValueError: If `min_level` is greater than `max_level`.
45
- ValueError: If `backbone_max_level` is less than 5 and `max_level` is greater than or equal to 5.
45
+ ValueError: If `backbone_max_level` is less than 5 and `max_level` is
46
+ greater than or equal to 5.
46
47
  """
47
48
 
48
49
  def __init__(
@@ -57,7 +58,6 @@ class RetinaNetBackbone(FeaturePyramidBackbone):
57
58
  dtype=None,
58
59
  **kwargs,
59
60
  ):
60
-
61
61
  # === Layers ===
62
62
  if min_level > max_level:
63
63
  raise ValueError(
@@ -15,7 +15,7 @@ class RetinaNetImageConverter(ImageConverter):
15
15
  offset=None,
16
16
  norm_mean=[0.485, 0.456, 0.406],
17
17
  norm_std=[0.229, 0.224, 0.225],
18
- **kwargs
18
+ **kwargs,
19
19
  ):
20
20
  super().__init__(**kwargs)
21
21
  self.image_size = image_size
@@ -14,7 +14,7 @@ from keras_hub.src.models.retinanet.retinanet_backbone import RetinaNetBackbone
14
14
  from keras_hub.src.models.retinanet.retinanet_label_encoder import (
15
15
  RetinaNetLabelEncoder,
16
16
  )
17
- from keras_hub.src.models.retinanet.retinanet_object_detector_preprocessor import (
17
+ from keras_hub.src.models.retinanet.retinanet_object_detector_preprocessor import ( # noqa: E501
18
18
  RetinaNetObjectDetectorPreprocessor,
19
19
  )
20
20
 
@@ -54,10 +54,8 @@ class RetinaNetObjectDetector(ImageObjectDetector):
54
54
  ground truth boxes and classes into training targets. It matches
55
55
  ground truth boxes to anchors based on IoU and encodes box
56
56
  coordinates as offsets. If `None`, a default encoder is created.
57
- See the
58
- `keras_hub.src.models.retinanet.retinanet_label_encoder.RetinaNetLabelEncoder`
59
- class for details. If None, a default encoder is created with
60
- standard parameters.
57
+ See the `RetinaNetLabelEncoder` class for details. If None, a
58
+ default encoder is created with standard parameters.
61
59
  - `anchor_generator`: Same as the model's.
62
60
  - `bounding_box_format`: Same as the model's
63
61
  `bounding_box_format`.
@@ -74,7 +72,8 @@ class RetinaNetObjectDetector(ImageObjectDetector):
74
72
  pre_logits_num_conv_layers: int. The number of convolutional layers in
75
73
  the head before the logits layer. These convolutional layers are
76
74
  applied before the final linear layer (logits) that produces the
77
- output predictions (bounding box regressions, classification scores).
75
+ output predictions (bounding box regressions,
76
+ classification scores).
78
77
  preprocessor: Optional. An instance of
79
78
  `RetinaNetObjectDetectorPreprocessor`or a custom preprocessor.
80
79
  Handles image preprocessing before feeding into the backbone.
@@ -5,7 +5,8 @@ backbone_presets = {
5
5
  "retinanet_resnet50_fpn_coco": {
6
6
  "metadata": {
7
7
  "description": (
8
- "RetinaNet model with ResNet50 backbone fine-tuned on COCO in 800x800 resolution."
8
+ "RetinaNet model with ResNet50 backbone fine-tuned on COCO in "
9
+ "800x800 resolution."
9
10
  ),
10
11
  "params": 34121239,
11
12
  "path": "retinanet",
@@ -23,8 +23,8 @@ class RobertaBackbone(Backbone):
23
23
 
24
24
  The default constructor gives a fully customizable, randomly initialized
25
25
  RoBERTa encoder with any number of layers, heads, and embedding
26
- dimensions. To load preset architectures and weights, use the `from_preset()`
27
- constructor.
26
+ dimensions. To load preset architectures and weights, use the
27
+ `from_preset()` constructor.
28
28
 
29
29
  Disclaimer: Pre-trained models are provided on an "as is" basis, without
30
30
  warranties or conditions of any kind. The underlying model is provided by a
@@ -5,7 +5,8 @@ backbone_presets = {
5
5
  "metadata": {
6
6
  "description": (
7
7
  "12-layer RoBERTa model where case is maintained."
8
- "Trained on English Wikipedia, BooksCorpus, CommonCraw, and OpenWebText."
8
+ "Trained on English Wikipedia, BooksCorpus, CommonCraw, and "
9
+ "OpenWebText."
9
10
  ),
10
11
  "params": 124052736,
11
12
  "path": "roberta",
@@ -16,7 +17,8 @@ backbone_presets = {
16
17
  "metadata": {
17
18
  "description": (
18
19
  "24-layer RoBERTa model where case is maintained."
19
- "Trained on English Wikipedia, BooksCorpus, CommonCraw, and OpenWebText."
20
+ "Trained on English Wikipedia, BooksCorpus, CommonCraw, and "
21
+ "OpenWebText."
20
22
  ),
21
23
  "params": 354307072,
22
24
  "path": "roberta",
@@ -38,9 +38,9 @@ class RobertaTextClassifier(TextClassifier):
38
38
  Args:
39
39
  backbone: A `keras_hub.models.RobertaBackbone` instance.
40
40
  num_classes: int. Number of classes to predict.
41
- preprocessor: A `keras_hub.models.RobertaTextClassifierPreprocessor` or `None`. If
42
- `None`, this model will not apply preprocessing, and inputs should
43
- be preprocessed before calling the model.
41
+ preprocessor: A `keras_hub.models.RobertaTextClassifierPreprocessor` or
42
+ `None`. If `None`, this model will not apply preprocessing, and
43
+ inputs should be preprocessed before calling the model.
44
44
  activation: Optional `str` or callable. The activation function to use
45
45
  on the model outputs. Set `activation="softmax"` to return output
46
46
  probabilities. Defaults to `None`.
@@ -9,8 +9,8 @@ class SAMBackbone(Backbone):
9
9
  """A backbone for the Segment Anything Model (SAM).
10
10
 
11
11
  Args:
12
- image_encoder: `keras_hub.models.ViTDetBackbone`. A feature extractor for
13
- the input images.
12
+ image_encoder: `keras_hub.models.ViTDetBackbone`. A feature extractor
13
+ for the input images.
14
14
  prompt_encoder: `keras_hub.layers.SAMPromptEncoder`. A Keras layer to
15
15
  compute embeddings for points, box, and mask prompt.
16
16
  mask_decoder: `keras_hub.layers.SAMMaskDecoder`. A Keras layer to
@@ -200,17 +200,18 @@ class SAMImageSegmenter(ImageSegmenter):
200
200
  def _add_placeholder_prompts(self, inputs):
201
201
  """Adds placeholder prompt inputs for a call to SAM.
202
202
 
203
- Because SAM is a functional subclass model, all inputs must be specified in
204
- calls to the model. However, prompt inputs are all optional, so we have to
205
- add placeholders when they're not specified by the user.
203
+ Because SAM is a functional subclass model, all inputs must be specified
204
+ in calls to the model. However, prompt inputs are all optional, so we
205
+ have to add placeholders when they're not specified by the user.
206
206
  """
207
207
  inputs = inputs.copy()
208
208
 
209
209
  # Get the batch shape based on the image input
210
210
  batch_size = ops.shape(inputs["images"])[0]
211
211
 
212
- # The type of the placeholders must match the existing inputs with respect
213
- # to whether or not they are tensors (as opposed to Numpy arrays).
212
+ # The type of the placeholders must match the existing inputs with
213
+ # respect to whether or not they are tensors (as opposed to Numpy
214
+ # arrays).
214
215
  zeros = ops.zeros if ops.is_tensor(inputs["images"]) else np.zeros
215
216
 
216
217
  # Fill in missing inputs.
@@ -170,8 +170,8 @@ class TwoWayMultiHeadAttention(keras.layers.Layer):
170
170
  key_dim: int. Size of each attention head for query, key, and
171
171
  value.
172
172
  intermediate_dim: int. Number of hidden dims to use in the mlp block.
173
- skip_first_layer_pos_embedding: bool. A boolean indicating whether to skip the
174
- first layer positional embeddings.
173
+ skip_first_layer_pos_embedding: bool. A boolean indicating whether to
174
+ skip the first layer positional embeddings.
175
175
  attention_downsample_rate: int, optional. The downsample rate to use
176
176
  in the attention layers. Defaults to 2.
177
177
  activation: str, optional. The activation for the mlp block's output
@@ -296,7 +296,9 @@ class TwoWayMultiHeadAttention(keras.layers.Layer):
296
296
  "num_heads": self.num_heads,
297
297
  "key_dim": self.key_dim,
298
298
  "intermediate_dim": self.intermediate_dim,
299
- "skip_first_layer_pos_embedding": self.skip_first_layer_pos_embedding,
299
+ "skip_first_layer_pos_embedding": (
300
+ self.skip_first_layer_pos_embedding
301
+ ),
300
302
  "attention_downsample_rate": self.attention_downsample_rate,
301
303
  "activation": self.activation,
302
304
  }
@@ -57,7 +57,7 @@ class SAMPromptEncoder(keras.layers.Layer):
57
57
  input_image_size=(1024, 1024),
58
58
  mask_in_channels=16,
59
59
  activation="gelu",
60
- **kwargs
60
+ **kwargs,
61
61
  ):
62
62
  super().__init__(**kwargs)
63
63
  self.hidden_size = hidden_size
@@ -305,7 +305,9 @@ class SAMPromptEncoder(keras.layers.Layer):
305
305
  return {
306
306
  "prompt_sparse_embeddings": sparse_embeddings,
307
307
  "prompt_dense_embeddings": dense_embeddings,
308
- "prompt_dense_positional_embeddings": prompt_dense_positional_embeddings,
308
+ "prompt_dense_positional_embeddings": (
309
+ prompt_dense_positional_embeddings
310
+ ),
309
311
  }
310
312
 
311
313
  def get_config(self):