keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. keras_hub/__init__.py +52 -0
  2. keras_hub/api/__init__.py +27 -0
  3. keras_hub/api/layers/__init__.py +47 -0
  4. keras_hub/api/metrics/__init__.py +24 -0
  5. keras_hub/api/models/__init__.py +249 -0
  6. keras_hub/api/samplers/__init__.py +29 -0
  7. keras_hub/api/tokenizers/__init__.py +35 -0
  8. keras_hub/src/__init__.py +13 -0
  9. keras_hub/src/api_export.py +53 -0
  10. keras_hub/src/layers/__init__.py +13 -0
  11. keras_hub/src/layers/modeling/__init__.py +13 -0
  12. keras_hub/src/layers/modeling/alibi_bias.py +143 -0
  13. keras_hub/src/layers/modeling/cached_multi_head_attention.py +137 -0
  14. keras_hub/src/layers/modeling/f_net_encoder.py +200 -0
  15. keras_hub/src/layers/modeling/masked_lm_head.py +239 -0
  16. keras_hub/src/layers/modeling/position_embedding.py +123 -0
  17. keras_hub/src/layers/modeling/reversible_embedding.py +311 -0
  18. keras_hub/src/layers/modeling/rotary_embedding.py +169 -0
  19. keras_hub/src/layers/modeling/sine_position_encoding.py +108 -0
  20. keras_hub/src/layers/modeling/token_and_position_embedding.py +150 -0
  21. keras_hub/src/layers/modeling/transformer_decoder.py +496 -0
  22. keras_hub/src/layers/modeling/transformer_encoder.py +262 -0
  23. keras_hub/src/layers/modeling/transformer_layer_utils.py +106 -0
  24. keras_hub/src/layers/preprocessing/__init__.py +13 -0
  25. keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +220 -0
  26. keras_hub/src/layers/preprocessing/multi_segment_packer.py +319 -0
  27. keras_hub/src/layers/preprocessing/preprocessing_layer.py +62 -0
  28. keras_hub/src/layers/preprocessing/random_deletion.py +271 -0
  29. keras_hub/src/layers/preprocessing/random_swap.py +267 -0
  30. keras_hub/src/layers/preprocessing/start_end_packer.py +219 -0
  31. keras_hub/src/metrics/__init__.py +13 -0
  32. keras_hub/src/metrics/bleu.py +394 -0
  33. keras_hub/src/metrics/edit_distance.py +197 -0
  34. keras_hub/src/metrics/perplexity.py +181 -0
  35. keras_hub/src/metrics/rouge_base.py +204 -0
  36. keras_hub/src/metrics/rouge_l.py +97 -0
  37. keras_hub/src/metrics/rouge_n.py +125 -0
  38. keras_hub/src/models/__init__.py +13 -0
  39. keras_hub/src/models/albert/__init__.py +20 -0
  40. keras_hub/src/models/albert/albert_backbone.py +267 -0
  41. keras_hub/src/models/albert/albert_classifier.py +202 -0
  42. keras_hub/src/models/albert/albert_masked_lm.py +129 -0
  43. keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +194 -0
  44. keras_hub/src/models/albert/albert_preprocessor.py +206 -0
  45. keras_hub/src/models/albert/albert_presets.py +70 -0
  46. keras_hub/src/models/albert/albert_tokenizer.py +119 -0
  47. keras_hub/src/models/backbone.py +311 -0
  48. keras_hub/src/models/bart/__init__.py +20 -0
  49. keras_hub/src/models/bart/bart_backbone.py +261 -0
  50. keras_hub/src/models/bart/bart_preprocessor.py +276 -0
  51. keras_hub/src/models/bart/bart_presets.py +74 -0
  52. keras_hub/src/models/bart/bart_seq_2_seq_lm.py +490 -0
  53. keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +262 -0
  54. keras_hub/src/models/bart/bart_tokenizer.py +124 -0
  55. keras_hub/src/models/bert/__init__.py +23 -0
  56. keras_hub/src/models/bert/bert_backbone.py +227 -0
  57. keras_hub/src/models/bert/bert_classifier.py +183 -0
  58. keras_hub/src/models/bert/bert_masked_lm.py +131 -0
  59. keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +198 -0
  60. keras_hub/src/models/bert/bert_preprocessor.py +184 -0
  61. keras_hub/src/models/bert/bert_presets.py +147 -0
  62. keras_hub/src/models/bert/bert_tokenizer.py +112 -0
  63. keras_hub/src/models/bloom/__init__.py +20 -0
  64. keras_hub/src/models/bloom/bloom_attention.py +186 -0
  65. keras_hub/src/models/bloom/bloom_backbone.py +173 -0
  66. keras_hub/src/models/bloom/bloom_causal_lm.py +298 -0
  67. keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +176 -0
  68. keras_hub/src/models/bloom/bloom_decoder.py +206 -0
  69. keras_hub/src/models/bloom/bloom_preprocessor.py +185 -0
  70. keras_hub/src/models/bloom/bloom_presets.py +121 -0
  71. keras_hub/src/models/bloom/bloom_tokenizer.py +116 -0
  72. keras_hub/src/models/causal_lm.py +383 -0
  73. keras_hub/src/models/classifier.py +109 -0
  74. keras_hub/src/models/csp_darknet/__init__.py +13 -0
  75. keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +410 -0
  76. keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +133 -0
  77. keras_hub/src/models/deberta_v3/__init__.py +24 -0
  78. keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +210 -0
  79. keras_hub/src/models/deberta_v3/deberta_v3_classifier.py +228 -0
  80. keras_hub/src/models/deberta_v3/deberta_v3_masked_lm.py +135 -0
  81. keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +191 -0
  82. keras_hub/src/models/deberta_v3/deberta_v3_preprocessor.py +206 -0
  83. keras_hub/src/models/deberta_v3/deberta_v3_presets.py +82 -0
  84. keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +155 -0
  85. keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +227 -0
  86. keras_hub/src/models/deberta_v3/disentangled_self_attention.py +412 -0
  87. keras_hub/src/models/deberta_v3/relative_embedding.py +94 -0
  88. keras_hub/src/models/densenet/__init__.py +13 -0
  89. keras_hub/src/models/densenet/densenet_backbone.py +210 -0
  90. keras_hub/src/models/densenet/densenet_image_classifier.py +131 -0
  91. keras_hub/src/models/distil_bert/__init__.py +26 -0
  92. keras_hub/src/models/distil_bert/distil_bert_backbone.py +187 -0
  93. keras_hub/src/models/distil_bert/distil_bert_classifier.py +208 -0
  94. keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +137 -0
  95. keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +194 -0
  96. keras_hub/src/models/distil_bert/distil_bert_preprocessor.py +175 -0
  97. keras_hub/src/models/distil_bert/distil_bert_presets.py +57 -0
  98. keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +114 -0
  99. keras_hub/src/models/electra/__init__.py +20 -0
  100. keras_hub/src/models/electra/electra_backbone.py +247 -0
  101. keras_hub/src/models/electra/electra_preprocessor.py +154 -0
  102. keras_hub/src/models/electra/electra_presets.py +95 -0
  103. keras_hub/src/models/electra/electra_tokenizer.py +104 -0
  104. keras_hub/src/models/f_net/__init__.py +20 -0
  105. keras_hub/src/models/f_net/f_net_backbone.py +236 -0
  106. keras_hub/src/models/f_net/f_net_classifier.py +154 -0
  107. keras_hub/src/models/f_net/f_net_masked_lm.py +132 -0
  108. keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +196 -0
  109. keras_hub/src/models/f_net/f_net_preprocessor.py +177 -0
  110. keras_hub/src/models/f_net/f_net_presets.py +43 -0
  111. keras_hub/src/models/f_net/f_net_tokenizer.py +95 -0
  112. keras_hub/src/models/falcon/__init__.py +20 -0
  113. keras_hub/src/models/falcon/falcon_attention.py +156 -0
  114. keras_hub/src/models/falcon/falcon_backbone.py +164 -0
  115. keras_hub/src/models/falcon/falcon_causal_lm.py +291 -0
  116. keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +173 -0
  117. keras_hub/src/models/falcon/falcon_preprocessor.py +187 -0
  118. keras_hub/src/models/falcon/falcon_presets.py +30 -0
  119. keras_hub/src/models/falcon/falcon_tokenizer.py +110 -0
  120. keras_hub/src/models/falcon/falcon_transformer_decoder.py +255 -0
  121. keras_hub/src/models/feature_pyramid_backbone.py +73 -0
  122. keras_hub/src/models/gemma/__init__.py +20 -0
  123. keras_hub/src/models/gemma/gemma_attention.py +250 -0
  124. keras_hub/src/models/gemma/gemma_backbone.py +316 -0
  125. keras_hub/src/models/gemma/gemma_causal_lm.py +448 -0
  126. keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +167 -0
  127. keras_hub/src/models/gemma/gemma_decoder_block.py +241 -0
  128. keras_hub/src/models/gemma/gemma_preprocessor.py +191 -0
  129. keras_hub/src/models/gemma/gemma_presets.py +248 -0
  130. keras_hub/src/models/gemma/gemma_tokenizer.py +103 -0
  131. keras_hub/src/models/gemma/rms_normalization.py +40 -0
  132. keras_hub/src/models/gpt2/__init__.py +20 -0
  133. keras_hub/src/models/gpt2/gpt2_backbone.py +199 -0
  134. keras_hub/src/models/gpt2/gpt2_causal_lm.py +437 -0
  135. keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +173 -0
  136. keras_hub/src/models/gpt2/gpt2_preprocessor.py +187 -0
  137. keras_hub/src/models/gpt2/gpt2_presets.py +82 -0
  138. keras_hub/src/models/gpt2/gpt2_tokenizer.py +110 -0
  139. keras_hub/src/models/gpt_neo_x/__init__.py +13 -0
  140. keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +251 -0
  141. keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +175 -0
  142. keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +201 -0
  143. keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +141 -0
  144. keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +258 -0
  145. keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +145 -0
  146. keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +88 -0
  147. keras_hub/src/models/image_classifier.py +90 -0
  148. keras_hub/src/models/llama/__init__.py +20 -0
  149. keras_hub/src/models/llama/llama_attention.py +225 -0
  150. keras_hub/src/models/llama/llama_backbone.py +188 -0
  151. keras_hub/src/models/llama/llama_causal_lm.py +327 -0
  152. keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +170 -0
  153. keras_hub/src/models/llama/llama_decoder.py +246 -0
  154. keras_hub/src/models/llama/llama_layernorm.py +48 -0
  155. keras_hub/src/models/llama/llama_preprocessor.py +189 -0
  156. keras_hub/src/models/llama/llama_presets.py +80 -0
  157. keras_hub/src/models/llama/llama_tokenizer.py +84 -0
  158. keras_hub/src/models/llama3/__init__.py +20 -0
  159. keras_hub/src/models/llama3/llama3_backbone.py +84 -0
  160. keras_hub/src/models/llama3/llama3_causal_lm.py +46 -0
  161. keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +173 -0
  162. keras_hub/src/models/llama3/llama3_preprocessor.py +21 -0
  163. keras_hub/src/models/llama3/llama3_presets.py +69 -0
  164. keras_hub/src/models/llama3/llama3_tokenizer.py +63 -0
  165. keras_hub/src/models/masked_lm.py +101 -0
  166. keras_hub/src/models/mistral/__init__.py +20 -0
  167. keras_hub/src/models/mistral/mistral_attention.py +238 -0
  168. keras_hub/src/models/mistral/mistral_backbone.py +203 -0
  169. keras_hub/src/models/mistral/mistral_causal_lm.py +328 -0
  170. keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +175 -0
  171. keras_hub/src/models/mistral/mistral_layer_norm.py +48 -0
  172. keras_hub/src/models/mistral/mistral_preprocessor.py +190 -0
  173. keras_hub/src/models/mistral/mistral_presets.py +48 -0
  174. keras_hub/src/models/mistral/mistral_tokenizer.py +82 -0
  175. keras_hub/src/models/mistral/mistral_transformer_decoder.py +265 -0
  176. keras_hub/src/models/mix_transformer/__init__.py +13 -0
  177. keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +181 -0
  178. keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +133 -0
  179. keras_hub/src/models/mix_transformer/mix_transformer_layers.py +300 -0
  180. keras_hub/src/models/opt/__init__.py +20 -0
  181. keras_hub/src/models/opt/opt_backbone.py +173 -0
  182. keras_hub/src/models/opt/opt_causal_lm.py +301 -0
  183. keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +177 -0
  184. keras_hub/src/models/opt/opt_preprocessor.py +188 -0
  185. keras_hub/src/models/opt/opt_presets.py +72 -0
  186. keras_hub/src/models/opt/opt_tokenizer.py +116 -0
  187. keras_hub/src/models/pali_gemma/__init__.py +23 -0
  188. keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +277 -0
  189. keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +313 -0
  190. keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +147 -0
  191. keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +160 -0
  192. keras_hub/src/models/pali_gemma/pali_gemma_presets.py +78 -0
  193. keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +79 -0
  194. keras_hub/src/models/pali_gemma/pali_gemma_vit.py +566 -0
  195. keras_hub/src/models/phi3/__init__.py +20 -0
  196. keras_hub/src/models/phi3/phi3_attention.py +260 -0
  197. keras_hub/src/models/phi3/phi3_backbone.py +224 -0
  198. keras_hub/src/models/phi3/phi3_causal_lm.py +218 -0
  199. keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +173 -0
  200. keras_hub/src/models/phi3/phi3_decoder.py +260 -0
  201. keras_hub/src/models/phi3/phi3_layernorm.py +48 -0
  202. keras_hub/src/models/phi3/phi3_preprocessor.py +190 -0
  203. keras_hub/src/models/phi3/phi3_presets.py +50 -0
  204. keras_hub/src/models/phi3/phi3_rotary_embedding.py +137 -0
  205. keras_hub/src/models/phi3/phi3_tokenizer.py +94 -0
  206. keras_hub/src/models/preprocessor.py +207 -0
  207. keras_hub/src/models/resnet/__init__.py +13 -0
  208. keras_hub/src/models/resnet/resnet_backbone.py +612 -0
  209. keras_hub/src/models/resnet/resnet_image_classifier.py +136 -0
  210. keras_hub/src/models/roberta/__init__.py +20 -0
  211. keras_hub/src/models/roberta/roberta_backbone.py +184 -0
  212. keras_hub/src/models/roberta/roberta_classifier.py +209 -0
  213. keras_hub/src/models/roberta/roberta_masked_lm.py +136 -0
  214. keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +198 -0
  215. keras_hub/src/models/roberta/roberta_preprocessor.py +192 -0
  216. keras_hub/src/models/roberta/roberta_presets.py +43 -0
  217. keras_hub/src/models/roberta/roberta_tokenizer.py +132 -0
  218. keras_hub/src/models/seq_2_seq_lm.py +54 -0
  219. keras_hub/src/models/t5/__init__.py +20 -0
  220. keras_hub/src/models/t5/t5_backbone.py +261 -0
  221. keras_hub/src/models/t5/t5_layer_norm.py +35 -0
  222. keras_hub/src/models/t5/t5_multi_head_attention.py +324 -0
  223. keras_hub/src/models/t5/t5_presets.py +95 -0
  224. keras_hub/src/models/t5/t5_tokenizer.py +100 -0
  225. keras_hub/src/models/t5/t5_transformer_layer.py +178 -0
  226. keras_hub/src/models/task.py +419 -0
  227. keras_hub/src/models/vgg/__init__.py +13 -0
  228. keras_hub/src/models/vgg/vgg_backbone.py +158 -0
  229. keras_hub/src/models/vgg/vgg_image_classifier.py +124 -0
  230. keras_hub/src/models/vit_det/__init__.py +13 -0
  231. keras_hub/src/models/vit_det/vit_det_backbone.py +204 -0
  232. keras_hub/src/models/vit_det/vit_layers.py +565 -0
  233. keras_hub/src/models/whisper/__init__.py +20 -0
  234. keras_hub/src/models/whisper/whisper_audio_feature_extractor.py +260 -0
  235. keras_hub/src/models/whisper/whisper_backbone.py +305 -0
  236. keras_hub/src/models/whisper/whisper_cached_multi_head_attention.py +153 -0
  237. keras_hub/src/models/whisper/whisper_decoder.py +141 -0
  238. keras_hub/src/models/whisper/whisper_encoder.py +106 -0
  239. keras_hub/src/models/whisper/whisper_preprocessor.py +326 -0
  240. keras_hub/src/models/whisper/whisper_presets.py +148 -0
  241. keras_hub/src/models/whisper/whisper_tokenizer.py +163 -0
  242. keras_hub/src/models/xlm_roberta/__init__.py +26 -0
  243. keras_hub/src/models/xlm_roberta/xlm_roberta_backbone.py +81 -0
  244. keras_hub/src/models/xlm_roberta/xlm_roberta_classifier.py +225 -0
  245. keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +141 -0
  246. keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +195 -0
  247. keras_hub/src/models/xlm_roberta/xlm_roberta_preprocessor.py +205 -0
  248. keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +43 -0
  249. keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +191 -0
  250. keras_hub/src/models/xlnet/__init__.py +13 -0
  251. keras_hub/src/models/xlnet/relative_attention.py +459 -0
  252. keras_hub/src/models/xlnet/xlnet_backbone.py +222 -0
  253. keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +133 -0
  254. keras_hub/src/models/xlnet/xlnet_encoder.py +378 -0
  255. keras_hub/src/samplers/__init__.py +13 -0
  256. keras_hub/src/samplers/beam_sampler.py +207 -0
  257. keras_hub/src/samplers/contrastive_sampler.py +231 -0
  258. keras_hub/src/samplers/greedy_sampler.py +50 -0
  259. keras_hub/src/samplers/random_sampler.py +77 -0
  260. keras_hub/src/samplers/sampler.py +237 -0
  261. keras_hub/src/samplers/serialization.py +97 -0
  262. keras_hub/src/samplers/top_k_sampler.py +92 -0
  263. keras_hub/src/samplers/top_p_sampler.py +113 -0
  264. keras_hub/src/tests/__init__.py +13 -0
  265. keras_hub/src/tests/test_case.py +608 -0
  266. keras_hub/src/tokenizers/__init__.py +13 -0
  267. keras_hub/src/tokenizers/byte_pair_tokenizer.py +638 -0
  268. keras_hub/src/tokenizers/byte_tokenizer.py +299 -0
  269. keras_hub/src/tokenizers/sentence_piece_tokenizer.py +267 -0
  270. keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +150 -0
  271. keras_hub/src/tokenizers/tokenizer.py +235 -0
  272. keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +355 -0
  273. keras_hub/src/tokenizers/word_piece_tokenizer.py +544 -0
  274. keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +176 -0
  275. keras_hub/src/utils/__init__.py +13 -0
  276. keras_hub/src/utils/keras_utils.py +130 -0
  277. keras_hub/src/utils/pipeline_model.py +293 -0
  278. keras_hub/src/utils/preset_utils.py +621 -0
  279. keras_hub/src/utils/python_utils.py +21 -0
  280. keras_hub/src/utils/tensor_utils.py +206 -0
  281. keras_hub/src/utils/timm/__init__.py +13 -0
  282. keras_hub/src/utils/timm/convert.py +37 -0
  283. keras_hub/src/utils/timm/convert_resnet.py +171 -0
  284. keras_hub/src/utils/transformers/__init__.py +13 -0
  285. keras_hub/src/utils/transformers/convert.py +101 -0
  286. keras_hub/src/utils/transformers/convert_bert.py +173 -0
  287. keras_hub/src/utils/transformers/convert_distilbert.py +184 -0
  288. keras_hub/src/utils/transformers/convert_gemma.py +187 -0
  289. keras_hub/src/utils/transformers/convert_gpt2.py +186 -0
  290. keras_hub/src/utils/transformers/convert_llama3.py +136 -0
  291. keras_hub/src/utils/transformers/convert_pali_gemma.py +303 -0
  292. keras_hub/src/utils/transformers/safetensor_utils.py +97 -0
  293. keras_hub/src/version_utils.py +23 -0
  294. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +34 -0
  295. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +297 -0
  296. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/WHEEL +5 -0
  297. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/top_level.txt +1 -0
@@ -0,0 +1,355 @@
1
+ # Copyright 2024 The KerasHub Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from keras_hub.src.api_export import keras_hub_export
17
+ from keras_hub.src.tokenizers import tokenizer
18
+ from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
19
+ from keras_hub.src.utils.tensor_utils import is_int_dtype
20
+
21
+ try:
22
+ import tensorflow as tf
23
+ import tensorflow_text as tf_text
24
+ except ImportError:
25
+ tf = None
26
+ tf_text = None
27
+
28
+
29
+ @keras_hub_export("keras_hub.tokenizers.UnicodeCodepointTokenizer")
30
+ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
31
+ """A unicode character tokenizer layer.
32
+
33
+ This tokenizer is a vocabulary free tokenizer which tokenizes text as
34
+ unicode character codepoints.
35
+
36
+ Tokenizer outputs can either be padded and truncated with a
37
+ `sequence_length` argument, or left un-truncated. The exact output will
38
+ depend on the rank of the input tensors.
39
+
40
+ If input is a batch of strings (rank > 0):
41
+ By default, the layer will output a `tf.RaggedTensor` where the last
42
+ dimension of the output is ragged. If `sequence_length` is set, the layer
43
+ will output a dense `tf.Tensor` where all inputs have been padded or
44
+ truncated to `sequence_length`.
45
+
46
+ If input is a scalar string (rank == 0):
47
+ By default, the layer will output a dense `tf.Tensor` with static shape
48
+ `[None]`. If `sequence_length` is set, the output will be
49
+ a dense `tf.Tensor` of shape `[sequence_length]`.
50
+
51
+ The output dtype can be controlled via the `dtype` argument, which should be
52
+ an integer type ("int16", "int32", etc.).
53
+
54
+ Args:
55
+ lowercase: If `True`, the input text will be first lowered before
56
+ tokenization.
57
+ sequence_length: If set, the output will be converted to a dense
58
+ tensor and padded/trimmed so all outputs are of sequence_length.
59
+ normalization_form: One of the following string values (None, 'NFC',
60
+ 'NFKC', 'NFD', 'NFKD'). If set will normalize unicode to the given
61
+ form before tokenizing.
62
+ errors: One of ('replace', 'remove', 'strict'). Specifies the
63
+ `detokenize()` behavior when an invalid codepoint is encountered.
64
+ The value of `'strict'` will cause the tokenizer to produce a
65
+ `InvalidArgument` error on any invalid input formatting. A value of
66
+ `'replace'` will cause the tokenizer to replace any invalid
67
+ formatting in the input with the replacement_char codepoint.
68
+ A value of `'ignore'` will cause the tokenizer to skip any invalid
69
+ formatting in the input and produce no corresponding output
70
+ character.
71
+ replacement_char: The unicode codepoint to use in place of invalid
72
+ codepoints. (U+FFFD) is `65533`. Defaults to `65533`.
73
+ input_encoding: One of ("UTF-8", "UTF-16-BE", or "UTF-32-BE").
74
+ One of The encoding of the input text. Defaults to `"UTF-8"`.
75
+ output_encoding: One of ("UTF-8", "UTF-16-BE", or "UTF-32-BE").
76
+ The encoding of the output text. Defaults to `"UTF-8"`.
77
+ vocabulary_size: Set the vocabulary `vocabulary_size`,
78
+ by clamping all codepoints to the range [0, vocabulary_size).
79
+ Effectively this will make the `vocabulary_size - 1` id the
80
+ the OOV value.
81
+
82
+ Examples:
83
+
84
+ Basic Usage.
85
+ >>> inputs = "Unicode Tokenizer"
86
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
87
+ >>> outputs = tokenizer(inputs)
88
+ >>> np.array(outputs)
89
+ array([117, 110, 105, 99, 111, 100, 101, 32, 116, 111, 107, 101, 110,
90
+ 105, 122, 101, 114], dtype=int32)
91
+
92
+ Ragged outputs.
93
+ >>> inputs = ["पुस्तक", "کتاب"]
94
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
95
+ >>> seq1, seq2 = tokenizer(inputs)
96
+ >>> np.array(seq1)
97
+ array([2346, 2369, 2360, 2381, 2340, 2325], dtype=int32)
98
+ >>> np.array(seq2)
99
+ array([1705, 1578, 1575, 1576], dtype=int32)
100
+
101
+ Dense outputs.
102
+ >>> inputs = ["पुस्तक", "کتاب"]
103
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
104
+ ... sequence_length=8)
105
+ >>> seq1, seq2 = tokenizer(inputs)
106
+ >>> np.array(seq1)
107
+ array([2346, 2369, 2360, 2381, 2340, 2325, 0, 0], dtype=int32)
108
+ >>> np.array(seq2)
109
+ array([1705, 1578, 1575, 1576, 0, 0, 0, 0], dtype=int32)
110
+
111
+ Tokenize, then batch for ragged outputs.
112
+ >>> inputs = ["Book", "पुस्तक", "کتاب"]
113
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
114
+ >>> ds = tf.data.Dataset.from_tensor_slices(inputs)
115
+ >>> ds = ds.map(tokenizer)
116
+ >>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(3))
117
+ >>> ds.take(1).get_single_element()
118
+ <tf.RaggedTensor [[98, 111, 111, 107],
119
+ [2346, 2369, 2360, 2381, 2340, 2325],
120
+ [1705, 1578, 1575, 1576]]>
121
+
122
+ Batch, then tokenize for ragged outputs.
123
+ >>> inputs = ["Book", "पुस्तक", "کتاب"]
124
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
125
+ >>> ds = tf.data.Dataset.from_tensor_slices(inputs)
126
+ >>> ds = ds.batch(3).map(tokenizer)
127
+ >>> ds.take(1).get_single_element()
128
+ <tf.RaggedTensor [[98, 111, 111, 107],
129
+ [2346, 2369, 2360, 2381, 2340, 2325],
130
+ [1705, 1578, 1575, 1576]]>
131
+
132
+ Tokenize, then batch for dense outputs (`sequence_length` provided).
133
+ >>> inputs = ["Book", "पुस्तक", "کتاب"]
134
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
135
+ ... sequence_length=5)
136
+ >>> ds = tf.data.Dataset.from_tensor_slices(inputs)
137
+ >>> ds = ds.map(tokenizer)
138
+ >>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(3))
139
+ >>> ds.take(1).get_single_element()
140
+ <tf.Tensor: shape=(3, 5), dtype=int32, numpy=
141
+ array([[ 98, 111, 111, 107, 0],
142
+ [2346, 2369, 2360, 2381, 2340],
143
+ [1705, 1578, 1575, 1576, 0]], dtype=int32)>
144
+
145
+ Batch, then tokenize for dense outputs (`sequence_length` provided).
146
+ >>> inputs = ["Book", "पुस्तक", "کتاب"]
147
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
148
+ ... sequence_length=5)
149
+ >>> ds = tf.data.Dataset.from_tensor_slices(inputs)
150
+ >>> ds = ds.batch(3).map(tokenizer)
151
+ >>> ds.take(1).get_single_element()
152
+ <tf.Tensor: shape=(3, 5), dtype=int32, numpy=
153
+ array([[ 98, 111, 111, 107, 0],
154
+ [2346, 2369, 2360, 2381, 2340],
155
+ [1705, 1578, 1575, 1576, 0]], dtype=int32)>
156
+
157
+ Tokenization with truncation.
158
+ >>> inputs = ["I Like to Travel a Lot", "मैं किताबें पढ़ना पसंद करता हूं"]
159
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
160
+ ... sequence_length=5)
161
+ >>> outputs = tokenizer(inputs)
162
+ >>> np.array(outputs)
163
+ array([[ 105, 32, 108, 105, 107],
164
+ [2350, 2376, 2306, 32, 2325]], dtype=int32)
165
+
166
+ Tokenization with vocabulary_size.
167
+ >>> latin_ext_cutoff = 592
168
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
169
+ ... vocabulary_size=latin_ext_cutoff)
170
+ >>> outputs = tokenizer("¿Cómo estás?")
171
+ >>> np.array(outputs)
172
+ array([191, 99, 243, 109, 111, 32, 101, 115, 116, 225, 115, 63],
173
+ dtype=int32)
174
+ >>> outputs = tokenizer("आप कैसे हैं")
175
+ >>> np.array(outputs)
176
+ array([591, 591, 32, 591, 591, 591, 591, 32, 591, 591, 591],
177
+ dtype=int32)
178
+
179
+ Detokenization.
180
+ >>> inputs = tf.constant([110, 105, 110, 106, 97], dtype="int32")
181
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
182
+ >>> outputs = tokenizer.detokenize(inputs)
183
+ >>> np.array(outputs).astype("U")
184
+ array('ninja', dtype='<U5')
185
+
186
+ Detokenization with padding.
187
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
188
+ ... sequence_length=7)
189
+ >>> dataset = tf.data.Dataset.from_tensor_slices(["a b c", "b c", "a"])
190
+ >>> dataset = dataset.map(tokenizer)
191
+ >>> dataset.take(1).get_single_element()
192
+ <tf.Tensor: shape=(7,), dtype=int32,
193
+ numpy=array([97, 32, 98, 32, 99, 0, 0], dtype=int32)>
194
+ >>> detokunbatched = dataset.map(tokenizer.detokenize)
195
+ >>> detokunbatched.take(1).get_single_element()
196
+ <tf.Tensor: shape=(), dtype=string, numpy=b'a b c'>
197
+
198
+ Detokenization with invalid bytes.
199
+ >>> inputs = tf.constant([110, 105, 10000000, 110, 106, 97])
200
+ >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
201
+ ... errors="replace", replacement_char=88)
202
+ >>> outputs = tokenizer.detokenize(inputs)
203
+ >>> np.array(outputs).astype("U")
204
+ array('niXnja', dtype='<U6')
205
+ """
206
+
207
+ def __init__(
208
+ self,
209
+ sequence_length=None,
210
+ lowercase=True,
211
+ normalization_form=None,
212
+ errors="replace",
213
+ replacement_char=65533,
214
+ input_encoding="UTF-8",
215
+ output_encoding="UTF-8",
216
+ vocabulary_size=None,
217
+ dtype="int32",
218
+ **kwargs,
219
+ ) -> None:
220
+ if not is_int_dtype(dtype):
221
+ raise ValueError(
222
+ "Output dtype must be an integer type. "
223
+ f"Received: dtype={dtype}"
224
+ )
225
+
226
+ # Check normalization_form.
227
+ if normalization_form not in [None, "NFC", "NFKC", "NFD", "NFKD"]:
228
+ raise ValueError(
229
+ '`normalization_form` must be one of None, "NFC", "NFKC", '
230
+ '"NFD", "NFKD". Received: normalization_form='
231
+ f"{normalization_form}"
232
+ )
233
+
234
+ # Check errors.
235
+ if errors not in ["strict", "replace", "ignore"]:
236
+ raise ValueError(
237
+ '`errors` must be one of "strict", "replace", "ignore" '
238
+ f"Received: errors={errors}"
239
+ )
240
+
241
+ # Check normalization_form matches input_encoding.
242
+ if normalization_form:
243
+ if input_encoding != "UTF-8":
244
+ raise ValueError(
245
+ """Normalization Forms are Only Supported for Input Encoding
246
+ UTF-8"""
247
+ )
248
+
249
+ super().__init__(dtype=dtype, **kwargs)
250
+
251
+ self.sequence_length = sequence_length
252
+ self.lowercase = lowercase
253
+ self.normalization_form = normalization_form
254
+ self.errors = errors
255
+ self.replacement_char = replacement_char
256
+ self.input_encoding = input_encoding
257
+ self.output_encoding = output_encoding
258
+ self._vocabulary_size = vocabulary_size
259
+
260
+ def get_config(self):
261
+ config = super().get_config()
262
+ config.update(
263
+ {
264
+ "sequence_length": self.sequence_length,
265
+ "lowercase": self.lowercase,
266
+ "normalization_form": self.normalization_form,
267
+ "errors": self.errors,
268
+ "replacement_char": self.replacement_char,
269
+ "input_encoding": self.input_encoding,
270
+ "output_encoding": self.output_encoding,
271
+ "vocabulary_size": self._vocabulary_size,
272
+ }
273
+ )
274
+ return config
275
+
276
+ def vocabulary_size(self):
277
+ """Get the size of the tokenizer vocabulary. None implies no vocabulary
278
+ size was provided"""
279
+ return self._vocabulary_size
280
+
281
+ def get_vocabulary(self):
282
+ vocab = {}
283
+ for i in range(self.vocabulary_size()):
284
+ vocab[chr(i)] = i
285
+ return vocab
286
+
287
+ def tokenize(self, inputs):
288
+ if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
289
+ inputs = tf.convert_to_tensor(inputs)
290
+
291
+ scalar_input = inputs.shape.rank == 0
292
+ if scalar_input:
293
+ inputs = tf.expand_dims(inputs, 0)
294
+
295
+ # Optionally lowercase the text
296
+ if self.lowercase:
297
+ inputs = tf_text.case_fold_utf8(inputs)
298
+
299
+ # Optionally normalize the text to a given form
300
+ if self.normalization_form:
301
+ inputs = tf_text.normalize_utf8(inputs, self.normalization_form)
302
+
303
+ tokens = tf.strings.unicode_decode(
304
+ inputs,
305
+ errors=self.errors,
306
+ replacement_char=self.replacement_char,
307
+ input_encoding=self.input_encoding,
308
+ )
309
+ tokens = tf.cast(tokens, self.compute_dtype)
310
+
311
+ if self.sequence_length:
312
+ output_shape = tokens.shape.as_list()
313
+ output_shape[-1] = self.sequence_length
314
+ tokens = tokens.to_tensor(shape=output_shape)
315
+
316
+ if scalar_input:
317
+ tokens = tf.squeeze(tokens, 0)
318
+
319
+ # Optionally clamps the output code point values to be in the
320
+ # range [0, vocabulary_size)
321
+ if self._vocabulary_size:
322
+ tokens = tf.clip_by_value(tokens, 0, self._vocabulary_size - 1)
323
+
324
+ return tokens
325
+
326
+ def detokenize(self, inputs):
327
+ inputs, unbatched, _ = convert_to_ragged_batch(inputs)
328
+ inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
329
+ outputs = tf.strings.unicode_encode(
330
+ inputs,
331
+ errors=self.errors,
332
+ replacement_char=self.replacement_char,
333
+ output_encoding=self.output_encoding,
334
+ )
335
+ if unbatched:
336
+ outputs = tf.squeeze(outputs, 0)
337
+ return outputs
338
+
339
+ def id_to_token(self, id):
340
+ """Convert an integer id to a string token."""
341
+ if id >= self.vocabulary_size() or id < 0:
342
+ raise ValueError(
343
+ f"`id` must be in range [0, {self.vocabulary_size() - 1}]. "
344
+ f"Received: {id}"
345
+ )
346
+ return chr(id)
347
+
348
+ def token_to_id(self, token):
349
+ """Convert a string token to an integer id."""
350
+ id = ord(token)
351
+ if id >= self.vocabulary_size():
352
+ raise ValueError(
353
+ f"Token {token} is not supported by `UnicodeCodepointTokenizer`."
354
+ )
355
+ return id