@huggingface/transformers 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +10 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +1062 -183
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +2239 -1232
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +1 -358
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +1 -421
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +1 -358
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +1082 -181
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +11 -16
  16. package/src/backends/onnx.js +2 -7
  17. package/src/base/image_processors_utils.js +3 -1
  18. package/src/configs.js +11 -2
  19. package/src/env.js +1 -1
  20. package/src/models/feature_extractors.js +1 -0
  21. package/src/models/idefics3/image_processing_idefics3.js +24 -13
  22. package/src/models/image_processors.js +1 -0
  23. package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
  24. package/src/models/moonshine/processing_moonshine.js +20 -0
  25. package/src/models/paligemma/processing_paligemma.js +82 -0
  26. package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
  27. package/src/models/phi3_v/processing_phi3_v.js +53 -0
  28. package/src/models/processors.js +3 -0
  29. package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
  30. package/src/models/pyannote/processing_pyannote.js +7 -54
  31. package/src/models.js +233 -35
  32. package/src/ops/registry.js +11 -0
  33. package/src/pipelines.js +30 -0
  34. package/src/tokenizers.js +12 -1
  35. package/src/utils/core.js +39 -9
  36. package/src/utils/hub.js +8 -12
  37. package/src/utils/image.js +40 -0
  38. package/src/utils/tensor.js +51 -1
  39. package/types/backends/onnx.d.ts +2 -2
  40. package/types/backends/onnx.d.ts.map +1 -1
  41. package/types/base/feature_extraction_utils.d.ts +1 -1
  42. package/types/base/feature_extraction_utils.d.ts.map +1 -1
  43. package/types/base/image_processors_utils.d.ts +4 -4
  44. package/types/base/image_processors_utils.d.ts.map +1 -1
  45. package/types/base/processing_utils.d.ts +4 -4
  46. package/types/base/processing_utils.d.ts.map +1 -1
  47. package/types/configs.d.ts +7 -7
  48. package/types/configs.d.ts.map +1 -1
  49. package/types/env.d.ts +1 -1
  50. package/types/env.d.ts.map +1 -1
  51. package/types/generation/configuration_utils.d.ts +2 -2
  52. package/types/generation/logits_process.d.ts +2 -2
  53. package/types/generation/logits_process.d.ts.map +1 -1
  54. package/types/generation/logits_sampler.d.ts.map +1 -1
  55. package/types/generation/parameters.d.ts +5 -5
  56. package/types/generation/stopping_criteria.d.ts +1 -1
  57. package/types/generation/stopping_criteria.d.ts.map +1 -1
  58. package/types/generation/streamers.d.ts +2 -2
  59. package/types/generation/streamers.d.ts.map +1 -1
  60. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
  61. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
  62. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  63. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  64. package/types/models/auto/processing_auto.d.ts +1 -1
  65. package/types/models/auto/processing_auto.d.ts.map +1 -1
  66. package/types/models/clap/feature_extraction_clap.d.ts +1 -1
  67. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
  68. package/types/models/detr/image_processing_detr.d.ts +11 -11
  69. package/types/models/detr/image_processing_detr.d.ts.map +1 -1
  70. package/types/models/donut/image_processing_donut.d.ts +1 -1
  71. package/types/models/donut/image_processing_donut.d.ts.map +1 -1
  72. package/types/models/feature_extractors.d.ts +1 -0
  73. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  74. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  75. package/types/models/idefics3/processing_idefics3.d.ts.map +1 -1
  76. package/types/models/image_processors.d.ts +1 -0
  77. package/types/models/janus/image_processing_janus.d.ts +1 -1
  78. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  79. package/types/models/janus/processing_janus.d.ts.map +1 -1
  80. package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
  81. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
  82. package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
  83. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  84. package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
  85. package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
  86. package/types/models/moonshine/processing_moonshine.d.ts +17 -0
  87. package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
  88. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
  89. package/types/models/paligemma/processing_paligemma.d.ts +12 -0
  90. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
  91. package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
  92. package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
  93. package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
  94. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
  95. package/types/models/processors.d.ts +3 -0
  96. package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
  97. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  98. package/types/models/pyannote/processing_pyannote.d.ts +4 -15
  99. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
  100. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  101. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
  102. package/types/models/sam/image_processing_sam.d.ts.map +1 -1
  103. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
  104. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
  105. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
  106. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
  107. package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
  108. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
  109. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
  110. package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
  111. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
  112. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
  113. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
  114. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
  115. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
  116. package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
  117. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  118. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  119. package/types/models/whisper/processing_whisper.d.ts.map +1 -1
  120. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
  121. package/types/models.d.ts +61 -5
  122. package/types/models.d.ts.map +1 -1
  123. package/types/ops/registry.d.ts +1 -0
  124. package/types/ops/registry.d.ts.map +1 -1
  125. package/types/pipelines.d.ts +31 -51
  126. package/types/pipelines.d.ts.map +1 -1
  127. package/types/tokenizers.d.ts +10 -6
  128. package/types/tokenizers.d.ts.map +1 -1
  129. package/types/utils/audio.d.ts.map +1 -1
  130. package/types/utils/constants.d.ts.map +1 -1
  131. package/types/utils/core.d.ts +87 -22
  132. package/types/utils/core.d.ts.map +1 -1
  133. package/types/utils/data-structures.d.ts.map +1 -1
  134. package/types/utils/devices.d.ts.map +1 -1
  135. package/types/utils/dtypes.d.ts.map +1 -1
  136. package/types/utils/generic.d.ts.map +1 -1
  137. package/types/utils/hub.d.ts +3 -3
  138. package/types/utils/hub.d.ts.map +1 -1
  139. package/types/utils/image.d.ts +10 -1
  140. package/types/utils/image.d.ts.map +1 -1
  141. package/types/utils/maths.d.ts +10 -10
  142. package/types/utils/maths.d.ts.map +1 -1
  143. package/types/utils/tensor.d.ts +22 -6
  144. package/types/utils/tensor.d.ts.map +1 -1
package/README.md CHANGED
@@ -47,7 +47,7 @@ npm i @huggingface/transformers
47
47
  Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
48
48
  ```html
49
49
  <script type="module">
50
- import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.1';
50
+ import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.2.0';
51
51
  </script>
52
52
  ```
53
53
 
@@ -155,7 +155,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa
155
155
 
156
156
 
157
157
 
158
- By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.1/dist/), which should work out-of-the-box. You can customize this as follows:
158
+ By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.2.0/dist/), which should work out-of-the-box. You can customize this as follows:
159
159
 
160
160
  ### Settings
161
161
 
@@ -320,6 +320,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
320
320
  1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
321
321
  1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
322
322
  1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
323
+ 1. **EXAONE** (from LG AI Research) released with the papers [EXAONE 3.0 7.8B Instruction Tuned Language Model](https://arxiv.org/abs/2408.03541) and [EXAONE 3.5: Series of Large Language Models for Real-world Use Cases](https://arxiv.org/abs/2412.04862) by the LG AI Research team.
323
324
  1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
324
325
  1. **FastViT** (from Apple) released with the paper [FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization](https://arxiv.org/abs/2303.14189) by Pavan Kumar Anasosalu Vasu, James Gabriel, Jeff Zhu, Oncel Tuzel and Anurag Ranjan.
325
326
  1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
@@ -337,6 +338,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
337
338
  1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
338
339
  1. **[Hiera](https://huggingface.co/docs/transformers/model_doc/hiera)** (from Meta) released with the paper [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/pdf/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer.
339
340
  1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
341
+ 1. **[I-JEPA](https://huggingface.co/docs/transformers/model_doc/ijepa)** (from Meta) released with the paper [Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
340
342
  1. **[Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3)** (from Hugging Face) released with the paper [Building and better understanding vision-language models: insights and future directions](https://arxiv.org/abs/2408.12637) by Hugo Laurençon, Andrés Marafioti, Victor Sanh, Léo Tronchon.
341
343
  1. **JAIS** (from Core42) released with the paper [Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open Generative Large Language Models](https://arxiv.org/pdf/2308.16149) by Neha Sengupta, Sunil Kumar Sahu, Bokang Jia, Satheesh Katipomu, Haonan Li, Fajri Koto, William Marshall, Gurpreet Gosal, Cynthia Liu, Zhiming Chen, Osama Mohammed Afzal, Samta Kamboj, Onkar Pandit, Rahul Pal, Lalit Pradhan, Zain Muhammad Mujahid, Massa Baali, Xudong Han, Sondos Mahmoud Bsharat, Alham Fikri Aji, Zhiqiang Shen, Zhengzhong Liu, Natalia Vassilieva, Joel Hestness, Andy Hock, Andrew Feldman, Jonathan Lee, Andrew Jackson, Hector Xuguang Ren, Preslav Nakov, Timothy Baldwin, Eric Xing.
342
344
  1. **Janus** (from DeepSeek) released with the paper [Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation](https://arxiv.org/abs/2410.13848) Chengyue Wu, Xiaokang Chen, Zhiyu Wu, Yiyang Ma, Xingchao Liu, Zizheng Pan, Wen Liu, Zhenda Xie, Xingkai Yu, Chong Ruan, Ping Luo.
@@ -365,20 +367,24 @@ You can refine your search by selecting the task you're interested in (e.g., [te
365
367
  1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
366
368
  1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
367
369
  1. **Moondream1** released in the repository [moondream](https://github.com/vikhyat/moondream) by vikhyat.
370
+ 1. **[Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine)** (from Useful Sensors) released with the paper [Moonshine: Speech Recognition for Live Transcription and Voice Commands](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
368
371
  1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
369
372
  1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
370
373
  1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
371
374
  1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
372
375
  1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
373
- 1. **[OLMo](https://huggingface.co/docs/transformers/master/model_doc/olmo)** (from AI2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
376
+ 1. **[OLMo](https://huggingface.co/docs/transformers/master/model_doc/olmo)** (from Ai2) released with the paper [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
377
+ 1. **[OLMo2](https://huggingface.co/docs/transformers/master/model_doc/olmo2)** (from Ai2) released with the blog [OLMo 2: The best fully open language model to date](https://allenai.org/blog/olmo2) by the Ai2 OLMo team.
374
378
  1. **OpenELM** (from Apple) released with the paper [OpenELM: An Efficient Language Model Family with Open-source Training and Inference Framework](https://arxiv.org/abs/2404.14619) by Sachin Mehta, Mohammad Hossein Sekhavat, Qingqing Cao, Maxwell Horton, Yanzi Jin, Chenfan Sun, Iman Mirzadeh, Mahyar Najibi, Dmitry Belenko, Peter Zatloukal, Mohammad Rastegari.
375
379
  1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
376
380
  1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
377
381
  1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
382
+ 1. **[PaliGemma](https://huggingface.co/docs/transformers/main/model_doc/paligemma)** (from Google) released with the papers [PaliGemma: A versatile 3B VLM for transfer](https://arxiv.org/abs/2407.07726) and [PaliGemma 2: A Family of Versatile VLMs for Transfer](https://arxiv.org/abs/2412.03555) by the PaliGemma Google team.
378
383
  1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from IBM) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/abs/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
379
384
  1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from Princeton University, IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
380
385
  1. **[Phi](https://huggingface.co/docs/transformers/main/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
381
- 1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
386
+ 1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v2) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
387
+ 1. **Phi3V** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v4) by Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Qin Cai, Vishrav Chaudhary, Dong Chen, Dongdong Chen, Weizhu Chen, Yen-Chun Chen, Yi-Ling Chen, Hao Cheng, Parul Chopra, Xiyang Dai, Matthew Dixon, Ronen Eldan, Victor Fragoso, Jianfeng Gao, Mei Gao, Min Gao, Amit Garg, Allie Del Giorno, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Wenxiang Hu, Jamie Huynh, Dan Iter, Sam Ade Jacobs, Mojan Javaheripi, Xin Jin, Nikos Karampatziakis, Piero Kauffmann, Mahoud Khademi, Dongwoo Kim, Young Jin Kim, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Yunsheng Li, Chen Liang, Lars Liden, Xihui Lin, Zeqi Lin, Ce Liu, Liyuan Liu, Mengchen Liu, Weishung Liu, Xiaodong Liu, Chong Luo, Piyush Madan, Ali Mahmoudzadeh, David Majercak, Matt Mazzola, Caio César Teodoro Mendes, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Liliang Ren, Gustavo de Rosa, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Yelong Shen, Swadheen Shukla, Xia Song, Masahiro Tanaka, Andrea Tupini, Praneetha Vaddamanu, Chunyu Wang, Guanhua Wang, Lijuan Wang , Shuohang Wang, Xin Wang, Yu Wang, Rachel Ward, Wen Wen, Philipp Witte, Haiping Wu, Xiaoxia Wu, Michael Wyatt, Bin Xiao, Can Xu, Jiahang Xu, Weijian Xu, Jilong Xue, Sonali Yadav, Fan Yang, Jianwei Yang, Yifan Yang, Ziyi Yang, Donghan Yu, Lu Yuan, Chenruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
382
388
  1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
383
389
  1. **PyAnnote** released in the repository [pyannote/pyannote-audio](https://github.com/pyannote/pyannote-audio) by Hervé Bredin.
384
390
  1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.