spark-nlp 5.5.3__py2.py3-none-any.whl → 6.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (37) hide show
  1. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/METADATA +20 -11
  2. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/RECORD +36 -17
  3. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/WHEEL +1 -1
  4. sparknlp/__init__.py +2 -2
  5. sparknlp/annotator/classifier_dl/__init__.py +4 -0
  6. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  7. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +2 -2
  8. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  9. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  10. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  11. sparknlp/annotator/cleaners/__init__.py +15 -0
  12. sparknlp/annotator/cleaners/cleaner.py +202 -0
  13. sparknlp/annotator/cleaners/extractor.py +191 -0
  14. sparknlp/annotator/cv/__init__.py +9 -1
  15. sparknlp/annotator/cv/gemma3_for_multimodal.py +351 -0
  16. sparknlp/annotator/cv/janus_for_multimodal.py +356 -0
  17. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  18. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  19. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  20. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  21. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  22. sparknlp/annotator/cv/smolvlm_transformer.py +432 -0
  23. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +10 -6
  24. sparknlp/annotator/seq2seq/__init__.py +3 -0
  25. sparknlp/annotator/seq2seq/auto_gguf_model.py +8 -503
  26. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +333 -0
  27. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  28. sparknlp/annotator/seq2seq/llama3_transformer.py +4 -4
  29. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  30. sparknlp/base/image_assembler.py +58 -0
  31. sparknlp/common/properties.py +605 -96
  32. sparknlp/internal/__init__.py +127 -2
  33. sparknlp/reader/enums.py +19 -0
  34. sparknlp/reader/pdf_to_text.py +111 -0
  35. sparknlp/reader/sparknlp_reader.py +222 -14
  36. spark_nlp-5.5.3.dist-info/.uuid +0 -1
  37. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: spark-nlp
3
- Version: 5.5.3
3
+ Version: 6.0.1
4
4
  Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
5
5
  Home-page: https://github.com/JohnSnowLabs/spark-nlp
6
6
  Author: John Snow Labs
@@ -87,7 +87,7 @@ documentation and examples
87
87
 
88
88
  ## Quick Start
89
89
 
90
- This is a quick example of how to use Spark NLP pre-trained pipeline in Python and PySpark:
90
+ This is a quick example of how to use a Spark NLP pre-trained pipeline in Python and PySpark:
91
91
 
92
92
  ```sh
93
93
  $ java -version
@@ -95,7 +95,7 @@ $ java -version
95
95
  $ conda create -n sparknlp python=3.7 -y
96
96
  $ conda activate sparknlp
97
97
  # spark-nlp by default is based on pyspark 3.x
98
- $ pip install spark-nlp==5.5.3 pyspark==3.3.1
98
+ $ pip install spark-nlp==6.0.1 pyspark==3.3.1
99
99
  ```
100
100
 
101
101
  In Python console or Jupyter `Python3` kernel:
@@ -161,10 +161,11 @@ For a quick example of using pipelines and models take a look at our official [d
161
161
 
162
162
  ### Apache Spark Support
163
163
 
164
- Spark NLP *5.5.3* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
164
+ Spark NLP *6.0.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
165
165
 
166
166
  | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
167
167
  |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
168
+ | 6.0.x | YES | YES | YES | YES | YES | YES | NO | NO |
168
169
  | 5.5.x | YES | YES | YES | YES | YES | YES | NO | NO |
169
170
  | 5.4.x | YES | YES | YES | YES | YES | YES | NO | NO |
170
171
  | 5.3.x | YES | YES | YES | YES | YES | YES | NO | NO |
@@ -178,6 +179,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github
178
179
 
179
180
  | Spark NLP | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10| Scala 2.11 | Scala 2.12 |
180
181
  |-----------|------------|------------|------------|------------|------------|------------|------------|
182
+ | 6.0.x | NO | YES | YES | YES | YES | NO | YES |
181
183
  | 5.5.x | NO | YES | YES | YES | YES | NO | YES |
182
184
  | 5.4.x | NO | YES | YES | YES | YES | NO | YES |
183
185
  | 5.3.x | NO | YES | YES | YES | YES | NO | YES |
@@ -189,7 +191,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
189
191
 
190
192
  ### Databricks Support
191
193
 
192
- Spark NLP 5.5.3 has been tested and is compatible with the following runtimes:
194
+ Spark NLP 6.0.1 has been tested and is compatible with the following runtimes:
193
195
 
194
196
  | **CPU** | **GPU** |
195
197
  |--------------------|--------------------|
@@ -206,7 +208,7 @@ We are compatible with older runtimes. For a full list check databricks support
206
208
 
207
209
  ### EMR Support
208
210
 
209
- Spark NLP 5.5.3 has been tested and is compatible with the following EMR releases:
211
+ Spark NLP 6.0.1 has been tested and is compatible with the following EMR releases:
210
212
 
211
213
  | **EMR Release** |
212
214
  |--------------------|
@@ -216,6 +218,13 @@ Spark NLP 5.5.3 has been tested and is compatible with the following EMR release
216
218
  | emr-7.0.0 |
217
219
  | emr-7.1.0 |
218
220
  | emr-7.2.0 |
221
+ | emr-7.3.0 |
222
+ | emr-7.4.0 |
223
+ | emr-7.5.0 |
224
+ | emr-7.6.0 |
225
+ | emr-7.7.0 |
226
+ | emr-7.8.0 |
227
+
219
228
 
220
229
  We are compatible with older EMR releases. For a full list check EMR support in our official [documentation](https://sparknlp.org/docs/en/install#emr-support)
221
230
 
@@ -237,7 +246,7 @@ deployed to Maven central. To add any of our packages as a dependency in your ap
237
246
  from our official documentation.
238
247
 
239
248
  If you are interested, there is a simple SBT project for Spark NLP to guide you on how to use it in your
240
- projects [Spark NLP SBT S5.5.3r](https://github.com/maziyarpanahi/spark-nlp-starter)
249
+ projects [Spark NLP Starter](https://github.com/maziyarpanahi/spark-nlp-starter)
241
250
 
242
251
  ### Python
243
252
 
@@ -246,7 +255,7 @@ Check all available installations for Python in our official [documentation](htt
246
255
 
247
256
  ### Compiled JARs
248
257
 
249
- To compile the jars from source follow [these instructions](https://sparknlp.org/docs/en/compiled#jars) from our official documenation
258
+ To compile the jars from source follow [these instructions](https://sparknlp.org/docs/en/compiled#jars) from our official documentation
250
259
 
251
260
  ## Platform-Specific Instructions
252
261
 
@@ -266,7 +275,7 @@ For detailed instructions on how to use Spark NLP on supported platforms, please
266
275
 
267
276
  Spark NLP library and all the pre-trained models/pipelines can be used entirely offline with no access to the Internet.
268
277
  Please check [these instructions](https://sparknlp.org/docs/en/install#s3-integration) from our official documentation
269
- to use Spark NLP offline
278
+ to use Spark NLP offline.
270
279
 
271
280
  ## Advanced Settings
272
281
 
@@ -282,7 +291,7 @@ In Spark NLP we can define S3 locations to:
282
291
 
283
292
  Please check [these instructions](https://sparknlp.org/docs/en/install#s3-integration) from our official documentation.
284
293
 
285
- ## Document5.5.3
294
+ ## Documentation
286
295
 
287
296
  ### Examples
288
297
 
@@ -315,7 +324,7 @@ the Spark NLP library:
315
324
  keywords = {Spark, Natural language processing, Deep learning, Tensorflow, Cluster},
316
325
  abstract = {Spark NLP is a Natural Language Processing (NLP) library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines that can scale easily in a distributed environment. Spark NLP comes with 1100+ pretrained pipelines and models in more than 192+ languages. It supports nearly all the NLP tasks and modules that can be used seamlessly in a cluster. Downloaded more than 2.7 million times and experiencing 9x growth since January 2020, Spark NLP is used by 54% of healthcare organizations as the world’s most widely used NLP library in the enterprise.}
317
326
  }
318
- }5.5.3
327
+ }
319
328
  ```
320
329
 
321
330
  ## Community support
@@ -3,7 +3,7 @@ com/johnsnowlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
3
3
  com/johnsnowlabs/ml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  com/johnsnowlabs/ml/ai/__init__.py,sha256=YQiK2M7U4d8y5irPy_HB8ae0mSpqS9583MH44pnKJXc,295
5
5
  com/johnsnowlabs/nlp/__init__.py,sha256=DPIVXtONO5xXyOk-HB0-sNiHAcco17NN13zPS_6Uw8c,294
6
- sparknlp/__init__.py,sha256=Wmw9AZuFatQEjZ0WucHWPO4yF4HTsEZOVZ27IaEAbok,13783
6
+ sparknlp/__init__.py,sha256=shc-4QWkDcCU_Sgy4G3UiRpXWIhRXz0d2Z0eEwfEyYc,13783
7
7
  sparknlp/annotation.py,sha256=I5zOxG5vV2RfPZfqN9enT1i4mo6oBcn3Lrzs37QiOiA,5635
8
8
  sparknlp/annotation_audio.py,sha256=iRV_InSVhgvAwSRe9NTbUH9v6OGvTM-FPCpSAKVu0mE,1917
9
9
  sparknlp/annotation_image.py,sha256=xhCe8Ko-77XqWVuuYHFrjKqF6zPd8Z-RY_rmZXNwCXU,2547
@@ -30,13 +30,14 @@ sparknlp/annotator/audio/__init__.py,sha256=dXjtvi5c0aTZFq1Q_JciUd1uFTBVSJoUdcq0
30
30
  sparknlp/annotator/audio/hubert_for_ctc.py,sha256=76PfwPZZvOHU5kfDqLueCFbmqa4W8pMNRGoCvOqjsEA,7859
31
31
  sparknlp/annotator/audio/wav2vec2_for_ctc.py,sha256=K78P1U6vA4O1UufsLYzy0H7arsKNmwPcIV7kzDFsA5Q,6210
32
32
  sparknlp/annotator/audio/whisper_for_ctc.py,sha256=uII51umuohqwnAW0Q7VdxEFyr_j5LMnfpcRlf8TbetA,9800
33
- sparknlp/annotator/classifier_dl/__init__.py,sha256=4v2_3kSWQFFBc_KzaJ0gEC6ANVJpy5tsHa6CJGc4nCw,4005
33
+ sparknlp/annotator/classifier_dl/__init__.py,sha256=lQUdV9ynjn7hG2Wxb37iwc89gfOLC6g_UPgtFaIwm3g,4311
34
+ sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py,sha256=oaV3pTFNCnEpyaML1ydOPOdO40OtC_tOACrcm3IqPgU,5984
34
35
  sparknlp/annotator/classifier_dl/albert_for_question_answering.py,sha256=LG2dL6Fky1T35yXTUZBfIihIIGnkRFQ7ECQ3HRXXEG8,6517
35
36
  sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py,sha256=kWx7f9pcKE2qw319gn8FN0Md5dX38gbmfeoY9gWCLNk,7842
36
37
  sparknlp/annotator/classifier_dl/albert_for_token_classification.py,sha256=5rdsjWnsAVmtP-idU7ATKJ8lkH2rtlKZLnpi4Mq27eI,6839
37
38
  sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py,sha256=_TgV6EiIOiD_djA3fxfoz-o37mzMeKbn6iL2kZ6GzO0,8366
38
39
  sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py,sha256=yqQeDdpLbNOKuSZejZjSAjT8ydYyxsTVf2aFDgSSDfc,8767
39
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py,sha256=Ew_NGBj7F5ApgK3SyQh2HIfjD7ZTqTs0LZEQxjwoyto,5936
40
+ sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py,sha256=vq9Y2d253B6bN3ShKwGq1GVBDCFGKkmz_psShbnx-e8,5930
40
41
  sparknlp/annotator/classifier_dl/bert_for_question_answering.py,sha256=2euY_RAdMPA4IHJXZAd5MkQojFOtFNhB_hSc1iVQ5DQ,6433
41
42
  sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py,sha256=AzD3RQcRuQc0DDTbL6vGiacTtHlZnbAqksNvRQq7EQE,7800
42
43
  sparknlp/annotator/classifier_dl/bert_for_token_classification.py,sha256=uJXoDLPfPWiRmKqtw_3lLBvneIirj87S2JWwfd33zq8,6668
@@ -54,6 +55,7 @@ sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py,sha256=yA
54
55
  sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py,sha256=Cax3LcVLppiHs1dyahsBSq_TLHSwI2-K7LGCZHZNs1I,7926
55
56
  sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py,sha256=y9S83LW0Mfn4fRzopRXFj8l2gb-Nrm1rr9zRftOckJU,6832
56
57
  sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py,sha256=DqQOSr-TutHS6y165QJ-Pg6EEkG9JOFN0FxgyCi5SCg,8485
58
+ sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py,sha256=xRSs2B7YMSfqAHGzR79NzHq-rBEkxUl-pUNiXVxVWuk,6048
57
59
  sparknlp/annotator/classifier_dl/longformer_for_question_answering.py,sha256=VKbOKSTtwdeSsSzB2oKiRlFwSOcpHuMfkvgGM3ofBIo,6553
58
60
  sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py,sha256=_XO3Ufl_wHyUgUIechZ6J1VCE2G2W-FUPZfHmJSfQvk,7932
59
61
  sparknlp/annotator/classifier_dl/longformer_for_token_classification.py,sha256=RmiFuBRhIAoJoQ8Rgcu997-PxBK1hhWuLVlS1qztMyk,6848
@@ -61,24 +63,37 @@ sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py,sha256=w9hHLrQb
61
63
  sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py,sha256=M__giFElL6Q3I88QD6OoXDzdQDk_Zp5sS__Kh_XpLdo,7308
62
64
  sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py,sha256=SgFAJcv7ZE3BmJOehK_CjAaueqaaK6PR33zA5aE9-Ww,6754
63
65
  sparknlp/annotator/classifier_dl/multi_classifier_dl.py,sha256=ylKQzS7ROyeKeiOF4BZiIkQV1sfrnfUUQ9LXFSFK_Vo,16045
66
+ sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py,sha256=SlzkA_fKurWOQDhvWlEBiMUfLgIoaRRglIdENMv7u38,6008
64
67
  sparknlp/annotator/classifier_dl/roberta_for_question_answering.py,sha256=WRxu1uhXnY9C4UHdtJ8qiVGhPSX7sCdSaML0AWHOdJw,6471
65
68
  sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py,sha256=z97uH5WkG8kPX1Y9qtpLwD7egl0kzbVmxtq4xzZgNNI,7857
66
69
  sparknlp/annotator/classifier_dl/roberta_for_token_classification.py,sha256=hvnG31FonfirdLcIy4_bkhbdQalRlqS8x3woScQeRVg,7220
67
70
  sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py,sha256=nP0D_jg8xPFUMP9uGNvmACIqfwAneDddVXbioHLHFJ0,8818
68
71
  sparknlp/annotator/classifier_dl/sentiment_dl.py,sha256=6Z7X3-ykxoaUz6vz-YIXkv2m2_lxIDEwKAd1yHIzcvU,14416
69
72
  sparknlp/annotator/classifier_dl/tapas_for_question_answering.py,sha256=2YBODMDUZT-j5ceOFTixrEkOqrztIM1kU-tsW_wao18,6317
73
+ sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py,sha256=D9Gdw3UbnoamRfS_RXocIuqyZVap8uirX8IpT41RaYU,5600
70
74
  sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py,sha256=t_zCnKGCjDccKNj_2mjRkysOaNCWNBMKXehbuFSphQc,6538
71
75
  sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py,sha256=sudgwa8_QZQzaYvEMSt6J1bDDwyK2Hp1VFhh98P08hY,7930
72
76
  sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py,sha256=ub5mMiZYKP4eBmXRzjkjfv_FFFR8E01XJs0RC__RxPo,6808
73
77
  sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py,sha256=4dBzpPj-VJcZul5hGcyjYkVMQ1PiaXZEGwvEaob3rss,8899
74
78
  sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py,sha256=CI9Ah2lyHkqwDHWGCbkk_gPbQd0NudpC7oXiHtWOucs,7811
75
79
  sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py,sha256=SndQpIfslsSYEOX-myLjpUS6-wVIeDG8MOhJYcu2_7M,6739
80
+ sparknlp/annotator/cleaners/__init__.py,sha256=tqevy1reFBls_EQdvD1f8Hhj5o7s153-NNLCXRoKJtQ,693
81
+ sparknlp/annotator/cleaners/cleaner.py,sha256=r_0ImrtGT-S-ytOknKoP844FVSv0J9YVKQyNrmSBTjs,6839
82
+ sparknlp/annotator/cleaners/extractor.py,sha256=nml8mnOToZYPF5fTp9VWdDfnWTXryLDzp3RWfQoJkWY,5805
76
83
  sparknlp/annotator/coref/__init__.py,sha256=SG8MAaVxQpoYYAsyKaoOlvlHjorDzj3DHt4nnEdBWm8,53
77
84
  sparknlp/annotator/coref/spanbert_coref.py,sha256=AXWJhvVquY2uoApO_Np1fz7_KyJhxnZB4i-xk78sBfc,8407
78
- sparknlp/annotator/cv/__init__.py,sha256=194aJ5N5eE3HOYRzAAdroHTTQ0o1qyCrgyRLddvqBp0,1006
85
+ sparknlp/annotator/cv/__init__.py,sha256=ySHQ_8pumJFESLUtDqvb0X9oX6He-w_-Jw--Z3ASU5w,1473
79
86
  sparknlp/annotator/cv/blip_for_question_answering.py,sha256=At7L5pPBNDR1r-JGLKM5b3dTrq5Ecz9r0M1gToUVZTs,6551
80
87
  sparknlp/annotator/cv/clip_for_zero_shot_classification.py,sha256=_1pLc9BiFrFN10eJPCDJLJT-vdnTSG9OnB25Y_kKJIA,7528
81
88
  sparknlp/annotator/cv/convnext_for_image_classification.py,sha256=KzaAlYW5M2l73zUozzgg8_p14eGDz9k9PYVAUZLN25k,11874
89
+ sparknlp/annotator/cv/gemma3_for_multimodal.py,sha256=dh0KjTJGqpD-yN7d2f2auMbKLwL5w74Rhgai5y0LeHw,13053
90
+ sparknlp/annotator/cv/janus_for_multimodal.py,sha256=-TlAfeZ3A8iMJ23Q05Tx_KObgBfy9-qYAN9gAPCvjbw,14499
91
+ sparknlp/annotator/cv/llava_for_multimodal.py,sha256=kzOcZs08yCnB9AgaogZG28SguGqVUw9sumijhM5YRFU,12064
92
+ sparknlp/annotator/cv/mllama_for_multimodal.py,sha256=e4_bOGycy-gPYrl8en0mOP3eF8p17Xt85nwE5kmez5g,13071
93
+ sparknlp/annotator/cv/paligemma_for_multimodal.py,sha256=nqwGWRG4kc7FJ26DMwwhkN7FnBWGBZsTjL9H-scs69Q,11204
94
+ sparknlp/annotator/cv/phi3_vision_for_multimodal.py,sha256=MPGj07Gi-QCE5Ew5l3_SqUqBIR4Tvhhi1ZVbkuX-ihU,12127
95
+ sparknlp/annotator/cv/qwen2vl_transformer.py,sha256=S2jFwMfh-2iaTl7t8SndH1U1dHSpnlW6E0IQBtw_Xak,12565
96
+ sparknlp/annotator/cv/smolvlm_transformer.py,sha256=JJINJfo_tUvgZ89AG3YqJQ99yjqZpUzJ5vEfISXOeh0,16963
82
97
  sparknlp/annotator/cv/swin_for_image_classification.py,sha256=iZ1KY0GInbQmGzkmuNbds4PGPwCheLXc-Syv2HRmqug,10694
83
98
  sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py,sha256=rEWJte-qN6PI6ML2cGhsZ37wAzjHUtN_WD5pcKAez7M,10167
84
99
  sparknlp/annotator/cv/vit_for_image_classification.py,sha256=D2V3pxAd3rBi1817lxVOqaVvCw4trcVyorQgIPdLNAE,9148
@@ -87,7 +102,7 @@ sparknlp/annotator/dependency/dependency_parser.py,sha256=SxyvHPp8Hs1Xnm5X1nLTMi
87
102
  sparknlp/annotator/dependency/typed_dependency_parser.py,sha256=60vPdYkbFk9MPGegg3m9Uik9cMXpMZd8tBvXG39gNww,12456
88
103
  sparknlp/annotator/embeddings/__init__.py,sha256=KHDCHb8SMlkSGGSu69SfKneUDDUlBdMGdMzDrYp_cis,2408
89
104
  sparknlp/annotator/embeddings/albert_embeddings.py,sha256=6Rd1LIn8oFIpq_ALcJh-RUjPEO7Ht8wsHY6JHSFyMkw,9995
90
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py,sha256=ngqjiXUqkL3xOrmt42bY8pp7azgbIWqXGfbKud1CijM,19981
105
+ sparknlp/annotator/embeddings/auto_gguf_embeddings.py,sha256=IlqkPGOH2lmZvxEyDSGX-G90DtTFOe2Rvujfbg5zvlU,20185
91
106
  sparknlp/annotator/embeddings/bert_embeddings.py,sha256=HVUjkg56kBcpGZCo-fmPG5uatMDF3swW_lnbpy1SgSI,8463
92
107
  sparknlp/annotator/embeddings/bert_sentence_embeddings.py,sha256=NQy9KuXT9aKsTpYCR5RAeoFWI2YqEGorbdYrf_0KKmw,9148
93
108
  sparknlp/annotator/embeddings/bge_embeddings.py,sha256=Y4b6QzRJGc_Z9_R6SYq-P5NxcvI9XzJlBzwCLLHJpRo,8103
@@ -147,17 +162,20 @@ sparknlp/annotator/sentence/sentence_detector_dl.py,sha256=-Osj9Bm9KyZRTAWkOsK9c
147
162
  sparknlp/annotator/sentiment/__init__.py,sha256=Lq3vKaZS1YATLMg0VNXSVtkWL5q5G9taGBvdrvSwnfg,766
148
163
  sparknlp/annotator/sentiment/sentiment_detector.py,sha256=m545NGU0Xzg_PO6_qIfpli1uZj7JQcyFgqe9R6wAPFI,8154
149
164
  sparknlp/annotator/sentiment/vivekn_sentiment.py,sha256=4rpXWDgzU6ddnbrSCp9VdLb2epCc9oZ3c6XcqxEw8nk,9655
150
- sparknlp/annotator/seq2seq/__init__.py,sha256=Fdz1zsxpB6vM2a0sKuGCSMD1ZgqeVqAez0-AtppMGB4,1541
151
- sparknlp/annotator/seq2seq/auto_gguf_model.py,sha256=pTQq3KztHQq3fybdCmXEq5wTlb0t-5ANCfdQ_-7oQRg,38343
165
+ sparknlp/annotator/seq2seq/__init__.py,sha256=4h6taLL4pgzs_pR2uFx9AcLrCqtu8bx3hXNeK1_u_EE,1723
166
+ sparknlp/annotator/seq2seq/auto_gguf_model.py,sha256=PYHfljxtSSTL0lBm1-6ZBEHxkBLZvID63HSuxWv7XS4,11554
167
+ sparknlp/annotator/seq2seq/auto_gguf_vision_model.py,sha256=EYrm8EW7AMq3AoIKPe7Gp6ayBlFpWeg76AsAr4nanqU,15346
152
168
  sparknlp/annotator/seq2seq/bart_transformer.py,sha256=I1flM4yeCzEAKOdQllBC30XuedxVJ7ferkFhZ6gwEbE,18481
169
+ sparknlp/annotator/seq2seq/cohere_transformer.py,sha256=43LZBVazZMgJRCsN7HaYjVYfJ5hRMV95QZyxMtXq-m4,13496
153
170
  sparknlp/annotator/seq2seq/cpm_transformer.py,sha256=0CnBFMlxMu0pD2QZMHyoGtIYgXqfUQm68vr6zEAa6Eg,13290
154
171
  sparknlp/annotator/seq2seq/gpt2_transformer.py,sha256=Oz95R_NRR4tWHu_bW6Ak2832ZILXycp3ify7LfRSi8o,15310
155
172
  sparknlp/annotator/seq2seq/llama2_transformer.py,sha256=3LzTR0VerFdFmOizsrs2Q7HTnjELJ5WtfUgx5XnOqGM,13898
156
- sparknlp/annotator/seq2seq/llama3_transformer.py,sha256=dA3rIEVOLmlnJwhqkYmL_GrrcRVpoUY_i7QIyA5N2jM,14920
173
+ sparknlp/annotator/seq2seq/llama3_transformer.py,sha256=wmhgWQkO__H1vIGnAMjUU14Gtit4qOcE1m9YpM6YkB4,14950
157
174
  sparknlp/annotator/seq2seq/m2m100_transformer.py,sha256=uIL9RZuuryTIdAy9TbJf9wbz6RekhW8S079bJhaB6i4,16116
158
175
  sparknlp/annotator/seq2seq/marian_transformer.py,sha256=mQ4Ylh7ZzXAOue8f-x0gqzfS3vAz3XUdD7eQ2XhcEs4,13781
159
176
  sparknlp/annotator/seq2seq/mistral_transformer.py,sha256=PJegrSQts_58rkt96xaHlqU1fKIaz8hxt7DTPkGS10A,14254
160
177
  sparknlp/annotator/seq2seq/nllb_transformer.py,sha256=hOmdJOgl_-_PxoADrV-tVYmlfFrqNwvn6Vn2RC4siZM,19534
178
+ sparknlp/annotator/seq2seq/olmo_transformer.py,sha256=B_zhYkAfYycw5uBq1tVNPmaKuYtpJOxRC6PArit7XiE,13634
161
179
  sparknlp/annotator/seq2seq/phi2_transformer.py,sha256=WwKCUOH8qGFv62YF63HjuT7bMVldh06gHvaZH3tbSDk,13787
162
180
  sparknlp/annotator/seq2seq/phi3_transformer.py,sha256=arIcw5NDMv3ubBwWz3KYRdLMsspTiEI8vk4s00lyq1c,14293
163
181
  sparknlp/annotator/seq2seq/qwen_transformer.py,sha256=cOpOlz5r_apmVHZgp7uFjybSzVj2yxv8QYlYcGwFyKg,14645
@@ -185,7 +203,7 @@ sparknlp/base/finisher.py,sha256=V4wkMm9Ug09q4zTQc9T9Wr-awmu2Hu-eNaJ039YgZXM,858
185
203
  sparknlp/base/graph_finisher.py,sha256=a8fxk3ei2YQw6s0Y9Yy8oMOF1i1XUrgqaiwVE0VPt4w,4834
186
204
  sparknlp/base/has_recursive_fit.py,sha256=P55rSHLIXhihXWS2bOC_DskcQTc3njieVD1JkjS2bcA,849
187
205
  sparknlp/base/has_recursive_transform.py,sha256=UkGNgo4LMsjQC-Coeefg4bJcg7FoPcPiG382zEa6Ywk,841
188
- sparknlp/base/image_assembler.py,sha256=HH7ZJ-iZCXnBXVXekQLb1ei_HJuVxhYNVb94OrVLmeY,4068
206
+ sparknlp/base/image_assembler.py,sha256=-ylzVaDdjJBDQNkTixsCn7WvFB8cqC3_lPdvdiJu0aM,6168
189
207
  sparknlp/base/light_pipeline.py,sha256=2lOstyyK0o6L3BHPIZWQBpIKtJ7LcSz3Pvgo6eZDs5U,17023
190
208
  sparknlp/base/multi_document_assembler.py,sha256=4htET1fRAeOB6zhsNXsBq5rKZvn-LGD4vrFRjPZeqow,7070
191
209
  sparknlp/base/prompt_assembler.py,sha256=ysU4Vbmnuv2UBHK0JBkYrxiZiJ7_GTcVMip1-QRmheI,11570
@@ -199,12 +217,12 @@ sparknlp/common/annotator_properties.py,sha256=7B1os7pBUfHo6b7IPQAXQ-nir0u3tQLzD
199
217
  sparknlp/common/annotator_type.py,sha256=ash2Ip1IOOiJamPVyy_XQj8Ja_DRHm0b9Vj4Ni75oKM,1225
200
218
  sparknlp/common/coverage_result.py,sha256=No4PSh1HSs3PyRI1zC47x65tWgfirqPI290icHQoXEI,823
201
219
  sparknlp/common/match_strategy.py,sha256=kt1MUPqU1wCwk5qCdYk6jubHbU-5yfAYxb9jjAOrdnY,1678
202
- sparknlp/common/properties.py,sha256=TMUpY0EQ3b-GXO9iuctkKrunLhRYePqu2fbmHfocr2w,23870
220
+ sparknlp/common/properties.py,sha256=vuvF5eUq4H2LxgIhIPsjeS8AF4JJcRDEicmLqyeRgmk,51457
203
221
  sparknlp/common/read_as.py,sha256=imxPGwV7jr4Li_acbo0OAHHRGCBbYv-akzEGaBWEfcY,1226
204
222
  sparknlp/common/recursive_annotator_approach.py,sha256=vqugBw22cE3Ff7PIpRlnYFuOlchgL0nM26D8j-NdpqU,1449
205
223
  sparknlp/common/storage.py,sha256=D91H3p8EIjNspjqAYu6ephRpCUtdcAir4_PrAbkIQWE,4842
206
224
  sparknlp/common/utils.py,sha256=Yne6yYcwKxhOZC-U4qfYoDhWUP_6BIaAjI5X_P_df1E,1306
207
- sparknlp/internal/__init__.py,sha256=BttGS21n2-LGjx8udi7f4_nNt_BeUnfif9WpeZchuFE,34502
225
+ sparknlp/internal/__init__.py,sha256=YtsUXuuHzv4lATbepu7BhWJEc7Vo65OtEgphxEHOa5Q,39168
208
226
  sparknlp/internal/annotator_java_ml.py,sha256=UGPoThG0rGXUOXGSQnDzEDW81Mu1s5RPF29v7DFyE3c,1187
209
227
  sparknlp/internal/annotator_transformer.py,sha256=fXmc2IWXGybqZpbEU9obmbdBYPc798y42zvSB4tqV9U,1448
210
228
  sparknlp/internal/extended_java_wrapper.py,sha256=hwP0133-hDiDf5sBF-P3MtUsuuDj1PpQbtGZQIRwzfk,2240
@@ -217,7 +235,9 @@ sparknlp/pretrained/pretrained_pipeline.py,sha256=lquxiaABuA68Rmu7csamJPqBoRJqMU
217
235
  sparknlp/pretrained/resource_downloader.py,sha256=8_-rpvO2LsX_Lq4wMPif2ca3RlJZWEabt8pDm2xymiI,7806
218
236
  sparknlp/pretrained/utils.py,sha256=T1MrvW_DaWk_jcOjVLOea0NMFE9w8fe0ZT_5urZ_nEY,1099
219
237
  sparknlp/reader/__init__.py,sha256=-Toj3AIBki-zXPpV8ezFTI2LX1yP_rK2bhpoa8nBkTw,685
220
- sparknlp/reader/sparknlp_reader.py,sha256=cMliB2zDcmhxp44mu8aRcm5nFK2BXeFCuGgVUkhI8YQ,3825
238
+ sparknlp/reader/enums.py,sha256=MNGug9oJ1BBLM1Pbske13kAabalDzHa2kucF5xzFpHs,770
239
+ sparknlp/reader/pdf_to_text.py,sha256=o2-ZqioR3-apGDo5WCb0_I0sEQr6O-CGMfMb4W4YSss,3892
240
+ sparknlp/reader/sparknlp_reader.py,sha256=BEKfT9JaOWlA2ddsMNiC-pVRrM9Ad_4J4-Ur3iCNKH0,38218
221
241
  sparknlp/training/__init__.py,sha256=qREi9u-5Vc2VjpL6-XZsyvu5jSEIdIhowW7_kKaqMqo,852
222
242
  sparknlp/training/conll.py,sha256=wKBiSTrjc6mjsl7Nyt6B8f4yXsDJkZb-sn8iOjix9cE,6961
223
243
  sparknlp/training/conllu.py,sha256=8r3i-tmyrLsyk1DtZ9uo2mMDCWb1yw2Y5W6UsV13MkY,4953
@@ -248,8 +268,7 @@ sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py,sha256=R4yHFN3
248
268
  sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py,sha256=EoCSdcIjqQ3wv13MAuuWrKV8wyVBP0SbOEW41omHlR0,23189
249
269
  sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py,sha256=k5CQ7gKV6HZbZMB8cKLUJuZxoZWlP_DFWdZ--aIDwsc,2356
250
270
  sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py,sha256=pAxjWhjazSX8Vg0MFqJiuRVw1IbnQNSs-8Xp26L4nko,870
251
- spark_nlp-5.5.3.dist-info/.uuid,sha256=1f6hF51aIuv9yCvh31NU9lOpS34NE-h3a0Et7R9yR6A,36
252
- spark_nlp-5.5.3.dist-info/METADATA,sha256=rZJcS1xIcl3Vota-hC2wHauvrHO45e9c8Y86MjVt4go,19156
253
- spark_nlp-5.5.3.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
254
- spark_nlp-5.5.3.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
255
- spark_nlp-5.5.3.dist-info/RECORD,,
271
+ spark_nlp-6.0.1.dist-info/METADATA,sha256=kkkusDiCwvbTAsSSdkBaImpm8p2RxYSHEeKLHHMJb5k,19577
272
+ spark_nlp-6.0.1.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
273
+ spark_nlp-6.0.1.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
274
+ spark_nlp-6.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
sparknlp/__init__.py CHANGED
@@ -132,7 +132,7 @@ def start(gpu=False,
132
132
  The initiated Spark session.
133
133
 
134
134
  """
135
- current_version = "5.5.3"
135
+ current_version = "6.0.1"
136
136
 
137
137
  if params is None:
138
138
  params = {}
@@ -316,4 +316,4 @@ def version():
316
316
  str
317
317
  The current Spark NLP version.
318
318
  """
319
- return '5.5.3'
319
+ return '6.0.1'
@@ -55,3 +55,7 @@ from sparknlp.annotator.classifier_dl.mpnet_for_token_classification import *
55
55
  from sparknlp.annotator.classifier_dl.albert_for_zero_shot_classification import *
56
56
  from sparknlp.annotator.classifier_dl.camembert_for_zero_shot_classification import *
57
57
  from sparknlp.annotator.classifier_dl.bert_for_multiple_choice import *
58
+ from sparknlp.annotator.classifier_dl.xlm_roberta_for_multiple_choice import *
59
+ from sparknlp.annotator.classifier_dl.roberta_for_multiple_choice import *
60
+ from sparknlp.annotator.classifier_dl.distilbert_for_multiple_choice import *
61
+ from sparknlp.annotator.classifier_dl.albert_for_multiple_choice import *
@@ -0,0 +1,161 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class AlbertForMultipleChoice(AnnotatorModel,
18
+ HasCaseSensitiveProperties,
19
+ HasBatchedAnnotate,
20
+ HasEngine,
21
+ HasMaxSentenceLengthLimit):
22
+ """AlbertForMultipleChoice can load ALBERT Models with a multiple choice classification head on top
23
+ (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
24
+
25
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
26
+ object:
27
+
28
+ >>> spanClassifier = AlbertForMultipleChoice.pretrained() \\
29
+ ... .setInputCols(["document_question", "document_context"]) \\
30
+ ... .setOutputCol("answer")
31
+
32
+ The default model is ``"albert_base_uncased_multiple_choice"``, if no name is
33
+ provided.
34
+
35
+ For available pretrained models please see the `Models Hub
36
+ <https://sparknlp.org/models?task=Multiple+Choice>`__.
37
+
38
+ To see which models are compatible and how to import them see
39
+ `Import Transformers into Spark NLP 🚀
40
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
41
+
42
+ ====================== ======================
43
+ Input Annotation types Output Annotation type
44
+ ====================== ======================
45
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
46
+ ====================== ======================
47
+
48
+ Parameters
49
+ ----------
50
+ batchSize
51
+ Batch size. Large values allows faster processing but requires more
52
+ memory, by default 8
53
+ caseSensitive
54
+ Whether to ignore case in tokens for embeddings matching, by default
55
+ False
56
+ maxSentenceLength
57
+ Max sentence length to process, by default 512
58
+
59
+ Examples
60
+ --------
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = MultiDocumentAssembler() \\
66
+ ... .setInputCols(["question", "context"]) \\
67
+ ... .setOutputCols(["document_question", "document_context"])
68
+ >>> questionAnswering = AlbertForMultipleChoice.pretrained() \\
69
+ ... .setInputCols(["document_question", "document_context"]) \\
70
+ ... .setOutputCol("answer") \\
71
+ ... .setCaseSensitive(False)
72
+ >>> pipeline = Pipeline().setStages([
73
+ ... documentAssembler,
74
+ ... questionAnswering
75
+ ... ])
76
+ >>> data = spark.createDataFrame([["The Eiffel Tower is located in which country??", "Germany, France, Italy"]]).toDF("question", "context")
77
+ >>> result = pipeline.fit(data).transform(data)
78
+ >>> result.select("answer.result").show(truncate=False)
79
+ +--------------------+
80
+ |result |
81
+ +--------------------+
82
+ |[France] |
83
+ +--------------------+
84
+ """
85
+ name = "AlbertForMultipleChoice"
86
+
87
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
88
+
89
+ outputAnnotatorType = AnnotatorType.CHUNK
90
+
91
+ choicesDelimiter = Param(Params._dummy(),
92
+ "choicesDelimiter",
93
+ "Delimiter character use to split the choices",
94
+ TypeConverters.toString)
95
+
96
+ def setChoicesDelimiter(self, value):
97
+ """Sets delimiter character use to split the choices
98
+
99
+ Parameters
100
+ ----------
101
+ value : string
102
+ Delimiter character use to split the choices
103
+ """
104
+ return self._set(caseSensitive=value)
105
+
106
+ @keyword_only
107
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForMultipleChoice",
108
+ java_model=None):
109
+ super(AlbertForMultipleChoice, self).__init__(
110
+ classname=classname,
111
+ java_model=java_model
112
+ )
113
+ self._setDefault(
114
+ batchSize=4,
115
+ maxSentenceLength=512,
116
+ caseSensitive=False,
117
+ choicesDelimiter = ","
118
+ )
119
+
120
+ @staticmethod
121
+ def loadSavedModel(folder, spark_session):
122
+ """Loads a locally saved model.
123
+
124
+ Parameters
125
+ ----------
126
+ folder : str
127
+ Folder of the saved model
128
+ spark_session : pyspark.sql.SparkSession
129
+ The current SparkSession
130
+
131
+ Returns
132
+ -------
133
+ BertForQuestionAnswering
134
+ The restored model
135
+ """
136
+ from sparknlp.internal import _AlbertMultipleChoiceLoader
137
+ jModel = _AlbertMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
138
+ return AlbertForMultipleChoice(java_model=jModel)
139
+
140
+ @staticmethod
141
+ def pretrained(name="albert_base_uncased_multiple_choice", lang="en", remote_loc=None):
142
+ """Downloads and loads a pretrained model.
143
+
144
+ Parameters
145
+ ----------
146
+ name : str, optional
147
+ Name of the pretrained model, by default
148
+ "bert_base_uncased_multiple_choice"
149
+ lang : str, optional
150
+ Language of the pretrained model, by default "en"
151
+ remote_loc : str, optional
152
+ Optional remote address of the resource, by default None. Will use
153
+ Spark NLPs repositories otherwise.
154
+
155
+ Returns
156
+ -------
157
+ BertForQuestionAnswering
158
+ The restored model
159
+ """
160
+ from sparknlp.pretrained import ResourceDownloader
161
+ return ResourceDownloader.downloadModel(AlbertForMultipleChoice, name, lang, remote_loc)
@@ -130,7 +130,7 @@ class BertForMultipleChoice(AnnotatorModel,
130
130
 
131
131
  Returns
132
132
  -------
133
- BertForQuestionAnswering
133
+ BertForMultipleChoice
134
134
  The restored model
135
135
  """
136
136
  from sparknlp.internal import _BertMultipleChoiceLoader
@@ -154,7 +154,7 @@ class BertForMultipleChoice(AnnotatorModel,
154
154
 
155
155
  Returns
156
156
  -------
157
- BertForQuestionAnswering
157
+ BertForMultipleChoice
158
158
  The restored model
159
159
  """
160
160
  from sparknlp.pretrained import ResourceDownloader
@@ -0,0 +1,161 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class DistilBertForMultipleChoice(AnnotatorModel,
18
+ HasCaseSensitiveProperties,
19
+ HasBatchedAnnotate,
20
+ HasEngine,
21
+ HasMaxSentenceLengthLimit):
22
+ """DistilBertForMultipleChoice can load DistilBert Models with a multiple choice classification head on top
23
+ (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
24
+
25
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
26
+ object:
27
+
28
+ >>> spanClassifier = DistilBertForMultipleChoice.pretrained() \\
29
+ ... .setInputCols(["document_question", "document_context"]) \\
30
+ ... .setOutputCol("answer")
31
+
32
+ The default model is ``"bert_base_uncased_multiple_choice"``, if no name is
33
+ provided.
34
+
35
+ For available pretrained models please see the `Models Hub
36
+ <https://sparknlp.org/models?task=Multiple+Choice>`__.
37
+
38
+ To see which models are compatible and how to import them see
39
+ `Import Transformers into Spark NLP 🚀
40
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
41
+
42
+ ====================== ======================
43
+ Input Annotation types Output Annotation type
44
+ ====================== ======================
45
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
46
+ ====================== ======================
47
+
48
+ Parameters
49
+ ----------
50
+ batchSize
51
+ Batch size. Large values allows faster processing but requires more
52
+ memory, by default 8
53
+ caseSensitive
54
+ Whether to ignore case in tokens for embeddings matching, by default
55
+ False
56
+ maxSentenceLength
57
+ Max sentence length to process, by default 512
58
+
59
+ Examples
60
+ --------
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = MultiDocumentAssembler() \\
66
+ ... .setInputCols(["question", "context"]) \\
67
+ ... .setOutputCols(["document_question", "document_context"])
68
+ >>> questionAnswering = DistilBertForMultipleChoice.pretrained() \\
69
+ ... .setInputCols(["document_question", "document_context"]) \\
70
+ ... .setOutputCol("answer") \\
71
+ ... .setCaseSensitive(False)
72
+ >>> pipeline = Pipeline().setStages([
73
+ ... documentAssembler,
74
+ ... questionAnswering
75
+ ... ])
76
+ >>> data = spark.createDataFrame([["The Eiffel Tower is located in which country??", "Germany, France, Italy"]]).toDF("question", "context")
77
+ >>> result = pipeline.fit(data).transform(data)
78
+ >>> result.select("answer.result").show(truncate=False)
79
+ +--------------------+
80
+ |result |
81
+ +--------------------+
82
+ |[France] |
83
+ +--------------------+
84
+ """
85
+ name = "DistilBertForMultipleChoice"
86
+
87
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
88
+
89
+ outputAnnotatorType = AnnotatorType.CHUNK
90
+
91
+ choicesDelimiter = Param(Params._dummy(),
92
+ "choicesDelimiter",
93
+ "Delimiter character use to split the choices",
94
+ TypeConverters.toString)
95
+
96
+ def setChoicesDelimiter(self, value):
97
+ """Sets delimiter character use to split the choices
98
+
99
+ Parameters
100
+ ----------
101
+ value : string
102
+ Delimiter character use to split the choices
103
+ """
104
+ return self._set(caseSensitive=value)
105
+
106
+ @keyword_only
107
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForMultipleChoice",
108
+ java_model=None):
109
+ super(DistilBertForMultipleChoice, self).__init__(
110
+ classname=classname,
111
+ java_model=java_model
112
+ )
113
+ self._setDefault(
114
+ batchSize=4,
115
+ maxSentenceLength=512,
116
+ caseSensitive=False,
117
+ choicesDelimiter = ","
118
+ )
119
+
120
+ @staticmethod
121
+ def loadSavedModel(folder, spark_session):
122
+ """Loads a locally saved model.
123
+
124
+ Parameters
125
+ ----------
126
+ folder : str
127
+ Folder of the saved model
128
+ spark_session : pyspark.sql.SparkSession
129
+ The current SparkSession
130
+
131
+ Returns
132
+ -------
133
+ DistilBertForMultipleChoice
134
+ The restored model
135
+ """
136
+ from sparknlp.internal import _DistilBertMultipleChoiceLoader
137
+ jModel = _DistilBertMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
138
+ return DistilBertForMultipleChoice(java_model=jModel)
139
+
140
+ @staticmethod
141
+ def pretrained(name="distilbert_base_uncased_multiple_choice", lang="en", remote_loc=None):
142
+ """Downloads and loads a pretrained model.
143
+
144
+ Parameters
145
+ ----------
146
+ name : str, optional
147
+ Name of the pretrained model, by default
148
+ "bert_base_uncased_multiple_choice"
149
+ lang : str, optional
150
+ Language of the pretrained model, by default "en"
151
+ remote_loc : str, optional
152
+ Optional remote address of the resource, by default None. Will use
153
+ Spark NLPs repositories otherwise.
154
+
155
+ Returns
156
+ -------
157
+ DistilBertForMultipleChoice
158
+ The restored model
159
+ """
160
+ from sparknlp.pretrained import ResourceDownloader
161
+ return ResourceDownloader.downloadModel(DistilBertForMultipleChoice, name, lang, remote_loc)