spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -29,9 +29,6 @@ class NorvigSweetingApproach(AnnotatorApproach):
29
29
 
30
30
  For instantiated/pretrained models, see :class:`.NorvigSweetingModel`.
31
31
 
32
- For extended examples of usage, see the `Spark NLP Workshop
33
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
34
-
35
32
  ====================== ======================
36
33
  Input Annotation types Output Annotation type
37
34
  ====================== ======================
@@ -270,11 +267,11 @@ class NorvigSweetingModel(AnnotatorModel):
270
267
 
271
268
  The default model is ``"spellcheck_norvig"``, if no name is provided. For
272
269
  available pretrained models please see the `Models Hub
273
- <https://nlp.johnsnowlabs.com/models?task=Spell+Check>`__.
270
+ <https://sparknlp.org/models?task=Spell+Check>`__.
274
271
 
275
272
 
276
- For extended examples of usage, see the `Spark NLP Workshop
277
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
273
+ For extended examples of usage, see the `Examples
274
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
278
275
 
279
276
  ====================== ======================
280
277
  Input Annotation types Output Annotation type
@@ -212,7 +212,7 @@ class SymmetricDeleteModel(AnnotatorModel):
212
212
 
213
213
  The default model is ``"spellcheck_sd"``, if no name is provided. For
214
214
  available pretrained models please see the `Models Hub
215
- <https://nlp.johnsnowlabs.com/models?task=Spell+Check>`__.
215
+ <https://sparknlp.org/models?task=Spell+Check>`__.
216
216
 
217
217
  ====================== ======================
218
218
  Input Annotation types Output Annotation type
@@ -19,8 +19,8 @@ class Stemmer(AnnotatorModel):
19
19
  """Returns hard-stems out of words with the objective of retrieving the
20
20
  meaningful part of the word.
21
21
 
22
- For extended examples of usage, see the `Spark NLP Workshop
23
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
22
+ For extended examples of usage, see the `Examples
23
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb>`__.
24
24
 
25
25
  ====================== ======================
26
26
  Input Annotation types Output Annotation type
@@ -77,4 +77,3 @@ class Stemmer(AnnotatorModel):
77
77
  self._setDefault(
78
78
  language="english"
79
79
  )
80
-
@@ -34,10 +34,10 @@ class StopWordsCleaner(AnnotatorModel):
34
34
  This will load the default pretrained model ``"stopwords_en"``.
35
35
 
36
36
  For available pretrained models please see the `Models Hub
37
- <https://nlp.johnsnowlabs.com/models?task=Stop+Words+Removal>`__.
37
+ <https://sparknlp.orgtask=Stop+Words+Removal>`__.
38
38
 
39
- For extended examples of usage, see the `Spark NLP Workshop
40
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
39
+ For extended examples of usage, see the `Examples
40
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb>`__.
41
41
 
42
42
  ====================== ======================
43
43
  Input Annotation types Output Annotation type
@@ -188,4 +188,3 @@ class StopWordsCleaner(AnnotatorModel):
188
188
  """
189
189
  from sparknlp.pretrained import ResourceDownloader
190
190
  return ResourceDownloader.downloadModel(StopWordsCleaner, name, lang, remote_loc)
191
-
@@ -66,7 +66,7 @@ class TFNerDLGraphBuilder(Estimator, DefaultParamsWritable, DefaultParamsReadabl
66
66
 
67
67
  Parameters
68
68
  ----------
69
- *value : str
69
+ *value : List[str]
70
70
  Input columns for the annotator
71
71
  """
72
72
  if type(value[0]) == str or type(value[0]) == list:
@@ -16,5 +16,4 @@
16
16
  from sparknlp.annotator.token.chunk_tokenizer import *
17
17
  from sparknlp.annotator.token.recursive_tokenizer import *
18
18
  from sparknlp.annotator.token.regex_tokenizer import *
19
- from sparknlp.annotator.token.token2_chunk import *
20
19
  from sparknlp.annotator.token.tokenizer import *
@@ -28,8 +28,8 @@ class RecursiveTokenizer(AnnotatorApproach):
28
28
  - ``infixes``: Strings that will be split when found at the middle of token.
29
29
  - ``whitelist``: Whitelist of strings not to split
30
30
 
31
- For extended examples of usage, see the `Spark NLP Workshop
32
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/7.Context_Spell_Checker.ipynb>`__.
31
+ For extended examples of usage, see the `Examples
32
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
33
33
 
34
34
  ====================== ======================
35
35
  Input Annotation types Output Annotation type
@@ -203,4 +203,3 @@ class RecursiveTokenizerModel(AnnotatorModel):
203
203
  classname=classname,
204
204
  java_model=java_model
205
205
  )
206
-
@@ -27,8 +27,8 @@ class Tokenizer(AnnotatorApproach):
27
27
  Identifies tokens with tokenization open standards. A few rules will help
28
28
  customizing it if defaults do not fit user needs.
29
29
 
30
- For extended examples of usage see the `Spark NLP Workshop
31
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
30
+ For extended examples of usage see the `Examples
31
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__.
32
32
 
33
33
  ====================== ======================
34
34
  Input Annotation types Output Annotation type
@@ -559,4 +559,3 @@ class TokenizerModel(AnnotatorModel):
559
559
  """
560
560
  from sparknlp.pretrained import ResourceDownloader
561
561
  return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)
562
-
@@ -20,11 +20,27 @@ class WordSegmenterApproach(AnnotatorApproach):
20
20
  """Trains a WordSegmenter which tokenizes non-english or non-whitespace
21
21
  separated texts.
22
22
 
23
- Many languages are not whitespace separated and their sentences are a
24
- concatenation of many symbols, like Korean, Japanese or Chinese. Without
25
- understanding the language, splitting the words into their corresponding
26
- tokens is impossible. The WordSegmenter is trained to understand these
27
- languages and split them into semantically correct parts.
23
+ Many languages are not whitespace separated and their sentences are a concatenation
24
+ of many symbols, like Korean, Japanese or Chinese. Without understanding the
25
+ language, splitting the words into their corresponding tokens is impossible. The
26
+ WordSegmenter is trained to understand these languages and split them into
27
+ semantically correct parts.
28
+
29
+ This annotator is based on the paper Chinese Word Segmentation as Character Tagging
30
+ [1]. Word segmentation is treated as a tagging problem. Each character is be tagged
31
+ as on of four different labels: LL (left boundary), RR (right boundary), MM (middle)
32
+ and LR (word by itself). The label depends on the position of the word in the
33
+ sentence. LL tagged words will combine with the word on the right. Likewise, RR
34
+ tagged words combine with words on the left. MM tagged words are treated as the
35
+ middle of the word and combine with either side. LR tagged words are words by
36
+ themselves.
37
+
38
+ Example (from [1], Example 3(a) (raw), 3(b) (tagged), 3(c) (translation)):
39
+ - 上海 计划 到 本 世纪 末 实现 人均 国内 生产 总值 五千 美元
40
+ - 上/LL 海/RR 计/LL 划/RR 到/LR 本/LR 世/LL 纪/RR 末/LR 实/LL 现/RR 人/LL 均/RR
41
+ 国/LL 内/RR 生/LL 产/RR 总/LL值/RR 五/LL 千/RR 美/LL 元/RR
42
+ - Shanghai plans to reach the goal of 5,000 dollars in per capita GDP by the end
43
+ of the century.
28
44
 
29
45
  For instantiated/pretrained models, see :class:`.WordSegmenterModel`.
30
46
 
@@ -37,8 +53,17 @@ class WordSegmenterApproach(AnnotatorApproach):
37
53
  The helper class :class:`.POS` might be useful to read training data into
38
54
  data frames.
39
55
 
40
- For extended examples of usage, see the `Spark NLP Workshop
41
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master/jupyter/annotation/chinese/word_segmentation>`__.
56
+ For extended examples of usage, see the `Examples
57
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/chinese/word_segmentation>`__.
58
+
59
+ References
60
+ ----------
61
+
62
+ `[1] <https://aclanthology.org/O03-4002.pdf>`__ Xue, Nianwen. “Chinese Word
63
+ Segmentation as Character Tagging.” International Journal of Computational
64
+ Linguistics & Chinese Language Processing, Volume 8, Number 1, February 2003:
65
+ Special Issue on Word Formation and Chinese Language Processing, 2003, pp. 29-48.
66
+ ACLWeb, https://aclanthology.org/O03-4002.
42
67
 
43
68
  ====================== ======================
44
69
  Input Annotation types Output Annotation type
@@ -282,10 +307,10 @@ class WordSegmenterModel(AnnotatorModel):
282
307
 
283
308
  The default model is ``"wordseg_pku"``, default language is ``"zh"``, if no
284
309
  values are provided. For available pretrained models please see the `Models
285
- Hub <https://nlp.johnsnowlabs.com/models?task=Word+Segmentation>`__.
310
+ Hub <https://sparknlp.org/models?task=Word+Segmentation>`__.
286
311
 
287
- For extended examples of usage, see the `Spark NLP Workshop
288
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/chinese/word_segmentation/words_segmenter_demo.ipynb>`__.
312
+ For extended examples of usage, see the `Examples
313
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/jupyter/annotation/chinese/word_segmentation/words_segmenter_demo.ipynb>`__.
289
314
 
290
315
  ====================== ======================
291
316
  Input Annotation types Output Annotation type
sparknlp/base/__init__.py CHANGED
@@ -12,13 +12,12 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  """Module of base Spark NLP annotators."""
15
-
16
- from sparknlp.base.chunk2_doc import *
17
15
  from sparknlp.base.doc2_chunk import *
18
16
  from sparknlp.base.document_assembler import *
19
17
  from sparknlp.base.multi_document_assembler import *
20
18
  from sparknlp.base.embeddings_finisher import *
21
19
  from sparknlp.base.finisher import *
20
+ from sparknlp.base.gguf_ranking_finisher import *
22
21
  from sparknlp.base.graph_finisher import *
23
22
  from sparknlp.base.has_recursive_fit import *
24
23
  from sparknlp.base.has_recursive_transform import *
@@ -28,4 +27,4 @@ from sparknlp.base.token_assembler import *
28
27
  from sparknlp.base.image_assembler import *
29
28
  from sparknlp.base.audio_assembler import *
30
29
  from sparknlp.base.table_assembler import *
31
-
30
+ from sparknlp.base.prompt_assembler import *
@@ -29,9 +29,6 @@ class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties):
29
29
  ``StringType`` or ``ArrayType[StringType]`` (using setIsArray). Useful for
30
30
  annotators that require a CHUNK type input.
31
31
 
32
- For more extended examples on document pre-processing see the
33
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
34
-
35
32
  ====================== ======================
36
33
  Input Annotation types Output Annotation type
37
34
  ====================== ======================
@@ -24,13 +24,13 @@ class DocumentAssembler(AnnotatorTransformer):
24
24
  """Prepares data into a format that is processable by Spark NLP.
25
25
 
26
26
  This is the entry point for every Spark NLP pipeline. The
27
- `DocumentAssembler` can read either a ``String`` column or an
28
- ``Array[String]``. Additionally, :meth:`.setCleanupMode` can be used to
29
- pre-process the text (Default: ``disabled``). For possible options please
30
- refer the parameters section.
27
+ `DocumentAssembler` reads ``String`` columns. Additionally,
28
+ :meth:`.setCleanupMode` can be used to pre-process the
29
+ text (Default: ``disabled``). For possible options please refer the
30
+ parameters section.
31
31
 
32
32
  For more extended examples on document pre-processing see the
33
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
33
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb>`__.
34
34
 
35
35
  ====================== ======================
36
36
  Input Annotation types Output Annotation type
@@ -34,7 +34,8 @@ class EmbeddingsFinisher(AnnotatorTransformer):
34
34
  require a ``featureCol``.
35
35
 
36
36
  For more extended examples see the
37
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.1_Text_classification_examples_in_SparkML_SparkNLP.ipynb>`__.
37
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb
38
+ >`__.
38
39
 
39
40
  ====================== ======================
40
41
  Input Annotation types Output Annotation type
@@ -127,7 +128,8 @@ class EmbeddingsFinisher(AnnotatorTransformer):
127
128
  super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher")
128
129
  self._setDefault(
129
130
  cleanAnnotations=False,
130
- outputAsVector=False
131
+ outputAsVector=False,
132
+ outputCols=[]
131
133
  )
132
134
 
133
135
  @keyword_only
@@ -187,3 +189,13 @@ class EmbeddingsFinisher(AnnotatorTransformer):
187
189
 
188
190
  return self._set(outputAsVector=value)
189
191
 
192
+ def getInputCols(self):
193
+ """Gets input columns name of annotations."""
194
+ return self.getOrDefault(self.inputCols)
195
+
196
+ def getOutputCols(self):
197
+ """Gets output columns name of annotations."""
198
+ if len(self.getOrDefault(self.outputCols)) == 0:
199
+ return ["finished_" + input_col for input_col in self.getInputCols()]
200
+ else:
201
+ return self.getOrDefault(self.outputCols)
sparknlp/base/finisher.py CHANGED
@@ -25,7 +25,8 @@ class Finisher(AnnotatorTransformer):
25
25
  outputs annotation(s) values into ``String``.
26
26
 
27
27
  For more extended examples on document pre-processing see the
28
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
28
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb
29
+ >`__.
29
30
 
30
31
  ====================== ======================
31
32
  Input Annotation types Output Annotation type
@@ -97,7 +98,6 @@ class Finisher(AnnotatorTransformer):
97
98
  includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
98
99
  outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
99
100
  parseEmbeddingsVectors = Param(Params._dummy(), "parseEmbeddingsVectors", "whether to include embeddings vectors in the process", typeConverter=TypeConverters.toBoolean)
100
-
101
101
  name = "Finisher"
102
102
 
103
103
  @keyword_only
@@ -109,7 +109,8 @@ class Finisher(AnnotatorTransformer):
109
109
  outputAsArray=True,
110
110
  parseEmbeddingsVectors=False,
111
111
  valueSplitSymbol="#",
112
- annotationSplitSymbol="@"
112
+ annotationSplitSymbol="@",
113
+ outputCols=[]
113
114
  )
114
115
 
115
116
  @keyword_only
@@ -122,7 +123,7 @@ class Finisher(AnnotatorTransformer):
122
123
 
123
124
  Parameters
124
125
  ----------
125
- *value : str
126
+ *value : List[str]
126
127
  Input columns for the annotator
127
128
  """
128
129
  if len(value) == 1 and type(value[0]) == list:
@@ -204,3 +205,13 @@ class Finisher(AnnotatorTransformer):
204
205
  """
205
206
  return self._set(parseEmbeddingsVectors=value)
206
207
 
208
+ def getInputCols(self):
209
+ """Gets input columns name of annotations."""
210
+ return self.getOrDefault(self.inputCols)
211
+
212
+ def getOutputCols(self):
213
+ """Gets output columns name of annotations."""
214
+ if len(self.getOrDefault(self.outputCols)) == 0:
215
+ return ["finished_" + input_col for input_col in self.getInputCols()]
216
+ else:
217
+ return self.getOrDefault(self.outputCols)
@@ -0,0 +1,234 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the GGUFRankingFinisher."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+ from sparknlp.internal import AnnotatorTransformer
19
+
20
+
21
+ class GGUFRankingFinisher(AnnotatorTransformer):
22
+ """Finisher for AutoGGUFReranker outputs that provides ranking capabilities
23
+ including top-k selection, sorting by relevance score, and score normalization.
24
+
25
+ This finisher processes the output of AutoGGUFReranker, which contains documents with
26
+ relevance scores in their metadata. It provides several options for post-processing:
27
+
28
+ - Top-k selection: Select only the top k documents by relevance score
29
+ - Score thresholding: Filter documents by minimum relevance score
30
+ - Min-max scaling: Normalize relevance scores to 0-1 range
31
+ - Sorting: Sort documents by relevance score in descending order
32
+ - Ranking: Add rank information to document metadata
33
+
34
+ The finisher preserves the document annotation structure while adding ranking information
35
+ to the metadata and optionally filtering/sorting the documents.
36
+
37
+ For extended examples of usage, see the `Examples
38
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.
39
+
40
+ ====================== ======================
41
+ Input Annotation types Output Annotation type
42
+ ====================== ======================
43
+ ``DOCUMENT`` ``DOCUMENT``
44
+ ====================== ======================
45
+
46
+ Parameters
47
+ ----------
48
+ inputCols
49
+ Name of input annotation columns containing reranked documents
50
+ outputCol
51
+ Name of output annotation column containing ranked documents, by default "ranked_documents"
52
+ topK
53
+ Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
54
+ minRelevanceScore
55
+ Minimum relevance score threshold for filtering documents, by default Double.MinValue
56
+ minMaxScaling
57
+ Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False
58
+
59
+ Examples
60
+ --------
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = DocumentAssembler() \\
66
+ ... .setInputCol("text") \\
67
+ ... .setOutputCol("document")
68
+ >>> reranker = AutoGGUFReranker.pretrained() \\
69
+ ... .setInputCols("document") \\
70
+ ... .setOutputCol("reranked_documents") \\
71
+ ... .setQuery("A man is eating pasta.")
72
+ >>> finisher = GGUFRankingFinisher() \\
73
+ ... .setInputCols("reranked_documents") \\
74
+ ... .setOutputCol("ranked_documents") \\
75
+ ... .setTopK(3) \\
76
+ ... .setMinMaxScaling(True)
77
+ >>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
78
+ >>> data = spark.createDataFrame([
79
+ ... ("A man is eating food.",),
80
+ ... ("A man is eating a piece of bread.",),
81
+ ... ("The girl is carrying a baby.",),
82
+ ... ("A man is riding a horse.",)
83
+ ... ], ["text"])
84
+ >>> result = pipeline.fit(data).transform(data)
85
+ >>> result.select("ranked_documents").show(truncate=False)
86
+ # Documents will be sorted by relevance with rank information in metadata
87
+ """
88
+
89
+ name = "GGUFRankingFinisher"
90
+
91
+ inputCols = Param(Params._dummy(),
92
+ "inputCols",
93
+ "Name of input annotation columns containing reranked documents",
94
+ typeConverter=TypeConverters.toListString)
95
+
96
+ outputCol = Param(Params._dummy(),
97
+ "outputCol",
98
+ "Name of output annotation column containing ranked documents",
99
+ typeConverter=TypeConverters.toListString)
100
+
101
+ topK = Param(Params._dummy(),
102
+ "topK",
103
+ "Maximum number of top documents to return based on relevance score (-1 for no limit)",
104
+ typeConverter=TypeConverters.toInt)
105
+
106
+ minRelevanceScore = Param(Params._dummy(),
107
+ "minRelevanceScore",
108
+ "Minimum relevance score threshold for filtering documents",
109
+ typeConverter=TypeConverters.toFloat)
110
+
111
+ minMaxScaling = Param(Params._dummy(),
112
+ "minMaxScaling",
113
+ "Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
114
+ typeConverter=TypeConverters.toBoolean)
115
+
116
+ @keyword_only
117
+ def __init__(self):
118
+ super(GGUFRankingFinisher, self).__init__(
119
+ classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
120
+ self._setDefault(
121
+ topK=-1,
122
+ minRelevanceScore=float('-inf'), # Equivalent to Double.MinValue
123
+ minMaxScaling=False,
124
+ outputCol=["ranked_documents"]
125
+ )
126
+
127
+ @keyword_only
128
+ def setParams(self):
129
+ kwargs = self._input_kwargs
130
+ return self._set(**kwargs)
131
+
132
+ def setInputCols(self, *value):
133
+ """Sets input annotation column names.
134
+
135
+ Parameters
136
+ ----------
137
+ value : List[str]
138
+ Input annotation column names containing reranked documents
139
+ """
140
+ if len(value) == 1 and isinstance(value[0], list):
141
+ return self._set(inputCols=value[0])
142
+ else:
143
+ return self._set(inputCols=list(value))
144
+
145
+ def getInputCols(self):
146
+ """Gets input annotation column names.
147
+
148
+ Returns
149
+ -------
150
+ List[str]
151
+ Input annotation column names
152
+ """
153
+ return self.getOrDefault(self.inputCols)
154
+
155
+ def setOutputCol(self, value):
156
+ """Sets output annotation column name.
157
+
158
+ Parameters
159
+ ----------
160
+ value : str
161
+ Output annotation column name
162
+ """
163
+ return self._set(outputCol=[value])
164
+
165
+ def getOutputCol(self):
166
+ """Gets output annotation column name.
167
+
168
+ Returns
169
+ -------
170
+ str
171
+ Output annotation column name
172
+ """
173
+ output_cols = self.getOrDefault(self.outputCol)
174
+ return output_cols[0] if output_cols else "ranked_documents"
175
+
176
+ def setTopK(self, value):
177
+ """Sets maximum number of top documents to return.
178
+
179
+ Parameters
180
+ ----------
181
+ value : int
182
+ Maximum number of top documents to return (-1 for no limit)
183
+ """
184
+ return self._set(topK=value)
185
+
186
+ def getTopK(self):
187
+ """Gets maximum number of top documents to return.
188
+
189
+ Returns
190
+ -------
191
+ int
192
+ Maximum number of top documents to return
193
+ """
194
+ return self.getOrDefault(self.topK)
195
+
196
+ def setMinRelevanceScore(self, value):
197
+ """Sets minimum relevance score threshold.
198
+
199
+ Parameters
200
+ ----------
201
+ value : float
202
+ Minimum relevance score threshold
203
+ """
204
+ return self._set(minRelevanceScore=value)
205
+
206
+ def getMinRelevanceScore(self):
207
+ """Gets minimum relevance score threshold.
208
+
209
+ Returns
210
+ -------
211
+ float
212
+ Minimum relevance score threshold
213
+ """
214
+ return self.getOrDefault(self.minRelevanceScore)
215
+
216
+ def setMinMaxScaling(self, value):
217
+ """Sets whether to apply min-max scaling.
218
+
219
+ Parameters
220
+ ----------
221
+ value : bool
222
+ Whether to apply min-max scaling to normalize scores
223
+ """
224
+ return self._set(minMaxScaling=value)
225
+
226
+ def getMinMaxScaling(self):
227
+ """Gets whether to apply min-max scaling.
228
+
229
+ Returns
230
+ -------
231
+ bool
232
+ Whether min-max scaling is enabled
233
+ """
234
+ return self.getOrDefault(self.minMaxScaling)
@@ -15,6 +15,8 @@
15
15
 
16
16
  from pyspark import keyword_only
17
17
  from pyspark.ml.param import TypeConverters, Params, Param
18
+ from pyspark.sql import SparkSession, DataFrame
19
+ from pyspark.sql.functions import regexp_replace, col
18
20
 
19
21
  from sparknlp.common import AnnotatorType
20
22
  from sparknlp.internal import AnnotatorTransformer
@@ -65,6 +67,7 @@ class ImageAssembler(AnnotatorTransformer):
65
67
  outputAnnotatorType = AnnotatorType.IMAGE
66
68
 
67
69
  inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
70
+ textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
68
71
  outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
69
72
  name = 'ImageAssembler'
70
73
 
@@ -101,3 +104,69 @@ class ImageAssembler(AnnotatorTransformer):
101
104
  def getOutputCol(self):
102
105
  """Gets output column name of annotations."""
103
106
  return self.getOrDefault(self.outputCol)
107
+
108
+ def setTextCol(self, value):
109
+ """Sets an optional text column name.
110
+
111
+ Parameters
112
+ ----------
113
+ value : str
114
+ Name of an optional input text column
115
+ """
116
+ return self._set(inputCol=value)
117
+
118
+ @classmethod
119
+ def loadImagesAsBytes(cls, spark: SparkSession, path: str):
120
+ """
121
+ Loads images from a given path and returns them as raw bytes, instead of the default
122
+ OpenCV-compatible format. Supported image types include JPEG, PNG, GIF, and BMP.
123
+
124
+ Multimodal inference with llama.cpp requires raw bytes as input.
125
+
126
+ Parameters
127
+ ----------
128
+ spark : SparkSession
129
+ The active SparkSession.
130
+ path : str
131
+ The path to the images. Supported image types are JPEG, PNG, GIF, and BMP.
132
+
133
+ Returns
134
+ -------
135
+ DataFrame
136
+ A DataFrame containing the images as raw bytes along with their metadata.
137
+ """
138
+
139
+ # Replace the path separator in the `origin` field and `path` column, so that they match
140
+ def replace_path(column_name: str):
141
+ return regexp_replace(col(column_name), ":///", ":/")
142
+
143
+ # Load the images as metadata with the default Spark image format
144
+ data = (
145
+ spark.read.format("image")
146
+ .option("dropInvalid", True)
147
+ .load(path)
148
+ .withColumn(
149
+ "image", col("image").withField("origin", replace_path("image.origin"))
150
+ )
151
+ )
152
+
153
+ # Load the images as raw binary files
154
+ image_bytes = (
155
+ spark.read.format("binaryFile")
156
+ .option("pathGlobFilter", "*.{jpeg,jpg,png,gif,bmp,JPEG,JPG,PNG,GIF,BMP}")
157
+ .option("dropInvalid", True)
158
+ .load(path)
159
+ .withColumn("path", replace_path("path"))
160
+ )
161
+
162
+ # Join the two datasets on the file path
163
+ df_joined = data.join(
164
+ image_bytes, data["image.origin"] == image_bytes["path"], "inner"
165
+ )
166
+
167
+ # Replace the `data` field of the `image` column with raw bytes
168
+ df_image_replaced = df_joined.withColumn(
169
+ "image", df_joined["image"].withField("data", df_joined["content"])
170
+ )
171
+
172
+ return df_image_replaced