spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains classes concerning Wav2Vec2ForCTC."""
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class Wav2Vec2ForCTC(AnnotatorModel,
21
+ HasBatchedAnnotateAudio,
22
+ HasAudioFeatureProperties,
23
+ HasEngine):
24
+ """Wav2Vec2 Model with a language modeling head on top for Connectionist Temporal
25
+ Classification (CTC). Wav2Vec2 was proposed in wav2vec 2.0: A Framework for
26
+ Self-Supervised Learning of Speech Representations by Alexei Baevski, Henry Zhou,
27
+ Abdelrahman Mohamed, Michael Auli.
28
+
29
+ The annotator takes audio files and transcribes it as text. The audio needs to be
30
+ provided pre-processed an array of floats.
31
+
32
+ Note that this annotator is currently not supported on Apple Silicon processors such
33
+ as the M1. This is due to the processor not supporting instructions for XLA.
34
+
35
+ Pretrained models can be loaded with ``pretrained`` of the companion object:
36
+
37
+ >>> speechToText = Wav2Vec2ForCTC.pretrained() \\
38
+ ... .setInputCols(["audio_assembler"]) \\
39
+ ... .setOutputCol("text")
40
+
41
+
42
+ The default model is ``"asr_wav2vec2_base_960h"``, if no name is provided.
43
+
44
+ For available pretrained models please see the
45
+ `Models Hub <https://sparknlp.org/models>`__.
46
+
47
+ To see which models are compatible and how to import them see
48
+ https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
49
+ examples, see
50
+ `Wav2Vec2ForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTCTestSpec.scala>`__.
51
+
52
+ ====================== ======================
53
+ Input Annotation types Output Annotation type
54
+ ====================== ======================
55
+ ``AUDIO`` ``DOCUMENT``
56
+ ====================== ======================
57
+
58
+ Parameters
59
+ ----------
60
+
61
+ batchSize
62
+ Size of each batch, by default 2
63
+
64
+ Examples
65
+ --------
66
+ >>> import sparknlp
67
+ >>> from sparknlp.base import *
68
+ >>> from sparknlp.annotator import *
69
+ >>> from pyspark.ml import Pipeline
70
+ >>> audioAssembler = AudioAssembler() \\
71
+ ... .setInputCol("audio_content") \\
72
+ ... .setOutputCol("audio_assembler")
73
+ >>> speechToText = Wav2Vec2ForCTC \\
74
+ ... .pretrained() \\
75
+ ... .setInputCols(["audio_assembler"]) \\
76
+ ... .setOutputCol("text")
77
+ >>> pipeline = Pipeline().setStages([audioAssembler, speechToText])
78
+ >>> processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content")
79
+ >>> result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)
80
+ >>> result.select("text.result").show(truncate = False)
81
+ +------------------------------------------------------------------------------------------+
82
+ |result |
83
+ +------------------------------------------------------------------------------------------+
84
+ |[MISTER QUILTER IS THE APOSTLE OF THE MIDLE CLASES AND WE ARE GLAD TO WELCOME HIS GOSPEL ]|
85
+ +------------------------------------------------------------------------------------------+
86
+ """
87
+ name = "Wav2Vec2ForCTC"
88
+
89
+ inputAnnotatorTypes = [AnnotatorType.AUDIO]
90
+
91
+ outputAnnotatorType = AnnotatorType.DOCUMENT
92
+
93
+ configProtoBytes = Param(Params._dummy(),
94
+ "configProtoBytes",
95
+ "ConfigProto from tensorflow, serialized into byte array. Get with "
96
+ "config_proto.SerializeToString()",
97
+ TypeConverters.toListInt)
98
+
99
+ def setConfigProtoBytes(self, b):
100
+ """Sets configProto from tensorflow, serialized into byte array.
101
+
102
+ Parameters
103
+ ----------
104
+ b : List[int]
105
+ ConfigProto from tensorflow, serialized into byte array
106
+ """
107
+ return self._set(configProtoBytes=b)
108
+
109
+ @keyword_only
110
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.audio.Wav2Vec2ForCTC",
111
+ java_model=None):
112
+ super(Wav2Vec2ForCTC, self).__init__(
113
+ classname=classname,
114
+ java_model=java_model
115
+ )
116
+ self._setDefault(
117
+ batchSize=2
118
+ )
119
+
120
+ @staticmethod
121
+ def loadSavedModel(folder, spark_session):
122
+ """Loads a locally saved model.
123
+
124
+ Parameters
125
+ ----------
126
+ folder : str
127
+ Folder of the saved model
128
+ spark_session : pyspark.sql.SparkSession
129
+ The current SparkSession
130
+
131
+ Returns
132
+ -------
133
+ Wav2Vec2ForCTC
134
+ The restored model
135
+ """
136
+ from sparknlp.internal import _Wav2Vec2ForCTC
137
+ jModel = _Wav2Vec2ForCTC(folder, spark_session._jsparkSession)._java_obj
138
+ return Wav2Vec2ForCTC(java_model=jModel)
139
+
140
+ @staticmethod
141
+ def pretrained(name="asr_wav2vec2_base_960h", lang="en", remote_loc=None):
142
+ """Downloads and loads a pretrained model.
143
+
144
+ Parameters
145
+ ----------
146
+ name : str, optional
147
+ Name of the pretrained model, by default
148
+ "asr_wav2vec2_base_960h"
149
+ lang : str, optional
150
+ Language of the pretrained model, by default "en"
151
+ remote_loc : str, optional
152
+ Optional remote address of the resource, by default None. Will use
153
+ Spark NLPs repositories otherwise.
154
+
155
+ Returns
156
+ -------
157
+ Wav2Vec2ForCTC
158
+ The restored model
159
+ """
160
+ from sparknlp.pretrained import ResourceDownloader
161
+ return ResourceDownloader.downloadModel(Wav2Vec2ForCTC, name, lang, remote_loc)
@@ -0,0 +1,251 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains classes concerning WhisperForCTC."""
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class WhisperForCTC(AnnotatorModel,
21
+ HasBatchedAnnotateAudio,
22
+ HasAudioFeatureProperties,
23
+ HasEngine, HasGeneratorProperties):
24
+ """Whisper Model with a language modeling head on top for Connectionist Temporal Classification
25
+ (CTC).
26
+
27
+ Whisper is an automatic speech recognition (ASR) system trained on 680,000 hours of
28
+ multilingual and multitask supervised data collected from the web. It transcribe in multiple
29
+ languages, as well as translate from those languages into English.
30
+
31
+ The audio needs to be provided pre-processed an array of floats.
32
+
33
+ Note that at the moment, this annotator only supports greedy search and only Spark Versions
34
+ 3.4 and up are supported.
35
+
36
+ For multilingual models, the language and the task (transcribe or translate) can be set with
37
+ ``setLanguage`` and ``setTask``.
38
+
39
+ Pretrained models can be loaded with ``pretrained`` of the companion object:
40
+
41
+ .. code-block:: python
42
+
43
+ speechToText = WhisperForCTC.pretrained() \\
44
+ .setInputCols(["audio_assembler"]) \\
45
+ .setOutputCol("text")
46
+
47
+
48
+ The default model is ``"asr_whisper_tiny_opt"``, if no name is provided.
49
+
50
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
51
+
52
+ To see which models are compatible and how to import them see
53
+ https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
54
+ examples, see
55
+ `WhisperForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala>`__.
56
+
57
+ **References:**
58
+
59
+ `Robust Speech Recognition via Large-Scale Weak Supervision <https://arxiv.org/abs/2212.04356>`__
60
+
61
+ **Paper Abstract:**
62
+
63
+ *We study the capabilities of speech processing systems trained simply to predict large
64
+ amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual
65
+ and multitask supervision, the resulting models generalize well to standard benchmarks and are
66
+ often competitive with prior fully supervised results but in a zero- shot transfer setting
67
+ without the need for any fine- tuning. When compared to humans, the models approach their
68
+ accuracy and robustness. We are releasing models and inference code to serve as a foundation
69
+ for further work on robust speech processing.*
70
+
71
+ ====================== ======================
72
+ Input Annotation types Output Annotation type
73
+ ====================== ======================
74
+ ``AUDIO`` ``DOCUMENT``
75
+ ====================== ======================
76
+
77
+ Parameters
78
+ ----------
79
+ task
80
+ The formatted task for the audio. Either `<|translate|>` or `<|transcribe|>`.
81
+ language
82
+ The language for the audio, formatted to e.g. `<|en|>`. Check the model description for
83
+ supported languages.
84
+ isMultilingual
85
+ Whether the model is multilingual
86
+ minOutputLength
87
+ Minimum length of the sequence to be generated
88
+ maxOutputLength
89
+ Maximum length of output text
90
+ doSample
91
+ Whether or not to use sampling; use greedy decoding otherwise
92
+ temperature
93
+ The value used to module the next token probabilities
94
+ topK
95
+ The number of highest probability vocabulary tokens to keep for top-k-filtering
96
+ topP
97
+ If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are
98
+ kept for generation
99
+ repetitionPenalty
100
+ The parameter for repetition penalty. 1.0 means no penalty.
101
+ See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details
102
+ noRepeatNgramSize
103
+ If set to int > 0, all ngrams of that size can only occur once
104
+ beamSize
105
+ The Number of beams for beam search
106
+
107
+ Examples
108
+ --------
109
+ >>> import sparknlp
110
+ >>> from sparknlp.base import *
111
+ >>> from sparknlp.annotator import *
112
+ >>> from pyspark.ml import Pipeline
113
+ >>> audioAssembler = AudioAssembler() \\
114
+ ... .setInputCol("audio_content") \\
115
+ ... .setOutputCol("audio_assembler")
116
+ >>> speechToText = WhisperForCTC.pretrained() \\
117
+ ... .setInputCols(["audio_assembler"]) \\
118
+ ... .setOutputCol("text")
119
+ >>> pipeline = Pipeline().setStages([audioAssembler, speechToText])
120
+ >>> processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content")
121
+ >>> result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)
122
+ >>> result.select("text.result").show(truncate = False)
123
+ +------------------------------------------------------------------------------------------+
124
+ |result |
125
+ +------------------------------------------------------------------------------------------+
126
+ |[ Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.]|
127
+ +------------------------------------------------------------------------------------------+
128
+ """
129
+ name = "WhisperForCTC"
130
+
131
+ inputAnnotatorTypes = [AnnotatorType.AUDIO]
132
+
133
+ outputAnnotatorType = AnnotatorType.DOCUMENT
134
+
135
+ configProtoBytes = Param(Params._dummy(),
136
+ "configProtoBytes",
137
+ "ConfigProto from tensorflow, serialized into byte array. Get with "
138
+ "config_proto.SerializeToString()",
139
+ TypeConverters.toListInt)
140
+
141
+ language = Param(Params._dummy(), "language", "Optional parameter to set the language for the transcription.",
142
+ typeConverter=TypeConverters.toString)
143
+
144
+ isMultilingual = Param(Params._dummy(), "isMultilingual", "Whether the model is multilingual.",
145
+ typeConverter=TypeConverters.toBoolean)
146
+
147
+ def setConfigProtoBytes(self, b):
148
+ """Sets configProto from tensorflow, serialized into byte array.
149
+
150
+ Parameters
151
+ ----------
152
+ b : List[int]
153
+ ConfigProto from tensorflow, serialized into byte array
154
+ """
155
+ return self._set(configProtoBytes=b)
156
+
157
+ def getLanguage(self):
158
+ """Gets the langauge for the transcription."""
159
+ return self.getOrDefault(self.language)
160
+
161
+ def getIsMultilingual(self):
162
+ """Gets whether the model is multilingual."""
163
+ return self.getOrDefault(self.isMultilingual)
164
+
165
+ def setLanguage(self, value):
166
+ """Sets the language for the audio, formatted to e.g. `<|en|>`. Check the model description for
167
+ supported languages.
168
+
169
+ Parameters
170
+ ----------
171
+ value : String
172
+ Formatted language code
173
+ """
174
+ return self._call_java("setLanguage", value)
175
+
176
+ def setTask(self, value):
177
+ """Sets the formatted task for the audio. Either `<|translate|>` or `<|transcribe|>`.
178
+
179
+ Only multilingual models can do translation.
180
+
181
+ Parameters
182
+ ----------
183
+ value : String
184
+ Formatted task
185
+ """
186
+ return self._call_java("setTask", value)
187
+
188
+ @keyword_only
189
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.audio.WhisperForCTC",
190
+ java_model=None):
191
+ super(WhisperForCTC, self).__init__(
192
+ classname=classname,
193
+ java_model=java_model
194
+ )
195
+ self._setDefault(
196
+ minOutputLength=0,
197
+ maxOutputLength=448,
198
+ doSample=False,
199
+ temperature=1.0,
200
+ topK=1,
201
+ topP=1.0,
202
+ repetitionPenalty=1.0,
203
+ noRepeatNgramSize=0,
204
+ batchSize=2,
205
+ beamSize=1,
206
+ nReturnSequences=1,
207
+ isMultilingual=True,
208
+ )
209
+
210
+ @staticmethod
211
+ def loadSavedModel(folder, spark_session):
212
+ """Loads a locally saved model.
213
+
214
+ Parameters
215
+ ----------
216
+ folder : str
217
+ Folder of the saved model
218
+ spark_session : pyspark.sql.SparkSession
219
+ The current SparkSession
220
+
221
+ Returns
222
+ -------
223
+ WhisperForCTC
224
+ The restored model
225
+ """
226
+ from sparknlp.internal import _WhisperForCTC
227
+ jModel = _WhisperForCTC(folder, spark_session._jsparkSession)._java_obj
228
+ return WhisperForCTC(java_model=jModel)
229
+
230
+ @staticmethod
231
+ def pretrained(name="asr_whisper_tiny_opt", lang="xx", remote_loc=None):
232
+ """Downloads and loads a pretrained model.
233
+
234
+ Parameters
235
+ ----------
236
+ name : str, optional
237
+ Name of the pretrained model, by default
238
+ "asr_hubert_large_ls960"
239
+ lang : str, optional
240
+ Language of the pretrained model, by default "en"
241
+ remote_loc : str, optional
242
+ Optional remote address of the resource, by default None. Will use
243
+ Spark NLPs repositories otherwise.
244
+
245
+ Returns
246
+ -------
247
+ WhisperForCTC
248
+ The restored model
249
+ """
250
+ from sparknlp.pretrained import ResourceDownloader
251
+ return ResourceDownloader.downloadModel(WhisperForCTC, name, lang, remote_loc)
@@ -0,0 +1,85 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Chunk2Doc."""
15
+
16
+ from pyspark import keyword_only
17
+
18
+ from sparknlp.common import AnnotatorProperties
19
+ from sparknlp.common.annotator_type import AnnotatorType
20
+ from sparknlp.internal import AnnotatorTransformer
21
+
22
+
23
+ class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties):
24
+ """Converts a ``CHUNK`` type column back into ``DOCUMENT``.
25
+
26
+ Useful when trying to re-tokenize or do further analysis on a ``CHUNK`` result.
27
+
28
+ ====================== ======================
29
+ Input Annotation types Output Annotation type
30
+ ====================== ======================
31
+ ``CHUNK`` ``DOCUMENT``
32
+ ====================== ======================
33
+
34
+ Parameters
35
+ ----------
36
+ None
37
+
38
+ Examples
39
+ --------
40
+ >>> import sparknlp
41
+ >>> from sparknlp.base import *
42
+ >>> from sparknlp.pretrained import PretrainedPipeline
43
+
44
+ Location entities are extracted and converted back into ``DOCUMENT`` type for
45
+ further processing.
46
+
47
+ >>> data = spark.createDataFrame([[1, "New York and New Jersey aren't that far apart actually."]]).toDF("id", "text")
48
+
49
+ Define pretrained pipeline that extracts Named Entities amongst other things
50
+ and apply `Chunk2Doc` on it.
51
+
52
+ >>> pipeline = PretrainedPipeline("explain_document_dl")
53
+ >>> chunkToDoc = Chunk2Doc().setInputCols("entities").setOutputCol("chunkConverted")
54
+ >>> explainResult = pipeline.transform(data)
55
+
56
+ Show results.
57
+
58
+ >>> result = chunkToDoc.transform(explainResult)
59
+ >>> result.selectExpr("explode(chunkConverted)").show(truncate=False)
60
+ +------------------------------------------------------------------------------+
61
+ |col |
62
+ +------------------------------------------------------------------------------+
63
+ |[document, 0, 7, New York, [entity -> LOC, sentence -> 0, chunk -> 0], []] |
64
+ |[document, 13, 22, New Jersey, [entity -> LOC, sentence -> 0, chunk -> 1], []]|
65
+ +------------------------------------------------------------------------------+
66
+
67
+ See Also
68
+ --------
69
+ Doc2Chunk : for converting `DOCUMENT` annotations to `CHUNK`
70
+ """
71
+
72
+ name = "Chunk2Doc"
73
+
74
+ inputAnnotatorTypes = [AnnotatorType.CHUNK]
75
+
76
+ outputAnnotatorType = AnnotatorType.DOCUMENT
77
+
78
+ @keyword_only
79
+ def __init__(self):
80
+ super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunk2Doc")
81
+
82
+ @keyword_only
83
+ def setParams(self):
84
+ kwargs = self._input_kwargs
85
+ return self._set(**kwargs)
@@ -0,0 +1,137 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the Chunker."""
15
+ from sparknlp.common import *
16
+
17
+
18
+ class Chunker(AnnotatorModel):
19
+ """This annotator matches a pattern of part-of-speech tags in order to
20
+ return meaningful phrases from document. Extracted part-of-speech tags are
21
+ mapped onto the sentence, which can then be parsed by regular expressions.
22
+ The part-of-speech tags are wrapped by angle brackets ``<>`` to be easily
23
+ distinguishable in the text itself.
24
+
25
+ This example sentence will result in the form:
26
+
27
+ .. code-block:: none
28
+
29
+ "Peter Pipers employees are picking pecks of pickled peppers."
30
+ "<NNP><NNP><NNS><VBP><VBG><NNS><IN><JJ><NNS><.>"
31
+
32
+
33
+ To then extract these tags, ``regexParsers`` need to be set with e.g.:
34
+
35
+ >>> chunker = Chunker() \\
36
+ ... .setInputCols(["sentence", "pos"]) \\
37
+ ... .setOutputCol("chunk") \\
38
+ ... .setRegexParsers(["<NNP>+", "<NNS>+"])
39
+
40
+ When defining the regular expressions, tags enclosed in angle brackets are
41
+ treated as groups, so here specifically ``"<NNP>+"`` means 1 or more nouns
42
+ in succession.
43
+
44
+ For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb>`__.
45
+
46
+ ====================== ======================
47
+ Input Annotation types Output Annotation type
48
+ ====================== ======================
49
+ ``DOCUMENT, POS`` ``CHUNK``
50
+ ====================== ======================
51
+
52
+ Parameters
53
+ ----------
54
+ regexParsers
55
+ An array of grammar based chunk parsers
56
+
57
+ Examples
58
+ --------
59
+ >>> import sparknlp
60
+ >>> from sparknlp.base import *
61
+ >>> from sparknlp.annotator import *
62
+ >>> from pyspark.ml import Pipeline
63
+ >>> documentAssembler = DocumentAssembler() \\
64
+ ... .setInputCol("text") \\
65
+ ... .setOutputCol("document")
66
+ >>> sentence = SentenceDetector() \\
67
+ ... .setInputCols("document") \\
68
+ ... .setOutputCol("sentence")
69
+ >>> tokenizer = Tokenizer() \\
70
+ ... .setInputCols(["sentence"]) \\
71
+ ... .setOutputCol("token")
72
+ >>> POSTag = PerceptronModel.pretrained() \\
73
+ ... .setInputCols("document", "token") \\
74
+ ... .setOutputCol("pos")
75
+ >>> chunker = Chunker() \\
76
+ ... .setInputCols("sentence", "pos") \\
77
+ ... .setOutputCol("chunk") \\
78
+ ... .setRegexParsers(["<NNP>+", "<NNS>+"])
79
+ >>> pipeline = Pipeline() \\
80
+ ... .setStages([
81
+ ... documentAssembler,
82
+ ... sentence,
83
+ ... tokenizer,
84
+ ... POSTag,
85
+ ... chunker
86
+ ... ])
87
+ >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]).toDF("text")
88
+ >>> result = pipeline.fit(data).transform(data)
89
+ >>> result.selectExpr("explode(chunk) as result").show(truncate=False)
90
+ +-------------------------------------------------------------+
91
+ |result |
92
+ +-------------------------------------------------------------+
93
+ |[chunk, 0, 11, Peter Pipers, [sentence -> 0, chunk -> 0], []]|
94
+ |[chunk, 13, 21, employees, [sentence -> 0, chunk -> 1], []] |
95
+ |[chunk, 35, 39, pecks, [sentence -> 0, chunk -> 2], []] |
96
+ |[chunk, 52, 58, peppers, [sentence -> 0, chunk -> 3], []] |
97
+ +-------------------------------------------------------------+
98
+
99
+ See Also
100
+ --------
101
+ PerceptronModel : for Part-Of-Speech tagging
102
+ """
103
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS]
104
+
105
+ outputAnnotatorType = AnnotatorType.CHUNK
106
+
107
+ regexParsers = Param(Params._dummy(),
108
+ "regexParsers",
109
+ "an array of grammar based chunk parsers",
110
+ typeConverter=TypeConverters.toListString)
111
+
112
+ name = "Chunker"
113
+
114
+ @keyword_only
115
+ def __init__(self):
116
+ super(Chunker, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunker")
117
+
118
+ def setRegexParsers(self, value):
119
+ """Sets an array of grammar based chunk parsers.
120
+
121
+ POS classes should be enclosed in angle brackets, then treated as
122
+ groups.
123
+
124
+ Parameters
125
+ ----------
126
+ value : List[str]
127
+ Array of grammar based chunk parsers
128
+
129
+
130
+ Examples
131
+ --------
132
+ >>> chunker = Chunker() \\
133
+ ... .setInputCols("sentence", "pos") \\
134
+ ... .setOutputCol("chunk") \\
135
+ ... .setRegexParsers(["<NNP>+", "<NNS>+"])
136
+ """
137
+ return self._set(regexParsers=value)