spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,467 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for SentenceDetectorDl."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class SentenceDetectorDLApproach(AnnotatorApproach):
20
+ """Trains an annotator that detects sentence boundaries using a deep
21
+ learning approach.
22
+
23
+ Currently, only the CNN model is supported for training, but in the future
24
+ the architecture of the model can be set with :meth:`.setModel`.
25
+
26
+ For pretrained models see :class:`.SentenceDetectorDLModel`.
27
+
28
+ Each extracted sentence can be returned in an Array or exploded to separate
29
+ rows, if ``explodeSentences`` is set to ``True``.
30
+
31
+ ====================== ======================
32
+ Input Annotation types Output Annotation type
33
+ ====================== ======================
34
+ ``DOCUMENT`` ``DOCUMENT``
35
+ ====================== ======================
36
+
37
+ Parameters
38
+ ----------
39
+ modelArchitecture
40
+ Model architecture (CNN)
41
+ impossiblePenultimates
42
+ Impossible penultimates - list of strings which a sentence can't end
43
+ with
44
+ validationSplit
45
+ Choose the proportion of training dataset to be validated against the
46
+ model on each
47
+ epochsNumber
48
+ Number of epochs for the optimization process
49
+ outputLogsPath
50
+ Path to folder where logs will be saved. If no path is specified, no
51
+ logs are generated
52
+ explodeSentences
53
+ Whether to explode each sentence into a different row, for better
54
+ parallelization. Defaults to False.
55
+
56
+ References
57
+ ----------
58
+ The default model ``"cnn"`` is based on the paper `Deep-EOS: General-Purpose
59
+ Neural Networks for Sentence Boundary Detection (2020, Stefan Schweter,
60
+ Sajawel Ahmed)
61
+ <https://konvens.org/proceedings/2019/papers/KONVENS2019_paper_41.pdf>`__
62
+ using a CNN architecture. We also modified the original implementation a
63
+ little bit to cover broken sentences and some impossible end of line chars.
64
+
65
+ Examples
66
+ --------
67
+ The training process needs data, where each data point is a sentence.
68
+ In this example the ``train.txt`` file has the form of::
69
+
70
+ ...
71
+ Slightly more moderate language would make our present situation – namely the lack of progress – a little easier.
72
+ His political successors now have great responsibilities to history and to the heritage of values bequeathed to them by Nelson Mandela.
73
+ ...
74
+
75
+ where each line is one sentence.
76
+
77
+ Training can then be started like so:
78
+
79
+ >>> import sparknlp
80
+ >>> from sparknlp.base import *
81
+ >>> from sparknlp.annotator import *
82
+ >>> from pyspark.ml import Pipeline
83
+ >>> trainingData = spark.read.text("train.txt").toDF("text")
84
+ >>> documentAssembler = DocumentAssembler() \\
85
+ ... .setInputCol("text") \\
86
+ ... .setOutputCol("document")
87
+ >>> sentenceDetector = SentenceDetectorDLApproach() \\
88
+ ... .setInputCols(["document"]) \\
89
+ ... .setOutputCol("sentences") \\
90
+ ... .setEpochsNumber(100)
91
+ >>> pipeline = Pipeline().setStages([documentAssembler, sentenceDetector])
92
+ >>> model = pipeline.fit(trainingData)
93
+ """
94
+
95
+ name = "SentenceDetectorDLApproach"
96
+
97
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
98
+
99
+ outputAnnotatorType = AnnotatorType.DOCUMENT
100
+
101
+ modelArchitecture = Param(Params._dummy(),
102
+ "modelArchitecture",
103
+ "Model architecture (CNN)",
104
+ typeConverter=TypeConverters.toString)
105
+
106
+ impossiblePenultimates = Param(Params._dummy(),
107
+ "impossiblePenultimates",
108
+ "Impossible penultimates - list of strings which a sentence can't end with",
109
+ typeConverter=TypeConverters.toListString)
110
+
111
+ validationSplit = Param(Params._dummy(),
112
+ "validationSplit",
113
+ "Choose the proportion of training dataset to be validated against the model on each "
114
+ "Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
115
+ TypeConverters.toFloat)
116
+
117
+ epochsNumber = Param(Params._dummy(),
118
+ "epochsNumber",
119
+ "Number of epochs for the optimization process",
120
+ TypeConverters.toInt)
121
+
122
+ outputLogsPath = Param(Params._dummy(),
123
+ "outputLogsPath",
124
+ "Path to folder where logs will be saved. If no path is specified, no logs are generated",
125
+ TypeConverters.toString)
126
+
127
+ explodeSentences = Param(Params._dummy(),
128
+ "explodeSentences",
129
+ "whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
130
+ TypeConverters.toBoolean)
131
+
132
+ def setModel(self, model_architecture):
133
+ """Sets the Model architecture. Currently only ``"cnn"`` is available.
134
+
135
+ Parameters
136
+ ----------
137
+ model_architecture : str
138
+ Model architecture
139
+ """
140
+ return self._set(modelArchitecture=model_architecture)
141
+
142
+ def setValidationSplit(self, validation_split):
143
+ """Sets the proportion of training dataset to be validated against the
144
+ model on each Epoch, by default it is 0.0 and off. The value should be
145
+ between 0.0 and 1.0.
146
+
147
+ Parameters
148
+ ----------
149
+ validation_split : float
150
+ Proportion of training dataset to be validated
151
+ """
152
+ return self._set(validationSplit=validation_split)
153
+
154
+ def setEpochsNumber(self, epochs_number):
155
+ """Sets number of epochs to train.
156
+
157
+ Parameters
158
+ ----------
159
+ epochs_number : int
160
+ Number of epochs
161
+ """
162
+ return self._set(epochsNumber=epochs_number)
163
+
164
+ def setOutputLogsPath(self, output_logs_path):
165
+ """Sets folder path to save training logs.
166
+
167
+ Parameters
168
+ ----------
169
+ output_logs_path : str
170
+ Folder path to save training logs
171
+ """
172
+ return self._set(outputLogsPath=output_logs_path)
173
+
174
+ def setImpossiblePenultimates(self, impossible_penultimates):
175
+ """Sets impossible penultimates - list of strings which a sentence can't
176
+ end with.
177
+
178
+ Parameters
179
+ ----------
180
+ impossible_penultimates : List[str]
181
+ List of strings which a sentence can't end with
182
+
183
+ """
184
+ return self._set(impossiblePenultimates=impossible_penultimates)
185
+
186
+ def setExplodeSentences(self, value):
187
+ """Sets whether to explode each sentence into a different row, for
188
+ better parallelization, by default False.
189
+
190
+ Parameters
191
+ ----------
192
+ value : bool
193
+ Whether to explode each sentence into a different row
194
+ """
195
+ return self._set(explodeSentences=value)
196
+
197
+ def _create_model(self, java_model):
198
+ return SentenceDetectorDLModel(java_model=java_model)
199
+
200
+ @keyword_only
201
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLApproach"):
202
+ super(SentenceDetectorDLApproach, self).__init__(classname=classname)
203
+
204
+
205
+ class SentenceDetectorDLModel(AnnotatorModel, HasEngine):
206
+ """Annotator that detects sentence boundaries using a deep learning approach.
207
+
208
+ Instantiated Model of the :class:`.SentenceDetectorDLApproach`.
209
+ Detects sentence boundaries using a deep learning approach.
210
+
211
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
212
+ object:
213
+
214
+ >>> sentenceDL = SentenceDetectorDLModel.pretrained() \\
215
+ ... .setInputCols(["document"]) \\
216
+ ... .setOutputCol("sentencesDL")
217
+
218
+ The default model is ``"sentence_detector_dl"``, if no name is provided. For
219
+ available pretrained models please see the `Models Hub
220
+ <https://sparknlp.org/models?task=Sentence+Detection>`__.
221
+
222
+ Each extracted sentence can be returned in an Array or exploded to separate
223
+ rows, if ``explodeSentences`` is set to ``true``.
224
+
225
+ For extended examples of usage, see the `Examples
226
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb>`__.
227
+
228
+ ====================== ======================
229
+ Input Annotation types Output Annotation type
230
+ ====================== ======================
231
+ ``DOCUMENT`` ``DOCUMENT``
232
+ ====================== ======================
233
+
234
+ Parameters
235
+ ----------
236
+ modelArchitecture
237
+ Model architecture (CNN)
238
+ explodeSentences
239
+ whether to explode each sentence into a different row, for better
240
+ parallelization. Defaults to false.
241
+ customBounds
242
+ characters used to explicitly mark sentence bounds, by default []
243
+ useCustomBoundsOnly
244
+ Only utilize custom bounds in sentence detection, by default False
245
+ splitLength
246
+ length at which sentences will be forcibly split
247
+ minLength
248
+ Set the minimum allowed length for each sentence, by default 0
249
+ maxLength
250
+ Set the maximum allowed length for each sentence, by default 99999
251
+ impossiblePenultimates
252
+ Impossible penultimates - list of strings which a sentence can't end
253
+ with
254
+
255
+ Examples
256
+ --------
257
+ In this example, the normal `SentenceDetector` is compared to the
258
+ `SentenceDetectorDLModel`. In a pipeline, `SentenceDetectorDLModel` can be
259
+ used as a replacement for the `SentenceDetector`.
260
+
261
+ >>> import sparknlp
262
+ >>> from sparknlp.base import *
263
+ >>> from sparknlp.annotator import *
264
+ >>> from pyspark.ml import Pipeline
265
+ >>> documentAssembler = DocumentAssembler() \\
266
+ ... .setInputCol("text") \\
267
+ ... .setOutputCol("document")
268
+ >>> sentence = SentenceDetector() \\
269
+ ... .setInputCols(["document"]) \\
270
+ ... .setOutputCol("sentences")
271
+ >>> sentenceDL = SentenceDetectorDLModel \\
272
+ ... .pretrained("sentence_detector_dl", "en") \\
273
+ ... .setInputCols(["document"]) \\
274
+ ... .setOutputCol("sentencesDL")
275
+ >>> pipeline = Pipeline().setStages([
276
+ ... documentAssembler,
277
+ ... sentence,
278
+ ... sentenceDL
279
+ ... ])
280
+ >>> data = spark.createDataFrame([[\"\"\"John loves Mary.Mary loves Peter
281
+ ... Peter loves Helen .Helen loves John;
282
+ ... Total: four people involved.\"\"\"]]).toDF("text")
283
+ >>> result = pipeline.fit(data).transform(data)
284
+ >>> result.selectExpr("explode(sentences.result) as sentences").show(truncate=False)
285
+ +----------------------------------------------------------+
286
+ |sentences |
287
+ +----------------------------------------------------------+
288
+ |John loves Mary.Mary loves Peter\\n Peter loves Helen .|
289
+ |Helen loves John; |
290
+ |Total: four people involved. |
291
+ +----------------------------------------------------------+
292
+ >>> result.selectExpr("explode(sentencesDL.result) as sentencesDL").show(truncate=False)
293
+ +----------------------------+
294
+ |sentencesDL |
295
+ +----------------------------+
296
+ |John loves Mary. |
297
+ |Mary loves Peter |
298
+ |Peter loves Helen . |
299
+ |Helen loves John; |
300
+ |Total: four people involved.|
301
+ +----------------------------+
302
+ """
303
+ name = "SentenceDetectorDLModel"
304
+
305
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
306
+
307
+ outputAnnotatorType = AnnotatorType.DOCUMENT
308
+
309
+ modelArchitecture = Param(Params._dummy(), "modelArchitecture", "Model architecture (CNN)",
310
+ typeConverter=TypeConverters.toString)
311
+
312
+ explodeSentences = Param(Params._dummy(),
313
+ "explodeSentences",
314
+ "whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
315
+ TypeConverters.toBoolean)
316
+
317
+ customBounds = Param(Params._dummy(),
318
+ "customBounds",
319
+ "characters used to explicitly mark sentence bounds",
320
+ typeConverter=TypeConverters.toListString)
321
+
322
+ useCustomBoundsOnly = Param(Params._dummy(),
323
+ "useCustomBoundsOnly",
324
+ "Only utilize custom bounds in sentence detection",
325
+ typeConverter=TypeConverters.toBoolean)
326
+
327
+ splitLength = Param(Params._dummy(),
328
+ "splitLength",
329
+ "length at which sentences will be forcibly split.",
330
+ typeConverter=TypeConverters.toInt)
331
+
332
+ minLength = Param(Params._dummy(),
333
+ "minLength",
334
+ "Set the minimum allowed length for each sentence.",
335
+ typeConverter=TypeConverters.toInt)
336
+
337
+ maxLength = Param(Params._dummy(),
338
+ "maxLength",
339
+ "Set the maximum allowed length for each sentence",
340
+ typeConverter=TypeConverters.toInt)
341
+
342
+ impossiblePenultimates = Param(Params._dummy(),
343
+ "impossiblePenultimates",
344
+ "Impossible penultimates - list of strings which a sentence can't end with",
345
+ typeConverter=TypeConverters.toListString)
346
+
347
+ def setModel(self, modelArchitecture):
348
+ """Sets the Model architecture. Currently only ``"cnn"`` is available.
349
+
350
+ Parameters
351
+ ----------
352
+ model_architecture : str
353
+ Model architecture
354
+ """
355
+ return self._set(modelArchitecture=modelArchitecture)
356
+
357
+ def setExplodeSentences(self, value):
358
+ """Sets whether to explode each sentence into a different row, for
359
+ better parallelization, by default False.
360
+
361
+ Parameters
362
+ ----------
363
+ value : bool
364
+ Whether to explode each sentence into a different row
365
+ """
366
+ return self._set(explodeSentences=value)
367
+
368
+ def setCustomBounds(self, value):
369
+ """Sets characters used to explicitly mark sentence bounds, by default
370
+ [].
371
+
372
+ Parameters
373
+ ----------
374
+ value : List[str]
375
+ Characters used to explicitly mark sentence bounds
376
+ """
377
+ return self._set(customBounds=value)
378
+
379
+ def setUseCustomBoundsOnly(self, value):
380
+ """Sets whether to only utilize custom bounds in sentence detection, by
381
+ default False.
382
+
383
+ Parameters
384
+ ----------
385
+ value : bool
386
+ Whether to only utilize custom bounds
387
+ """
388
+ return self._set(useCustomBoundsOnly=value)
389
+
390
+ def setSplitLength(self, value):
391
+ """Sets length at which sentences will be forcibly split.
392
+
393
+ Parameters
394
+ ----------
395
+ value : int
396
+ Length at which sentences will be forcibly split.
397
+ """
398
+ return self._set(splitLength=value)
399
+
400
+ def setMinLength(self, value):
401
+ """Sets minimum allowed length for each sentence, by default 0
402
+
403
+ Parameters
404
+ ----------
405
+ value : int
406
+ Minimum allowed length for each sentence
407
+ """
408
+ return self._set(minLength=value)
409
+
410
+ def setMaxLength(self, value):
411
+ """Sets the maximum allowed length for each sentence, by default
412
+ 99999
413
+
414
+ Parameters
415
+ ----------
416
+ value : int
417
+ Maximum allowed length for each sentence
418
+ """
419
+ return self._set(maxLength=value)
420
+
421
+ def setImpossiblePenultimates(self, impossible_penultimates):
422
+ """Sets impossible penultimates - list of strings which a sentence can't
423
+ end with.
424
+
425
+ Parameters
426
+ ----------
427
+ impossible_penultimates : List[str]
428
+ List of strings which a sentence can't end with
429
+
430
+ """
431
+ return self._set(impossiblePenultimates=impossible_penultimates)
432
+
433
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel",
434
+ java_model=None):
435
+ super(SentenceDetectorDLModel, self).__init__(
436
+ classname=classname,
437
+ java_model=java_model
438
+ )
439
+ self._setDefault(
440
+ useCustomBoundsOnly=False,
441
+ customBounds=[],
442
+ explodeSentences=False,
443
+ minLength=0,
444
+ maxLength=99999
445
+ )
446
+
447
+ @staticmethod
448
+ def pretrained(name="sentence_detector_dl", lang="en", remote_loc=None):
449
+ """Downloads and loads a pretrained model.
450
+
451
+ Parameters
452
+ ----------
453
+ name : str, optional
454
+ Name of the pretrained model, by default "sentence_detector_dl"
455
+ lang : str, optional
456
+ Language of the pretrained model, by default "en"
457
+ remote_loc : str, optional
458
+ Optional remote address of the resource, by default None. Will use
459
+ Spark NLPs repositories otherwise.
460
+
461
+ Returns
462
+ -------
463
+ SentenceDetectorDLModel
464
+ The restored model
465
+ """
466
+ from sparknlp.pretrained import ResourceDownloader
467
+ return ResourceDownloader.downloadModel(SentenceDetectorDLModel, name, lang, remote_loc)
@@ -0,0 +1,17 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Module of annotators for sentiment analysis."""
16
+ from sparknlp.annotator.sentiment.sentiment_detector import *
17
+ from sparknlp.annotator.sentiment.vivekn_sentiment import *
@@ -0,0 +1,208 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the SentimentDetector."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class SentimentDetector(AnnotatorApproach):
20
+ """Trains a rule based sentiment detector, which calculates a score based on
21
+ predefined keywords.
22
+
23
+ A dictionary of predefined sentiment keywords must be provided with
24
+ :meth:`.setDictionary`, where each line is a word delimited to its class
25
+ (either ``positive`` or ``negative``). The dictionary can be set in the form
26
+ of a delimited text file.
27
+
28
+ By default, the sentiment score will be assigned labels ``"positive"`` if
29
+ the score is ``>= 0``, else ``"negative"``.
30
+
31
+ For extended examples of usage, see the `Examples
32
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dictionary-sentiment/sentiment.ipynb>`__.
33
+
34
+ ====================== ======================
35
+ Input Annotation types Output Annotation type
36
+ ====================== ======================
37
+ ``TOKEN, DOCUMENT`` ``SENTIMENT``
38
+ ====================== ======================
39
+
40
+ Parameters
41
+ ----------
42
+ dictionary
43
+ path for dictionary to sentiment analysis
44
+
45
+ Examples
46
+ --------
47
+ In this example, the dictionary ``default-sentiment-dict.txt`` has the form
48
+ of::
49
+
50
+ ...
51
+ cool,positive
52
+ superb,positive
53
+ bad,negative
54
+ uninspired,negative
55
+ ...
56
+
57
+ where each sentiment keyword is delimited by ``","``.
58
+
59
+ >>> import sparknlp
60
+ >>> from sparknlp.base import *
61
+ >>> from sparknlp.annotator import *
62
+ >>> from pyspark.ml import Pipeline
63
+ >>> documentAssembler = DocumentAssembler() \\
64
+ ... .setInputCol("text") \\
65
+ ... .setOutputCol("document")
66
+ >>> tokenizer = Tokenizer() \\
67
+ ... .setInputCols(["document"]) \\
68
+ ... .setOutputCol("token")
69
+ >>> lemmatizer = Lemmatizer() \\
70
+ ... .setInputCols(["token"]) \\
71
+ ... .setOutputCol("lemma") \\
72
+ ... .setDictionary("lemmas_small.txt", "->", "\\t")
73
+ >>> sentimentDetector = SentimentDetector() \\
74
+ ... .setInputCols(["lemma", "document"]) \\
75
+ ... .setOutputCol("sentimentScore") \\
76
+ ... .setDictionary("default-sentiment-dict.txt", ",", ReadAs.TEXT)
77
+ >>> pipeline = Pipeline().setStages([
78
+ ... documentAssembler,
79
+ ... tokenizer,
80
+ ... lemmatizer,
81
+ ... sentimentDetector,
82
+ ... ])
83
+ >>> data = spark.createDataFrame([
84
+ ... ["The staff of the restaurant is nice"],
85
+ ... ["I recommend others to avoid because it is too expensive"]
86
+ ... ]).toDF("text")
87
+ >>> result = pipeline.fit(data).transform(data)
88
+ >>> result.selectExpr("sentimentScore.result").show(truncate=False)
89
+ +----------+
90
+ |result |
91
+ +----------+
92
+ |[positive]|
93
+ |[negative]|
94
+ +----------+
95
+
96
+ See Also
97
+ --------
98
+ ViveknSentimentApproach : for an alternative approach to sentiment extraction
99
+ """
100
+
101
+ inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
102
+
103
+ outputAnnotatorType = AnnotatorType.SENTIMENT
104
+
105
+ dictionary = Param(Params._dummy(),
106
+ "dictionary",
107
+ "path for dictionary to sentiment analysis",
108
+ typeConverter=TypeConverters.identity)
109
+
110
+ positiveMultiplier = Param(Params._dummy(),
111
+ "positiveMultiplier",
112
+ "multiplier for positive sentiments. Defaults 1.0",
113
+ typeConverter=TypeConverters.toFloat)
114
+
115
+ negativeMultiplier = Param(Params._dummy(),
116
+ "negativeMultiplier",
117
+ "multiplier for negative sentiments. Defaults -1.0",
118
+ typeConverter=TypeConverters.toFloat)
119
+
120
+ incrementMultiplier = Param(Params._dummy(),
121
+ "incrementMultiplier",
122
+ "multiplier for increment sentiments. Defaults 2.0",
123
+ typeConverter=TypeConverters.toFloat)
124
+
125
+ decrementMultiplier = Param(Params._dummy(),
126
+ "decrementMultiplier",
127
+ "multiplier for decrement sentiments. Defaults -2.0",
128
+ typeConverter=TypeConverters.toFloat)
129
+
130
+ reverseMultiplier = Param(Params._dummy(),
131
+ "reverseMultiplier",
132
+ "multiplier for revert sentiments. Defaults -1.0",
133
+ typeConverter=TypeConverters.toFloat)
134
+
135
+ enableScore = Param(Params._dummy(),
136
+ "enableScore",
137
+ "if true, score will show as the double value, else will output string \"positive\" or \"negative\". Defaults false",
138
+ typeConverter=TypeConverters.toBoolean)
139
+
140
+ def __init__(self):
141
+ super(SentimentDetector, self).__init__(
142
+ classname="com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector")
143
+ self._setDefault(positiveMultiplier=1.0, negativeMultiplier=-1.0, incrementMultiplier=2.0,
144
+ decrementMultiplier=-2.0, reverseMultiplier=-1.0, enableScore=False)
145
+
146
+ def setDictionary(self, path, delimiter, read_as=ReadAs.TEXT, options={'format': 'text'}):
147
+ """Sets path for dictionary to sentiment analysis
148
+
149
+ Parameters
150
+ ----------
151
+ path : str
152
+ Path to dictionary file
153
+ delimiter : str
154
+ Delimiter for entries
155
+ read_as : sttr, optional
156
+ How to read the resource, by default ReadAs.TEXT
157
+ options : dict, optional
158
+ Options for reading the resource, by default {'format': 'text'}
159
+ """
160
+ opts = options.copy()
161
+ if "delimiter" not in opts:
162
+ opts["delimiter"] = delimiter
163
+ return self._set(dictionary=ExternalResource(path, read_as, opts))
164
+
165
+ def _create_model(self, java_model):
166
+ return SentimentDetectorModel(java_model=java_model)
167
+
168
+
169
+ class SentimentDetectorModel(AnnotatorModel):
170
+ """Rule based sentiment detector, which calculates a score based on
171
+ predefined keywords.
172
+
173
+ This is the instantiated model of the :class:`.SentimentDetector`. For
174
+ training your own model, please see the documentation of that class.
175
+
176
+ By default, the sentiment score will be assigned labels ``"positive"`` if
177
+ the score is ``>= 0``, else ``"negative"``.
178
+
179
+ For extended examples of usage, see the `Examples
180
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dictionary-sentiment/sentiment.ipynb>`__.
181
+
182
+ ====================== ======================
183
+ Input Annotation types Output Annotation type
184
+ ====================== ======================
185
+ ``TOKEN, DOCUMENT`` ``SENTIMENT``
186
+ ====================== ======================
187
+
188
+ Parameters
189
+ ----------
190
+ None
191
+ """
192
+ name = "SentimentDetectorModel"
193
+
194
+ inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
195
+
196
+ outputAnnotatorType = AnnotatorType.SENTIMENT
197
+
198
+ positiveMultiplier = Param(Params._dummy(),
199
+ "positiveMultiplier",
200
+ "multiplier for positive sentiments. Defaults 1.0",
201
+ typeConverter=TypeConverters.toFloat)
202
+
203
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel",
204
+ java_model=None):
205
+ super(SentimentDetectorModel, self).__init__(
206
+ classname=classname,
207
+ java_model=java_model
208
+ )