spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,591 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for NerDL."""
15
+
16
+ import sys
17
+
18
+ from sparknlp.annotator.param import EvaluationDLParams
19
+ from sparknlp.common import *
20
+ from sparknlp.annotator.ner.ner_approach import NerApproach
21
+
22
+
23
+ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
24
+ """This Named Entity recognition annotator allows to train generic NER model
25
+ based on Neural Networks.
26
+
27
+ The architecture of the neural network is a Char CNNs - BiLSTM - CRF that
28
+ achieves state-of-the-art in most datasets.
29
+
30
+ For instantiated/pretrained models, see :class:`.NerDLModel`.
31
+
32
+ The training data should be a labeled Spark Dataset, in the format of
33
+ :class:`.CoNLL` 2003 IOB with `Annotation` type columns. The data should
34
+ have columns of type ``DOCUMENT, TOKEN, WORD_EMBEDDINGS`` and an additional
35
+ label column of annotator type ``NAMED_ENTITY``.
36
+
37
+ Excluding the label, this can be done with for example:
38
+
39
+ - a SentenceDetector,
40
+ - a Tokenizer and
41
+ - a WordEmbeddingsModel (any embeddings can be chosen, e.g. BertEmbeddings
42
+ for BERT based embeddings).
43
+
44
+ By default, collects all data points into memory for training. For larger datasets, use
45
+ ``setEnableMemoryOptimizer(true)``. This will optimize memory usage during training at the cost
46
+ of speed. Note that this annotator will use as much memory as the largest partition of the
47
+ input dataset, so we recommend repartitioning to batch sizes.
48
+
49
+ Setting a test dataset to monitor model metrics can be done with
50
+ ``.setTestDataset``. The method expects a path to a parquet file containing a
51
+ dataframe that has the same required columns as the training dataframe. The
52
+ pre-processing steps for the training dataframe should also be applied to the test
53
+ dataframe. The following example will show how to create the test dataset with a
54
+ CoNLL dataset:
55
+
56
+ >>> documentAssembler = DocumentAssembler() \\
57
+ ... .setInputCol("text") \\
58
+ ... .setOutputCol("document")
59
+ >>> embeddings = WordEmbeddingsModel \\
60
+ ... .pretrained() \\
61
+ ... .setInputCols(["document", "token"]) \\
62
+ ... .setOutputCol("embeddings")
63
+ >>> preProcessingPipeline = Pipeline().setStages([documentAssembler, embeddings])
64
+ >>> conll = CoNLL()
65
+ >>> (train, test) = conll \\
66
+ ... .readDataset(spark, "src/test/resources/conll2003/eng.train") \\
67
+ ... .randomSplit([0.8, 0.2])
68
+ >>> preProcessingPipeline \\
69
+ ... .fit(test) \\
70
+ ... .transform(test)
71
+ ... .write \\
72
+ ... .mode("overwrite") \\
73
+ ... .parquet("test_data")
74
+ >>> tagger = NerDLApproach() \\
75
+ ... .setInputCols(["document", "token", "embeddings"]) \\
76
+ ... .setLabelColumn("label") \\
77
+ ... .setOutputCol("ner") \\
78
+ ... .setTestDataset("test_data")
79
+
80
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner>`__.
81
+
82
+ ==================================== ======================
83
+ Input Annotation types Output Annotation type
84
+ ==================================== ======================
85
+ ``DOCUMENT, TOKEN, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
86
+ ==================================== ======================
87
+
88
+ Parameters
89
+ ----------
90
+ labelColumn
91
+ Column with label per each token
92
+ entities
93
+ Entities to recognize
94
+ minEpochs
95
+ Minimum number of epochs to train, by default 0
96
+ maxEpochs
97
+ Maximum number of epochs to train, by default 50
98
+ verbose
99
+ Level of verbosity during training, by default 2
100
+ randomSeed
101
+ Random seed
102
+ lr
103
+ Learning Rate, by default 0.001
104
+ po
105
+ Learning rate decay coefficient. Real Learning Rage = lr / (1 + po *
106
+ epoch), by default 0.005
107
+ batchSize
108
+ Batch size, by default 8
109
+ dropout
110
+ Dropout coefficient, by default 0.5
111
+ graphFolder
112
+ Folder path that contain external graph files
113
+ configProtoBytes
114
+ ConfigProto from tensorflow, serialized into byte array.
115
+ useContrib
116
+ whether to use contrib LSTM Cells. Not compatible with Windows. Might
117
+ slightly improve accuracy
118
+ validationSplit
119
+ Choose the proportion of training dataset to be validated against the
120
+ model on each Epoch. The value should be between 0.0 and 1.0 and by
121
+ default it is 0.0 and off, by default 0.0
122
+ evaluationLogExtended
123
+ Whether logs for validation to be extended, by default False.
124
+ testDataset
125
+ Path to a parquet file of a test dataset. If set, it is used to calculate
126
+ statistics on it during training.
127
+ includeConfidence
128
+ whether to include confidence scores in annotation metadata, by default
129
+ False
130
+ includeAllConfidenceScores
131
+ whether to include all confidence scores in annotation metadata or just
132
+ the score of the predicted tag, by default False
133
+ enableOutputLogs
134
+ Whether to use stdout in addition to Spark logs, by default False
135
+ outputLogsPath
136
+ Folder path to save training logs
137
+ enableMemoryOptimizer
138
+ Whether to optimize for large datasets or not. Enabling this option can
139
+ slow down training, by default False
140
+ useBestModel
141
+ Whether to restore and use the model that has achieved the best performance
142
+ at the end of the training.
143
+ bestModelMetric
144
+ Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model
145
+
146
+ Examples
147
+ --------
148
+ >>> import sparknlp
149
+ >>> from sparknlp.base import *
150
+ >>> from sparknlp.annotator import *
151
+ >>> from sparknlp.training import *
152
+ >>> from pyspark.ml import Pipeline
153
+
154
+ This CoNLL dataset already includes a sentence, token and label
155
+ column with their respective annotator types. If a custom dataset is used,
156
+ these need to be defined with for example:
157
+
158
+ >>> documentAssembler = DocumentAssembler() \\
159
+ ... .setInputCol("text") \\
160
+ ... .setOutputCol("document")
161
+ >>> sentence = SentenceDetector() \\
162
+ ... .setInputCols(["document"]) \\
163
+ ... .setOutputCol("sentence")
164
+ >>> tokenizer = Tokenizer() \\
165
+ ... .setInputCols(["sentence"]) \\
166
+ ... .setOutputCol("token")
167
+
168
+ Then the training can start
169
+
170
+ >>> embeddings = BertEmbeddings.pretrained() \\
171
+ ... .setInputCols(["sentence", "token"]) \\
172
+ ... .setOutputCol("embeddings")
173
+ >>> nerTagger = NerDLApproach() \\
174
+ ... .setInputCols(["sentence", "token", "embeddings"]) \\
175
+ ... .setLabelColumn("label") \\
176
+ ... .setOutputCol("ner") \\
177
+ ... .setMaxEpochs(1) \\
178
+ ... .setRandomSeed(0) \\
179
+ ... .setVerbose(0)
180
+ >>> pipeline = Pipeline().setStages([
181
+ ... embeddings,
182
+ ... nerTagger
183
+ ... ])
184
+
185
+ We use the sentences, tokens, and labels from the CoNLL dataset.
186
+
187
+ >>> conll = CoNLL()
188
+ >>> trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
189
+ >>> pipelineModel = pipeline.fit(trainingData)
190
+
191
+ See Also
192
+ --------
193
+ NerCrfApproach : for a generic CRF approach
194
+ NerConverter : to further process the results
195
+ """
196
+
197
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.WORD_EMBEDDINGS]
198
+
199
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
200
+
201
+ lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat)
202
+
203
+ po = Param(Params._dummy(), "po", "Learning rate decay coefficient. Real Learning Rage = lr / (1 + po * epoch)",
204
+ TypeConverters.toFloat)
205
+
206
+ batchSize = Param(Params._dummy(), "batchSize", "Batch size", TypeConverters.toInt)
207
+
208
+ dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat)
209
+
210
+ graphFolder = Param(Params._dummy(), "graphFolder", "Folder path that contain external graph files",
211
+ TypeConverters.toString)
212
+
213
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
214
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
215
+ TypeConverters.toListInt)
216
+
217
+ useContrib = Param(Params._dummy(), "useContrib",
218
+ "whether to use contrib LSTM Cells. Not compatible with Windows. Might slightly improve accuracy.",
219
+ TypeConverters.toBoolean)
220
+
221
+ includeConfidence = Param(Params._dummy(), "includeConfidence",
222
+ "whether to include confidence scores in annotation metadata",
223
+ TypeConverters.toBoolean)
224
+
225
+ includeAllConfidenceScores = Param(Params._dummy(), "includeAllConfidenceScores",
226
+ "whether to include all confidence scores in annotation metadata or just the score of the predicted tag",
227
+ TypeConverters.toBoolean)
228
+
229
+ enableMemoryOptimizer = Param(Params._dummy(), "enableMemoryOptimizer",
230
+ "Whether to optimize for large datasets or not. Enabling this option can slow down training.",
231
+ TypeConverters.toBoolean)
232
+
233
+ useBestModel = Param(Params._dummy(), "useBestModel",
234
+ "Whether to restore and use the model that has achieved the best performance at the end of the training.",
235
+ TypeConverters.toBoolean)
236
+
237
+ bestModelMetric = Param(Params._dummy(), "bestModelMetric",
238
+ "Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model.",
239
+ TypeConverters.toString)
240
+
241
+ def setConfigProtoBytes(self, b):
242
+ """Sets configProto from tensorflow, serialized into byte array.
243
+
244
+ Parameters
245
+ ----------
246
+ b : List[int]
247
+ ConfigProto from tensorflow, serialized into byte array
248
+ """
249
+ return self._set(configProtoBytes=b)
250
+
251
+ def setGraphFolder(self, p):
252
+ """Sets folder path that contain external graph files.
253
+
254
+ Parameters
255
+ ----------
256
+ p : str
257
+ Folder path that contain external graph files
258
+ """
259
+ return self._set(graphFolder=p)
260
+
261
+ def setUseContrib(self, v):
262
+ """Sets whether to use contrib LSTM Cells. Not compatible with Windows.
263
+ Might slightly improve accuracy.
264
+
265
+ Parameters
266
+ ----------
267
+ v : bool
268
+ Whether to use contrib LSTM Cells
269
+
270
+ Raises
271
+ ------
272
+ Exception
273
+ Windows not supported to use contrib
274
+ """
275
+ if v and sys.version == 'win32':
276
+ raise Exception("Windows not supported to use contrib")
277
+ return self._set(useContrib=v)
278
+
279
+ def setLr(self, v):
280
+ """Sets Learning Rate, by default 0.001.
281
+
282
+ Parameters
283
+ ----------
284
+ v : float
285
+ Learning Rate
286
+ """
287
+ self._set(lr=v)
288
+ return self
289
+
290
+ def setPo(self, v):
291
+ """Sets Learning rate decay coefficient, by default 0.005.
292
+
293
+ Real Learning Rage is lr / (1 + po * epoch).
294
+
295
+ Parameters
296
+ ----------
297
+ v : float
298
+ Learning rate decay coefficient
299
+ """
300
+ self._set(po=v)
301
+ return self
302
+
303
+ def setBatchSize(self, v):
304
+ """Sets batch size, by default 64.
305
+
306
+ Parameters
307
+ ----------
308
+ v : int
309
+ Batch size
310
+ """
311
+ self._set(batchSize=v)
312
+ return self
313
+
314
+ def setDropout(self, v):
315
+ """Sets dropout coefficient, by default 0.5.
316
+
317
+ Parameters
318
+ ----------
319
+ v : float
320
+ Dropout coefficient
321
+ """
322
+ self._set(dropout=v)
323
+ return self
324
+
325
+ def setIncludeConfidence(self, value):
326
+ """Sets whether to include confidence scores in annotation metadata, by
327
+ default False.
328
+
329
+ Parameters
330
+ ----------
331
+ value : bool
332
+ Whether to include the confidence value in the output.
333
+ """
334
+ return self._set(includeConfidence=value)
335
+
336
+ def setIncludeAllConfidenceScores(self, value):
337
+ """Sets whether to include all confidence scores in annotation metadata
338
+ or just the score of the predicted tag, by default False.
339
+
340
+ Parameters
341
+ ----------
342
+ value : bool
343
+ Whether to include all confidence scores in annotation metadata or
344
+ just the score of the predicted tag
345
+ """
346
+ return self._set(includeAllConfidenceScores=value)
347
+
348
+ def setEnableMemoryOptimizer(self, value):
349
+ """Sets Whether to optimize for large datasets or not, by default False.
350
+ Enabling this option can slow down training.
351
+
352
+ Parameters
353
+ ----------
354
+ value : bool
355
+ Whether to optimize for large datasets
356
+ """
357
+ return self._set(enableMemoryOptimizer=value)
358
+
359
+ def setUseBestModel(self, value):
360
+ """Whether to restore and use the model that has achieved the best performance at the end of the training.
361
+ The metric that is being monitored is F1 for testDataset and if it's not set it will be validationSplit, and if it's not set finally looks for loss.
362
+
363
+ Parameters
364
+ ----------
365
+ value : bool
366
+ Whether to restore and use the model that has achieved the best performance at the end of the training.
367
+ """
368
+ return self._set(useBestModel=value)
369
+
370
+ def setBestModelMetric(self, value):
371
+ """Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model when setUseBestModel is True
372
+
373
+ Parameters
374
+ ----------
375
+ value : str
376
+ Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model
377
+ """
378
+ return self._set(bestModelMetric=value)
379
+
380
+ def _create_model(self, java_model):
381
+ return NerDLModel(java_model=java_model)
382
+
383
+ @keyword_only
384
+ def __init__(self):
385
+ super(NerDLApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLApproach")
386
+ uc = False if sys.platform == 'win32' else True
387
+ self._setDefault(
388
+ minEpochs=0,
389
+ maxEpochs=50,
390
+ lr=float(0.001),
391
+ po=float(0.005),
392
+ batchSize=8,
393
+ dropout=float(0.5),
394
+ verbose=2,
395
+ useContrib=uc,
396
+ validationSplit=float(0.0),
397
+ evaluationLogExtended=False,
398
+ includeConfidence=False,
399
+ includeAllConfidenceScores=False,
400
+ enableOutputLogs=False,
401
+ enableMemoryOptimizer=False,
402
+ useBestModel=False,
403
+ bestModelMetric="f1_micro"
404
+ )
405
+
406
+
407
+ class NerDLModel(AnnotatorModel, HasStorageRef, HasBatchedAnnotate, HasEngine):
408
+ """This Named Entity recognition annotator is a generic NER model based on
409
+ Neural Networks.
410
+
411
+ Neural Network architecture is Char CNNs - BiLSTM - CRF that achieves
412
+ state-of-the-art in most datasets.
413
+
414
+ This is the instantiated model of the :class:`.NerDLApproach`. For training
415
+ your own model, please see the documentation of that class.
416
+
417
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
418
+ object:
419
+
420
+ >>> nerModel = NerDLModel.pretrained() \\
421
+ ... .setInputCols(["sentence", "token", "embeddings"]) \\
422
+ ... .setOutputCol("ner")
423
+
424
+
425
+ The default model is ``"ner_dl"``, if no name is provided.
426
+
427
+ For available pretrained models please see the `Models Hub
428
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
429
+ Additionally, pretrained pipelines are available for this module, see
430
+ `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
431
+
432
+ Note that some pretrained models require specific types of embeddings,
433
+ depending on which they were trained on. For example, the default model
434
+ ``"ner_dl"`` requires the WordEmbeddings ``"glove_100d"``.
435
+
436
+ For extended examples of usage, see the `Examples
437
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb>`__.
438
+
439
+ ==================================== ======================
440
+ Input Annotation types Output Annotation type
441
+ ==================================== ======================
442
+ ``DOCUMENT, TOKEN, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
443
+ ==================================== ======================
444
+
445
+ Parameters
446
+ ----------
447
+ batchSize
448
+ Size of every batch, by default 8
449
+ configProtoBytes
450
+ ConfigProto from tensorflow, serialized into byte array.
451
+ includeConfidence
452
+ Whether to include confidence scores in annotation metadata, by default
453
+ False
454
+ includeAllConfidenceScores
455
+ Whether to include all confidence scores in annotation metadata or just
456
+ the score of the predicted tag, by default False
457
+ classes
458
+ Tags used to trained this NerDLModel
459
+
460
+ Examples
461
+ --------
462
+ >>> import sparknlp
463
+ >>> from sparknlp.base import *
464
+ >>> from sparknlp.annotator import *
465
+ >>> from pyspark.ml import Pipeline
466
+
467
+ First extract the prerequisites for the NerDLModel
468
+
469
+ >>> documentAssembler = DocumentAssembler() \\
470
+ ... .setInputCol("text") \\
471
+ ... .setOutputCol("document")
472
+ >>> sentence = SentenceDetector() \\
473
+ ... .setInputCols(["document"]) \\
474
+ ... .setOutputCol("sentence")
475
+ >>> tokenizer = Tokenizer() \\
476
+ ... .setInputCols(["sentence"]) \\
477
+ ... .setOutputCol("token")
478
+ >>> embeddings = WordEmbeddingsModel.pretrained() \\
479
+ ... .setInputCols(["sentence", "token"]) \\
480
+ ... .setOutputCol("bert")
481
+
482
+ Then NER can be extracted
483
+
484
+ >>> nerTagger = NerDLModel.pretrained() \\
485
+ ... .setInputCols(["sentence", "token", "bert"]) \\
486
+ ... .setOutputCol("ner")
487
+ >>> pipeline = Pipeline().setStages([
488
+ ... documentAssembler,
489
+ ... sentence,
490
+ ... tokenizer,
491
+ ... embeddings,
492
+ ... nerTagger
493
+ ... ])
494
+ >>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
495
+ >>> result = pipeline.fit(data).transform(data)
496
+ >>> result.select("ner.result").show(truncate=False)
497
+ +------------------------------------+
498
+ |result |
499
+ +------------------------------------+
500
+ |[B-ORG, O, O, B-PER, O, O, B-LOC, O]|
501
+ +------------------------------------+
502
+
503
+ See Also
504
+ --------
505
+ NerCrfModel : for a generic CRF approach
506
+ NerConverter : to further process the results
507
+ """
508
+ name = "NerDLModel"
509
+
510
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.WORD_EMBEDDINGS]
511
+
512
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
513
+
514
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel", java_model=None):
515
+ super(NerDLModel, self).__init__(
516
+ classname=classname,
517
+ java_model=java_model
518
+ )
519
+ self._setDefault(
520
+ includeConfidence=False,
521
+ includeAllConfidenceScores=False,
522
+ batchSize=8
523
+ )
524
+
525
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
526
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
527
+ TypeConverters.toListInt)
528
+ includeConfidence = Param(Params._dummy(), "includeConfidence",
529
+ "whether to include confidence scores in annotation metadata",
530
+ TypeConverters.toBoolean)
531
+ includeAllConfidenceScores = Param(Params._dummy(), "includeAllConfidenceScores",
532
+ "whether to include all confidence scores in annotation metadata or just the score of the predicted tag",
533
+ TypeConverters.toBoolean)
534
+ classes = Param(Params._dummy(), "classes",
535
+ "get the tags used to trained this NerDLModel",
536
+ TypeConverters.toListString)
537
+
538
+ def setConfigProtoBytes(self, b):
539
+ """Sets configProto from tensorflow, serialized into byte array.
540
+
541
+ Parameters
542
+ ----------
543
+ b : List[int]
544
+ ConfigProto from tensorflow, serialized into byte array
545
+ """
546
+ return self._set(configProtoBytes=b)
547
+
548
+ def setIncludeConfidence(self, value):
549
+ """Sets whether to include confidence scores in annotation metadata, by
550
+ default False.
551
+
552
+ Parameters
553
+ ----------
554
+ value : bool
555
+ Whether to include the confidence value in the output.
556
+ """
557
+ return self._set(includeConfidence=value)
558
+
559
+ def setIncludeAllConfidenceScores(self, value):
560
+ """Sets whether to include all confidence scores in annotation metadata
561
+ or just the score of the predicted tag, by default False.
562
+
563
+ Parameters
564
+ ----------
565
+ value : bool
566
+ Whether to include all confidence scores in annotation metadata or
567
+ just the score of the predicted tag
568
+ """
569
+ return self._set(includeAllConfidenceScores=value)
570
+
571
+ @staticmethod
572
+ def pretrained(name="ner_dl", lang="en", remote_loc=None):
573
+ """Downloads and loads a pretrained model.
574
+
575
+ Parameters
576
+ ----------
577
+ name : str, optional
578
+ Name of the pretrained model, by default "ner_dl"
579
+ lang : str, optional
580
+ Language of the pretrained model, by default "en"
581
+ remote_loc : str, optional
582
+ Optional remote address of the resource, by default None. Will use
583
+ Spark NLPs repositories otherwise.
584
+
585
+ Returns
586
+ -------
587
+ NerDLModel
588
+ The restored model
589
+ """
590
+ from sparknlp.pretrained import ResourceDownloader
591
+ return ResourceDownloader.downloadModel(NerDLModel, name, lang, remote_loc)