spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,397 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for NerCrf."""
15
+
16
+ from sparknlp.common import *
17
+ from sparknlp.annotator.ner.ner_approach import NerApproach
18
+
19
+
20
+ class NerCrfApproach(AnnotatorApproach, NerApproach):
21
+ """Algorithm for training a Named Entity Recognition Model
22
+
23
+ For instantiated/pretrained models, see :class:`.NerCrfModel`.
24
+
25
+ This Named Entity recognition annotator allows for a generic model to be
26
+ trained by utilizing a CRF machine learning algorithm. The training data
27
+ should be a labeled Spark Dataset, e.g. :class:`.CoNLL` 2003 IOB with
28
+ `Annotation` type columns. The data should have columns of type
29
+ ``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS`` and an additional label column of
30
+ annotator type ``NAMED_ENTITY``.
31
+
32
+ Excluding the label, this can be done with for example:
33
+
34
+ - a :class:`.SentenceDetector`,
35
+ - a :class:`.Tokenizer`,
36
+ - a :class:`.PerceptronModel` and
37
+ - a :class:`.WordEmbeddingsModel`.
38
+
39
+ Optionally the user can provide an entity dictionary file with
40
+ :meth:`.setExternalFeatures` for better accuracy.
41
+
42
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
43
+
44
+ ========================================= ======================
45
+ Input Annotation types Output Annotation type
46
+ ========================================= ======================
47
+ ``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
48
+ ========================================= ======================
49
+
50
+ Parameters
51
+ ----------
52
+ labelColumn
53
+ Column with label per each token
54
+ entities
55
+ Entities to recognize
56
+ minEpochs
57
+ Minimum number of epochs to train, by default 0
58
+ maxEpochs
59
+ Maximum number of epochs to train, by default 1000
60
+ verbose
61
+ Level of verbosity during training, by default 4
62
+ randomSeed
63
+ Random seed
64
+ l2
65
+ L2 regularization coefficient, by default 1.0
66
+ c0
67
+ c0 params defining decay speed for gradient, by default 2250000
68
+ lossEps
69
+ If Epoch relative improvement less than eps then training is stopped, by
70
+ default 0.001
71
+ minW
72
+ Features with less weights then this param value will be filtered
73
+ includeConfidence
74
+ Whether to include confidence scores in annotation metadata, by default
75
+ False
76
+ externalFeatures
77
+ Additional dictionaries paths to use as a features
78
+
79
+ Examples
80
+ --------
81
+ >>> import sparknlp
82
+ >>> from sparknlp.base import *
83
+ >>> from sparknlp.annotator import *
84
+ >>> from sparknlp.training import *
85
+ >>> from pyspark.ml import Pipeline
86
+
87
+ This CoNLL dataset already includes a sentence, token, POS tags and label
88
+ column with their respective annotator types. If a custom dataset is used,
89
+ these need to be defined with for example:
90
+
91
+ >>> documentAssembler = DocumentAssembler() \\
92
+ ... .setInputCol("text") \\
93
+ ... .setOutputCol("document")
94
+ >>> sentence = SentenceDetector() \\
95
+ ... .setInputCols(["document"]) \\
96
+ ... .setOutputCol("sentence")
97
+ >>> tokenizer = Tokenizer() \\
98
+ ... .setInputCols(["sentence"]) \\
99
+ ... .setOutputCol("token")
100
+ >>> posTagger = PerceptronModel.pretrained() \\
101
+ ... .setInputCols(["sentence", "token"]) \\
102
+ ... .setOutputCol("pos")
103
+
104
+ Then training can start:
105
+
106
+ >>> embeddings = WordEmbeddingsModel.pretrained() \\
107
+ ... .setInputCols(["sentence", "token"]) \\
108
+ ... .setOutputCol("embeddings") \\
109
+ ... .setCaseSensitive(False)
110
+ >>> nerTagger = NerCrfApproach() \\
111
+ ... .setInputCols(["sentence", "token", "pos", "embeddings"]) \\
112
+ ... .setLabelColumn("label") \\
113
+ ... .setMinEpochs(1) \\
114
+ ... .setMaxEpochs(3) \\
115
+ ... .setOutputCol("ner")
116
+ >>> pipeline = Pipeline().setStages([
117
+ ... embeddings,
118
+ ... nerTagger
119
+ ... ])
120
+
121
+ We use the sentences, tokens, POS tags and labels from the CoNLL dataset.
122
+
123
+ >>> conll = CoNLL()
124
+ >>> trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
125
+ >>> pipelineModel = pipeline.fit(trainingData)
126
+
127
+ See Also
128
+ --------
129
+ NerDLApproach : for a deep learning based approach
130
+ NerConverter : to further process the results
131
+ """
132
+
133
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.WORD_EMBEDDINGS]
134
+
135
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
136
+
137
+ l2 = Param(Params._dummy(), "l2", "L2 regularization coefficient", TypeConverters.toFloat)
138
+
139
+ c0 = Param(Params._dummy(), "c0", "c0 params defining decay speed for gradient", TypeConverters.toInt)
140
+
141
+ lossEps = Param(Params._dummy(), "lossEps", "If Epoch relative improvement less than eps then training is stopped",
142
+ TypeConverters.toFloat)
143
+
144
+ minW = Param(Params._dummy(), "minW", "Features with less weights then this param value will be filtered",
145
+ TypeConverters.toFloat)
146
+
147
+ includeConfidence = Param(Params._dummy(), "includeConfidence",
148
+ "external features is a delimited text. needs 'delimiter' in options",
149
+ TypeConverters.toBoolean)
150
+
151
+ externalFeatures = Param(Params._dummy(), "externalFeatures", "Additional dictionaries paths to use as a features",
152
+ TypeConverters.identity)
153
+
154
+ verbose = Param(Params._dummy(), "verbose", "Level of verbosity during training", TypeConverters.toInt)
155
+
156
+ def setL2(self, l2value):
157
+ """Sets L2 regularization coefficient, by default 1.0.
158
+
159
+ Parameters
160
+ ----------
161
+ l2value : float
162
+ L2 regularization coefficient
163
+ """
164
+ return self._set(l2=l2value)
165
+
166
+ def setC0(self, c0value):
167
+ """Sets c0 params defining decay speed for gradient, by default 2250000.
168
+
169
+ Parameters
170
+ ----------
171
+ c0value : int
172
+ c0 params defining decay speed for gradient
173
+ """
174
+ return self._set(c0=c0value)
175
+
176
+ def setLossEps(self, eps):
177
+ """Sets If Epoch relative improvement less than eps then training is
178
+ stopped, by default 0.001.
179
+
180
+ Parameters
181
+ ----------
182
+ eps : float
183
+ The threshold
184
+ """
185
+ return self._set(lossEps=eps)
186
+
187
+ def setMinW(self, w):
188
+ """Sets minimum weight value.
189
+
190
+ Features with less weights then this param value will be filtered.
191
+
192
+ Parameters
193
+ ----------
194
+ w : float
195
+ Minimum weight value
196
+ """
197
+ return self._set(minW=w)
198
+
199
+ def setExternalFeatures(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
200
+ """Sets Additional dictionaries paths to use as a features.
201
+
202
+ Parameters
203
+ ----------
204
+ path : str
205
+ Path to the source files
206
+ delimiter : str
207
+ Delimiter for the dictionary file. Can also be set it `options`.
208
+ read_as : str, optional
209
+ How to read the file, by default ReadAs.TEXT
210
+ options : dict, optional
211
+ Options to read the resource, by default {"format": "text"}
212
+ """
213
+ opts = options.copy()
214
+ if "delimiter" not in opts:
215
+ opts["delimiter"] = delimiter
216
+ return self._set(externalFeatures=ExternalResource(path, read_as, opts))
217
+
218
+ def setIncludeConfidence(self, b):
219
+ """Sets whether to include confidence scores in annotation metadata, by
220
+ default False.
221
+
222
+ Parameters
223
+ ----------
224
+ b : bool
225
+ Whether to include the confidence value in the output.
226
+ """
227
+ return self._set(includeConfidence=b)
228
+
229
+ def setVerbose(self, verboseValue):
230
+ """Sets level of verbosity during training.
231
+
232
+ Parameters
233
+ ----------
234
+ verboseValue : int
235
+ Level of verbosity
236
+ """
237
+ return self._set(verbose=verboseValue)
238
+
239
+ def _create_model(self, java_model):
240
+ return NerCrfModel(java_model=java_model)
241
+
242
+ @keyword_only
243
+ def __init__(self):
244
+ super(NerCrfApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach")
245
+ self._setDefault(
246
+ minEpochs=0,
247
+ maxEpochs=1000,
248
+ l2=float(1),
249
+ c0=2250000,
250
+ lossEps=float(1e-3),
251
+ verbose=4,
252
+ includeConfidence=False
253
+ )
254
+
255
+
256
+ class NerCrfModel(AnnotatorModel):
257
+ """Extracts Named Entities based on a CRF Model.
258
+
259
+ This Named Entity recognition annotator allows for a generic model to be
260
+ trained by utilizing a CRF machine learning algorithm. The data should have
261
+ columns of type ``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS``. These can be
262
+ extracted with for example
263
+
264
+ - a SentenceDetector,
265
+ - a Tokenizer and
266
+ - a PerceptronModel.
267
+
268
+ This is the instantiated model of the :class:`.NerCrfApproach`. For training
269
+ your own model, please see the documentation of that class.
270
+
271
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
272
+ object:
273
+
274
+ >>> nerTagger = NerCrfModel.pretrained() \\
275
+ ... .setInputCols(["sentence", "token", "word_embeddings", "pos"]) \\
276
+ ... .setOutputCol("ner")
277
+
278
+
279
+ The default model is ``"ner_crf"``, if no name is provided. For available
280
+ pretrained models please see the `Models Hub
281
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
282
+
283
+ For extended examples of usage, see the `Examples
284
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
285
+
286
+ ========================================= ======================
287
+ Input Annotation types Output Annotation type
288
+ ========================================= ======================
289
+ ``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
290
+ ========================================= ======================
291
+
292
+ Parameters
293
+ ----------
294
+ includeConfidence
295
+ Whether to include confidence scores in annotation metadata, by default
296
+ False
297
+
298
+ Examples
299
+ --------
300
+ >>> import sparknlp
301
+ >>> from sparknlp.base import *
302
+ >>> from sparknlp.annotator import *
303
+ >>> from pyspark.ml import Pipeline
304
+
305
+ First extract the prerequisites for the NerCrfModel
306
+
307
+ >>> documentAssembler = DocumentAssembler() \\
308
+ ... .setInputCol("text") \\
309
+ ... .setOutputCol("document")
310
+ >>> sentence = SentenceDetector() \\
311
+ ... .setInputCols(["document"]) \\
312
+ ... .setOutputCol("sentence")
313
+ >>> tokenizer = Tokenizer() \\
314
+ ... .setInputCols(["sentence"]) \\
315
+ ... .setOutputCol("token")
316
+ >>> embeddings = WordEmbeddingsModel.pretrained() \\
317
+ ... .setInputCols(["sentence", "token"]) \\
318
+ ... .setOutputCol("word_embeddings")
319
+ >>> posTagger = PerceptronModel.pretrained() \\
320
+ ... .setInputCols(["sentence", "token"]) \\
321
+ ... .setOutputCol("pos")
322
+
323
+ Then NER can be extracted
324
+
325
+ >>> nerTagger = NerCrfModel.pretrained() \\
326
+ ... .setInputCols(["sentence", "token", "word_embeddings", "pos"]) \\
327
+ ... .setOutputCol("ner")
328
+ >>> pipeline = Pipeline().setStages([
329
+ ... documentAssembler,
330
+ ... sentence,
331
+ ... tokenizer,
332
+ ... embeddings,
333
+ ... posTagger,
334
+ ... nerTagger
335
+ ... ])
336
+ >>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
337
+ >>> result = pipeline.fit(data).transform(data)
338
+ >>> result.select("ner.result").show(truncate=False)
339
+ +------------------------------------+
340
+ |result |
341
+ +------------------------------------+
342
+ |[I-ORG, O, O, I-PER, O, O, I-LOC, O]|
343
+ +------------------------------------+
344
+
345
+ See Also
346
+ --------
347
+ NerDLModel : for a deep learning based approach
348
+ NerConverter : to further process the results
349
+ """
350
+ name = "NerCrfModel"
351
+
352
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.WORD_EMBEDDINGS]
353
+
354
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
355
+
356
+ includeConfidence = Param(Params._dummy(), "includeConfidence",
357
+ "external features is a delimited text. needs 'delimiter' in options",
358
+ TypeConverters.toBoolean)
359
+
360
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfModel", java_model=None):
361
+ super(NerCrfModel, self).__init__(
362
+ classname=classname,
363
+ java_model=java_model
364
+ )
365
+
366
+ def setIncludeConfidence(self, b):
367
+ """Sets whether to include confidence scores in annotation metadata, by
368
+ default False.
369
+
370
+ Parameters
371
+ ----------
372
+ b : bool
373
+ Whether to include the confidence value in the output.
374
+ """
375
+ return self._set(includeConfidence=b)
376
+
377
+ @staticmethod
378
+ def pretrained(name="ner_crf", lang="en", remote_loc=None):
379
+ """Downloads and loads a pretrained model.
380
+
381
+ Parameters
382
+ ----------
383
+ name : str, optional
384
+ Name of the pretrained model, by default "ner_crf"
385
+ lang : str, optional
386
+ Language of the pretrained model, by default "en"
387
+ remote_loc : str, optional
388
+ Optional remote address of the resource, by default None. Will use
389
+ Spark NLPs repositories otherwise.
390
+
391
+ Returns
392
+ -------
393
+ NerCrfModel
394
+ The restored model
395
+ """
396
+ from sparknlp.pretrained import ResourceDownloader
397
+ return ResourceDownloader.downloadModel(NerCrfModel, name, lang, remote_loc)