spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,299 @@
1
+ import os
2
+ import re
3
+
4
+ from sparknlp.internal import _ResourceHelper_moveFile
5
+ from sparknlp.training._tf_graph_builders.ner_dl.create_graph import create_graph
6
+
7
+
8
+ class WrongTFVersion(Exception):
9
+ pass
10
+
11
+
12
+ class TensorflowAddonsNeeded(Exception):
13
+ pass
14
+
15
+
16
+ class TFGraphBuilder:
17
+ """
18
+ Generic class to create the tensorflow graphs for 'ner_dl', 'generic_classifier', 'assertion_dl', 'relation_extraction' annotators in spark-nlp healthcare. In version 1.1
19
+ Examples
20
+ --------
21
+ >>> from sparknlp.training.tfgraphs import tf_graph
22
+ >>>
23
+ >>> tf_graph.get_models()
24
+
25
+ """
26
+
27
+ def supports_auto_file_name(self):
28
+ return False
29
+
30
+ def get_model_filename(self):
31
+ raise Exception("Not implemented.")
32
+
33
+ def check_build_params(self):
34
+
35
+ build_params = self.get_build_params()
36
+ required_params = self.get_model_build_params()
37
+
38
+ for req_param in required_params:
39
+ if req_param not in build_params:
40
+ if required_params[req_param] is None:
41
+ raise Exception(f"You need to specify a value for {req_param} in the build parameters.")
42
+
43
+ def get_build_params(self):
44
+ return self.__build_params
45
+
46
+ def get_build_params_with_defaults(self):
47
+ build_params = self.get_build_params()
48
+ req_build_params = self.get_model_build_params()
49
+
50
+ for req_param in req_build_params:
51
+ if (req_param not in build_params) and (req_build_params[req_param] is not None):
52
+ build_params[req_param] = req_build_params[req_param]
53
+
54
+ return build_params
55
+
56
+ def get_build_param(self, build_param):
57
+ build_params = self.get_build_params()
58
+
59
+ if build_param in build_params:
60
+ return build_params[build_param]
61
+
62
+ required_params = self.get_model_build_params()
63
+
64
+ if (build_param in required_params) and (required_params[build_param] is not None):
65
+ return required_params[build_param]
66
+
67
+ raise Exception(f"No value for {build_param} found.")
68
+
69
+ def get_model_build_params(self):
70
+ return {}
71
+
72
+ def get_model_build_param_explanations(self):
73
+ return {}
74
+
75
+ def __init__(self, build_params):
76
+ self.__build_params = build_params
77
+
78
+
79
+ class NerTFGraphBuilder(TFGraphBuilder):
80
+ """
81
+ Class to build the the TF graphs for MedicalNerApproach.
82
+
83
+ Examples
84
+ --------
85
+
86
+ >>> from sparknlp.training.tfgraphs import tf_graph
87
+ >>> from sparknlp.base import *
88
+ >>> from sparknlp.annotator import *
89
+ >>>feat_size = 200
90
+ >>>n_classes = 6
91
+ >>> tf_graph.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12},model_location="./medical_ner_graphs",model_filename="auto")
92
+ >>> nerTagger = MedicalNerApproach()\
93
+ >>> .setInputCols(["sentence", "token", "embeddings"])\
94
+ >>> .setLabelColumn("label")\
95
+ >>> .setOutputCol("ner")\
96
+ >>> .setMaxEpochs(2)\
97
+ >>> .setBatchSize(64)\
98
+ >>> .setRandomSeed(0)\
99
+ >>> .setVerbose(1)\
100
+ >>> .setValidationSplit(0.2)\
101
+ >>> .setEvaluationLogExtended(True) \
102
+ >>> .setEnableOutputLogs(True)\
103
+ >>> .setIncludeConfidence(True)\
104
+ >>> .setOutputLogsPath('ner_logs')\
105
+ >>> .setGraphFolder('medical_ner_graphs')\
106
+ >>> .setEnableMemoryOptimizer(True)
107
+ """
108
+
109
+ def supports_auto_file_name(self):
110
+ return True
111
+
112
+ def get_model_filename(self):
113
+ return "blstm_{}_{}_{}_{}.pb".format(
114
+ self.get_build_param("ntags"),
115
+ self.get_build_param("embeddings_dim"),
116
+ self.get_build_param("lstm_size"),
117
+ self.get_build_param("nchars"),
118
+ )
119
+
120
+ def get_model_build_params(self):
121
+ return {
122
+ "ntags": None,
123
+ "embeddings_dim": 200,
124
+ "nchars": 100,
125
+ "lstm_size": 128,
126
+ "gpu_device": 0
127
+ }
128
+
129
+ def get_model_build_param_explanations(self):
130
+ return {
131
+ "ntags": "Number of tags.",
132
+ "embeddings_dim": "Embeddings dimension.",
133
+ "nchars": "Number of chars.",
134
+ "gpu_device": "Device for training.",
135
+ "lstm_size": "Number of LSTM units."
136
+ }
137
+
138
+ def build(self, model_location, model_filename):
139
+ if re.match(r'(\w+)://.*', model_location):
140
+ tmp_location = "/tmp/nerModel"
141
+ create_graph(
142
+ model_location=tmp_location,
143
+ model_filename=model_filename,
144
+ ntags=self.get_build_param("ntags"),
145
+ embeddings_dim=self.get_build_param("embeddings_dim"),
146
+ nchars=self.get_build_param("nchars"),
147
+ lstm_size=self.get_build_param("lstm_size"),
148
+ gpu_device=self.get_build_param("gpu_device"),
149
+ is_medical=False,
150
+ )
151
+
152
+ file_location = os.path.join(tmp_location, model_filename)
153
+ _ResourceHelper_moveFile(file_location, model_location).apply()
154
+
155
+ else:
156
+ create_graph(
157
+ model_location=model_location,
158
+ model_filename=model_filename,
159
+ ntags=self.get_build_param("ntags"),
160
+ embeddings_dim=self.get_build_param("embeddings_dim"),
161
+ nchars=self.get_build_param("nchars"),
162
+ lstm_size=self.get_build_param("lstm_size"),
163
+ gpu_device=self.get_build_param("gpu_device"),
164
+ is_medical=False,
165
+ )
166
+
167
+
168
+ class TFGraphBuilderFactory:
169
+ """
170
+ Factory class to create the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare
171
+ """
172
+
173
+ __model_builders = {
174
+ "ner_dl": NerTFGraphBuilder
175
+ }
176
+
177
+ @staticmethod
178
+ def get_models():
179
+ """
180
+ Method that return the available tf models in spark-nlp healthcare
181
+ Examples
182
+ --------
183
+ >>> from sparknlp.training.tfgraphs import tf_graph
184
+ >>> tf_graph.get_models()
185
+ """
186
+ return list(TFGraphBuilderFactory.__model_builders.keys())
187
+
188
+ @staticmethod
189
+ def build(model_name, build_params, model_location, model_filename="auto"):
190
+ """
191
+ Method that create the tf graph.
192
+
193
+ Parameters
194
+ ----------
195
+ model_name: str
196
+ The name of the tf model that you want to build.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction
197
+ build_params: dict
198
+ Configuration params to build the tf graph for the specific model.
199
+ model_location: str
200
+ Path where the model will be saved
201
+ model_filename: str
202
+ Name of the .rb file. If you put auto the filename will be generated.
203
+
204
+ Examples
205
+ --------
206
+ >>> from sparknlp.training.tfgraphs import tf_graph
207
+ >>> tf_graph.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12},model_location="./ner_graphs",model_filename="auto")
208
+
209
+ """
210
+ try:
211
+ import tensorflow as tf
212
+
213
+ if tf.__version__[0] == '2':
214
+
215
+ try:
216
+ import tensorflow_addons
217
+
218
+ except ModuleNotFoundError:
219
+ raise TensorflowAddonsNeeded()
220
+
221
+ if not (tf.__version__.startswith("1.15") or tf.__version__[0] == '2'):
222
+ raise WrongTFVersion()
223
+
224
+ except WrongTFVersion:
225
+ print(tf.version)
226
+ raise Exception("Tensorflow 2.xx or 1.15 is required to build model graphs.")
227
+
228
+ except ModuleNotFoundError:
229
+ raise Exception("You need to install Tensorflow 2.xx or 1.15 to be able to build model graphs")
230
+
231
+ except TensorflowAddonsNeeded:
232
+ raise Exception("You need to install tensorflow_addons to be able to generate graphs in Tensorflow 2.x")
233
+
234
+ if model_name not in TFGraphBuilderFactory.__model_builders:
235
+ raise Exception(f"Can't build a graph for {model_name}: model not supported.")
236
+
237
+ model = TFGraphBuilderFactory.__model_builders[model_name](build_params)
238
+ model.check_build_params()
239
+
240
+ if model_filename == "auto":
241
+ if not model.supports_auto_file_name():
242
+ msg = f"""
243
+ {model_name} doesn't support automatic filename generation, please specify the filename of the
244
+ output graph
245
+ """.strip()
246
+ raise Exception(msg)
247
+ else:
248
+ model_filename = model.get_model_filename()
249
+
250
+ model.build(model_location, model_filename)
251
+ else:
252
+ if re.match(r'(\w+)://.*', model_location):
253
+ tmp_location = "/tmp/relationModel"
254
+ model.build(tmp_location, model_filename)
255
+
256
+ file_location = os.path.join(tmp_location, model_filename)
257
+ _ResourceHelper_moveFile(file_location, model_location).apply()
258
+
259
+ else:
260
+ model.build(model_location, model_filename)
261
+
262
+ print("{} graph exported to {}/{}".format(model_name, model_location, model_filename))
263
+
264
+ @staticmethod
265
+ def print_model_params(model_name):
266
+ """
267
+ Method that return the params allowed for the tf model.This method return the params with the description for every param.
268
+
269
+ Parameters
270
+ ----------
271
+ model_name: str
272
+ The name of the tf model name.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction
273
+
274
+ Examples
275
+ --------
276
+ >>> from sparknlp.training import tf_graph
277
+ >>> tf_graph.print_model_params("ner_dl")
278
+ """
279
+ if model_name not in TFGraphBuilderFactory.get_models():
280
+ raise Exception(f"Model {model_name} not supported.")
281
+
282
+ model = TFGraphBuilderFactory.__model_builders[model_name]({})
283
+ model_params = model.get_model_build_params()
284
+ model_params_descr = model.get_model_build_param_explanations()
285
+
286
+ print(f"{model_name} parameters.")
287
+ print("{:<20} {:<10} {:<20} {}".format("Parameter", "Required", "Default value", "Description"))
288
+ for param in model_params:
289
+ if type(model_params[param]) in [list, tuple]:
290
+ default_value = "[" + ", ".join(map(str, model_params[param])) + "]"
291
+ else:
292
+ default_value = model_params[param]
293
+
294
+ print("{:<20} {:<10} {:<20} {}".format(
295
+ param,
296
+ "yes" if default_value is None else "no",
297
+ default_value if default_value is not None else "-",
298
+ model_params_descr[param] if param in model_params_descr else ""
299
+ ))
@@ -0,0 +1,41 @@
1
+ import tensorflow.compat.v1 as tf
2
+
3
+ from .ner_model import NerModel
4
+
5
+
6
+ def create_graph(
7
+ model_location,
8
+ ntags,
9
+ embeddings_dim,
10
+ nchars,
11
+ lstm_size=128,
12
+ model_filename=None,
13
+ gpu_device=0,
14
+ is_medical=False
15
+ ):
16
+ tf.disable_v2_behavior()
17
+ tf.enable_v2_tensorshape()
18
+ tf.reset_default_graph()
19
+
20
+ if model_filename is None:
21
+ model_filename = 'blstm' + '_{}_{}_{}_{}'.format(ntags, embeddings_dim, lstm_size, nchars) + '.pb'
22
+
23
+ with tf.Session() as session:
24
+ ner = NerModel(session=None, use_gpu_device=gpu_device)
25
+ ner.add_cnn_char_repr(nchars, 25, 30)
26
+ ner.add_bilstm_char_repr(nchars, 25, 30)
27
+ ner.add_pretrained_word_embeddings(embeddings_dim)
28
+ ner.add_context_repr(ntags, lstm_size, 3)
29
+ ner.add_inference_layer(True, "predictions" if is_medical else "cond_2/Merge")
30
+ ner.add_training_op(5, "train" if is_medical else None)
31
+ ner.init_variables()
32
+ tf.train.Saver()
33
+
34
+ if model_location.startswith("dbfs:"):
35
+ graph_location = model_location.replace("dbfs:/", "/dbfs/")
36
+ else:
37
+ graph_location = model_location
38
+
39
+ tf.io.write_graph(ner.session.graph, graph_location, model_filename, False)
40
+ ner.close()
41
+ session.close()
@@ -0,0 +1,78 @@
1
+ import random
2
+ import string
3
+
4
+ import numpy as np
5
+
6
+
7
+ class DatasetEncoder:
8
+ # Each sentence must be array of tuple (word, tag)
9
+ def __init__(self, embeddings_resolver, tag2id=None, piece_tag='[X]'):
10
+ if tag2id is None:
11
+ tag2id = {'O': 0}
12
+ self.char2id = {c: i + 1 for i, c in enumerate(string.printable)}
13
+ self.tag2id = tag2id
14
+ self.embeddings_resolver = embeddings_resolver
15
+ self.piece_tag = piece_tag
16
+
17
+ def shuffle(self):
18
+ random.shuffle(self.sentences)
19
+
20
+ @staticmethod
21
+ def normalize(word):
22
+ return word.strip().lower()
23
+
24
+ def get_char_indexes(self, word):
25
+ result = []
26
+ for c in word:
27
+ char_id = self.char2id.get(c, len(self.char2id) - 1)
28
+ result.append(char_id)
29
+
30
+ return result
31
+
32
+ def encode(self, sentences, output=False):
33
+ for sentence in sentences:
34
+ dataset_words = [word for (word, tag) in sentence]
35
+ word_embeddings = self.embeddings_resolver.resolve_sentence(dataset_words)
36
+
37
+ # Zip Embeddings and Tags
38
+ words = []
39
+ tags = []
40
+ char_ids = []
41
+ tag_ids = []
42
+ is_word_start = []
43
+ embeddings = []
44
+
45
+ i = 0
46
+
47
+ for item in word_embeddings:
48
+ words.append(item.piece)
49
+
50
+ if item.is_word_start:
51
+ assert i < len(sentence), 'i = {} is more or equal than length of {}, during zip with {}'.format(i,
52
+ sentence,
53
+ word_embeddings)
54
+ tag = sentence[i][1]
55
+ i += 1
56
+ else:
57
+ tag = self.piece_tag
58
+
59
+ tag_id = self.tag2id.get(tag, len(self.tag2id))
60
+ self.tag2id[tag] = tag_id
61
+
62
+ tags.append(tag)
63
+ tag_ids.append(tag_id)
64
+
65
+ embeddings.append(item.vector)
66
+ is_word_start.append(item.is_word_start)
67
+
68
+ char_ids.append(self.get_char_indexes(item.piece))
69
+
70
+ if len(sentence) > 0:
71
+ yield {
72
+ "words": words,
73
+ "tags": tags,
74
+ "char_ids": char_ids,
75
+ "tag_ids": tag_ids,
76
+ "is_word_start": is_word_start,
77
+ "word_embeddings": np.array(embeddings, dtype=np.float16)
78
+ }