spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,277 @@
1
+ import os
2
+ import re
3
+
4
+ from sparknlp.internal import _ResourceHelper_moveFile
5
+ from sparknlp.training._tf_graph_builders.ner_dl.create_graph import create_graph
6
+
7
+
8
+ class WrongTFVersion(Exception):
9
+ pass
10
+
11
+
12
+ class TFGraphBuilder:
13
+ """
14
+ Generic class to create the tensorflow graphs for 'ner_dl', 'generic_classifier', 'assertion_dl', 'relation_extraction' annotators in spark-nlp healthcare
15
+
16
+ Examples
17
+ --------
18
+ >>> from sparknlp.training.tfgraphs import tf_graph_1x
19
+ >>> tf_graph_1x.get_models()
20
+
21
+ """
22
+
23
+ def supports_auto_file_name(self):
24
+ return False
25
+
26
+ def get_model_filename(self):
27
+ raise Exception("Not implemented.")
28
+
29
+ def check_build_params(self):
30
+
31
+ build_params = self.get_build_params()
32
+ required_params = self.get_model_build_params()
33
+
34
+ for req_param in required_params:
35
+ if req_param not in build_params:
36
+ if required_params[req_param] is None:
37
+ raise Exception(f"You need to specify a value for {req_param} in the build parameters.")
38
+
39
+ def get_build_params(self):
40
+ return self.__build_params
41
+
42
+ def get_build_params_with_defaults(self):
43
+ build_params = self.get_build_params()
44
+ req_build_params = self.get_model_build_params()
45
+
46
+ for req_param in req_build_params:
47
+ if (req_param not in build_params) and (req_build_params[req_param] is not None):
48
+ build_params[req_param] = req_build_params[req_param]
49
+
50
+ return build_params
51
+
52
+ def get_build_param(self, build_param):
53
+ build_params = self.get_build_params()
54
+
55
+ if build_param in build_params:
56
+ return build_params[build_param]
57
+
58
+ required_params = self.get_model_build_params()
59
+
60
+ if (build_param in required_params) and (required_params[build_param] is not None):
61
+ return required_params[build_param]
62
+
63
+ raise Exception(f"No value for {build_param} found.")
64
+
65
+ def get_model_build_params(self):
66
+ return {}
67
+
68
+ def get_model_build_param_explanations(self):
69
+ return {}
70
+
71
+ def __init__(self, build_params):
72
+ self.__build_params = build_params
73
+
74
+ class NerTFGraphBuilder(TFGraphBuilder):
75
+ """
76
+ Class to build the the TF graphs for MedicalNerApproach.
77
+
78
+ Examples
79
+ --------
80
+
81
+ >>> from sparknlp.training.tfgraphs import tf_graph_1x
82
+ >>> from sparknlp.base import *
83
+ >>> from sparknlp.annotator import *
84
+ >>> feat_size = 200
85
+ >>> n_classes = 6
86
+ >>> tf_graph_1x.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12},model_location="./ner_graphs",model_filename="auto")
87
+ >>> nerTagger = NerDLApproach()\
88
+ >>> .setInputCols(["sentence", "token", "embeddings"])\
89
+ >>> .setLabelColumn("label")\
90
+ >>> .setOutputCol("ner")\
91
+ >>> .setMaxEpochs(2)\
92
+ >>> .setBatchSize(64)\
93
+ >>> .setRandomSeed(0)\
94
+ >>> .setVerbose(1)\
95
+ >>> .setValidationSplit(0.2)\
96
+ >>> .setEvaluationLogExtended(True) \
97
+ >>> .setEnableOutputLogs(True)\
98
+ >>> .setIncludeConfidence(True)\
99
+ >>> .setOutputLogsPath('ner_logs')\
100
+ >>> .setGraphFolder('medical_ner_graphs')\
101
+ >>> .setEnableMemoryOptimizer(True)
102
+
103
+ """
104
+
105
+ def supports_auto_file_name(self):
106
+ return True
107
+
108
+ def get_model_filename(self):
109
+ return "blstm_{}_{}_{}_{}.pb".format(
110
+ self.get_build_param("ntags"),
111
+ self.get_build_param("embeddings_dim"),
112
+ self.get_build_param("lstm_size"),
113
+ self.get_build_param("nchars"),
114
+ )
115
+
116
+ def get_model_build_params(self):
117
+ return {
118
+ "ntags": None,
119
+ "embeddings_dim": 200,
120
+ "nchars": 100,
121
+ "lstm_size": 128,
122
+ "gpu_device": 0
123
+ }
124
+
125
+ def get_model_build_param_explanations(self):
126
+ return {
127
+ "ntags": "Number of tags.",
128
+ "embeddings_dim": "Embeddings dimension.",
129
+ "nchars": "Number of chars.",
130
+ "gpu_device": "Device for training.",
131
+ "lstm_size": "Number of LSTM units."
132
+ }
133
+
134
+ def build(self, model_location, model_filename):
135
+
136
+ if re.match(r'(\w+)://.*', model_location):
137
+ tmp_location = "/tmp/nerModel"
138
+ create_graph(
139
+ model_location=tmp_location,
140
+ model_filename=model_filename,
141
+ ntags=self.get_build_param("ntags"),
142
+ embeddings_dim=self.get_build_param("embeddings_dim"),
143
+ nchars=self.get_build_param("nchars"),
144
+ lstm_size=self.get_build_param("lstm_size"),
145
+ gpu_device=self.get_build_param("gpu_device"),
146
+ is_medical=False,
147
+ )
148
+
149
+ file_location = os.path.join(tmp_location, model_filename)
150
+ _ResourceHelper_moveFile(file_location, model_location).apply()
151
+
152
+ else:
153
+ create_graph(
154
+ model_location=model_location,
155
+ model_filename=model_filename,
156
+ ntags=self.get_build_param("ntags"),
157
+ embeddings_dim=self.get_build_param("embeddings_dim"),
158
+ nchars=self.get_build_param("nchars"),
159
+ lstm_size=self.get_build_param("lstm_size"),
160
+ gpu_device=self.get_build_param("gpu_device"),
161
+ is_medical=False,
162
+ )
163
+
164
+
165
+ class TFGraphBuilderFactory:
166
+ """
167
+ Factory class to create the different tensorflow graphs for ner_dl
168
+ """
169
+
170
+ __model_builders = {
171
+ "ner_dl": NerTFGraphBuilder
172
+ }
173
+
174
+ @staticmethod
175
+ def get_models():
176
+ """
177
+ Method that return the available tf models in spark-nlp healthcare
178
+
179
+ Examples
180
+ --------
181
+ >>> from sparknlp.training.tfgraphs import tf_graph_1x
182
+ >>> tf_graph_1x.get_models()
183
+
184
+ """
185
+ return list(TFGraphBuilderFactory.__model_builders.keys())
186
+
187
+ @staticmethod
188
+ def build(model_name, build_params, model_location, model_filename="auto"):
189
+ """
190
+ Method that create the tf graph.
191
+
192
+ Parameters
193
+ ----------
194
+ model_name: str
195
+ The name of the tf model that you want to build. Model available: ner_dl
196
+ build_params: dict
197
+ Configuration params to build the tf graph for the specific model.
198
+ model_location: str
199
+ Path where the model will be saved
200
+ model_filename: str
201
+ Name of the .rb file. If you put auto the filename will be generated.
202
+
203
+ Examples
204
+ --------
205
+ >>> from sparknlp.training.tfgraphs import tf_graph_1x
206
+ >>> tf_graph_1x.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12},model_location="./ner_graphs",model_filename="auto")
207
+
208
+ """
209
+ try:
210
+ import tensorflow as tf
211
+
212
+ if not tf.__version__.startswith("1.15"):
213
+ raise WrongTFVersion()
214
+
215
+ except WrongTFVersion:
216
+ print(tf.version)
217
+ raise Exception("Tensorflow v1.15 is required to build model graphs.")
218
+
219
+ except ModuleNotFoundError:
220
+ raise Exception("You need to install Tensorflow 1.15 to be able to build model graphs")
221
+
222
+ if model_name not in TFGraphBuilderFactory.__model_builders:
223
+ raise Exception(f"Can't build a graph for {model_name}: model not supported.")
224
+
225
+ model = TFGraphBuilderFactory.__model_builders[model_name](build_params)
226
+ model.check_build_params()
227
+
228
+ if model_filename == "auto":
229
+ if not model.supports_auto_file_name():
230
+ msg = f"""
231
+ {model_name} doesn't support automatic filename generation, please specify the filename of the
232
+ output graph
233
+ """.strip()
234
+ raise Exception(msg)
235
+ else:
236
+ model_filename = model.get_model_filename()
237
+
238
+ model.build(model_location, model_filename)
239
+ print("{} graph exported to {}/{}".format(model_name, model_location, model_filename))
240
+
241
+ @staticmethod
242
+ def print_model_params(model_name):
243
+ """
244
+ Method that return the params allowed for the tf model.This method return the params with the description for every param.
245
+
246
+ Parameters
247
+ ----------
248
+ model_name: str
249
+ The name of the tf model name.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction
250
+
251
+ Examples
252
+ --------
253
+ >>> from sparknlp.training.tfgraphs import tf_graph
254
+ >>> tf_graph.print_model_params("ner_dl")
255
+
256
+ """
257
+ if model_name not in TFGraphBuilderFactory.get_models():
258
+ raise Exception(f"Model {model_name} not supported.")
259
+
260
+ model = TFGraphBuilderFactory.__model_builders[model_name]({})
261
+ model_params = model.get_model_build_params()
262
+ model_params_descr = model.get_model_build_param_explanations()
263
+
264
+ print(f"{model_name} parameters.")
265
+ print("{:<20} {:<10} {:<20} {}".format("Parameter", "Required", "Default value", "Description"))
266
+ for param in model_params:
267
+ if type(model_params[param]) in [list, tuple]:
268
+ default_value = "[" + ", ".join(map(str, model_params[param])) + "]"
269
+ else:
270
+ default_value = model_params[param]
271
+
272
+ print("{:<20} {:<10} {:<20} {}".format(
273
+ param,
274
+ "yes" if default_value is None else "no",
275
+ default_value if default_value is not None else "-",
276
+ model_params_descr[param] if param in model_params_descr else ""
277
+ ))
@@ -0,0 +1,34 @@
1
+ import tensorflow.compat.v1 as tf
2
+
3
+ from .ner_model import NerModel
4
+
5
+
6
+ def create_graph(
7
+ model_location,
8
+ ntags,
9
+ embeddings_dim,
10
+ nchars,
11
+ lstm_size=128,
12
+ model_filename=None,
13
+ gpu_device=0,
14
+ is_medical=False
15
+ ):
16
+ tf.disable_v2_behavior()
17
+ tf.reset_default_graph()
18
+
19
+ if model_filename is None:
20
+ model_filename = 'blstm' + '_{}_{}_{}_{}'.format(ntags, embeddings_dim, lstm_size, nchars) + '.pb'
21
+
22
+ with tf.Session() as session:
23
+ ner = NerModel(session=None, use_gpu_device=gpu_device)
24
+ ner.add_cnn_char_repr(nchars, 25, 30)
25
+ ner.add_bilstm_char_repr(nchars, 25, 30)
26
+ ner.add_pretrained_word_embeddings(embeddings_dim)
27
+ ner.add_context_repr(ntags, lstm_size, 3)
28
+ ner.add_inference_layer(True, "predictions" if is_medical else None)
29
+ ner.add_training_op(5, "train" if is_medical else None)
30
+ ner.init_variables()
31
+ saver = tf.train.Saver()
32
+ tf.io.write_graph(ner.session.graph, model_location, model_filename, False)
33
+ ner.close()
34
+ session.close()
@@ -0,0 +1,78 @@
1
+ import random
2
+ import string
3
+
4
+ import numpy as np
5
+
6
+
7
+ class DatasetEncoder:
8
+ # Each sentence must be array of tuple (word, tag)
9
+ def __init__(self, embeddings_resolver, tag2id=None, piece_tag='[X]'):
10
+ if tag2id is None :
11
+ tag2id = {'O': 0}
12
+ self.char2id = {c: i + 1 for i, c in enumerate(string.printable)}
13
+ self.tag2id = tag2id
14
+ self.embeddings_resolver = embeddings_resolver
15
+ self.piece_tag = piece_tag
16
+
17
+ def shuffle(self):
18
+ random.shuffle(self.sentences)
19
+
20
+ @staticmethod
21
+ def normalize(word):
22
+ return word.strip().lower()
23
+
24
+ def get_char_indexes(self, word):
25
+ result = []
26
+ for c in word:
27
+ char_id = self.char2id.get(c, len(self.char2id) - 1)
28
+ result.append(char_id)
29
+
30
+ return result
31
+
32
+ def encode(self, sentences, output=False):
33
+ for sentence in sentences:
34
+ dataset_words = [word for (word, tag) in sentence]
35
+ word_embeddings = self.embeddings_resolver.resolve_sentence(dataset_words)
36
+
37
+ # Zip Embeddings and Tags
38
+ words = []
39
+ tags = []
40
+ char_ids = []
41
+ tag_ids = []
42
+ is_word_start = []
43
+ embeddings = []
44
+
45
+ i = 0
46
+
47
+ for item in word_embeddings:
48
+ words.append(item.piece)
49
+
50
+ if item.is_word_start:
51
+ assert i < len(sentence), 'i = {} is more or equal than length of {}, during zip with {}'.format(i,
52
+ sentence,
53
+ word_embeddings)
54
+ tag = sentence[i][1]
55
+ i += 1
56
+ else:
57
+ tag = self.piece_tag
58
+
59
+ tag_id = self.tag2id.get(tag, len(self.tag2id))
60
+ self.tag2id[tag] = tag_id
61
+
62
+ tags.append(tag)
63
+ tag_ids.append(tag_id)
64
+
65
+ embeddings.append(item.vector)
66
+ is_word_start.append(item.is_word_start)
67
+
68
+ char_ids.append(self.get_char_indexes(item.piece))
69
+
70
+ if len(sentence) > 0:
71
+ yield {
72
+ "words": words,
73
+ "tags": tags,
74
+ "char_ids": char_ids,
75
+ "tag_ids": tag_ids,
76
+ "is_word_start": is_word_start,
77
+ "word_embeddings": np.array(embeddings, dtype=np.float16)
78
+ }