spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,175 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for DeBertaForTokenClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class DeBertaForTokenClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasEngine,
23
+ HasMaxSentenceLengthLimit):
24
+ """DeBertaForTokenClassification can load DeBERTa v2&v3 Models with a token
25
+ classification head on top (a linear layer on top of the hidden-states
26
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> embeddings = DeBertaForTokenClassification.pretrained() \\
32
+ ... .setInputCols(["token", "document"]) \\
33
+ ... .setOutputCol("label")
34
+
35
+ The default model is ``"deberta_v3_xsmall_token_classifier_conll03"``, if no name is
36
+ provided.
37
+
38
+ For available pretrained models please see the `Models Hub
39
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
40
+
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+
45
+ ====================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== ======================
48
+ ``DOCUMENT, TOKEN`` ``NAMED_ENTITY``
49
+ ====================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ batchSize
54
+ Batch size. Large values allows faster processing but requires more
55
+ memory, by default 8
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default
58
+ True
59
+ configProtoBytes
60
+ ConfigProto from tensorflow, serialized into byte array.
61
+ maxSentenceLength
62
+ Max sentence length to process, by default 128
63
+
64
+ Examples
65
+ --------
66
+ >>> import sparknlp
67
+ >>> from sparknlp.base import *
68
+ >>> from sparknlp.annotator import *
69
+ >>> from pyspark.ml import Pipeline
70
+ >>> documentAssembler = DocumentAssembler() \\
71
+ ... .setInputCol("text") \\
72
+ ... .setOutputCol("document")
73
+ >>> tokenizer = Tokenizer() \\
74
+ ... .setInputCols(["document"]) \\
75
+ ... .setOutputCol("token")
76
+ >>> tokenClassifier = DeBertaForTokenClassification.pretrained() \\
77
+ ... .setInputCols(["token", "document"]) \\
78
+ ... .setOutputCol("label") \\
79
+ ... .setCaseSensitive(True)
80
+ >>> pipeline = Pipeline().setStages([
81
+ ... documentAssembler,
82
+ ... tokenizer,
83
+ ... tokenClassifier
84
+ ... ])
85
+ >>> data = spark.createDataFrame([["John Lenon was born in London and lived in Paris. My name is Sarah and I live in London"]]).toDF("text")
86
+ >>> result = pipeline.fit(data).transform(data)
87
+ >>> result.select("label.result").show(truncate=False)
88
+ +------------------------------------------------------------------------------------+
89
+ |result |
90
+ +------------------------------------------------------------------------------------+
91
+ |[B-PER, I-PER, O, O, O, B-LOC, O, O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O, B-LOC]|
92
+ +------------------------------------------------------------------------------------+
93
+ """
94
+ name = "DeBertaForTokenClassification"
95
+
96
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
97
+
98
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
99
+
100
+ configProtoBytes = Param(Params._dummy(),
101
+ "configProtoBytes",
102
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
103
+ TypeConverters.toListInt)
104
+
105
+ def getClasses(self):
106
+ """
107
+ Returns labels used to train this model
108
+ """
109
+ return self._call_java("getClasses")
110
+
111
+ def setConfigProtoBytes(self, b):
112
+ """Sets configProto from tensorflow, serialized into byte array.
113
+
114
+ Parameters
115
+ ----------
116
+ b : List[int]
117
+ ConfigProto from tensorflow, serialized into byte array
118
+ """
119
+ return self._set(configProtoBytes=b)
120
+
121
+ @keyword_only
122
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DeBertaForTokenClassification",
123
+ java_model=None):
124
+ super(DeBertaForTokenClassification, self).__init__(
125
+ classname=classname,
126
+ java_model=java_model
127
+ )
128
+ self._setDefault(
129
+ batchSize=8,
130
+ maxSentenceLength=128,
131
+ caseSensitive=True
132
+ )
133
+
134
+ @staticmethod
135
+ def loadSavedModel(folder, spark_session):
136
+ """Loads a locally saved model.
137
+
138
+ Parameters
139
+ ----------
140
+ folder : str
141
+ Folder of the saved model
142
+ spark_session : pyspark.sql.SparkSession
143
+ The current SparkSession
144
+
145
+ Returns
146
+ -------
147
+ DeBertaForTokenClassification
148
+ The restored model
149
+ """
150
+ from sparknlp.internal import _DeBertTokenClassifierLoader
151
+ jModel = _DeBertTokenClassifierLoader(folder, spark_session._jsparkSession)._java_obj
152
+ return DeBertaForTokenClassification(java_model=jModel)
153
+
154
+ @staticmethod
155
+ def pretrained(name="deberta_v3_xsmall_token_classifier_conll03", lang="en", remote_loc=None):
156
+ """Downloads and loads a pretrained model.
157
+
158
+ Parameters
159
+ ----------
160
+ name : str, optional
161
+ Name of the pretrained model, by default
162
+ "deberta_v3_xsmall_token_classifier_conll03"
163
+ lang : str, optional
164
+ Language of the pretrained model, by default "en"
165
+ remote_loc : str, optional
166
+ Optional remote address of the resource, by default None. Will use
167
+ Spark NLPs repositories otherwise.
168
+
169
+ Returns
170
+ -------
171
+ DeBertaForTokenClassification
172
+ The restored model
173
+ """
174
+ from sparknlp.pretrained import ResourceDownloader
175
+ return ResourceDownloader.downloadModel(DeBertaForTokenClassification, name, lang, remote_loc)
@@ -0,0 +1,193 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for DeBertaForZeroShotClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class DeBertaForZeroShotClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasCandidateLabelsProperties,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """DeBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
27
+ inference) tasks. Equivalent of `DeBertaForSequenceClassification` models, but these models don't require a hardcoded
28
+ number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
29
+ flexible.
30
+ Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
31
+ pair and passed to the pretrained model.
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+ >>> sequenceClassifier = DeBertaForZeroShotClassification.pretrained() \\
35
+ ... .setInputCols(["token", "document"]) \\
36
+ ... .setOutputCol("label")
37
+ The default model is ``"deberta_base_zero_shot_classifier_mnli_anli_v3"``, if no name is
38
+ provided.
39
+ For available pretrained models please see the `Models Hub
40
+ <https://sparknlp.orgtask=Text+Classification>`__.
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
48
+ ====================== ======================
49
+ Parameters
50
+ ----------
51
+ batchSize
52
+ Batch size. Large values allows faster processing but requires more
53
+ memory, by default 8
54
+ caseSensitive
55
+ Whether to ignore case in tokens for embeddings matching, by default
56
+ True
57
+ configProtoBytes
58
+ ConfigProto from tensorflow, serialized into byte array.
59
+ maxSentenceLength
60
+ Max sentence length to process, by default 128
61
+ coalesceSentences
62
+ Instead of 1 class per sentence (if inputCols is `sentence`) output 1
63
+ class per document by averaging probabilities in all sentences, by
64
+ default False
65
+ activation
66
+ Whether to calculate logits via Softmax or Sigmoid, by default
67
+ `"softmax"`.
68
+ Examples
69
+ --------
70
+ >>> import sparknlp
71
+ >>> from sparknlp.base import *
72
+ >>> from sparknlp.annotator import *
73
+ >>> from pyspark.ml import Pipeline
74
+ >>> documentAssembler = DocumentAssembler() \\
75
+ ... .setInputCol("text") \\
76
+ ... .setOutputCol("document")
77
+ >>> tokenizer = Tokenizer() \\
78
+ ... .setInputCols(["document"]) \\
79
+ ... .setOutputCol("token")
80
+ >>> sequenceClassifier = DeBertaForZeroShotClassification.pretrained() \\
81
+ ... .setInputCols(["token", "document"]) \\
82
+ ... .setOutputCol("label") \\
83
+ ... .setCaseSensitive(True)
84
+ >>> pipeline = Pipeline().setStages([
85
+ ... documentAssembler,
86
+ ... tokenizer,
87
+ ... sequenceClassifier
88
+ ... ])
89
+ >>> data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text")
90
+ >>> result = pipeline.fit(data).transform(data)
91
+ >>> result.select("label.result").show(truncate=False)
92
+ +------+
93
+ |result|
94
+ +------+
95
+ |[pos] |
96
+ |[neg] |
97
+ +------+
98
+ """
99
+ name = "DeBertaForZeroShotClassification"
100
+
101
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
102
+
103
+ outputAnnotatorType = AnnotatorType.CATEGORY
104
+
105
+ configProtoBytes = Param(Params._dummy(),
106
+ "configProtoBytes",
107
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
108
+ TypeConverters.toListInt)
109
+
110
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
111
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
112
+ TypeConverters.toBoolean)
113
+
114
+ def getClasses(self):
115
+ """
116
+ Returns labels used to train this model
117
+ """
118
+ return self._call_java("getClasses")
119
+
120
+ def setConfigProtoBytes(self, b):
121
+ """Sets configProto from tensorflow, serialized into byte array.
122
+ Parameters
123
+ ----------
124
+ b : List[int]
125
+ ConfigProto from tensorflow, serialized into byte array
126
+ """
127
+ return self._set(configProtoBytes=b)
128
+
129
+ def setCoalesceSentences(self, value):
130
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging
131
+ probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as DeBerta
132
+ (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities
133
+ for the entire document instead of probabilities per sentence. (Default: true)
134
+ Parameters
135
+ ----------
136
+ value : bool
137
+ If the output of all sentences will be averaged to one output
138
+ """
139
+ return self._set(coalesceSentences=value)
140
+
141
+ @keyword_only
142
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DeBertaForZeroShotClassification",
143
+ java_model=None):
144
+ super(DeBertaForZeroShotClassification, self).__init__(
145
+ classname=classname,
146
+ java_model=java_model
147
+ )
148
+ self._setDefault(
149
+ batchSize=8,
150
+ maxSentenceLength=128,
151
+ caseSensitive=True,
152
+ coalesceSentences=False,
153
+ activation="softmax"
154
+ )
155
+
156
+ @staticmethod
157
+ def loadSavedModel(folder, spark_session):
158
+ """Loads a locally saved model.
159
+ Parameters
160
+ ----------
161
+ folder : str
162
+ Folder of the saved model
163
+ spark_session : pyspark.sql.SparkSession
164
+ The current SparkSession
165
+ Returns
166
+ -------
167
+ DeBertaForZeroShotClassification
168
+ The restored model
169
+ """
170
+ from sparknlp.internal import _DeBertaForZeroShotClassification
171
+ jModel = _DeBertaForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj
172
+ return DeBertaForZeroShotClassification(java_model=jModel)
173
+
174
+ @staticmethod
175
+ def pretrained(name="deberta_base_zero_shot_classifier_mnli_anli_v3", lang="en", remote_loc=None):
176
+ """Downloads and loads a pretrained model.
177
+ Parameters
178
+ ----------
179
+ name : str, optional
180
+ Name of the pretrained model, by default
181
+ "deberta_base_zero_shot_classifier_mnli_anli_v3"
182
+ lang : str, optional
183
+ Language of the pretrained model, by default "en"
184
+ remote_loc : str, optional
185
+ Optional remote address of the resource, by default None. Will use
186
+ Spark NLPs repositories otherwise.
187
+ Returns
188
+ -------
189
+ DeBertaForZeroShotClassification
190
+ The restored model
191
+ """
192
+ from sparknlp.pretrained import ResourceDownloader
193
+ return ResourceDownloader.downloadModel(DeBertaForZeroShotClassification, name, lang, remote_loc)
@@ -0,0 +1,168 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+
18
+ class DistilBertForQuestionAnswering(AnnotatorModel,
19
+ HasCaseSensitiveProperties,
20
+ HasBatchedAnnotate,
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
23
+ """DistilBertForQuestionAnswering can load DistilBERT Models with a span classification head on top for extractive
24
+ question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
25
+ logits and span end logits).
26
+
27
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
28
+ object:
29
+
30
+ >>> spanClassifier = DistilBertForQuestionAnswering.pretrained() \\
31
+ ... .setInputCols(["document_question", "document_context"]) \\
32
+ ... .setOutputCol("answer")
33
+
34
+ The default model is ``"distilbert_base_cased_qa_squad2"``, if no name is
35
+ provided.
36
+
37
+ For available pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
39
+
40
+ To see which models are compatible and how to import them see
41
+ `Import Transformers into Spark NLP 🚀
42
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Batch size. Large values allows faster processing but requires more
54
+ memory, by default 8
55
+ caseSensitive
56
+ Whether to ignore case in tokens for embeddings matching, by default
57
+ False
58
+ configProtoBytes
59
+ ConfigProto from tensorflow, serialized into byte array.
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 128
62
+
63
+ Examples
64
+ --------
65
+ >>> import sparknlp
66
+ >>> from sparknlp.base import *
67
+ >>> from sparknlp.annotator import *
68
+ >>> from pyspark.ml import Pipeline
69
+ >>> documentAssembler = MultiDocumentAssembler() \\
70
+ ... .setInputCols(["question", "context"]) \\
71
+ ... .setOutputCol(["document_question", "document_context"])
72
+ >>> spanClassifier = DistilBertForQuestionAnswering.pretrained() \\
73
+ ... .setInputCols(["document_question", "document_context"]) \\
74
+ ... .setOutputCol("answer") \\
75
+ ... .setCaseSensitive(False)
76
+ >>> pipeline = Pipeline().setStages([
77
+ ... documentAssembler,
78
+ ... spanClassifier
79
+ ... ])
80
+ >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
81
+ >>> result = pipeline.fit(data).transform(data)
82
+ >>> result.select("answer.result").show(truncate=False)
83
+ +--------------------+
84
+ |result |
85
+ +--------------------+
86
+ |[Clara] |
87
+ +--------------------+
88
+ """
89
+ name = "DistilBertForQuestionAnswering"
90
+
91
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
92
+
93
+ outputAnnotatorType = AnnotatorType.CHUNK
94
+
95
+ configProtoBytes = Param(Params._dummy(),
96
+ "configProtoBytes",
97
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
98
+ TypeConverters.toListInt)
99
+
100
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
101
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
102
+ TypeConverters.toBoolean)
103
+
104
+ def setConfigProtoBytes(self, b):
105
+ """Sets configProto from tensorflow, serialized into byte array.
106
+
107
+ Parameters
108
+ ----------
109
+ b : List[int]
110
+ ConfigProto from tensorflow, serialized into byte array
111
+ """
112
+ return self._set(configProtoBytes=b)
113
+
114
+ @keyword_only
115
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForQuestionAnswering",
116
+ java_model=None):
117
+ super(DistilBertForQuestionAnswering, self).__init__(
118
+ classname=classname,
119
+ java_model=java_model
120
+ )
121
+ self._setDefault(
122
+ batchSize=8,
123
+ maxSentenceLength=128,
124
+ caseSensitive=False
125
+ )
126
+
127
+ @staticmethod
128
+ def loadSavedModel(folder, spark_session):
129
+ """Loads a locally saved model.
130
+
131
+ Parameters
132
+ ----------
133
+ folder : str
134
+ Folder of the saved model
135
+ spark_session : pyspark.sql.SparkSession
136
+ The current SparkSession
137
+
138
+ Returns
139
+ -------
140
+ DistilBertForQuestionAnswering
141
+ The restored model
142
+ """
143
+ from sparknlp.internal import _DistilBertQuestionAnsweringLoader
144
+ jModel = _DistilBertQuestionAnsweringLoader(folder, spark_session._jsparkSession)._java_obj
145
+ return DistilBertForQuestionAnswering(java_model=jModel)
146
+
147
+ @staticmethod
148
+ def pretrained(name="distilbert_base_cased_qa_squad2", lang="en", remote_loc=None):
149
+ """Downloads and loads a pretrained model.
150
+
151
+ Parameters
152
+ ----------
153
+ name : str, optional
154
+ Name of the pretrained model, by default
155
+ "distilbert_base_cased_qa_squad2"
156
+ lang : str, optional
157
+ Language of the pretrained model, by default "en"
158
+ remote_loc : str, optional
159
+ Optional remote address of the resource, by default None. Will use
160
+ Spark NLPs repositories otherwise.
161
+
162
+ Returns
163
+ -------
164
+ DistilBertForQuestionAnswering
165
+ The restored model
166
+ """
167
+ from sparknlp.pretrained import ResourceDownloader
168
+ return ResourceDownloader.downloadModel(DistilBertForQuestionAnswering, name, lang, remote_loc)