spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,194 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for XlmRoBertaSentenceEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class XlmRoBertaSentenceEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """Sentence-level embeddings using XLM-RoBERTa. The XLM-RoBERTa model was proposed in Unsupervised Cross-lingual
27
+ Representation Learning at Scale by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary,
28
+ Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based
29
+ on Facebook's RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of
30
+ filtered CommonCrawl data. Pretrained models can be loaded with pretrained of the companion object:
31
+
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+
35
+ >>> embeddings = XlmRoBertaSentenceEmbeddings.pretrained() \\
36
+ ... .setInputCols(["sentence"]) \\
37
+ ... .setOutputCol("sentence_embeddings")
38
+
39
+
40
+ The default model is ``"sent_xlm_roberta_base"``, if no name is provided.
41
+
42
+ For available pretrained models please see the
43
+ `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
44
+
45
+ ====================== =======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== =======================
48
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
49
+ ====================== =======================
50
+
51
+ Parameters
52
+ ----------
53
+ batchSize
54
+ Size of every batch, by default 8
55
+ caseSensitive
56
+ Whether to ignore case in tokens for embeddings matching, by default
57
+ False
58
+ dimension
59
+ Number of embedding dimensions, by default 768
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 128
62
+ configProtoBytes
63
+ ConfigProto from tensorflow, serialized into byte array.
64
+
65
+ References
66
+ ----------
67
+ `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/pdf/1911.02116.pdf>`__
68
+
69
+ **Paper abstract:**
70
+
71
+ *This paper shows that pretraining multilingual language models at scale leads to significant performance gains
72
+ for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one
73
+ hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R,
74
+ significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8%
75
+ average accuracy on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs
76
+ particularly well on low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over
77
+ the previous XLM model. We also present a detailed empirical evaluation of the key factors that are required to
78
+ achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the
79
+ performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of
80
+ multilingual modeling without sacrificing per-language performance; XLM-Ris very competitive with strong
81
+ monolingual models on the GLUE and XNLI benchmarks. We will make XLM-R code, data, and models publicly available.*
82
+
83
+ Examples
84
+ --------
85
+ >>> import sparknlp
86
+ >>> from sparknlp.base import *
87
+ >>> from sparknlp.annotator import *
88
+ >>> from pyspark.ml import Pipeline
89
+ >>> documentAssembler = DocumentAssembler() \\
90
+ ... .setInputCol("text") \\
91
+ ... .setOutputCol("document")
92
+ >>> sentence = SentenceDetector() \\
93
+ ... .setInputCols(["document"]) \\
94
+ ... .setOutputCol("sentence")
95
+ >>> embeddings = XlmRoBertaSentenceEmbeddings.pretrained() \\
96
+ ... .setInputCols(["sentence"]) \\
97
+ ... .setOutputCol("sentence_embeddings")
98
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
99
+ ... .setInputCols(["sentence_embeddings"]) \\
100
+ ... .setOutputCols("finished_embeddings") \\
101
+ ... .setOutputAsVector(True)
102
+ >>> pipeline = Pipeline().setStages([
103
+ ... documentAssembler,
104
+ ... sentence,
105
+ ... embeddings,
106
+ ... embeddingsFinisher
107
+ ... ])
108
+ >>> data = spark.createDataFrame([["John loves apples. Mary loves oranges. John loves Mary."]]).toDF("text")
109
+ >>> result = pipeline.fit(data).transform(data)
110
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
111
+ +--------------------------------------------------------------------------------+
112
+ | result|
113
+ +--------------------------------------------------------------------------------+
114
+ |[-0.8951074481010437,0.13753940165042877,0.3108254075050354,-1.65693199634552...|
115
+ |[-0.6180210709571838,-0.12179657071828842,-0.191165953874588,-1.4497021436691...|
116
+ |[-0.822715163230896,0.7568016648292542,-0.1165061742067337,-1.59048593044281,...|
117
+ +--------------------------------------------------------------------------------+
118
+ """
119
+
120
+ name = "XlmRoBertaSentenceEmbeddings"
121
+
122
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
123
+
124
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
125
+
126
+ configProtoBytes = Param(Params._dummy(),
127
+ "configProtoBytes",
128
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
129
+ TypeConverters.toListInt)
130
+
131
+ def setConfigProtoBytes(self, b):
132
+ """Sets configProto from tensorflow, serialized into byte array.
133
+
134
+ Parameters
135
+ ----------
136
+ b : List[int]
137
+ ConfigProto from tensorflow, serialized into byte array
138
+ """
139
+ return self._set(configProtoBytes=b)
140
+
141
+ @keyword_only
142
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings", java_model=None):
143
+ super(XlmRoBertaSentenceEmbeddings, self).__init__(
144
+ classname=classname,
145
+ java_model=java_model
146
+ )
147
+ self._setDefault(
148
+ dimension=768,
149
+ batchSize=8,
150
+ maxSentenceLength=128,
151
+ caseSensitive=True
152
+ )
153
+
154
+ @staticmethod
155
+ def loadSavedModel(folder, spark_session):
156
+ """Loads a locally saved model.
157
+
158
+ Parameters
159
+ ----------
160
+ folder : str
161
+ Folder of the saved model
162
+ spark_session : pyspark.sql.SparkSession
163
+ The current SparkSession
164
+
165
+ Returns
166
+ -------
167
+ BertSentenceEmbeddings
168
+ The restored model
169
+ """
170
+ from sparknlp.internal import _XlmRoBertaSentenceLoader
171
+ jModel = _XlmRoBertaSentenceLoader(folder, spark_session._jsparkSession)._java_obj
172
+ return XlmRoBertaSentenceEmbeddings(java_model=jModel)
173
+
174
+ @staticmethod
175
+ def pretrained(name="sent_xlm_roberta_base", lang="xx", remote_loc=None):
176
+ """Downloads and loads a pretrained model.
177
+
178
+ Parameters
179
+ ----------
180
+ name : str, optional
181
+ Name of the pretrained model, by default "sent_xlm_roberta_base"
182
+ lang : str, optional
183
+ Language of the pretrained model, by default "xx"
184
+ remote_loc : str, optional
185
+ Optional remote address of the resource, by default None. Will use
186
+ Spark NLPs repositories otherwise.
187
+
188
+ Returns
189
+ -------
190
+ XlmRoBertaSentenceEmbeddings
191
+ The restored model
192
+ """
193
+ from sparknlp.pretrained import ResourceDownloader
194
+ return ResourceDownloader.downloadModel(XlmRoBertaSentenceEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,227 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for XlnetEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class XlnetEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """XlnetEmbeddings (XLNet): Generalized Autoregressive Pretraining for
27
+ Language Understanding
28
+
29
+ XLNet is a new unsupervised language representation learning method based on
30
+ a novel generalized permutation language modeling objective. Additionally,
31
+ XLNet employs Transformer-XL as the backbone model, exhibiting excellent
32
+ performance for language tasks involving long context. Overall, XLNet
33
+ achieves state-of-the-art (SOTA) results on various downstream language
34
+ tasks including question answering, natural language inference, sentiment
35
+ analysis, and document ranking.
36
+
37
+ These word embeddings represent the outputs generated by the XLNet models.
38
+
39
+ - ``"xlnet_large_cased"`` (`XLNet-Large
40
+ <https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip>`__):
41
+ 24-layer, 1024-hidden, 16-heads
42
+
43
+ - ``"xlnet_base_cased"`` (`XLNet-Base
44
+ <https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip>`__):
45
+ 12-layer, 768-hidden, 12-heads. This model is trained on full data
46
+ (different from the one in the paper).
47
+
48
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
49
+ object:
50
+
51
+ >>> embeddings = XlnetEmbeddings.pretrained() \\
52
+ ... .setInputCols(["sentence", "token"]) \\
53
+ ... .setOutputCol("embeddings")
54
+
55
+ The default model is ``"xlnet_base_cased"``, if no name is provided.
56
+
57
+ For extended examples of usage, see the `Examples
58
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_xlnet.ipynb>`__.
59
+ To see which models are compatible and how to import them see
60
+ `Import Transformers into Spark NLP 🚀
61
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
62
+
63
+ ====================== ======================
64
+ Input Annotation types Output Annotation type
65
+ ====================== ======================
66
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
67
+ ====================== ======================
68
+
69
+ Parameters
70
+ ----------
71
+ batchSize
72
+ Size of every batch, by default 8
73
+ dimension
74
+ Number of embedding dimensions, by default 768
75
+ caseSensitive
76
+ Whether to ignore case in tokens for embeddings matching, by default
77
+ True
78
+ configProtoBytes
79
+ ConfigProto from tensorflow, serialized into byte array.
80
+ maxSentenceLength
81
+ Max sentence length to process, by default 128
82
+
83
+ Notes
84
+ -----
85
+ This is a very computationally expensive module compared to word embedding
86
+ modules that only perform embedding lookups. The use of an accelerator is
87
+ recommended.
88
+
89
+ References
90
+ ----------
91
+ `XLNet: Generalized Autoregressive Pretraining for Language Understanding
92
+ <https://arxiv.org/abs/1906.08237>`__
93
+
94
+ https://github.com/zihangdai/xlnet
95
+
96
+ **Paper abstract:**
97
+
98
+ *With the capability of modeling bidirectional contexts, denoising
99
+ autoencoding based pretraining like BERT achieves better performance than
100
+ pretraining approaches based on autoregressive language modeling. However,
101
+ relying on corrupting the input with masks, BERT neglects dependency between
102
+ the masked positions and suffers from a pretrain-finetune discrepancy. In
103
+ light of these pros and cons, we propose XLNet, a generalized autoregressive
104
+ pretraining method that (1) enables learning bidirectional contexts by
105
+ maximizing the expected likelihood over all permutations of the
106
+ factorization order and (2) overcomes the limitations of BERT thanks to its
107
+ autoregressive formulation. Furthermore, XLNet integrates ideas from
108
+ Transformer-XL, the state-of-the-art autoregressive model, into pretraining.
109
+ Empirically, under comparable experiment settings, XLNet outperforms BERT on
110
+ 20 tasks, often by a large margin, including question answering, natural
111
+ language inference, sentiment analysis, and document ranking.*
112
+
113
+ Examples
114
+ --------
115
+ >>> import sparknlp
116
+ >>> from sparknlp.base import *
117
+ >>> from sparknlp.annotator import *
118
+ >>> from pyspark.ml import Pipeline
119
+ >>> documentAssembler = DocumentAssembler() \\
120
+ ... .setInputCol("text") \\
121
+ ... .setOutputCol("document")
122
+ >>> tokenizer = Tokenizer() \\
123
+ ... .setInputCols(["document"]) \\
124
+ ... .setOutputCol("token")
125
+ >>> embeddings = XlnetEmbeddings.pretrained() \\
126
+ ... .setInputCols(["token", "document"]) \\
127
+ ... .setOutputCol("embeddings")
128
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
129
+ ... .setInputCols(["embeddings"]) \\
130
+ ... .setOutputCols("finished_embeddings") \\
131
+ ... .setOutputAsVector(True) \\
132
+ ... .setCleanAnnotations(False)
133
+ >>> pipeline = Pipeline().setStages([
134
+ ... documentAssembler,
135
+ ... tokenizer,
136
+ ... embeddings,
137
+ ... embeddingsFinisher
138
+ ... ])
139
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
140
+ >>> result = pipeline.fit(data).transform(data)
141
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
142
+ +--------------------------------------------------------------------------------+
143
+ | result|
144
+ +--------------------------------------------------------------------------------+
145
+ |[-0.6287205219268799,-0.4865287244319916,-0.186111718416214,0.234187275171279...|
146
+ |[-1.1967450380325317,0.2746637463569641,0.9481253027915955,0.3431355059146881...|
147
+ |[-1.0777631998062134,-2.092679977416992,-1.5331977605819702,-1.11190271377563...|
148
+ |[-0.8349916934967041,-0.45627787709236145,-0.7890847325325012,-1.028069257736...|
149
+ |[-0.134845569729805,-0.11672890186309814,0.4945235550403595,-0.66587203741073...|
150
+ +--------------------------------------------------------------------------------+
151
+ """
152
+
153
+ name = "XlnetEmbeddings"
154
+
155
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
156
+
157
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
158
+
159
+ configProtoBytes = Param(Params._dummy(),
160
+ "configProtoBytes",
161
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
162
+ TypeConverters.toListInt)
163
+
164
+ def setConfigProtoBytes(self, b):
165
+ """Sets configProto from tensorflow, serialized into byte array.
166
+
167
+ Parameters
168
+ ----------
169
+ b : List[int]
170
+ ConfigProto from tensorflow, serialized into byte array
171
+ """
172
+ return self._set(configProtoBytes=b)
173
+
174
+ @keyword_only
175
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings", java_model=None):
176
+ super(XlnetEmbeddings, self).__init__(
177
+ classname=classname,
178
+ java_model=java_model
179
+ )
180
+ self._setDefault(
181
+ batchSize=8,
182
+ dimension=768,
183
+ maxSentenceLength=128,
184
+ caseSensitive=True
185
+ )
186
+
187
+ @staticmethod
188
+ def loadSavedModel(folder, spark_session):
189
+ """Loads a locally saved model.
190
+
191
+ Parameters
192
+ ----------
193
+ folder : str
194
+ Folder of the saved model
195
+ spark_session : pyspark.sql.SparkSession
196
+ The current SparkSession
197
+
198
+ Returns
199
+ -------
200
+ XlnetEmbeddings
201
+ The restored model
202
+ """
203
+ from sparknlp.internal import _XlnetLoader
204
+ jModel = _XlnetLoader(folder, spark_session._jsparkSession)._java_obj
205
+ return XlnetEmbeddings(java_model=jModel)
206
+
207
+ @staticmethod
208
+ def pretrained(name="xlnet_base_cased", lang="en", remote_loc=None):
209
+ """Downloads and loads a pretrained model.
210
+
211
+ Parameters
212
+ ----------
213
+ name : str, optional
214
+ Name of the pretrained model, by default "xlnet_base_cased"
215
+ lang : str, optional
216
+ Language of the pretrained model, by default "en"
217
+ remote_loc : str, optional
218
+ Optional remote address of the resource, by default None. Will use
219
+ Spark NLPs repositories otherwise.
220
+
221
+ Returns
222
+ -------
223
+ XlnetEmbeddings
224
+ The restored model
225
+ """
226
+ from sparknlp.pretrained import ResourceDownloader
227
+ return ResourceDownloader.downloadModel(XlnetEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,16 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Module of annotators for entity extraction."""
16
+ from sparknlp.annotator.er.entity_ruler import *