spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,221 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for DistilBertEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class DistilBertEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """DistilBERT is a small, fast, cheap and light Transformer model trained by
27
+ distilling BERT base. It has 40% less parameters than ``bert-base-uncased``,
28
+ runs 60% faster while preserving over 95% of BERT's performances as measured
29
+ on the GLUE language understanding benchmark.
30
+
31
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
32
+ object:
33
+
34
+ >>> embeddings = DistilBertEmbeddings.pretrained() \\
35
+ ... .setInputCols(["document", "token"]) \\
36
+ ... .setOutputCol("embeddings")
37
+
38
+
39
+ The default model is ``"distilbert_base_cased"``, if no name is provided.
40
+ For available pretrained models please see the
41
+ `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
42
+
43
+ For extended examples of usage, see the `Examples
44
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBERT.ipynb>`__.
45
+ To see which models are compatible and how to import them see
46
+ `Import Transformers into Spark NLP 🚀
47
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
48
+
49
+ ====================== ======================
50
+ Input Annotation types Output Annotation type
51
+ ====================== ======================
52
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
53
+ ====================== ======================
54
+
55
+ Parameters
56
+ ----------
57
+ batchSize
58
+ Size of every batch, by default 8
59
+ dimension
60
+ Number of embedding dimensions, by default 768
61
+ caseSensitive
62
+ Whether to ignore case in tokens for embeddings matching, by default
63
+ False
64
+ maxSentenceLength
65
+ Max sentence length to process, by default 128
66
+ configProtoBytes
67
+ ConfigProto from tensorflow, serialized into byte array.
68
+
69
+ Notes
70
+ -----
71
+ - DistilBERT doesn't have ``token_type_ids``, you don't need to
72
+ indicate which token belongs to which segment. Just separate your segments
73
+ with the separation token ``tokenizer.sep_token`` (or ``[SEP]``).
74
+ - DistilBERT doesn't have options to select the input positions
75
+ (``position_ids`` input). This could be added if necessary though,
76
+ just let us know if you need this option.
77
+
78
+ References
79
+ ----------
80
+ The DistilBERT model was proposed in the paper
81
+ `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and
82
+ lighter <https://arxiv.org/abs/1910.01108>`__.
83
+
84
+ **Paper Abstract:**
85
+
86
+ *As Transfer Learning from large-scale pre-trained models becomes more
87
+ prevalent in Natural Language Processing (NLP), operating these
88
+ large models in on-the- edge and/or under constrained computational
89
+ training or inference budgets remains challenging. In this work, we
90
+ propose a method to pre-train a smaller general-purpose language
91
+ representation model, called DistilBERT, which can then be
92
+ fine-tuned with good performances on a wide range of tasks like its
93
+ larger counterparts. While most prior work investigated the use of
94
+ distillation for building task-specific models, we leverage
95
+ knowledge distillation during the pretraining phase and show that it
96
+ is possible to reduce the size of a BERT model by 40%, while
97
+ retaining 97% of its language understanding capabilities and being
98
+ 60% faster. To leverage the inductive biases learned by larger
99
+ models during pretraining, we introduce a triple loss combining
100
+ language modeling, distillation and cosine-distance losses. Our
101
+ smaller, faster and lighter model is cheaper to pre-train and we
102
+ demonstrate its capabilities for on-device computations in a
103
+ proof-of-concept experiment and a comparative on-device study.*
104
+
105
+ Examples
106
+ --------
107
+ >>> import sparknlp
108
+ >>> from sparknlp.base import *
109
+ >>> from sparknlp.annotator import *
110
+ >>> from pyspark.ml import Pipeline
111
+ >>> documentAssembler = DocumentAssembler() \\
112
+ ... .setInputCol("text") \\
113
+ ... .setOutputCol("document")
114
+ >>> tokenizer = Tokenizer() \\
115
+ ... .setInputCols(["document"]) \\
116
+ ... .setOutputCol("token")
117
+ >>> embeddings = DistilBertEmbeddings.pretrained() \\
118
+ ... .setInputCols(["document", "token"]) \\
119
+ ... .setOutputCol("embeddings") \\
120
+ ... .setCaseSensitive(True)
121
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
122
+ ... .setInputCols(["embeddings"]) \\
123
+ ... .setOutputCols("finished_embeddings") \\
124
+ ... .setOutputAsVector(True) \\
125
+ ... .setCleanAnnotations(False)
126
+ >>> pipeline = Pipeline() \\
127
+ ... .setStages([
128
+ ... documentAssembler,
129
+ ... tokenizer,
130
+ ... embeddings,
131
+ ... embeddingsFinisher
132
+ ... ])
133
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
134
+ >>> result = pipeline.fit(data).transform(data)
135
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
136
+ +--------------------------------------------------------------------------------+
137
+ | result|
138
+ +--------------------------------------------------------------------------------+
139
+ |[0.1127224713563919,-0.1982710212469101,0.5360898375511169,-0.272536993026733...|
140
+ |[0.35534414649009705,0.13215228915214539,0.40981462597846985,0.14036104083061...|
141
+ |[0.328085333108902,-0.06269335001707077,-0.017595693469047546,-0.024373905733...|
142
+ |[0.15617232024669647,0.2967822253704071,0.22324979305267334,-0.04568954557180...|
143
+ |[0.45411425828933716,0.01173491682857275,0.190129816532135,0.1178255230188369...|
144
+ +--------------------------------------------------------------------------------+
145
+ """
146
+
147
+ name = "DistilBertEmbeddings"
148
+
149
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
150
+
151
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
152
+
153
+ configProtoBytes = Param(Params._dummy(),
154
+ "configProtoBytes",
155
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
156
+ TypeConverters.toListInt)
157
+
158
+ def setConfigProtoBytes(self, b):
159
+ """Sets configProto from tensorflow, serialized into byte array.
160
+
161
+ Parameters
162
+ ----------
163
+ b : List[int]
164
+ ConfigProto from tensorflow, serialized into byte array
165
+ """
166
+ return self._set(configProtoBytes=b)
167
+
168
+ @keyword_only
169
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings", java_model=None):
170
+ super(DistilBertEmbeddings, self).__init__(
171
+ classname=classname,
172
+ java_model=java_model
173
+ )
174
+ self._setDefault(
175
+ dimension=768,
176
+ batchSize=8,
177
+ maxSentenceLength=128,
178
+ caseSensitive=False
179
+ )
180
+
181
+ @staticmethod
182
+ def loadSavedModel(folder, spark_session):
183
+ """Loads a locally saved model.
184
+
185
+ Parameters
186
+ ----------
187
+ folder : str
188
+ Folder of the saved model
189
+ spark_session : pyspark.sql.SparkSession
190
+ The current SparkSession
191
+
192
+ Returns
193
+ -------
194
+ DistilBertEmbeddings
195
+ The restored model
196
+ """
197
+ from sparknlp.internal import _DistilBertLoader
198
+ jModel = _DistilBertLoader(folder, spark_session._jsparkSession)._java_obj
199
+ return DistilBertEmbeddings(java_model=jModel)
200
+
201
+ @staticmethod
202
+ def pretrained(name="distilbert_base_cased", lang="en", remote_loc=None):
203
+ """Downloads and loads a pretrained model.
204
+
205
+ Parameters
206
+ ----------
207
+ name : str, optional
208
+ Name of the pretrained model, by default "distilbert_base_cased"
209
+ lang : str, optional
210
+ Language of the pretrained model, by default "en"
211
+ remote_loc : str, optional
212
+ Optional remote address of the resource, by default None. Will use
213
+ Spark NLPs repositories otherwise.
214
+
215
+ Returns
216
+ -------
217
+ DistilBertEmbeddings
218
+ The restored model
219
+ """
220
+ from sparknlp.pretrained import ResourceDownloader
221
+ return ResourceDownloader.downloadModel(DistilBertEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,352 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Doc2Vec."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class Doc2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties):
20
+ """Trains a Word2Vec model that creates vector representations of words in a
21
+ text corpus.
22
+
23
+ The algorithm first constructs a vocabulary from the corpus and then learns
24
+ vector representation of words in the vocabulary. The vector representation
25
+ can be used as features in natural language processing and machine learning
26
+ algorithms.
27
+
28
+ We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
29
+ implementation and a hierarchical softmax method to train the model. The
30
+ variable names in the implementation match the original C implementation.
31
+
32
+ For instantiated/pretrained models, see :class:`.Doc2VecModel`.
33
+
34
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
35
+
36
+ ====================== =======================
37
+ Input Annotation types Output Annotation type
38
+ ====================== =======================
39
+ ``TOKEN`` ``SENTENCE_EMBEDDINGS``
40
+ ====================== =======================
41
+
42
+ Parameters
43
+ ----------
44
+ vectorSize
45
+ The dimension of codes after transforming from words (> 0), by default
46
+ 100
47
+ windowSize
48
+ The window size (context words from [-window, window]) (> 0), by default
49
+ 5
50
+ numPartitions
51
+ Number of partitions for sentences of words (> 0), by default 1
52
+ minCount
53
+ The minimum number of times a token must appear to be included in the
54
+ word2vec model's vocabulary (>= 0), by default 1
55
+ maxSentenceLength
56
+ The window size (Maximum length (in words) of each sentence in the input
57
+ data. Any sentence longer than this threshold will be divided into
58
+ chunks up to the size (> 0), by default 1000
59
+ stepSize
60
+ Step size (learning rate) to be used for each iteration of optimization
61
+ (> 0), by default 0.025
62
+ maxIter
63
+ Maximum number of iterations (>= 0), by default 1
64
+ seed
65
+ Random seed, by default 44
66
+
67
+
68
+ References
69
+ ----------
70
+ For the original C implementation, see https://code.google.com/p/word2vec/
71
+
72
+ For the research paper, see `Efficient Estimation of Word Representations in
73
+ Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
74
+ Representations of Words and Phrases and their Compositionality
75
+ <https://arxiv.org/pdf/1310.4546v1.pdf>`__.
76
+
77
+ Examples
78
+ --------
79
+ >>> import sparknlp
80
+ >>> from sparknlp.base import *
81
+ >>> from sparknlp.annotator import *
82
+ >>> from pyspark.ml import Pipeline
83
+ >>> documentAssembler = DocumentAssembler() \\
84
+ ... .setInputCol("text") \\
85
+ ... .setOutputCol("document")
86
+ >>> tokenizer = Tokenizer() \\
87
+ ... .setInputCols(["document"]) \\
88
+ ... .setOutputCol("token")
89
+ >>> embeddings = Doc2VecApproach() \\
90
+ ... .setInputCols(["token"]) \\
91
+ ... .setOutputCol("embeddings")
92
+ >>> pipeline = Pipeline() \\
93
+ ... .setStages([
94
+ ... documentAssembler,
95
+ ... tokenizer,
96
+ ... embeddings
97
+ ... ])
98
+ >>> path = "sherlockholmes.txt"
99
+ >>> dataset = spark.read.text(path).toDF("text")
100
+ >>> pipelineModel = pipeline.fit(dataset)
101
+ """
102
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
103
+
104
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
105
+
106
+ vectorSize = Param(Params._dummy(),
107
+ "vectorSize",
108
+ "the dimension of codes after transforming from words (> 0)",
109
+ typeConverter=TypeConverters.toInt)
110
+
111
+ windowSize = Param(Params._dummy(),
112
+ "windowSize",
113
+ "the window size (context words from [-window, window]) (> 0)",
114
+ typeConverter=TypeConverters.toInt)
115
+
116
+ numPartitions = Param(Params._dummy(),
117
+ "numPartitions",
118
+ "number of partitions for sentences of words (> 0)",
119
+ typeConverter=TypeConverters.toInt)
120
+
121
+ minCount = Param(Params._dummy(),
122
+ "minCount",
123
+ "the minimum number of times a token must " +
124
+ "appear to be included in the word2vec model's vocabulary (>= 0)",
125
+ typeConverter=TypeConverters.toInt)
126
+
127
+ maxSentenceLength = Param(Params._dummy(),
128
+ "maxSentenceLength",
129
+ "the window size (Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will " +
130
+ "be divided into chunks up to the size (> 0)",
131
+ typeConverter=TypeConverters.toInt)
132
+
133
+ stepSize = Param(Params._dummy(),
134
+ "stepSize",
135
+ "Step size (learning rate) to be used for each iteration of optimization (> 0)",
136
+ typeConverter=TypeConverters.toFloat)
137
+
138
+ maxIter = Param(Params._dummy(),
139
+ "maxIter",
140
+ "maximum number of iterations (>= 0)",
141
+ typeConverter=TypeConverters.toInt)
142
+
143
+ seed = Param(Params._dummy(),
144
+ "seed",
145
+ "Random seed",
146
+ typeConverter=TypeConverters.toInt)
147
+
148
+ def setVectorSize(self, vectorSize):
149
+ """
150
+ Sets vector size (default: 100).
151
+ """
152
+ return self._set(vectorSize=vectorSize)
153
+
154
+ def setWindowSize(self, windowSize):
155
+ """
156
+ Sets window size (default: 5).
157
+ """
158
+ return self._set(windowSize=windowSize)
159
+
160
+ def setStepSize(self, stepSize):
161
+ """
162
+ Sets initial learning rate (default: 0.025).
163
+ """
164
+ return self._set(stepSize=stepSize)
165
+
166
+ def setNumPartitions(self, numPartitions):
167
+ """
168
+ Sets number of partitions (default: 1). Use a small number for
169
+ accuracy.
170
+ """
171
+ return self._set(numPartitions=numPartitions)
172
+
173
+ def setMaxIter(self, numIterations):
174
+ """
175
+ Sets number of iterations (default: 1), which should be smaller
176
+ than or equal to number of partitions.
177
+ """
178
+ return self._set(maxIter=numIterations)
179
+
180
+ def setSeed(self, seed):
181
+ """
182
+ Sets random seed.
183
+ """
184
+ return self._set(seed=seed)
185
+
186
+ def setMinCount(self, minCount):
187
+ """
188
+ Sets minCount, the minimum number of times a token must appear
189
+ to be included in the word2vec model's vocabulary (default: 5).
190
+ """
191
+ return self._set(minCount=minCount)
192
+
193
+ def setMaxSentenceLength(self, maxSentenceLength):
194
+ """
195
+ Maximum length (in words) of each sentence in the input data.
196
+ Any sentence longer than this threshold will be divided into
197
+ chunks up to the size (> 0)
198
+ """
199
+ return self._set(maxSentenceLength=maxSentenceLength)
200
+
201
+ @keyword_only
202
+ def __init__(self):
203
+ super(Doc2VecApproach, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.Doc2VecApproach")
204
+ self._setDefault(
205
+ vectorSize=100,
206
+ windowSize=5,
207
+ numPartitions=1,
208
+ minCount=1,
209
+ maxSentenceLength=1000,
210
+ stepSize=0.025,
211
+ maxIter=1,
212
+ seed=44
213
+ )
214
+
215
+ def _create_model(self, java_model):
216
+ return Doc2VecModel(java_model=java_model)
217
+
218
+
219
+ class Doc2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties):
220
+ """Word2Vec model that creates vector representations of words in a text
221
+ corpus.
222
+
223
+ The algorithm first constructs a vocabulary from the corpus and then learns
224
+ vector representation of words in the vocabulary. The vector representation
225
+ can be used as features in natural language processing and machine learning
226
+ algorithms.
227
+
228
+ We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
229
+ implementation and a hierarchical softmax method to train the model. The
230
+ variable names in the implementation match the original C implementation.
231
+
232
+ This is the instantiated model of the :class:`.Doc2VecApproach`. For
233
+ training your own model, please see the documentation of that class.
234
+
235
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
236
+ object:
237
+
238
+ >>> embeddings = Doc2VecModel.pretrained() \\
239
+ ... .setInputCols(["token"]) \\
240
+ ... .setOutputCol("embeddings")
241
+
242
+ The default model is `"doc2vec_gigaword_300"`, if no name is provided.
243
+
244
+ ====================== =======================
245
+ Input Annotation types Output Annotation type
246
+ ====================== =======================
247
+ ``TOKEN`` ``SENTENCE_EMBEDDINGS``
248
+ ====================== =======================
249
+
250
+ Parameters
251
+ ----------
252
+ vectorSize
253
+ The dimension of codes after transforming from words (> 0) , by default
254
+ 100
255
+
256
+ References
257
+ ----------
258
+ For the original C implementation, see https://code.google.com/p/word2vec/
259
+
260
+ For the research paper, see `Efficient Estimation of Word Representations in
261
+ Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
262
+ Representations of Words and Phrases and their Compositionality
263
+ <https://arxiv.org/pdf/1310.4546v1.pdf>`__.
264
+
265
+ Examples
266
+ --------
267
+ >>> import sparknlp
268
+ >>> from sparknlp.base import *
269
+ >>> from sparknlp.annotator import *
270
+ >>> from pyspark.ml import Pipeline
271
+ >>> documentAssembler = DocumentAssembler() \\
272
+ ... .setInputCol("text") \\
273
+ ... .setOutputCol("document")
274
+ >>> tokenizer = Tokenizer() \\
275
+ ... .setInputCols(["document"]) \\
276
+ ... .setOutputCol("token")
277
+ >>> embeddings = Doc2VecModel.pretrained() \\
278
+ ... .setInputCols(["token"]) \\
279
+ ... .setOutputCol("embeddings")
280
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
281
+ ... .setInputCols(["embeddings"]) \\
282
+ ... .setOutputCols("finished_embeddings") \\
283
+ ... .setOutputAsVector(True)
284
+ >>> pipeline = Pipeline().setStages([
285
+ ... documentAssembler,
286
+ ... tokenizer,
287
+ ... embeddings,
288
+ ... embeddingsFinisher
289
+ ... ])
290
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
291
+ >>> result = pipeline.fit(data).transform(data)
292
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
293
+ +--------------------------------------------------------------------------------+
294
+ | result|
295
+ +--------------------------------------------------------------------------------+
296
+ |[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...|
297
+ +--------------------------------------------------------------------------------+
298
+ """
299
+ name = "Doc2VecModel"
300
+
301
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
302
+
303
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
304
+
305
+ vectorSize = Param(Params._dummy(),
306
+ "vectorSize",
307
+ "the dimension of codes after transforming from words (> 0)",
308
+ typeConverter=TypeConverters.toInt)
309
+
310
+ def setVectorSize(self, vectorSize):
311
+ """
312
+ Sets vector size (default: 100).
313
+ """
314
+ return self._set(vectorSize=vectorSize)
315
+
316
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.Doc2VecModel", java_model=None):
317
+ super(Doc2VecModel, self).__init__(
318
+ classname=classname,
319
+ java_model=java_model
320
+ )
321
+ self._setDefault(
322
+ vectorSize=100
323
+ )
324
+
325
+ @staticmethod
326
+ def pretrained(name="doc2vec_gigaword_300", lang="en", remote_loc=None):
327
+ """Downloads and loads a pretrained model.
328
+
329
+ Parameters
330
+ ----------
331
+ name : str, optional
332
+ Name of the pretrained model, by default "doc2vec_wiki"
333
+ lang : str, optional
334
+ Language of the pretrained model, by default "en"
335
+ remote_loc : str, optional
336
+ Optional remote address of the resource, by default None. Will use
337
+ Spark NLPs repositories otherwise.
338
+
339
+ Returns
340
+ -------
341
+ Doc2VecModel
342
+ The restored model
343
+ """
344
+ from sparknlp.pretrained import ResourceDownloader
345
+ return ResourceDownloader.downloadModel(Doc2VecModel, name, lang, remote_loc)
346
+
347
+ def getVectors(self):
348
+ """
349
+ Returns the vector representation of the words as a dataframe
350
+ with two fields, word and vector.
351
+ """
352
+ return self._call_java("getVectors")