spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,177 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for BertForTokenClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class BertForTokenClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasEngine,
23
+ HasMaxSentenceLengthLimit):
24
+ """BertForTokenClassification can load Bert Models with a token
25
+ classification head on top (a linear layer on top of the hidden-states
26
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> embeddings = BertForTokenClassification.pretrained() \\
32
+ ... .setInputCols(["token", "document"]) \\
33
+ ... .setOutputCol("label")
34
+
35
+ The default model is ``"bert_base_token_classifier_conll03"``, if no name is
36
+ provided.
37
+
38
+ For available pretrained models please see the `Models Hub
39
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
40
+
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+
45
+ ====================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== ======================
48
+ ``DOCUMENT, TOKEN`` ``NAMED_ENTITY``
49
+ ====================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ batchSize
54
+ Batch size. Large values allows faster processing but requires more
55
+ memory, by default 8
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default
58
+ True
59
+ configProtoBytes
60
+ ConfigProto from tensorflow, serialized into byte array.
61
+ maxSentenceLength
62
+ Max sentence length to process, by default 128
63
+
64
+ Examples
65
+ --------
66
+ >>> import sparknlp
67
+ >>> from sparknlp.base import *
68
+ >>> from sparknlp.annotator import *
69
+ >>> from pyspark.ml import Pipeline
70
+ >>> documentAssembler = DocumentAssembler() \\
71
+ ... .setInputCol("text") \\
72
+ ... .setOutputCol("document")
73
+ >>> tokenizer = Tokenizer() \\
74
+ ... .setInputCols(["document"]) \\
75
+ ... .setOutputCol("token")
76
+ >>> tokenClassifier = BertForTokenClassification.pretrained() \\
77
+ ... .setInputCols(["token", "document"]) \\
78
+ ... .setOutputCol("label") \\
79
+ ... .setCaseSensitive(True)
80
+ >>> pipeline = Pipeline().setStages([
81
+ ... documentAssembler,
82
+ ... tokenizer,
83
+ ... tokenClassifier
84
+ ... ])
85
+ >>> data = spark.createDataFrame([["John Lenon was born in London and lived in Paris. My name is Sarah and I live in London"]]).toDF("text")
86
+ >>> result = pipeline.fit(data).transform(data)
87
+ >>> result.select("label.result").show(truncate=False)
88
+ +------------------------------------------------------------------------------------+
89
+ |result
90
+ |
91
+ +------------------------------------------------------------------------------------+
92
+ |[B-PER, I-PER, O, O, O, B-LOC, O, O, O, B-LOC, O, O, O, O, B-PER, O, O, O,
93
+ O, B-LOC]|
94
+ +------------------------------------------------------------------------------------+
95
+ """
96
+ name = "BertForTokenClassification"
97
+
98
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
99
+
100
+ outputAnnotatorType: AnnotatorType = AnnotatorType.NAMED_ENTITY
101
+
102
+ configProtoBytes = Param(Params._dummy(),
103
+ "configProtoBytes",
104
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
105
+ TypeConverters.toListInt)
106
+
107
+ def getClasses(self):
108
+ """
109
+ Returns labels used to train this model
110
+ """
111
+ return self._call_java("getClasses")
112
+
113
+ def setConfigProtoBytes(self, b):
114
+ """Sets configProto from tensorflow, serialized into byte array.
115
+
116
+ Parameters
117
+ ----------
118
+ b : List[int]
119
+ ConfigProto from tensorflow, serialized into byte array
120
+ """
121
+ return self._set(configProtoBytes=b)
122
+
123
+ @keyword_only
124
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.BertForTokenClassification",
125
+ java_model=None):
126
+ super(BertForTokenClassification, self).__init__(
127
+ classname=classname,
128
+ java_model=java_model
129
+ )
130
+ self._setDefault(
131
+ batchSize=8,
132
+ maxSentenceLength=128,
133
+ caseSensitive=True
134
+ )
135
+
136
+ @staticmethod
137
+ def loadSavedModel(folder, spark_session):
138
+ """Loads a locally saved model.
139
+
140
+ Parameters
141
+ ----------
142
+ folder : str
143
+ Folder of the saved model
144
+ spark_session : pyspark.sql.SparkSession
145
+ The current SparkSession
146
+
147
+ Returns
148
+ -------
149
+ BertForTokenClassification
150
+ The restored model
151
+ """
152
+ from sparknlp.internal import _BertTokenClassifierLoader
153
+ jModel = _BertTokenClassifierLoader(folder, spark_session._jsparkSession)._java_obj
154
+ return BertForTokenClassification(java_model=jModel)
155
+
156
+ @staticmethod
157
+ def pretrained(name="bert_base_token_classifier_conll03", lang="en", remote_loc=None):
158
+ """Downloads and loads a pretrained model.
159
+
160
+ Parameters
161
+ ----------
162
+ name : str, optional
163
+ Name of the pretrained model, by default
164
+ "bert_base_token_classifier_conll03"
165
+ lang : str, optional
166
+ Language of the pretrained model, by default "en"
167
+ remote_loc : str, optional
168
+ Optional remote address of the resource, by default None. Will use
169
+ Spark NLPs repositories otherwise.
170
+
171
+ Returns
172
+ -------
173
+ BertForTokenClassification
174
+ The restored model
175
+ """
176
+ from sparknlp.pretrained import ResourceDownloader
177
+ return ResourceDownloader.downloadModel(BertForTokenClassification, name, lang, remote_loc)
@@ -0,0 +1,212 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for BertForSequenceClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class BertForZeroShotClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasCandidateLabelsProperties,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """BertForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
27
+ inference) tasks. Equivalent of `BertForSequenceClassification` models, but these models don't require a hardcoded
28
+ number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
29
+ flexible.
30
+
31
+ Note that the model will loop through all provided labels. So the more labels you have, the
32
+ longer this process will take.
33
+
34
+ Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
35
+ pair and passed to the pretrained model.
36
+
37
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
38
+ object:
39
+
40
+ >>> sequenceClassifier = BertForZeroShotClassification.pretrained() \\
41
+ ... .setInputCols(["token", "document"]) \\
42
+ ... .setOutputCol("label")
43
+
44
+ The default model is ``"bert_zero_shot_classifier_mnli"``, if no name is
45
+ provided.
46
+
47
+ For available pretrained models please see the `Models Hub
48
+ <https://sparknlp.orgtask=Text+Classification>`__.
49
+
50
+ To see which models are compatible and how to import them see
51
+ `Import Transformers into Spark NLP 🚀
52
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
53
+
54
+ ====================== ======================
55
+ Input Annotation types Output Annotation type
56
+ ====================== ======================
57
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
58
+ ====================== ======================
59
+
60
+ Parameters
61
+ ----------
62
+ batchSize
63
+ Batch size. Large values allows faster processing but requires more
64
+ memory, by default 8
65
+ caseSensitive
66
+ Whether to ignore case in tokens for embeddings matching, by default
67
+ True
68
+ configProtoBytes
69
+ ConfigProto from tensorflow, serialized into byte array.
70
+ maxSentenceLength
71
+ Max sentence length to process, by default 128
72
+ coalesceSentences
73
+ Instead of 1 class per sentence (if inputCols is `sentence`) output 1
74
+ class per document by averaging probabilities in all sentences, by
75
+ default False
76
+ activation
77
+ Whether to calculate logits via Softmax or Sigmoid, by default
78
+ `"softmax"`.
79
+
80
+ Examples
81
+ --------
82
+ >>> import sparknlp
83
+ >>> from sparknlp.base import *
84
+ >>> from sparknlp.annotator import *
85
+ >>> from pyspark.ml import Pipeline
86
+ >>> documentAssembler = DocumentAssembler() \\
87
+ ... .setInputCol("text") \\
88
+ ... .setOutputCol("document")
89
+ >>> tokenizer = Tokenizer() \\
90
+ ... .setInputCols(["document"]) \\
91
+ ... .setOutputCol("token")
92
+ >>> sequenceClassifier = BertForZeroShotClassification.pretrained() \\
93
+ ... .setInputCols(["token", "document"]) \\
94
+ ... .setOutputCol("label") \\
95
+ ... .setCaseSensitive(True)
96
+ >>> pipeline = Pipeline().setStages([
97
+ ... documentAssembler,
98
+ ... tokenizer,
99
+ ... sequenceClassifier
100
+ ... ])
101
+ >>> data = spark.createDataFrame([["I loved this movie when I was a child."], ["It was pretty boring."]]).toDF("text")
102
+ >>> result = pipeline.fit(data).transform(data)
103
+ >>> result.select("label.result").show(truncate=False)
104
+ +------+
105
+ |result|
106
+ +------+
107
+ |[pos] |
108
+ |[neg] |
109
+ +------+
110
+ """
111
+ name = "BertForZeroShotClassification"
112
+
113
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
114
+
115
+ outputAnnotatorType = AnnotatorType.CATEGORY
116
+
117
+ configProtoBytes = Param(Params._dummy(),
118
+ "configProtoBytes",
119
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
120
+ TypeConverters.toListInt)
121
+
122
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
123
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
124
+ TypeConverters.toBoolean)
125
+
126
+ def getClasses(self):
127
+ """
128
+ Returns labels used to train this model
129
+ """
130
+ return self._call_java("getClasses")
131
+
132
+ def setConfigProtoBytes(self, b):
133
+ """Sets configProto from tensorflow, serialized into byte array.
134
+
135
+ Parameters
136
+ ----------
137
+ b : List[int]
138
+ ConfigProto from tensorflow, serialized into byte array
139
+ """
140
+ return self._set(configProtoBytes=b)
141
+
142
+ def setCoalesceSentences(self, value):
143
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging
144
+ probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT
145
+ (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities
146
+ for the entire document instead of probabilities per sentence. (Default: true)
147
+
148
+ Parameters
149
+ ----------
150
+ value : bool
151
+ If the output of all sentences will be averaged to one output
152
+ """
153
+ return self._set(coalesceSentences=value)
154
+
155
+ @keyword_only
156
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.BertForZeroShotClassification",
157
+ java_model=None):
158
+ super(BertForZeroShotClassification, self).__init__(
159
+ classname=classname,
160
+ java_model=java_model
161
+ )
162
+ self._setDefault(
163
+ batchSize=8,
164
+ maxSentenceLength=128,
165
+ caseSensitive=True,
166
+ coalesceSentences=False,
167
+ activation="softmax",
168
+ multilabel=False
169
+ )
170
+
171
+ @staticmethod
172
+ def loadSavedModel(folder, spark_session):
173
+ """Loads a locally saved model.
174
+
175
+ Parameters
176
+ ----------
177
+ folder : str
178
+ Folder of the saved model
179
+ spark_session : pyspark.sql.SparkSession
180
+ The current SparkSession
181
+
182
+ Returns
183
+ -------
184
+ BertForZeroShotClassification
185
+ The restored model
186
+ """
187
+ from sparknlp.internal import _BertZeroShotClassifierLoader
188
+ jModel = _BertZeroShotClassifierLoader(folder, spark_session._jsparkSession)._java_obj
189
+ return BertForZeroShotClassification(java_model=jModel)
190
+
191
+ @staticmethod
192
+ def pretrained(name="bert_zero_shot_classifier_mnli", lang="xx", remote_loc=None):
193
+ """Downloads and loads a pretrained model.
194
+
195
+ Parameters
196
+ ----------
197
+ name : str, optional
198
+ Name of the pretrained model, by default
199
+ "bert_zero_shot_classifier_mnli"
200
+ lang : str, optional
201
+ Language of the pretrained model, by default "en"
202
+ remote_loc : str, optional
203
+ Optional remote address of the resource, by default None. Will use
204
+ Spark NLPs repositories otherwise.
205
+
206
+ Returns
207
+ -------
208
+ BertForZeroShotClassification
209
+ The restored model
210
+ """
211
+ from sparknlp.pretrained import ResourceDownloader
212
+ return ResourceDownloader.downloadModel(BertForZeroShotClassification, name, lang, remote_loc)
@@ -0,0 +1,168 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+
18
+ class CamemBertForQuestionAnswering(AnnotatorModel,
19
+ HasCaseSensitiveProperties,
20
+ HasBatchedAnnotate,
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
23
+ """CamemBertForQuestionAnswering can load CamemBERT Models with a span classification head on top for extractive
24
+ question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
25
+ logits and span end logits).
26
+
27
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
28
+ object:
29
+
30
+ >>> spanClassifier = CamemBertForQuestionAnswering.pretrained() \\
31
+ ... .setInputCols(["document_question", "document_context"]) \\
32
+ ... .setOutputCol("answer")
33
+
34
+ The default model is ``"camembert_base_qa_fquad"``, if no name is
35
+ provided.
36
+
37
+ For available pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
39
+
40
+ To see which models are compatible and how to import them see
41
+ `Import Transformers into Spark NLP 🚀
42
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Batch size. Large values allows faster processing but requires more
54
+ memory, by default 8
55
+ caseSensitive
56
+ Whether to ignore case in tokens for embeddings matching, by default
57
+ False
58
+ configProtoBytes
59
+ ConfigProto from tensorflow, serialized into byte array.
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 128
62
+
63
+ Examples
64
+ --------
65
+ >>> import sparknlp
66
+ >>> from sparknlp.base import *
67
+ >>> from sparknlp.annotator import *
68
+ >>> from pyspark.ml import Pipeline
69
+ >>> documentAssembler = MultiDocumentAssembler() \\
70
+ ... .setInputCols(["question", "context"]) \\
71
+ ... .setOutputCol(["document_question", "document_context"])
72
+ >>> spanClassifier = CamemBertForQuestionAnswering.pretrained() \\
73
+ ... .setInputCols(["document_question", "document_context"]) \\
74
+ ... .setOutputCol("answer") \\
75
+ ... .setCaseSensitive(False)
76
+ >>> pipeline = Pipeline().setStages([
77
+ ... documentAssembler,
78
+ ... spanClassifier
79
+ ... ])
80
+ >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
81
+ >>> result = pipeline.fit(data).transform(data)
82
+ >>> result.select("answer.result").show(truncate=False)
83
+ +--------------------+
84
+ |result |
85
+ +--------------------+
86
+ |[Clara] |
87
+ +--------------------+
88
+ """
89
+ name = "CamemBertForQuestionAnswering"
90
+
91
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
92
+
93
+ outputAnnotatorType = AnnotatorType.CHUNK
94
+
95
+ configProtoBytes = Param(Params._dummy(),
96
+ "configProtoBytes",
97
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
98
+ TypeConverters.toListInt)
99
+
100
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
101
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
102
+ TypeConverters.toBoolean)
103
+
104
+ def setConfigProtoBytes(self, b):
105
+ """Sets configProto from tensorflow, serialized into byte array.
106
+
107
+ Parameters
108
+ ----------
109
+ b : List[int]
110
+ ConfigProto from tensorflow, serialized into byte array
111
+ """
112
+ return self._set(configProtoBytes=b)
113
+
114
+ @keyword_only
115
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.CamemBertForQuestionAnswering",
116
+ java_model=None):
117
+ super(CamemBertForQuestionAnswering, self).__init__(
118
+ classname=classname,
119
+ java_model=java_model
120
+ )
121
+ self._setDefault(
122
+ batchSize=8,
123
+ maxSentenceLength=128,
124
+ caseSensitive=False
125
+ )
126
+
127
+ @staticmethod
128
+ def loadSavedModel(folder, spark_session):
129
+ """Loads a locally saved model.
130
+
131
+ Parameters
132
+ ----------
133
+ folder : str
134
+ Folder of the saved model
135
+ spark_session : pyspark.sql.SparkSession
136
+ The current SparkSession
137
+
138
+ Returns
139
+ -------
140
+ CamemBertForQuestionAnswering
141
+ The restored model
142
+ """
143
+ from sparknlp.internal import _CamemBertQuestionAnsweringLoader
144
+ jModel = _CamemBertQuestionAnsweringLoader(folder, spark_session._jsparkSession)._java_obj
145
+ return CamemBertForQuestionAnswering(java_model=jModel)
146
+
147
+ @staticmethod
148
+ def pretrained(name="camembert_base_qa_fquad", lang="fr", remote_loc=None):
149
+ """Downloads and loads a pretrained model.
150
+
151
+ Parameters
152
+ ----------
153
+ name : str, optional
154
+ Name of the pretrained model, by default
155
+ "camembert_base_qa_fquad"
156
+ lang : str, optional
157
+ Language of the pretrained model, by default "en"
158
+ remote_loc : str, optional
159
+ Optional remote address of the resource, by default None. Will use
160
+ Spark NLPs repositories otherwise.
161
+
162
+ Returns
163
+ -------
164
+ CamemBertForQuestionAnswering
165
+ The restored model
166
+ """
167
+ from sparknlp.pretrained import ResourceDownloader
168
+ return ResourceDownloader.downloadModel(CamemBertForQuestionAnswering, name, lang, remote_loc)