spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,205 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for CamemBertForSequenceClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class CamemBertForSequenceClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasEngine,
24
+ HasMaxSentenceLengthLimit):
25
+ """CamemBertForSequenceClassification can load CamemBERT Models with a sequence
26
+ classification/regression head on top (a linear layer on top of the pooled output)
27
+ e.g. for multi-class document classification tasks.
28
+
29
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
30
+ object:
31
+
32
+ >>> sequence_classifier = CamemBertForSequenceClassification.pretrained() \\
33
+ ... .setInputCols(["token", "document"]) \\
34
+ ... .setOutputCol("label")
35
+
36
+ The default model is ``"camembert_base_sequence_classifier_allocine"``, if no
37
+ name is provided.
38
+
39
+ For available pretrained models please see the `Models Hub
40
+ <https://sparknlp.org/models?task=Text+Classification>`__.
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+
45
+ ====================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== ======================
48
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
49
+ ====================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ batchSize
54
+ Batch size. Large values allows faster processing but requires more
55
+ memory, by default 8
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default
58
+ True
59
+ configProtoBytes
60
+ ConfigProto from tensorflow, serialized into byte array.
61
+ maxSentenceLength
62
+ Max sentence length to process, by default 128
63
+ coalesceSentences
64
+ Instead of 1 class per sentence (if inputCols is `sentence`) output
65
+ 1 class per document by averaging probabilities in all sentences, by
66
+ default False.
67
+ activation
68
+ Whether to calculate logits via Softmax or Sigmoid, by default
69
+ `"softmax"`.
70
+
71
+ Examples
72
+ --------
73
+ >>> import sparknlp
74
+ >>> from sparknlp.base import *
75
+ >>> from sparknlp.annotator import *
76
+ >>> from pyspark.ml import Pipeline
77
+ >>> documentAssembler = DocumentAssembler() \\
78
+ ... .setInputCol("text") \\
79
+ ... .setOutputCol("document")
80
+ >>> tokenizer = Tokenizer() \\
81
+ ... .setInputCols(["document"]) \\
82
+ ... .setOutputCol("token")
83
+ >>> sequenceClassifier = CamemBertForSequenceClassification.pretrained() \\
84
+ ... .setInputCols(["token", "document"]) \\
85
+ ... .setOutputCol("label") \\
86
+ ... .setCaseSensitive(True)
87
+ >>> pipeline = Pipeline().setStages([
88
+ ... documentAssembler,
89
+ ... tokenizer,
90
+ ... sequenceClassifier
91
+ ... ])
92
+ >>> data = spark.createDataFrame([["j'ai adoré ce film lorsque j'étais enfant.", "Je déteste ça."]]).toDF("text")
93
+ >>> result = pipeline.fit(data).transform(data)
94
+ >>> result.select("class.result").show(truncate=False)
95
+ +------+
96
+ |result|
97
+ +------+
98
+ |[pos] |
99
+ |[neg] |
100
+ +------+
101
+ """
102
+ name = "CamemBertForSequenceClassification"
103
+
104
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
105
+
106
+ outputAnnotatorType = AnnotatorType.CATEGORY
107
+
108
+ configProtoBytes = Param(Params._dummy(),
109
+ "configProtoBytes",
110
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
111
+ TypeConverters.toListInt)
112
+
113
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
114
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
115
+ TypeConverters.toBoolean)
116
+
117
+ def getClasses(self):
118
+ """
119
+ Returns labels used to train this model
120
+ """
121
+ return self._call_java("getClasses")
122
+
123
+ def setConfigProtoBytes(self, b):
124
+ """Sets configProto from tensorflow, serialized into byte array.
125
+
126
+ Parameters
127
+ ----------
128
+ b : List[int]
129
+ ConfigProto from tensorflow, serialized into byte array
130
+ """
131
+ return self._set(configProtoBytes=b)
132
+
133
+ def setCoalesceSentences(self, value):
134
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1
135
+ class per document by averaging probabilities in all sentences, by default True.
136
+
137
+ Due to max sequence length limit in almost all transformer models such as BERT
138
+ (512 tokens), this parameter helps feeding all the sentences into the model and
139
+ averaging all the probabilities for the entire document instead of probabilities
140
+ per sentence.
141
+
142
+ Parameters
143
+ ----------
144
+ value : bool
145
+ If the output of all sentences will be averaged to one output
146
+ """
147
+ return self._set(coalesceSentences=value)
148
+
149
+ @keyword_only
150
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.CamemBertForSequenceClassification",
151
+ java_model=None):
152
+ super(CamemBertForSequenceClassification, self).__init__(
153
+ classname=classname,
154
+ java_model=java_model
155
+ )
156
+ self._setDefault(
157
+ batchSize=8,
158
+ maxSentenceLength=128,
159
+ caseSensitive=True,
160
+ coalesceSentences=False,
161
+ activation="softmax"
162
+ )
163
+
164
+ @staticmethod
165
+ def loadSavedModel(folder, spark_session):
166
+ """Loads a locally saved model.
167
+
168
+ Parameters
169
+ ----------
170
+ folder : str
171
+ Folder of the saved model
172
+ spark_session : pyspark.sql.SparkSession
173
+ The current SparkSession
174
+
175
+ Returns
176
+ -------
177
+ CamemBertForSequenceClassification
178
+ The restored model
179
+ """
180
+ from sparknlp.internal import _CamemBertForSequenceClassificationLoader
181
+ jModel = _CamemBertForSequenceClassificationLoader(folder, spark_session._jsparkSession)._java_obj
182
+ return CamemBertForSequenceClassification(java_model=jModel)
183
+
184
+ @staticmethod
185
+ def pretrained(name="camembert_base_sequence_classifier_allocine", lang="fr", remote_loc=None):
186
+ """Downloads and loads a pretrained model.
187
+
188
+ Parameters
189
+ ----------
190
+ name : str, optional
191
+ Name of the pretrained model, by default
192
+ "camembert_base_sequence_classifier_allocine"
193
+ lang : str, optional
194
+ Language of the pretrained model, by default "fr"
195
+ remote_loc : str, optional
196
+ Optional remote address of the resource, by default None. Will use
197
+ Spark NLPs repositories otherwise.
198
+
199
+ Returns
200
+ -------
201
+ CamemBertForSequenceClassification
202
+ The restored model
203
+ """
204
+ from sparknlp.pretrained import ResourceDownloader
205
+ return ResourceDownloader.downloadModel(CamemBertForSequenceClassification, name, lang, remote_loc)
@@ -0,0 +1,173 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for CamemBertForTokenClassification."""
15
+ from sparknlp.common import *
16
+
17
+
18
+ class CamemBertForTokenClassification(AnnotatorModel,
19
+ HasCaseSensitiveProperties,
20
+ HasBatchedAnnotate,
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
23
+ """CamemBertForTokenClassification can load CamemBERT Models with a token
24
+ classification head on top (a linear layer on top of the hidden-states
25
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
26
+
27
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
28
+ object:
29
+
30
+ >>> token_classifier = CamemBertForTokenClassification.pretrained() \\
31
+ ... .setInputCols(["token", "document"]) \\
32
+ ... .setOutputCol("label")
33
+
34
+ The default model is ``"camembert_base_token_classifier_wikiner"``, if no
35
+ name is provided.
36
+
37
+ For available pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
39
+ To see which models are compatible and how to import them see
40
+ `Import Transformers into Spark NLP 🚀
41
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
42
+
43
+ ====================== ======================
44
+ Input Annotation types Output Annotation type
45
+ ====================== ======================
46
+ ``DOCUMENT, TOKEN`` ``NAMED_ENTITY``
47
+ ====================== ======================
48
+
49
+ Parameters
50
+ ----------
51
+ batchSize
52
+ Batch size. Large values allows faster processing but requires more
53
+ memory, by default 8
54
+ caseSensitive
55
+ Whether to ignore case in tokens for embeddings matching, by default
56
+ True
57
+ configProtoBytes
58
+ ConfigProto from tensorflow, serialized into byte array.
59
+ maxSentenceLength
60
+ Max sentence length to process, by default 128
61
+
62
+ Examples
63
+ --------
64
+ >>> import sparknlp
65
+ >>> from sparknlp.base import *
66
+ >>> from sparknlp.annotator import *
67
+ >>> from pyspark.ml import Pipeline
68
+ >>> documentAssembler = DocumentAssembler() \\
69
+ ... .setInputCol("text") \\
70
+ ... .setOutputCol("document")
71
+ >>> tokenizer = Tokenizer() \\
72
+ ... .setInputCols(["document"]) \\
73
+ ... .setOutputCol("token")
74
+ >>> tokenClassifier = CamemBertForTokenClassification.pretrained() \\
75
+ ... .setInputCols(["token", "document"]) \\
76
+ ... .setOutputCol("label") \\
77
+ ... .setCaseSensitive(True)
78
+ >>> pipeline = Pipeline().setStages([
79
+ ... documentAssembler,
80
+ ... tokenizer,
81
+ ... tokenClassifier
82
+ ... ])
83
+ >>> data = spark.createDataFrame([["george washington est allé à washington"]]).toDF("text")
84
+ >>> result = pipeline.fit(data).transform(data)
85
+ >>> result.select("label.result").show(truncate=False)
86
+ +------------------------------+
87
+ |result |
88
+ +------------------------------+
89
+ |[I-PER, I-PER, O, O, O, I-LOC]|
90
+ +------------------------------+
91
+ """
92
+ name = "CamemBertForTokenClassification"
93
+
94
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
95
+
96
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
97
+
98
+ configProtoBytes = Param(Params._dummy(),
99
+ "configProtoBytes",
100
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
101
+ TypeConverters.toListInt)
102
+
103
+ def getClasses(self):
104
+ """
105
+ Returns labels used to train this model
106
+ """
107
+ return self._call_java("getClasses")
108
+
109
+ def setConfigProtoBytes(self, b):
110
+ """Sets configProto from tensorflow, serialized into byte array.
111
+
112
+ Parameters
113
+ ----------
114
+ b : List[int]
115
+ ConfigProto from tensorflow, serialized into byte array
116
+ """
117
+ return self._set(configProtoBytes=b)
118
+
119
+ @keyword_only
120
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.CamemBertForTokenClassification",
121
+ java_model=None):
122
+ super(CamemBertForTokenClassification, self).__init__(
123
+ classname=classname,
124
+ java_model=java_model
125
+ )
126
+ self._setDefault(
127
+ batchSize=8,
128
+ maxSentenceLength=128,
129
+ caseSensitive=True
130
+ )
131
+
132
+ @staticmethod
133
+ def loadSavedModel(folder, spark_session):
134
+ """Loads a locally saved model.
135
+
136
+ Parameters
137
+ ----------
138
+ folder : str
139
+ Folder of the saved model
140
+ spark_session : pyspark.sql.SparkSession
141
+ The current SparkSession
142
+
143
+ Returns
144
+ -------
145
+ CamemBertForTokenClassification
146
+ The restored model
147
+ """
148
+ from sparknlp.internal import _CamemBertForTokenClassificationLoader
149
+ jModel = _CamemBertForTokenClassificationLoader(folder, spark_session._jsparkSession)._java_obj
150
+ return CamemBertForTokenClassification(java_model=jModel)
151
+
152
+ @staticmethod
153
+ def pretrained(name="camembert_base_token_classifier_wikiner", lang="en", remote_loc=None):
154
+ """Downloads and loads a pretrained model.
155
+
156
+ Parameters
157
+ ----------
158
+ name : str, optional
159
+ Name of the pretrained model, by default
160
+ "camembert_base_token_classifier_wikiner"
161
+ lang : str, optional
162
+ Language of the pretrained model, by default "en"
163
+ remote_loc : str, optional
164
+ Optional remote address of the resource, by default None. Will use
165
+ Spark NLPs repositories otherwise.
166
+
167
+ Returns
168
+ -------
169
+ CamemBertForTokenClassification
170
+ The restored model
171
+ """
172
+ from sparknlp.pretrained import ResourceDownloader
173
+ return ResourceDownloader.downloadModel(CamemBertForTokenClassification, name, lang, remote_loc)
@@ -0,0 +1,202 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for CamemBertForSequenceClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class CamemBertForZeroShotClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasCandidateLabelsProperties,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """CamemBertForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
27
+ inference) tasks. Equivalent of `DeBertaForSequenceClassification` models, but these models don't require a hardcoded
28
+ number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
29
+ flexible.
30
+ Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
31
+ pair and passed to the pretrained model.
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+ >>> sequenceClassifier = CamemBertForZeroShotClassification.pretrained() \\
35
+ ... .setInputCols(["token", "document"]) \\
36
+ ... .setOutputCol("label")
37
+ The default model is ``"camembert_zero_shot_classifier_xnli_onnx"``, if no name is
38
+ provided.
39
+ For available pretrained models please see the `Models Hub
40
+ <https://sparknlp.orgtask=Text+Classification>`__.
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
48
+ ====================== ======================
49
+ Parameters
50
+ ----------
51
+ batchSize
52
+ Batch size. Large values allows faster processing but requires more
53
+ memory, by default 8
54
+ caseSensitive
55
+ Whether to ignore case in tokens for embeddings matching, by default
56
+ True
57
+ configProtoBytes
58
+ ConfigProto from tensorflow, serialized into byte array.
59
+ maxSentenceLength
60
+ Max sentence length to process, by default 128
61
+ coalesceSentences
62
+ Instead of 1 class per sentence (if inputCols is `sentence`) output 1
63
+ class per document by averaging probabilities in all sentences, by
64
+ default False
65
+ activation
66
+ Whether to calculate logits via Softmax or Sigmoid, by default
67
+ `"softmax"`.
68
+ Examples
69
+ --------
70
+ >>> import sparknlp
71
+ >>> from sparknlp.base import *
72
+ >>> from sparknlp.annotator import *
73
+ >>> from pyspark.ml import Pipeline
74
+ >>> documentAssembler = DocumentAssembler() \\
75
+ ... .setInputCol("text") \\
76
+ ... .setOutputCol("document")
77
+ >>> tokenizer = Tokenizer() \\
78
+ ... .setInputCols(["document"]) \\
79
+ ... .setOutputCol("token")
80
+ >>> sequenceClassifier = CamemBertForZeroShotClassification.pretrained() \\
81
+ ... .setInputCols(["token", "document"]) \\
82
+ ... .setOutputCol("multi_class") \\
83
+ ... .setCaseSensitive(True)
84
+ ... .setCandidateLabels(["sport", "politique", "science"])
85
+ >>> pipeline = Pipeline().setStages([
86
+ ... documentAssembler,
87
+ ... tokenizer,
88
+ ... sequenceClassifier
89
+ ... ])
90
+ >>> data = spark.createDataFrame([["L'équipe de France joue aujourd'hui au Parc des Princes"]]).toDF("text")
91
+ >>> result = pipeline.fit(data).transform(data)
92
+ >>> result.select("class.result").show(truncate=False)
93
+ +------+
94
+ |result|
95
+ +------+
96
+ |[sport]|
97
+ +------+
98
+ """
99
+ name = "CamemBertForZeroShotClassification"
100
+
101
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
102
+
103
+ outputAnnotatorType = AnnotatorType.CATEGORY
104
+
105
+ configProtoBytes = Param(Params._dummy(),
106
+ "configProtoBytes",
107
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
108
+ TypeConverters.toListInt)
109
+
110
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
111
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
112
+ TypeConverters.toBoolean)
113
+
114
+ def getClasses(self):
115
+ """
116
+ Returns labels used to train this model
117
+ """
118
+ return self._call_java("getClasses")
119
+
120
+ def setConfigProtoBytes(self, b):
121
+ """Sets configProto from tensorflow, serialized into byte array.
122
+
123
+ Parameters
124
+ ----------
125
+ b : List[int]
126
+ ConfigProto from tensorflow, serialized into byte array
127
+ """
128
+ return self._set(configProtoBytes=b)
129
+
130
+ def setCoalesceSentences(self, value):
131
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1
132
+ class per document by averaging probabilities in all sentences, by default True.
133
+
134
+ Due to max sequence length limit in almost all transformer models such as BERT
135
+ (512 tokens), this parameter helps feeding all the sentences into the model and
136
+ averaging all the probabilities for the entire document instead of probabilities
137
+ per sentence.
138
+
139
+ Parameters
140
+ ----------
141
+ value : bool
142
+ If the output of all sentences will be averaged to one output
143
+ """
144
+ return self._set(coalesceSentences=value)
145
+
146
+ @keyword_only
147
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.CamemBertForZeroShotClassification",
148
+ java_model=None):
149
+ super(CamemBertForZeroShotClassification, self).__init__(
150
+ classname=classname,
151
+ java_model=java_model
152
+ )
153
+ self._setDefault(
154
+ batchSize=8,
155
+ maxSentenceLength=128,
156
+ caseSensitive=True,
157
+ coalesceSentences=False,
158
+ activation="softmax"
159
+ )
160
+
161
+ @staticmethod
162
+ def loadSavedModel(folder, spark_session):
163
+ """Loads a locally saved model.
164
+
165
+ Parameters
166
+ ----------
167
+ folder : str
168
+ Folder of the saved model
169
+ spark_session : pyspark.sql.SparkSession
170
+ The current SparkSession
171
+
172
+ Returns
173
+ -------
174
+ CamemBertForZeroShotClassification
175
+ The restored model
176
+ """
177
+ from sparknlp.internal import _CamemBertForZeroShotClassificationLoader
178
+ jModel = _CamemBertForZeroShotClassificationLoader(folder, spark_session._jsparkSession)._java_obj
179
+ return CamemBertForZeroShotClassification(java_model=jModel)
180
+
181
+ @staticmethod
182
+ def pretrained(name="camembert_zero_shot_classifier_xnli_onnx", lang="fr", remote_loc=None):
183
+ """Downloads and loads a pretrained model.
184
+
185
+ Parameters
186
+ ----------
187
+ name : str, optional
188
+ Name of the pretrained model, by default
189
+ "camembert_zero_shot_classifier_xnli_onnx"
190
+ lang : str, optional
191
+ Language of the pretrained model, by default "fr"
192
+ remote_loc : str, optional
193
+ Optional remote address of the resource, by default None. Will use
194
+ Spark NLPs repositories otherwise.
195
+
196
+ Returns
197
+ -------
198
+ CamemBertForSequenceClassification
199
+ The restored model
200
+ """
201
+ from sparknlp.pretrained import ResourceDownloader
202
+ return ResourceDownloader.downloadModel(CamemBertForZeroShotClassification, name, lang, remote_loc)