spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,176 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for XlnetForTokenClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class XlnetForTokenClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasEngine,
23
+ HasMaxSentenceLengthLimit):
24
+ """XlnetForTokenClassification can load XLNet Models with a token
25
+ classification head on top (a linear layer on top of the hidden-states
26
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> token_classifier = XlnetForTokenClassification.pretrained() \\
32
+ ... .setInputCols(["token", "document"]) \\
33
+ ... .setOutputCol("label")
34
+
35
+ The default model is ``"xlnet_base_token_classifier_conll03"``, if no name is
36
+ provided.
37
+
38
+ For available pretrained models please see the `Models Hub
39
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
40
+
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+
45
+ ====================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== ======================
48
+ ``DOCUMENT, TOKEN`` ``NAMED_ENTITY``
49
+ ====================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ batchSize
54
+ Batch size. Large values allows faster processing but requires more
55
+ memory, by default 8
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default
58
+ True
59
+ configProtoBytes
60
+ ConfigProto from tensorflow, serialized into byte array.
61
+ maxSentenceLength
62
+ Max sentence length to process, by default 128
63
+
64
+ Examples
65
+ --------
66
+ >>> import sparknlp
67
+ >>> from sparknlp.base import *
68
+ >>> from sparknlp.annotator import *
69
+ >>> from pyspark.ml import Pipeline
70
+ >>> documentAssembler = DocumentAssembler() \\
71
+ ... .setInputCol("text") \\
72
+ ... .setOutputCol("document")
73
+ >>> tokenizer = Tokenizer() \\
74
+ ... .setInputCols(["document"]) \\
75
+ ... .setOutputCol("token")
76
+ >>> tokenClassifier = XlnetForTokenClassification.pretrained() \\
77
+ ... .setInputCols(["token", "document"]) \\
78
+ ... .setOutputCol("label") \\
79
+ ... .setCaseSensitive(True)
80
+ >>> pipeline = Pipeline().setStages([
81
+ ... documentAssembler,
82
+ ... tokenizer,
83
+ ... tokenClassifier
84
+ ... ])
85
+ >>> data = spark.createDataFrame([["John Lenon was born in London and lived in Paris. My name is Sarah and I live in London"]]).toDF("text")
86
+ >>> result = pipeline.fit(data).transform(data)
87
+ >>> result.select("label.result").show(truncate=False)
88
+ +------------------------------------------------------------------------------------+
89
+ |result |
90
+ +------------------------------------------------------------------------------------+
91
+ |[B-PER, I-PER, O, O, O, B-LOC, O, O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O, B-LOC]|
92
+ +------------------------------------------------------------------------------------+
93
+ """
94
+
95
+ name = "XlnetForTokenClassification"
96
+
97
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
98
+
99
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
100
+
101
+ configProtoBytes = Param(Params._dummy(),
102
+ "configProtoBytes",
103
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
104
+ TypeConverters.toListInt)
105
+
106
+ def getClasses(self):
107
+ """
108
+ Returns labels used to train this model
109
+ """
110
+ return self._call_java("getClasses")
111
+
112
+ def setConfigProtoBytes(self, b):
113
+ """Sets configProto from tensorflow, serialized into byte array.
114
+
115
+ Parameters
116
+ ----------
117
+ b : List[int]
118
+ ConfigProto from tensorflow, serialized into byte array
119
+ """
120
+ return self._set(configProtoBytes=b)
121
+
122
+ @keyword_only
123
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlnetForTokenClassification",
124
+ java_model=None):
125
+ super(XlnetForTokenClassification, self).__init__(
126
+ classname=classname,
127
+ java_model=java_model
128
+ )
129
+ self._setDefault(
130
+ batchSize=8,
131
+ maxSentenceLength=128,
132
+ caseSensitive=True
133
+ )
134
+
135
+ @staticmethod
136
+ def loadSavedModel(folder, spark_session):
137
+ """Loads a locally saved model.
138
+
139
+ Parameters
140
+ ----------
141
+ folder : str
142
+ Folder of the saved model
143
+ spark_session : pyspark.sql.SparkSession
144
+ The current SparkSession
145
+
146
+ Returns
147
+ -------
148
+ XlnetForTokenClassification
149
+ The restored model
150
+ """
151
+ from sparknlp.internal import _XlnetTokenClassifierLoader
152
+ jModel = _XlnetTokenClassifierLoader(folder, spark_session._jsparkSession)._java_obj
153
+ return XlnetForTokenClassification(java_model=jModel)
154
+
155
+ @staticmethod
156
+ def pretrained(name="xlnet_base_token_classifier_conll03", lang="en", remote_loc=None):
157
+ """Downloads and loads a pretrained model.
158
+
159
+ Parameters
160
+ ----------
161
+ name : str, optional
162
+ Name of the pretrained model, by default
163
+ "xlnet_base_token_classifier_conll03"
164
+ lang : str, optional
165
+ Language of the pretrained model, by default "en"
166
+ remote_loc : str, optional
167
+ Optional remote address of the resource, by default None. Will use
168
+ Spark NLPs repositories otherwise.
169
+
170
+ Returns
171
+ -------
172
+ XlnetForTokenClassification
173
+ The restored model
174
+ """
175
+ from sparknlp.pretrained import ResourceDownloader
176
+ return ResourceDownloader.downloadModel(XlnetForTokenClassification, name, lang, remote_loc)
@@ -0,0 +1,15 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from sparknlp.annotator.cleaners.extractor import *
15
+ from sparknlp.annotator.cleaners.cleaner import *
@@ -0,0 +1,202 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Cleaner."""
15
+ from sparknlp.annotator import MarianTransformer
16
+ from sparknlp.common import *
17
+
18
+ class Cleaner(MarianTransformer):
19
+ name = "Cleaner"
20
+
21
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
22
+
23
+ outputAnnotatorType = AnnotatorType.CHUNK
24
+
25
+ encoding = Param(Params._dummy(),
26
+ "encoding",
27
+ "The encoding to be used for decoding the byte string (default is utf-8)",
28
+ typeConverter=TypeConverters.toString)
29
+
30
+ cleanPrefixPattern = Param(Params._dummy(),
31
+ "cleanPrefixPattern",
32
+ "The pattern for the prefix. Can be a simple string or a regex pattern.",
33
+ typeConverter=TypeConverters.toString)
34
+
35
+ cleanPostfixPattern = Param(Params._dummy(),
36
+ "cleanPostfixPattern",
37
+ "The pattern for the postfix. Can be a simple string or a regex pattern.",
38
+ typeConverter=TypeConverters.toString)
39
+
40
+ cleanerMode = Param(
41
+ Params._dummy(),
42
+ "cleanerMode",
43
+ "possible values: " +
44
+ "clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets, clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes",
45
+ typeConverter=TypeConverters.toString
46
+ )
47
+
48
+ extraWhitespace = Param(Params._dummy(),
49
+ "extraWhitespace",
50
+ "Whether to remove extra whitespace.",
51
+ typeConverter=TypeConverters.toBoolean)
52
+
53
+ dashes = Param(Params._dummy(),
54
+ "dashes",
55
+ "Whether to handle dashes in text.",
56
+ typeConverter=TypeConverters.toBoolean)
57
+
58
+ bullets = Param(Params._dummy(),
59
+ "bullets",
60
+ "Whether to handle bullets in text.",
61
+ typeConverter=TypeConverters.toBoolean)
62
+
63
+ trailingPunctuation = Param(Params._dummy(),
64
+ "trailingPunctuation",
65
+ "Whether to remove trailing punctuation from text.",
66
+ typeConverter=TypeConverters.toBoolean)
67
+
68
+ lowercase = Param(Params._dummy(),
69
+ "lowercase",
70
+ "Whether to convert text to lowercase.",
71
+ typeConverter=TypeConverters.toBoolean)
72
+
73
+ ignoreCase = Param(Params._dummy(),
74
+ "ignoreCase",
75
+ "If true, ignores case in the pattern.",
76
+ typeConverter=TypeConverters.toBoolean)
77
+
78
+ strip = Param(Params._dummy(),
79
+ "strip",
80
+ "If true, removes leading or trailing whitespace from the cleaned string.",
81
+ typeConverter=TypeConverters.toBoolean)
82
+
83
+ def setEncoding(self, value):
84
+ """Sets the encoding to be used for decoding the byte string (default is utf-8).
85
+
86
+ Parameters
87
+ ----------
88
+ value : str
89
+ The encoding to be used for decoding the byte string (default is utf-8)
90
+ """
91
+ return self._set(encoding=value)
92
+
93
+ def setCleanPrefixPattern(self, value):
94
+ """Sets the pattern for the prefix. Can be a simple string or a regex pattern.
95
+
96
+ Parameters
97
+ ----------
98
+ value : str
99
+ The pattern for the prefix. Can be a simple string or a regex pattern.
100
+ """
101
+ return self._set(cleanPrefixPattern=value)
102
+
103
+ def setCleanPostfixPattern(self, value):
104
+ """Sets the pattern for the postfix. Can be a simple string or a regex pattern.
105
+
106
+ Parameters
107
+ ----------
108
+ value : str
109
+ The pattern for the postfix. Can be a simple string or a regex pattern.
110
+ """
111
+ return self._set(cleanPostfixPattern=value)
112
+
113
+ def setCleanerMode(self, value):
114
+ """Sets the cleaner mode.
115
+
116
+ Possible values:
117
+ clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets,
118
+ clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes
119
+
120
+ Parameters
121
+ ----------
122
+ value : str
123
+ The mode for cleaning operations.
124
+ """
125
+ return self._set(cleanerMode=value)
126
+
127
+ def setExtraWhitespace(self, value):
128
+ """Sets whether to remove extra whitespace.
129
+
130
+ Parameters
131
+ ----------
132
+ value : bool
133
+ Whether to remove extra whitespace.
134
+ """
135
+ return self._set(extraWhitespace=value)
136
+
137
+ def setDashes(self, value):
138
+ """Sets whether to handle dashes in text.
139
+
140
+ Parameters
141
+ ----------
142
+ value : bool
143
+ Whether to handle dashes in text.
144
+ """
145
+ return self._set(dashes=value)
146
+
147
+ def setBullets(self, value):
148
+ """Sets whether to handle bullets in text.
149
+
150
+ Parameters
151
+ ----------
152
+ value : bool
153
+ Whether to handle bullets in text.
154
+ """
155
+ return self._set(bullets=value)
156
+
157
+ def setTrailingPunctuation(self, value):
158
+ """Sets whether to remove trailing punctuation from text.
159
+
160
+ Parameters
161
+ ----------
162
+ value : bool
163
+ Whether to remove trailing punctuation from text.
164
+ """
165
+ return self._set(trailingPunctuation=value)
166
+
167
+ def setLowercase(self, value):
168
+ """Sets whether to convert text to lowercase.
169
+
170
+ Parameters
171
+ ----------
172
+ value : bool
173
+ Whether to convert text to lowercase.
174
+ """
175
+ return self._set(lowercase=value)
176
+
177
+ def setIgnoreCase(self, value):
178
+ """Sets whether to ignore case in the pattern.
179
+
180
+ Parameters
181
+ ----------
182
+ value : bool
183
+ If true, ignores case in the pattern.
184
+ """
185
+ return self._set(ignoreCase=value)
186
+
187
+ def setStrip(self, value):
188
+ """Sets whether to remove leading or trailing whitespace from the cleaned string.
189
+
190
+ Parameters
191
+ ----------
192
+ value : bool
193
+ If true, removes leading or trailing whitespace from the cleaned string.
194
+ """
195
+ return self._set(strip=value)
196
+
197
+ @keyword_only
198
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Cleaner", java_model=None):
199
+ super(Cleaner, self).__init__(
200
+ classname=classname,
201
+ java_model=java_model
202
+ )
@@ -0,0 +1,191 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Extractor."""
15
+ from sparknlp.common import *
16
+
17
+ class Extractor(AnnotatorModel):
18
+ name = "Extractor"
19
+
20
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
21
+
22
+ outputAnnotatorType = AnnotatorType.CHUNK
23
+
24
+ emailDateTimeTzPattern = Param(Params._dummy(),
25
+ "emailDateTimeTzPattern",
26
+ "Specifies the date-time pattern for email timestamps, including time zone formatting.",
27
+ typeConverter=TypeConverters.toString)
28
+
29
+ emailAddress = Param(
30
+ Params._dummy(),
31
+ "emailAddress",
32
+ "Specifies the pattern for email addresses.",
33
+ typeConverter=TypeConverters.toString
34
+ )
35
+
36
+ ipAddressPattern = Param(
37
+ Params._dummy(),
38
+ "ipAddressPattern",
39
+ "Specifies the pattern for IP addresses.",
40
+ typeConverter=TypeConverters.toString
41
+ )
42
+
43
+ ipAddressNamePattern = Param(
44
+ Params._dummy(),
45
+ "ipAddressNamePattern",
46
+ "Specifies the pattern for IP addresses with names.",
47
+ typeConverter=TypeConverters.toString
48
+ )
49
+
50
+ mapiIdPattern = Param(
51
+ Params._dummy(),
52
+ "mapiIdPattern",
53
+ "Specifies the pattern for MAPI IDs.",
54
+ typeConverter=TypeConverters.toString
55
+ )
56
+
57
+ usPhoneNumbersPattern = Param(
58
+ Params._dummy(),
59
+ "usPhoneNumbersPattern",
60
+ "Specifies the pattern for US phone numbers.",
61
+ typeConverter=TypeConverters.toString
62
+ )
63
+
64
+ imageUrlPattern = Param(
65
+ Params._dummy(),
66
+ "imageUrlPattern",
67
+ "Specifies the pattern for image URLs.",
68
+ typeConverter=TypeConverters.toString
69
+ )
70
+
71
+ textPattern = Param(
72
+ Params._dummy(),
73
+ "textPattern",
74
+ "Specifies the pattern for text after and before.",
75
+ typeConverter=TypeConverters.toString
76
+ )
77
+
78
+ extractorMode = Param(
79
+ Params._dummy(),
80
+ "extractorMode",
81
+ "possible values: " +
82
+ "email_date, email_address, ip_address, ip_address_name, mapi_id, us_phone_numbers, image_urls, bullets, text_after, text_before",
83
+ typeConverter=TypeConverters.toString
84
+ )
85
+
86
+ index = Param(
87
+ Params._dummy(),
88
+ "index",
89
+ "Specifies the index of the pattern to extract in text after or before",
90
+ typeConverter=TypeConverters.toInt
91
+ )
92
+
93
+ def setEmailDateTimeTzPattern(self, value):
94
+ """Sets specifies the date-time pattern for email timestamps, including time zone formatting.
95
+
96
+ Parameters
97
+ ----------
98
+ value : str
99
+ Specifies the date-time pattern for email timestamps, including time zone formatting.
100
+ """
101
+ return self._set(emailDateTimeTzPattern=value)
102
+
103
+ def setEmailAddress(self, value):
104
+ """Sets the pattern for email addresses.
105
+
106
+ Parameters
107
+ ----------
108
+ value : str
109
+ Specifies the pattern for email addresses.
110
+ """
111
+ return self._set(emailAddress=value)
112
+
113
+ def setIpAddressPattern(self, value):
114
+ """Sets the pattern for IP addresses.
115
+
116
+ Parameters
117
+ ----------
118
+ value : str
119
+ Specifies the pattern for IP addresses.
120
+ """
121
+ return self._set(ipAddressPattern=value)
122
+
123
+ def setIpAddressNamePattern(self, value):
124
+ """Sets the pattern for IP addresses with names.
125
+
126
+ Parameters
127
+ ----------
128
+ value : str
129
+ Specifies the pattern for IP addresses with names.
130
+ """
131
+ return self._set(ipAddressNamePattern=value)
132
+
133
+ def setMapiIdPattern(self, value):
134
+ """Sets the pattern for MAPI IDs.
135
+
136
+ Parameters
137
+ ----------
138
+ value : str
139
+ Specifies the pattern for MAPI IDs.
140
+ """
141
+ return self._set(mapiIdPattern=value)
142
+
143
+ def setUsPhoneNumbersPattern(self, value):
144
+ """Sets the pattern for US phone numbers.
145
+
146
+ Parameters
147
+ ----------
148
+ value : str
149
+ Specifies the pattern for US phone numbers.
150
+ """
151
+ return self._set(usPhoneNumbersPattern=value)
152
+
153
+ def setImageUrlPattern(self, value):
154
+ """Sets the pattern for image URLs.
155
+
156
+ Parameters
157
+ ----------
158
+ value : str
159
+ Specifies the pattern for image URLs.
160
+ """
161
+ return self._set(imageUrlPattern=value)
162
+
163
+ def setTextPattern(self, value):
164
+ """Sets the pattern for text after and before.
165
+
166
+ Parameters
167
+ ----------
168
+ value : str
169
+ Specifies the pattern for text after and before.
170
+ """
171
+ return self._set(textPattern=value)
172
+
173
+ def setExtractorMode(self, value):
174
+ return self._set(extractorMode=value)
175
+
176
+ def setIndex(self, value):
177
+ """Sets the index of the pattern to extract in text after or before.
178
+
179
+ Parameters
180
+ ----------
181
+ value : int
182
+ Specifies the index of the pattern to extract in text after or before.
183
+ """
184
+ return self._set(index=value)
185
+
186
+ @keyword_only
187
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Extractor", java_model=None):
188
+ super(Extractor, self).__init__(
189
+ classname=classname,
190
+ java_model=java_model
191
+ )
@@ -0,0 +1 @@
1
+ from sparknlp.annotator.coref.spanbert_coref import *