spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the Perceptron Annotator."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class PerceptronApproach(AnnotatorApproach):
20
+ """Trains an averaged Perceptron model to tag words part-of-speech. Sets a
21
+ POS tag to each word within a sentence.
22
+
23
+ For pretrained models please see the :class:`.PerceptronModel`.
24
+
25
+ The training data needs to be in a Spark DataFrame, where the column needs
26
+ to consist of Annotations of type ``POS``. The `Annotation` needs to have
27
+ member ``result`` set to the POS tag and have a ``"word"`` mapping to its
28
+ word inside of member ``metadata``. This DataFrame for training can easily
29
+ created by the helper class :class:`.POS`.
30
+
31
+
32
+ >>> POS().readDataset(spark, datasetPath) \\
33
+ ... .selectExpr("explode(tags) as tags").show(truncate=False)
34
+ +---------------------------------------------+
35
+ |tags |
36
+ +---------------------------------------------+
37
+ |[pos, 0, 5, NNP, [word -> Pierre], []] |
38
+ |[pos, 7, 12, NNP, [word -> Vinken], []] |
39
+ |[pos, 14, 14, ,, [word -> ,], []] |
40
+ |[pos, 31, 34, MD, [word -> will], []] |
41
+ |[pos, 36, 39, VB, [word -> join], []] |
42
+ |[pos, 41, 43, DT, [word -> the], []] |
43
+ |[pos, 45, 49, NN, [word -> board], []] |
44
+ ...
45
+
46
+
47
+ For extended examples of usage, see the `Examples
48
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/french/Train-Perceptron-French.ipynb>`__.
49
+
50
+ ====================== ======================
51
+ Input Annotation types Output Annotation type
52
+ ====================== ======================
53
+ ``TOKEN, DOCUMENT`` ``POS``
54
+ ====================== ======================
55
+
56
+ Parameters
57
+ ----------
58
+ posCol
59
+ Column name for Array of POS tags that match tokens
60
+ nIterations
61
+ Number of iterations in training, converges to better accuracy, by
62
+ default 5
63
+
64
+ Examples
65
+ --------
66
+ >>> import sparknlp
67
+ >>> from sparknlp.base import *
68
+ >>> from sparknlp.annotator import *
69
+ >>> from sparknlp.training import *
70
+ >>> from pyspark.ml import Pipeline
71
+ >>> documentAssembler = DocumentAssembler() \\
72
+ ... .setInputCol("text") \\
73
+ ... .setOutputCol("document")
74
+ >>> sentence = SentenceDetector() \\
75
+ ... .setInputCols(["document"]) \\
76
+ ... .setOutputCol("sentence")
77
+ >>> tokenizer = Tokenizer() \\
78
+ ... .setInputCols(["sentence"]) \\
79
+ ... .setOutputCol("token")
80
+ >>> datasetPath = "src/test/resources/anc-pos-corpus-small/test-training.txt"
81
+ >>> trainingPerceptronDF = POS().readDataset(spark, datasetPath)
82
+ >>> trainedPos = PerceptronApproach() \\
83
+ ... .setInputCols(["document", "token"]) \\
84
+ ... .setOutputCol("pos") \\
85
+ ... .setPosColumn("tags") \\
86
+ ... .fit(trainingPerceptronDF)
87
+ >>> pipeline = Pipeline().setStages([
88
+ ... documentAssembler,
89
+ ... sentence,
90
+ ... tokenizer,
91
+ ... trainedPos
92
+ ... ])
93
+ >>> data = spark.createDataFrame([["To be or not to be, is this the question?"]]).toDF("text")
94
+ >>> result = pipeline.fit(data).transform(data)
95
+ >>> result.selectExpr("pos.result").show(truncate=False)
96
+ +--------------------------------------------------+
97
+ |result |
98
+ +--------------------------------------------------+
99
+ |[NNP, NNP, CD, JJ, NNP, NNP, ,, MD, VB, DT, CD, .]|
100
+ +--------------------------------------------------+
101
+ """
102
+
103
+ inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
104
+
105
+ outputAnnotatorType = AnnotatorType.POS
106
+
107
+ posCol = Param(Params._dummy(),
108
+ "posCol",
109
+ "column of Array of POS tags that match tokens",
110
+ typeConverter=TypeConverters.toString)
111
+
112
+ nIterations = Param(Params._dummy(),
113
+ "nIterations",
114
+ "Number of iterations in training, converges to better accuracy",
115
+ typeConverter=TypeConverters.toInt)
116
+
117
+ @keyword_only
118
+ def __init__(self):
119
+ super(PerceptronApproach, self).__init__(
120
+ classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach")
121
+ self._setDefault(
122
+ nIterations=5
123
+ )
124
+
125
+ def setPosColumn(self, value):
126
+ """Sets column name for Array of POS tags that match tokens.
127
+
128
+ Parameters
129
+ ----------
130
+ value : str
131
+ Name of column for Array of POS tags
132
+ """
133
+ return self._set(posCol=value)
134
+
135
+ def setIterations(self, value):
136
+ """Sets number of iterations in training, by default 5.
137
+
138
+ Parameters
139
+ ----------
140
+ value : int
141
+ Number of iterations in training
142
+ """
143
+ return self._set(nIterations=value)
144
+
145
+ def getNIterations(self):
146
+ """Gets number of iterations in training, by default 5.
147
+
148
+ Returns
149
+ -------
150
+ int
151
+ Number of iterations in training
152
+ """
153
+ return self.getOrDefault(self.nIterations)
154
+
155
+ def _create_model(self, java_model):
156
+ return PerceptronModel(java_model=java_model)
157
+
158
+
159
+ class PerceptronModel(AnnotatorModel):
160
+ """Averaged Perceptron model to tag words part-of-speech. Sets a POS tag to
161
+ each word within a sentence.
162
+
163
+ This is the instantiated model of the :class:`.PerceptronApproach`. For
164
+ training your own model, please see the documentation of that class.
165
+
166
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
167
+ object:
168
+
169
+ >>> posTagger = PerceptronModel.pretrained() \\
170
+ ... .setInputCols(["document", "token"]) \\
171
+ ... .setOutputCol("pos")
172
+
173
+
174
+ The default model is ``"pos_anc"``, if no name is provided.
175
+
176
+ For available pretrained models please see the `Models Hub
177
+ <https://sparknlp.org/models?task=Part+of+Speech+Tagging>`__.
178
+ Additionally, pretrained pipelines are available for this module, see
179
+ `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
180
+
181
+ For extended examples of usage, see the `Examples
182
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/french/Train-Perceptron-French.ipynb>`__.
183
+
184
+ ====================== ======================
185
+ Input Annotation types Output Annotation type
186
+ ====================== ======================
187
+ ``TOKEN, DOCUMENT`` ``POS``
188
+ ====================== ======================
189
+
190
+ Parameters
191
+ ----------
192
+ None
193
+
194
+ Examples
195
+ --------
196
+ >>> import sparknlp
197
+ >>> from sparknlp.base import *
198
+ >>> from sparknlp.annotator import *
199
+ >>> from pyspark.ml import Pipeline
200
+ >>> documentAssembler = DocumentAssembler() \\
201
+ ... .setInputCol("text") \\
202
+ ... .setOutputCol("document")
203
+ >>> tokenizer = Tokenizer() \\
204
+ ... .setInputCols(["document"]) \\
205
+ ... .setOutputCol("token")
206
+ >>> posTagger = PerceptronModel.pretrained() \\
207
+ ... .setInputCols(["document", "token"]) \\
208
+ ... .setOutputCol("pos")
209
+ >>> pipeline = Pipeline().setStages([
210
+ ... documentAssembler,
211
+ ... tokenizer,
212
+ ... posTagger
213
+ ... ])
214
+ >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers"]]).toDF("text")
215
+ >>> result = pipeline.fit(data).transform(data)
216
+ >>> result.selectExpr("explode(pos) as pos").show(truncate=False)
217
+ +-------------------------------------------+
218
+ |pos |
219
+ +-------------------------------------------+
220
+ |[pos, 0, 4, NNP, [word -> Peter], []] |
221
+ |[pos, 6, 11, NNP, [word -> Pipers], []] |
222
+ |[pos, 13, 21, NNS, [word -> employees], []]|
223
+ |[pos, 23, 25, VBP, [word -> are], []] |
224
+ |[pos, 27, 33, VBG, [word -> picking], []] |
225
+ |[pos, 35, 39, NNS, [word -> pecks], []] |
226
+ |[pos, 41, 42, IN, [word -> of], []] |
227
+ |[pos, 44, 50, JJ, [word -> pickled], []] |
228
+ |[pos, 52, 58, NNS, [word -> peppers], []] |
229
+ +-------------------------------------------+
230
+ """
231
+ name = "PerceptronModel"
232
+
233
+ inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
234
+
235
+ outputAnnotatorType = AnnotatorType.POS
236
+
237
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel", java_model=None):
238
+ super(PerceptronModel, self).__init__(
239
+ classname=classname,
240
+ java_model=java_model
241
+ )
242
+
243
+ @staticmethod
244
+ def pretrained(name="pos_anc", lang="en", remote_loc=None):
245
+ """Downloads and loads a pretrained model.
246
+
247
+ Parameters
248
+ ----------
249
+ name : str, optional
250
+ Name of the pretrained model, by default "pos_anc"
251
+ lang : str, optional
252
+ Language of the pretrained model, by default "en"
253
+ remote_loc : str, optional
254
+ Optional remote address of the resource, by default None. Will use
255
+ Spark NLPs repositories otherwise.
256
+
257
+ Returns
258
+ -------
259
+ PerceptronModel
260
+ The restored model
261
+ """
262
+ from sparknlp.pretrained import ResourceDownloader
263
+ return ResourceDownloader.downloadModel(PerceptronModel, name, lang, remote_loc)
@@ -0,0 +1,17 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Module of annotators for sentence detection."""
16
+ from sparknlp.annotator.sentence.sentence_detector import *
17
+ from sparknlp.annotator.sentence.sentence_detector_dl import *
@@ -0,0 +1,290 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the SentenceDetector."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class SentenceDetectorParams:
20
+ """Base class for SentenceDetector parameters
21
+ """
22
+
23
+ useAbbreviations = Param(Params._dummy(),
24
+ "useAbbreviations",
25
+ "whether to apply abbreviations at sentence detection",
26
+ typeConverter=TypeConverters.toBoolean)
27
+
28
+ customBounds = Param(Params._dummy(),
29
+ "customBounds",
30
+ "characters used to explicitly mark sentence bounds",
31
+ typeConverter=TypeConverters.toListString)
32
+
33
+ useCustomBoundsOnly = Param(Params._dummy(),
34
+ "useCustomBoundsOnly",
35
+ "Only utilize custom bounds in sentence detection",
36
+ typeConverter=TypeConverters.toBoolean)
37
+
38
+ customBoundsStrategy = Param(Params._dummy(),
39
+ "customBoundsStrategy",
40
+ "How to return matched custom bounds",
41
+ typeConverter=TypeConverters.toString)
42
+
43
+ explodeSentences = Param(Params._dummy(),
44
+ "explodeSentences",
45
+ "whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
46
+ typeConverter=TypeConverters.toBoolean)
47
+
48
+ splitLength = Param(Params._dummy(),
49
+ "splitLength",
50
+ "length at which sentences will be forcibly split.",
51
+ typeConverter=TypeConverters.toInt)
52
+
53
+ minLength = Param(Params._dummy(),
54
+ "minLength",
55
+ "Set the minimum allowed length for each sentence.",
56
+ typeConverter=TypeConverters.toInt)
57
+
58
+ maxLength = Param(Params._dummy(),
59
+ "maxLength",
60
+ "Set the maximum allowed length for each sentence",
61
+ typeConverter=TypeConverters.toInt)
62
+
63
+
64
+ class SentenceDetector(AnnotatorModel, SentenceDetectorParams):
65
+ """Annotator that detects sentence boundaries using regular expressions.
66
+
67
+ The following characters are checked as sentence boundaries:
68
+
69
+ 1. Lists ("(i), (ii)", "(a), (b)", "1., 2.")
70
+ 2. Numbers
71
+ 3. Abbreviations
72
+ 4. Punctuations
73
+ 5. Multiple Periods
74
+ 6. Geo-Locations/Coordinates ("N°. 1026.253.553.")
75
+ 7. Ellipsis ("...")
76
+ 8. In-between punctuations
77
+ 9. Quotation marks
78
+ 10. Exclamation Points
79
+ 11. Basic Breakers (".", ";")
80
+
81
+ For the explicit regular expressions used for detection, refer to source of
82
+ `PragmaticContentFormatter <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala>`__.
83
+
84
+ To add additional custom bounds, the parameter ``customBounds`` can be set with an array:
85
+
86
+ >>> sentence = SentenceDetector() \\
87
+ >>> .setInputCols(["document"]) \\
88
+ >>> .setOutputCol("sentence") \\
89
+ >>> .setCustomBounds(["\\n\\n"])
90
+
91
+ If only the custom bounds should be used, then the parameter ``useCustomBoundsOnly`` should be set to ``true``.
92
+
93
+ Each extracted sentence can be returned in an Array or exploded to separate rows,
94
+ if ``explodeSentences`` is set to ``true``.
95
+
96
+ For extended examples of usage, see the `Examples
97
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb>`__.
98
+
99
+ ====================== ======================
100
+ Input Annotation types Output Annotation type
101
+ ====================== ======================
102
+ ``DOCUMENT`` ``DOCUMENT``
103
+ ====================== ======================
104
+
105
+ Parameters
106
+ ----------
107
+ useAbbreviations
108
+ whether to apply abbreviations at sentence detection, by default True
109
+ customBounds
110
+ characters used to explicitly mark sentence bounds, by default []
111
+ useCustomBoundsOnly
112
+ Only utilize custom bounds in sentence detection, by default False
113
+ customBoundsStrategy
114
+ Sets how to return matched custom bounds, by default "none".
115
+
116
+ Will have no effect if no custom bounds are used.
117
+ Possible values are:
118
+
119
+ - "none" - Will not return the matched bound
120
+ - "prepend" - Prepends a sentence break to the match
121
+ - "append" - Appends a sentence break to the match
122
+ explodeSentences
123
+ whether to explode each sentence into a different row, for better
124
+ parallelization, by default False
125
+ splitLength
126
+ length at which sentences will be forcibly split
127
+ minLength
128
+ Set the minimum allowed length for each sentence, by default 0
129
+ maxLength
130
+ Set the maximum allowed length for each sentence, by default 99999
131
+ detectLists
132
+ whether detect lists during sentence detection, by default True
133
+
134
+ Examples
135
+ --------
136
+ >>> import sparknlp
137
+ >>> from sparknlp.base import *
138
+ >>> from sparknlp.annotator import *
139
+ >>> from pyspark.ml import Pipeline
140
+ >>> documentAssembler = DocumentAssembler() \\
141
+ ... .setInputCol("text") \\
142
+ ... .setOutputCol("document")
143
+ >>> sentence = SentenceDetector() \\
144
+ ... .setInputCols(["document"]) \\
145
+ ... .setOutputCol("sentence")
146
+ ... .setCustomBounds(["\\n\\n"])
147
+ >>> pipeline = Pipeline().setStages([
148
+ ... documentAssembler,
149
+ ... sentence
150
+ ... ])
151
+ >>> data = spark.createDataFrame([["This is my first sentence. This my second.\\n\\nHow about a third?"]]).toDF("text")
152
+ >>> result = pipeline.fit(data).transform(data)
153
+ >>> result.selectExpr("explode(sentence) as sentences").show(truncate=False)
154
+ +------------------------------------------------------------------+
155
+ |sentences |
156
+ +------------------------------------------------------------------+
157
+ |[document, 0, 25, This is my first sentence., [sentence -> 0], []]|
158
+ |[document, 27, 41, This my second., [sentence -> 1], []] |
159
+ |[document, 43, 60, How about a third?, [sentence -> 2], []] |
160
+ +------------------------------------------------------------------+
161
+ """
162
+
163
+ name = 'SentenceDetector'
164
+
165
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
166
+
167
+ outputAnnotatorType = AnnotatorType.DOCUMENT
168
+
169
+ # this one is exclusive to this detector
170
+ detectLists = Param(Params._dummy(),
171
+ "detectLists",
172
+ "whether detect lists during sentence detection",
173
+ typeConverter=TypeConverters.toBoolean)
174
+
175
+ def setCustomBounds(self, value):
176
+ """Sets characters used to explicitly mark sentence bounds, by default
177
+ [].
178
+
179
+ Parameters
180
+ ----------
181
+ value : List[str]
182
+ Characters used to explicitly mark sentence bounds
183
+ """
184
+ return self._set(customBounds=value)
185
+
186
+ def setCustomBoundsStrategy(self, value):
187
+ """Sets how to return matched custom bounds, by default "none".
188
+
189
+ Will have no effect if no custom bounds are used.
190
+ Possible values are:
191
+
192
+ - "none" - Will not return the matched bound
193
+ - "prepend" - Prepends a sentence break to the match
194
+ - "append" - Appends a sentence break to the match
195
+
196
+ Parameters
197
+ ----------
198
+ value : str
199
+ Strategy to use
200
+ """
201
+ return self._set(customBoundsStrategy=value)
202
+
203
+ def setUseAbbreviations(self, value):
204
+ """Sets whether to apply abbreviations at sentence detection, by default
205
+ True
206
+
207
+ Parameters
208
+ ----------
209
+ value : bool
210
+ Whether to apply abbreviations at sentence detection
211
+ """
212
+ return self._set(useAbbreviations=value)
213
+
214
+ def setDetectLists(self, value):
215
+ """Sets whether detect lists during sentence detection, by default True
216
+
217
+ Parameters
218
+ ----------
219
+ value : bool
220
+ Whether detect lists during sentence detection
221
+ """
222
+ return self._set(detectLists=value)
223
+
224
+ def setUseCustomBoundsOnly(self, value):
225
+ """Sets whether to only utilize custom bounds in sentence detection, by
226
+ default False.
227
+
228
+ Parameters
229
+ ----------
230
+ value : bool
231
+ Whether to only utilize custom bounds
232
+ """
233
+ return self._set(useCustomBoundsOnly=value)
234
+
235
+ def setExplodeSentences(self, value):
236
+ """Sets whether to explode each sentence into a different row, for
237
+ better parallelization, by default False.
238
+
239
+ Parameters
240
+ ----------
241
+ value : bool
242
+ Whether to explode each sentence into a different row
243
+ """
244
+ return self._set(explodeSentences=value)
245
+
246
+ def setSplitLength(self, value):
247
+ """Sets length at which sentences will be forcibly split.
248
+
249
+ Parameters
250
+ ----------
251
+ value : int
252
+ Length at which sentences will be forcibly split.
253
+ """
254
+ return self._set(splitLength=value)
255
+
256
+ def setMinLength(self, value):
257
+ """Sets minimum allowed length for each sentence, by default 0
258
+
259
+ Parameters
260
+ ----------
261
+ value : int
262
+ Minimum allowed length for each sentence
263
+ """
264
+ return self._set(minLength=value)
265
+
266
+ def setMaxLength(self, value):
267
+ """Sets the maximum allowed length for each sentence, by default
268
+ 99999
269
+
270
+ Parameters
271
+ ----------
272
+ value : int
273
+ Maximum allowed length for each sentence
274
+ """
275
+ return self._set(maxLength=value)
276
+
277
+ @keyword_only
278
+ def __init__(self):
279
+ super(SentenceDetector, self).__init__(
280
+ classname="com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector")
281
+ self._setDefault(
282
+ useAbbreviations=True,
283
+ detectLists=True,
284
+ useCustomBoundsOnly=False,
285
+ customBounds=[],
286
+ customBoundsStrategy="none",
287
+ explodeSentences=False,
288
+ minLength=0,
289
+ maxLength=99999
290
+ )