spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,205 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the RecursiveTokenizer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class RecursiveTokenizer(AnnotatorApproach):
20
+ """Tokenizes raw text recursively based on a handful of definable rules.
21
+
22
+ Unlike the Tokenizer, the RecursiveTokenizer operates based on these array
23
+ string parameters only:
24
+
25
+ - ``prefixes``: Strings that will be split when found at the beginning of
26
+ token.
27
+ - ``suffixes``: Strings that will be split when found at the end of token.
28
+ - ``infixes``: Strings that will be split when found at the middle of token.
29
+ - ``whitelist``: Whitelist of strings not to split
30
+
31
+ For extended examples of usage, see the `Examples
32
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
33
+
34
+ ====================== ======================
35
+ Input Annotation types Output Annotation type
36
+ ====================== ======================
37
+ ``DOCUMENT`` ``TOKEN``
38
+ ====================== ======================
39
+
40
+ Parameters
41
+ ----------
42
+ prefixes
43
+ Strings to be considered independent tokens when found at the beginning
44
+ of a word, by default ["'", '"', '(', '[', '\\n']
45
+ suffixes
46
+ Strings to be considered independent tokens when found at the end of a
47
+ word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')', ']',
48
+ '\\n', '!', "'s"]
49
+ infixes
50
+ Strings to be considered independent tokens when found in the middle of
51
+ a word, by default ['\\n', '(', ')']
52
+ whitelist
53
+ Strings to be considered as single tokens , by default ["it\'s",
54
+ "that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s",
55
+ "It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s",
56
+ "Who\'s"]
57
+
58
+ Examples
59
+ --------
60
+ >>> import sparknlp
61
+ >>> from sparknlp.base import *
62
+ >>> from sparknlp.annotator import *
63
+ >>> from pyspark.ml import Pipeline
64
+ >>> documentAssembler = DocumentAssembler() \\
65
+ ... .setInputCol("text") \\
66
+ ... .setOutputCol("document")
67
+ >>> tokenizer = RecursiveTokenizer() \\
68
+ ... .setInputCols(["document"]) \\
69
+ ... .setOutputCol("token")
70
+ >>> pipeline = Pipeline().setStages([
71
+ ... documentAssembler,
72
+ ... tokenizer
73
+ ... ])
74
+ >>> data = spark.createDataFrame([["One, after the Other, (and) again. PO, QAM,"]]).toDF("text")
75
+ >>> result = pipeline.fit(data).transform(data)
76
+ >>> result.select("token.result").show(truncate=False)
77
+ +------------------------------------------------------------------+
78
+ |result |
79
+ +------------------------------------------------------------------+
80
+ |[One, ,, after, the, Other, ,, (, and, ), again, ., PO, ,, QAM, ,]|
81
+ +------------------------------------------------------------------+
82
+ """
83
+ name = 'RecursiveTokenizer'
84
+
85
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
86
+
87
+ outputAnnotatorType = AnnotatorType.TOKEN
88
+
89
+ prefixes = Param(Params._dummy(),
90
+ "prefixes",
91
+ "strings to be considered independent tokens when found at the beginning of a word",
92
+ typeConverter=TypeConverters.toListString)
93
+
94
+ suffixes = Param(Params._dummy(),
95
+ "suffixes",
96
+ "strings to be considered independent tokens when found at the end of a word",
97
+ typeConverter=TypeConverters.toListString)
98
+
99
+ infixes = Param(Params._dummy(),
100
+ "infixes",
101
+ "strings to be considered independent tokens when found in the middle of a word",
102
+ typeConverter=TypeConverters.toListString)
103
+
104
+ whitelist = Param(Params._dummy(),
105
+ "whitelist",
106
+ "strings to be considered as single tokens",
107
+ typeConverter=TypeConverters.toListString)
108
+
109
+ def setPrefixes(self, p):
110
+ """Sets strings to be considered independent tokens when found at the
111
+ beginning of a word, by default ["'", '"', '(', '[', '\\n'].
112
+
113
+ Parameters
114
+ ----------
115
+ p : List[str]
116
+ Strings to be considered independent tokens when found at the
117
+ beginning of a word
118
+ """
119
+ return self._set(prefixes=p)
120
+
121
+ def setSuffixes(self, s):
122
+ """Sets strings to be considered independent tokens when found at the
123
+ end of a word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')',
124
+ ']', '\\n', '!', "'s"].
125
+
126
+ Parameters
127
+ ----------
128
+ s : List[str]
129
+ Strings to be considered independent tokens when found at the end of
130
+ a word
131
+ """
132
+ return self._set(suffixes=s)
133
+
134
+ def setInfixes(self, i):
135
+ """Sets strings to be considered independent tokens when found in the
136
+ middle of a word, by default ['\\n', '(', ')'].
137
+
138
+ Parameters
139
+ ----------
140
+ i : List[str]
141
+ Strings to be considered independent tokens when found in the middle
142
+ of a word
143
+
144
+ Returns
145
+ -------
146
+ [type]
147
+ [description]
148
+ """
149
+ return self._set(infixes=i)
150
+
151
+ def setWhitelist(self, w):
152
+ """Sets strings to be considered as single tokens, by default ["it\'s",
153
+ "that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s",
154
+ "It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s",
155
+ "Who\'s"].
156
+
157
+ Parameters
158
+ ----------
159
+ w : List[str]
160
+ Strings to be considered as single tokens
161
+ """
162
+ return self._set(whitelist=w)
163
+
164
+ @keyword_only
165
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer"):
166
+ super(RecursiveTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer")
167
+ self._setDefault(
168
+ prefixes=["'", "\"", "(", "[", "\n"],
169
+ infixes=["\n", "(", ")"],
170
+ suffixes=[".", ":", "%", ",", ";", "?", "'", "\"", ")", "]", "\n", "!", "'s"],
171
+ whitelist=["it's", "that's", "there's", "he's", "she's", "what's", "let's", "who's", \
172
+ "It's", "That's", "There's", "He's", "She's", "What's", "Let's", "Who's"]
173
+ )
174
+
175
+ def _create_model(self, java_model):
176
+ return RecursiveTokenizerModel(java_model=java_model)
177
+
178
+
179
+ class RecursiveTokenizerModel(AnnotatorModel):
180
+ """Instantiated model of the RecursiveTokenizer.
181
+
182
+ This is the instantiated model of the :class:`.RecursiveTokenizer`.
183
+ For training your own model, please see the documentation of that class.
184
+
185
+ ====================== ======================
186
+ Input Annotation types Output Annotation type
187
+ ====================== ======================
188
+ ``DOCUMENT`` ``TOKEN``
189
+ ====================== ======================
190
+
191
+ Parameters
192
+ ----------
193
+ None
194
+ """
195
+ name = 'RecursiveTokenizerModel'
196
+
197
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
198
+
199
+ outputAnnotatorType = AnnotatorType.TOKEN
200
+
201
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel", java_model=None):
202
+ super(RecursiveTokenizerModel, self).__init__(
203
+ classname=classname,
204
+ java_model=java_model
205
+ )
@@ -0,0 +1,208 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the RegexTokenizer."""
15
+
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class RegexTokenizer(AnnotatorModel):
21
+ """A tokenizer that splits text by a regex pattern.
22
+
23
+ The pattern needs to be set with :meth:`.setPattern` and this sets the
24
+ delimiting pattern or how the tokens should be split. By default this
25
+ pattern is ``\\s+`` which means that tokens should be split by 1 or more
26
+ whitespace characters.
27
+
28
+ ====================== ======================
29
+ Input Annotation types Output Annotation type
30
+ ====================== ======================
31
+ ``DOCUMENT`` ``TOKEN``
32
+ ====================== ======================
33
+
34
+ Parameters
35
+ ----------
36
+ minLength
37
+ Set the minimum allowed length for each token, by default 1
38
+ maxLength
39
+ Set the maximum allowed length for each token
40
+ toLowercase
41
+ Indicates whether to convert all characters to lowercase before
42
+ tokenizing, by default False
43
+ pattern
44
+ Regex pattern used for tokenizing, by default ``\\s+``
45
+ positionalMask
46
+ Using a positional mask to guarantee the incremental progression of the
47
+ tokenization, by default False
48
+ trimWhitespace
49
+ Using a trimWhitespace flag to remove whitespaces from identified tokens,
50
+ by default False
51
+ preservePosition
52
+ Using a preservePosition flag to preserve initial indexes before eventual whitespaces removal in tokens,
53
+ by default True
54
+
55
+ Examples
56
+ --------
57
+ >>> import sparknlp
58
+ >>> from sparknlp.base import *
59
+ >>> from sparknlp.annotator import *
60
+ >>> from pyspark.ml import Pipeline
61
+ >>> documentAssembler = DocumentAssembler() \\
62
+ ... .setInputCol("text") \\
63
+ ... .setOutputCol("document")
64
+ >>> regexTokenizer = RegexTokenizer() \\
65
+ ... .setInputCols(["document"]) \\
66
+ ... .setOutputCol("regexToken") \\
67
+ ... .setToLowercase(True) \\
68
+ >>> pipeline = Pipeline().setStages([
69
+ ... documentAssembler,
70
+ ... regexTokenizer
71
+ ... ])
72
+ >>> data = spark.createDataFrame([["This is my first sentence.\\nThis is my second."]]).toDF("text")
73
+ >>> result = pipeline.fit(data).transform(data)
74
+ >>> result.selectExpr("regexToken.result").show(truncate=False)
75
+ +-------------------------------------------------------+
76
+ |result |
77
+ +-------------------------------------------------------+
78
+ |[this, is, my, first, sentence., this, is, my, second.]|
79
+ +-------------------------------------------------------+
80
+ """
81
+
82
+ name = "RegexTokenizer"
83
+
84
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
85
+
86
+ outputAnnotatorType = AnnotatorType.TOKEN
87
+
88
+ @keyword_only
89
+ def __init__(self):
90
+ super(RegexTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexTokenizer")
91
+ self._setDefault(
92
+ inputCols=["document"],
93
+ outputCol="regexToken",
94
+ toLowercase=False,
95
+ minLength=1,
96
+ pattern="\\s+",
97
+ positionalMask=False,
98
+ trimWhitespace=False,
99
+ preservePosition=True
100
+ )
101
+
102
+ minLength = Param(Params._dummy(),
103
+ "minLength",
104
+ "Set the minimum allowed length for each token",
105
+ typeConverter=TypeConverters.toInt)
106
+
107
+ maxLength = Param(Params._dummy(),
108
+ "maxLength",
109
+ "Set the maximum allowed length for each token",
110
+ typeConverter=TypeConverters.toInt)
111
+
112
+ toLowercase = Param(Params._dummy(),
113
+ "toLowercase",
114
+ "Indicates whether to convert all characters to lowercase before tokenizing.",
115
+ typeConverter=TypeConverters.toBoolean)
116
+
117
+ pattern = Param(Params._dummy(),
118
+ "pattern",
119
+ "regex pattern used for tokenizing. Defaults \S+",
120
+ typeConverter=TypeConverters.toString)
121
+
122
+ positionalMask = Param(Params._dummy(),
123
+ "positionalMask",
124
+ "Using a positional mask to guarantee the incremental progression of the tokenization.",
125
+ typeConverter=TypeConverters.toBoolean)
126
+
127
+ trimWhitespace = Param(Params._dummy(),
128
+ "trimWhitespace",
129
+ "Indicates whether to use a trimWhitespaces flag to remove whitespaces from identified tokens.",
130
+ typeConverter=TypeConverters.toBoolean)
131
+
132
+ preservePosition = Param(Params._dummy(),
133
+ "preservePosition",
134
+ "Indicates whether to use a preserve initial indexes before eventual whitespaces removal in tokens.",
135
+ typeConverter=TypeConverters.toBoolean)
136
+
137
+ def setMinLength(self, value):
138
+ """Sets the minimum allowed length for each token, by default 1.
139
+
140
+ Parameters
141
+ ----------
142
+ value : int
143
+ Minimum allowed length for each token
144
+ """
145
+ return self._set(minLength=value)
146
+
147
+ def setMaxLength(self, value):
148
+ """Sets the maximum allowed length for each token.
149
+
150
+ Parameters
151
+ ----------
152
+ value : int
153
+ Maximum allowed length for each token
154
+ """
155
+ return self._set(maxLength=value)
156
+
157
+ def setToLowercase(self, value):
158
+ """Sets whether to convert all characters to lowercase before
159
+ tokenizing, by default False.
160
+
161
+ Parameters
162
+ ----------
163
+ value : bool
164
+ Whether to convert all characters to lowercase before tokenizing
165
+ """
166
+ return self._set(toLowercase=value)
167
+
168
+ def setPattern(self, value):
169
+ """Sets the regex pattern used for tokenizing, by default ``\\s+``.
170
+
171
+ Parameters
172
+ ----------
173
+ value : str
174
+ Regex pattern used for tokenizing
175
+ """
176
+ return self._set(pattern=value)
177
+
178
+ def setPositionalMask(self, value):
179
+ """Sets whether to use a positional mask to guarantee the incremental
180
+ progression of the tokenization, by default False.
181
+
182
+ Parameters
183
+ ----------
184
+ value : bool
185
+ Whether to use a positional mask
186
+ """
187
+ return self._set(positionalMask=value)
188
+
189
+ def setTrimWhitespace(self, value):
190
+ """Indicates whether to use a trimWhitespaces flag to remove whitespaces from identified tokens.
191
+
192
+ Parameters
193
+ ----------
194
+ value : bool
195
+ Indicates whether to use a trimWhitespaces flag, by default False.
196
+ """
197
+ return self._set(trimWhitespace=value)
198
+
199
+ def setPreservePosition(self, value):
200
+ """Indicates whether to use a preserve initial indexes before eventual whitespaces removal in tokens.
201
+
202
+ Parameters
203
+ ----------
204
+ value : bool
205
+ Indicates whether to use a preserve initial indexes, by default True.
206
+ """
207
+ return self._set(preservePosition=value)
208
+