spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,250 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the Lemmatizer."""
15
+ from sparknlp.common import *
16
+
17
+
18
+ class Lemmatizer(AnnotatorApproach):
19
+ """Class to find lemmas out of words with the objective of returning a base
20
+ dictionary word.
21
+
22
+ Retrieves the significant part of a word. A dictionary of predefined lemmas
23
+ must be provided with :meth:`.setDictionary`.
24
+
25
+ For instantiated/pretrained models, see :class:`.LemmatizerModel`.
26
+
27
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
28
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb>`__.
29
+
30
+ ====================== ======================
31
+ Input Annotation types Output Annotation type
32
+ ====================== ======================
33
+ ``TOKEN`` ``TOKEN``
34
+ ====================== ======================
35
+
36
+ Parameters
37
+ ----------
38
+ dictionary
39
+ lemmatizer external dictionary.
40
+
41
+ Examples
42
+ --------
43
+ >>> import sparknlp
44
+ >>> from sparknlp.base import *
45
+ >>> from sparknlp.annotator import *
46
+ >>> from pyspark.ml import Pipeline
47
+
48
+ In this example, the lemma dictionary ``lemmas_small.txt`` has the form of::
49
+
50
+ ...
51
+ pick -> pick picks picking picked
52
+ peck -> peck pecking pecked pecks
53
+ pickle -> pickle pickles pickled pickling
54
+ pepper -> pepper peppers peppered peppering
55
+ ...
56
+
57
+ where each key is delimited by ``->`` and values are delimited by ``\\t``
58
+
59
+ >>> documentAssembler = DocumentAssembler() \\
60
+ ... .setInputCol("text") \\
61
+ ... .setOutputCol("document")
62
+ >>> sentenceDetector = SentenceDetector() \\
63
+ ... .setInputCols(["document"]) \\
64
+ ... .setOutputCol("sentence")
65
+ >>> tokenizer = Tokenizer() \\
66
+ ... .setInputCols(["sentence"]) \\
67
+ ... .setOutputCol("token")
68
+ >>> lemmatizer = Lemmatizer() \\
69
+ ... .setInputCols(["token"]) \\
70
+ ... .setOutputCol("lemma") \\
71
+ ... .setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\\t")
72
+ >>> pipeline = Pipeline() \\
73
+ ... .setStages([
74
+ ... documentAssembler,
75
+ ... sentenceDetector,
76
+ ... tokenizer,
77
+ ... lemmatizer
78
+ ... ])
79
+ >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]) \\
80
+ ... .toDF("text")
81
+ >>> result = pipeline.fit(data).transform(data)
82
+ >>> result.selectExpr("lemma.result").show(truncate=False)
83
+ +------------------------------------------------------------------+
84
+ |result |
85
+ +------------------------------------------------------------------+
86
+ |[Peter, Pipers, employees, are, pick, peck, of, pickle, pepper, .]|
87
+ +------------------------------------------------------------------+
88
+ """
89
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
90
+
91
+ outputAnnotatorType = AnnotatorType.TOKEN
92
+
93
+ dictionary = Param(Params._dummy(),
94
+ "dictionary",
95
+ "lemmatizer external dictionary." +
96
+ " needs 'keyDelimiter' and 'valueDelimiter' in options for parsing target text",
97
+ typeConverter=TypeConverters.identity)
98
+
99
+ formCol = Param(Params._dummy(),
100
+ "formCol",
101
+ "Column that correspends to CoNLLU(formCol=) output",
102
+ typeConverter=TypeConverters.toString)
103
+
104
+ lemmaCol = Param(Params._dummy(),
105
+ "lemmaCol",
106
+ "Column that correspends to CoNLLU(lemmaCol=) output",
107
+ typeConverter=TypeConverters.toString)
108
+
109
+ @keyword_only
110
+ def __init__(self):
111
+ super(Lemmatizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Lemmatizer")
112
+ self._setDefault(
113
+ formCol="form",
114
+ lemmaCol="lemma"
115
+ )
116
+
117
+ def _create_model(self, java_model):
118
+ return LemmatizerModel(java_model=java_model)
119
+
120
+ def setFormCol(self, value):
121
+ """Column that correspends to CoNLLU(formCol=) output
122
+
123
+ Parameters
124
+ ----------
125
+ value : str
126
+ Name of column for Array of Form tokens
127
+ """
128
+ return self._set(formCol=value)
129
+
130
+ def setLemmaCol(self, value):
131
+ """Column that correspends to CoNLLU(fromLemma=) output
132
+
133
+ Parameters
134
+ ----------
135
+ value : str
136
+ Name of column for Array of Lemma tokens
137
+ """
138
+ return self._set(lemmaCol=value)
139
+
140
+ def setDictionary(self, path, key_delimiter, value_delimiter, read_as=ReadAs.TEXT,
141
+ options={"format": "text"}):
142
+ """Sets the external dictionary for the lemmatizer.
143
+
144
+ Parameters
145
+ ----------
146
+ path : str
147
+ Path to the source files
148
+ key_delimiter : str
149
+ Delimiter for the key
150
+ value_delimiter : str
151
+ Delimiter for the values
152
+ read_as : str, optional
153
+ How to read the file, by default ReadAs.TEXT
154
+ options : dict, optional
155
+ Options to read the resource, by default {"format": "text"}
156
+
157
+ Examples
158
+ --------
159
+ Here the file has each key is delimited by ``"->"`` and values are
160
+ delimited by ``\\t``::
161
+
162
+ ...
163
+ pick -> pick picks picking picked
164
+ peck -> peck pecking pecked pecks
165
+ pickle -> pickle pickles pickled pickling
166
+ pepper -> pepper peppers peppered peppering
167
+ ...
168
+
169
+ This file can then be parsed with
170
+
171
+ >>> lemmatizer = Lemmatizer() \\
172
+ ... .setInputCols(["token"]) \\
173
+ ... .setOutputCol("lemma") \\
174
+ ... .setDictionary("lemmas_small.txt", "->", "\\t")
175
+ """
176
+ opts = options.copy()
177
+ if "keyDelimiter" not in opts:
178
+ opts["keyDelimiter"] = key_delimiter
179
+ if "valueDelimiter" not in opts:
180
+ opts["valueDelimiter"] = value_delimiter
181
+ return self._set(dictionary=ExternalResource(path, read_as, opts))
182
+
183
+
184
+ class LemmatizerModel(AnnotatorModel):
185
+ """Instantiated Model of the Lemmatizer.
186
+
187
+ This is the instantiated model of the :class:`.Lemmatizer`.
188
+ For training your own model, please see the documentation of that class.
189
+
190
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
191
+ object:
192
+
193
+ >>> lemmatizer = LemmatizerModel.pretrained() \\
194
+ ... .setInputCols(["token"]) \\
195
+ ... .setOutputCol("lemma")
196
+
197
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
198
+
199
+ ====================== ======================
200
+ Input Annotation types Output Annotation type
201
+ ====================== ======================
202
+ ``TOKEN`` ``TOKEN``
203
+ ====================== ======================
204
+
205
+ Parameters
206
+ ----------
207
+ None
208
+
209
+ Examples
210
+ --------
211
+ The lemmatizer from the example of the :class:`.Lemmatizer` can be replaced
212
+ with:
213
+
214
+ >>> lemmatizer = LemmatizerModel.pretrained() \\
215
+ ... .setInputCols(["token"]) \\
216
+ ... .setOutputCol("lemma")
217
+ """
218
+ name = "LemmatizerModel"
219
+
220
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
221
+
222
+ outputAnnotatorType = AnnotatorType.TOKEN
223
+
224
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.LemmatizerModel", java_model=None):
225
+ super(LemmatizerModel, self).__init__(
226
+ classname=classname,
227
+ java_model=java_model
228
+ )
229
+
230
+ @staticmethod
231
+ def pretrained(name="lemma_antbnc", lang="en", remote_loc=None):
232
+ """Downloads and loads a pretrained model.
233
+
234
+ Parameters
235
+ ----------
236
+ name : str, optional
237
+ Name of the pretrained model, by default "lemma_antbnc"
238
+ lang : str, optional
239
+ Language of the pretrained model, by default "en"
240
+ remote_loc : str, optional
241
+ Optional remote address of the resource, by default None. Will use
242
+ Spark NLPs repositories otherwise.
243
+
244
+ Returns
245
+ -------
246
+ LemmatizerModel
247
+ The restored model
248
+ """
249
+ from sparknlp.pretrained import ResourceDownloader
250
+ return ResourceDownloader.downloadModel(LemmatizerModel, name, lang, remote_loc)
@@ -0,0 +1,20 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Module of annotators for text matching."""
16
+ from sparknlp.annotator.matcher.big_text_matcher import *
17
+ from sparknlp.annotator.matcher.date_matcher import *
18
+ from sparknlp.annotator.matcher.multi_date_matcher import *
19
+ from sparknlp.annotator.matcher.regex_matcher import *
20
+ from sparknlp.annotator.matcher.text_matcher import *
@@ -0,0 +1,272 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the BigTextMatcher."""
15
+
16
+ from sparknlp.common import *
17
+ from sparknlp.annotator.matcher.text_matcher import TextMatcherModel
18
+
19
+
20
+ class BigTextMatcher(AnnotatorApproach, HasStorage):
21
+ """Annotator to match exact phrases (by token) provided in a file against a
22
+ Document.
23
+
24
+ A text file of predefined phrases must be provided with ``setStoragePath``.
25
+
26
+ In contrast to the normal ``TextMatcher``, the ``BigTextMatcher`` is
27
+ designed for large corpora.
28
+
29
+ ====================== ======================
30
+ Input Annotation types Output Annotation type
31
+ ====================== ======================
32
+ ``DOCUMENT, TOKEN`` ``CHUNK``
33
+ ====================== ======================
34
+
35
+ Parameters
36
+ ----------
37
+ entities
38
+ ExternalResource for entities
39
+ caseSensitive
40
+ whether to ignore case in index lookups, by default True
41
+ mergeOverlapping
42
+ whether to merge overlapping matched chunks, by default False
43
+ tokenizer
44
+ TokenizerModel to use to tokenize input file for building a Trie
45
+
46
+ Examples
47
+ --------
48
+ In this example, the entities file is of the form::
49
+
50
+ ...
51
+ dolore magna aliqua
52
+ lorem ipsum dolor. sit
53
+ laborum
54
+ ...
55
+
56
+ where each line represents an entity phrase to be extracted.
57
+
58
+ >>> import sparknlp
59
+ >>> from sparknlp.base import *
60
+ >>> from sparknlp.annotator import *
61
+ >>> from pyspark.ml import Pipeline
62
+ >>> documentAssembler = DocumentAssembler() \\
63
+ ... .setInputCol("text") \\
64
+ ... .setOutputCol("document")
65
+ >>> tokenizer = Tokenizer() \\
66
+ ... .setInputCols("document") \\
67
+ ... .setOutputCol("token")
68
+ >>> data = spark.createDataFrame([["Hello dolore magna aliqua. Lorem ipsum dolor. sit in laborum"]]).toDF("text")
69
+ >>> entityExtractor = BigTextMatcher() \\
70
+ ... .setInputCols("document", "token") \\
71
+ ... .setStoragePath("src/test/resources/entity-extractor/test-phrases.txt", ReadAs.TEXT) \\
72
+ ... .setOutputCol("entity") \\
73
+ ... .setCaseSensitive(False)
74
+ >>> pipeline = Pipeline().setStages([documentAssembler, tokenizer, entityExtractor])
75
+ >>> results = pipeline.fit(data).transform(data)
76
+ >>> results.selectExpr("explode(entity)").show(truncate=False)
77
+ +--------------------------------------------------------------------+
78
+ |col |
79
+ +--------------------------------------------------------------------+
80
+ |[chunk, 6, 24, dolore magna aliqua, [sentence -> 0, chunk -> 0], []]|
81
+ |[chunk, 53, 59, laborum, [sentence -> 0, chunk -> 1], []] |
82
+ +--------------------------------------------------------------------+
83
+ """
84
+
85
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
86
+
87
+ outputAnnotatorType = AnnotatorType.CHUNK
88
+
89
+ entities = Param(Params._dummy(),
90
+ "entities",
91
+ "ExternalResource for entities",
92
+ typeConverter=TypeConverters.identity)
93
+
94
+ caseSensitive = Param(Params._dummy(),
95
+ "caseSensitive",
96
+ "whether to ignore case in index lookups",
97
+ typeConverter=TypeConverters.toBoolean)
98
+
99
+ mergeOverlapping = Param(Params._dummy(),
100
+ "mergeOverlapping",
101
+ "whether to merge overlapping matched chunks. Defaults false",
102
+ typeConverter=TypeConverters.toBoolean)
103
+
104
+ tokenizer = Param(Params._dummy(),
105
+ "tokenizer",
106
+ "TokenizerModel to use to tokenize input file for building a Trie",
107
+ typeConverter=TypeConverters.identity)
108
+
109
+ @keyword_only
110
+ def __init__(self):
111
+ super(BigTextMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.btm.BigTextMatcher")
112
+ self._setDefault(caseSensitive=True)
113
+ self._setDefault(mergeOverlapping=False)
114
+
115
+ def _create_model(self, java_model):
116
+ return TextMatcherModel(java_model=java_model)
117
+
118
+ def setEntities(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
119
+ """Sets ExternalResource for entities.
120
+
121
+ Parameters
122
+ ----------
123
+ path : str
124
+ Path to the resource
125
+ read_as : str, optional
126
+ How to read the resource, by default ReadAs.TEXT
127
+ options : dict, optional
128
+ Options for reading the resource, by default {"format": "text"}
129
+ """
130
+ return self._set(entities=ExternalResource(path, read_as, options.copy()))
131
+
132
+ def setCaseSensitive(self, b):
133
+ """Sets whether to ignore case in index lookups, by default True.
134
+
135
+ Parameters
136
+ ----------
137
+ b : bool
138
+ Whether to ignore case in index lookups
139
+ """
140
+ return self._set(caseSensitive=b)
141
+
142
+ def setMergeOverlapping(self, b):
143
+ """Sets whether to merge overlapping matched chunks, by default False.
144
+
145
+ Parameters
146
+ ----------
147
+ b : bool
148
+ Whether to merge overlapping matched chunks
149
+
150
+ """
151
+ return self._set(mergeOverlapping=b)
152
+
153
+ def setTokenizer(self, tokenizer_model):
154
+ """Sets TokenizerModel to use to tokenize input file for building a
155
+ Trie.
156
+
157
+ Parameters
158
+ ----------
159
+ tokenizer_model : :class:`TokenizerModel <sparknlp.annotator.TokenizerModel>`
160
+ TokenizerModel to use to tokenize input file
161
+
162
+ """
163
+ tokenizer_model._transfer_params_to_java()
164
+ return self._set(tokenizer_model._java_obj)
165
+
166
+
167
+ class BigTextMatcherModel(AnnotatorModel, HasStorageModel):
168
+ """Instantiated model of the BigTextMatcher.
169
+
170
+ This is the instantiated model of the :class:`.BigTextMatcher`.
171
+ For training your own model, please see the documentation of that class.
172
+
173
+ ====================== ======================
174
+ Input Annotation types Output Annotation type
175
+ ====================== ======================
176
+ ``DOCUMENT, TOKEN`` ``CHUNK``
177
+ ====================== ======================
178
+
179
+ Parameters
180
+ ----------
181
+ caseSensitive
182
+ Whether to ignore case in index lookups
183
+ mergeOverlapping
184
+ Whether to merge overlapping matched chunks, by default False
185
+ searchTrie
186
+ SearchTrie
187
+ """
188
+ name = "BigTextMatcherModel"
189
+ databases = ['TMVOCAB', 'TMEDGES', 'TMNODES']
190
+
191
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
192
+
193
+ outputAnnotatorType = AnnotatorType.CHUNK
194
+
195
+ caseSensitive = Param(Params._dummy(),
196
+ "caseSensitive",
197
+ "whether to ignore case in index lookups",
198
+ typeConverter=TypeConverters.toBoolean)
199
+
200
+ mergeOverlapping = Param(Params._dummy(),
201
+ "mergeOverlapping",
202
+ "whether to merge overlapping matched chunks. Defaults false",
203
+ typeConverter=TypeConverters.toBoolean)
204
+
205
+ searchTrie = Param(Params._dummy(),
206
+ "searchTrie",
207
+ "searchTrie",
208
+ typeConverter=TypeConverters.identity)
209
+
210
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.btm.TextMatcherModel", java_model=None):
211
+ super(BigTextMatcherModel, self).__init__(
212
+ classname=classname,
213
+ java_model=java_model
214
+ )
215
+
216
+ def setMergeOverlapping(self, b):
217
+ """Sets whether to merge overlapping matched chunks, by default False.
218
+
219
+ Parameters
220
+ ----------
221
+ v : bool
222
+ Whether to merge overlapping matched chunks, by default False
223
+ """
224
+ return self._set(mergeOverlapping=b)
225
+
226
+ def setCaseSensitive(self, v):
227
+ """Sets whether to ignore case in index lookups.
228
+
229
+ Parameters
230
+ ----------
231
+ b : bool
232
+ Whether to ignore case in index lookups
233
+ """
234
+ return self._set(caseSensitive=v)
235
+
236
+ @staticmethod
237
+ def pretrained(name, lang="en", remote_loc=None):
238
+ """Downloads and loads a pretrained model.
239
+
240
+ Parameters
241
+ ----------
242
+ name : str, optional
243
+ Name of the pretrained model
244
+ lang : str, optional
245
+ Language of the pretrained model, by default "en"
246
+ remote_loc : str, optional
247
+ Optional remote address of the resource, by default None. Will use
248
+ Spark NLPs repositories otherwise.
249
+
250
+ Returns
251
+ -------
252
+ TextMatcherModel
253
+ The restored model
254
+ """
255
+ from sparknlp.pretrained import ResourceDownloader
256
+ return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
257
+
258
+ @staticmethod
259
+ def loadStorage(path, spark, storage_ref):
260
+ """Loads the model from storage.
261
+
262
+ Parameters
263
+ ----------
264
+ path : str
265
+ Path to the model
266
+ spark : :class:`pyspark.sql.SparkSession`
267
+ The current SparkSession
268
+ storage_ref : str
269
+ Identifiers for the model parameters
270
+ """
271
+ HasStorageModel.loadStorages(path, spark, storage_ref, BigTextMatcherModel.databases)
272
+