spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
sparknlp/internal.py DELETED
@@ -1,288 +0,0 @@
1
- from abc import ABC
2
-
3
- from pyspark import SparkContext, keyword_only
4
- from pyspark.ml import PipelineModel
5
- from pyspark.ml.wrapper import JavaWrapper, JavaTransformer, JavaEstimator, JavaModel
6
- from pyspark.ml.util import JavaMLWritable, JavaMLReadable, JavaMLReader
7
- from pyspark.sql.dataframe import DataFrame
8
- from pyspark.ml.param.shared import Params
9
- import re
10
-
11
-
12
- # Helper class used to generate the getters for all params
13
- class ParamsGettersSetters(Params):
14
- getter_attrs = []
15
-
16
- def __init__(self):
17
- super(ParamsGettersSetters, self).__init__()
18
- for param in self.params:
19
- param_name = param.name
20
- fg_attr = "get" + re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), param_name)
21
- fs_attr = "set" + re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), param_name)
22
- # Generates getter and setter only if not exists
23
- try:
24
- getattr(self, fg_attr)
25
- except AttributeError:
26
- setattr(self, fg_attr, self.getParamValue(param_name))
27
- try:
28
- getattr(self, fs_attr)
29
- except AttributeError:
30
- setattr(self, fs_attr, self.setParamValue(param_name))
31
-
32
- def getParamValue(self, paramName):
33
- def r():
34
- try:
35
- return self.getOrDefault(paramName)
36
- except KeyError:
37
- return None
38
- return r
39
-
40
- def setParamValue(self, paramName):
41
- def r(v):
42
- self.set(self.getParam(paramName), v)
43
- return self
44
- return r
45
-
46
-
47
- class AnnotatorJavaMLReadable(JavaMLReadable):
48
- @classmethod
49
- def read(cls):
50
- """Returns an MLReader instance for this class."""
51
- return AnnotatorJavaMLReader(cls())
52
-
53
-
54
- class AnnotatorJavaMLReader(JavaMLReader):
55
- @classmethod
56
- def _java_loader_class(cls, clazz):
57
- if hasattr(clazz, '_java_class_name') and clazz._java_class_name is not None:
58
- return clazz._java_class_name
59
- else:
60
- return JavaMLReader._java_loader_class(clazz)
61
-
62
-
63
- class AnnotatorTransformer(JavaTransformer, AnnotatorJavaMLReadable, JavaMLWritable, ParamsGettersSetters):
64
- @keyword_only
65
- def __init__(self, classname):
66
- super(AnnotatorTransformer, self).__init__()
67
- kwargs = self._input_kwargs
68
- if 'classname' in kwargs:
69
- kwargs.pop('classname')
70
- self.setParams(**kwargs)
71
- self.__class__._java_class_name = classname
72
- self._java_obj = self._new_java_obj(classname, self.uid)
73
-
74
-
75
- class RecursiveEstimator(JavaEstimator, ABC):
76
-
77
- def _fit_java(self, dataset, pipeline=None):
78
- self._transfer_params_to_java()
79
- if pipeline:
80
- return self._java_obj.recursiveFit(dataset._jdf, pipeline._to_java())
81
- else:
82
- return self._java_obj.fit(dataset._jdf)
83
-
84
- def _fit(self, dataset, pipeline=None):
85
- java_model = self._fit_java(dataset, pipeline)
86
- model = self._create_model(java_model)
87
- return self._copyValues(model)
88
-
89
- def fit(self, dataset, params=None, pipeline=None):
90
- if params is None:
91
- params = dict()
92
- if isinstance(params, (list, tuple)):
93
- models = [None] * len(params)
94
- for index, model in self.fitMultiple(dataset, params):
95
- models[index] = model
96
- return models
97
- elif isinstance(params, dict):
98
- if params:
99
- return self.copy(params)._fit(dataset, pipeline=pipeline)
100
- else:
101
- return self._fit(dataset, pipeline=pipeline)
102
- else:
103
- raise ValueError("Params must be either a param map or a list/tuple of param maps, "
104
- "but got %s." % type(params))
105
-
106
-
107
- class RecursiveTransformer(JavaModel):
108
-
109
- def _transform_recursive(self, dataset, recursive_pipeline):
110
- self._transfer_params_to_java()
111
- return DataFrame(self._java_obj.recursiveTransform(dataset._jdf, recursive_pipeline._to_java()), dataset.sql_ctx)
112
-
113
- def transform_recursive(self, dataset, recursive_pipeline, params=None):
114
- if params is None:
115
- params = dict()
116
- if isinstance(params, dict):
117
- if params:
118
- return self.copy(params)._transform_recursive(dataset, recursive_pipeline)
119
- else:
120
- return self._transform_recursive(dataset, recursive_pipeline)
121
- else:
122
- raise ValueError("Params must be a param map but got %s." % type(params))
123
-
124
-
125
- class ExtendedJavaWrapper(JavaWrapper):
126
- def __init__(self, java_obj, *args):
127
- super(ExtendedJavaWrapper, self).__init__(java_obj)
128
- self.sc = SparkContext._active_spark_context
129
- self._java_obj = self.new_java_obj(java_obj, *args)
130
- self.java_obj = self._java_obj
131
-
132
- def __del__(self):
133
- pass
134
-
135
- def apply(self):
136
- return self._java_obj
137
-
138
- def new_java_obj(self, java_class, *args):
139
- return self._new_java_obj(java_class, *args)
140
-
141
- def new_java_array(self, pylist, java_class):
142
- """
143
- ToDo: Inspired from spark 2.0. Review if spark changes
144
- """
145
- java_array = self.sc._gateway.new_array(java_class, len(pylist))
146
- for i in range(len(pylist)):
147
- java_array[i] = pylist[i]
148
- return java_array
149
-
150
- def new_java_array_string(self, pylist):
151
- java_array = self._new_java_array(pylist, self.sc._gateway.jvm.java.lang.String)
152
- return java_array
153
-
154
- def new_java_array_integer(self, pylist):
155
- java_array = self._new_java_array(pylist, self.sc._gateway.jvm.java.lang.Integer)
156
- return java_array
157
-
158
-
159
- class _RegexRule(ExtendedJavaWrapper):
160
- def __init__(self, rule, identifier):
161
- super(_RegexRule, self).__init__("com.johnsnowlabs.nlp.util.regex.RegexRule", rule, identifier)
162
-
163
-
164
- class _ExternalResource(ExtendedJavaWrapper):
165
- def __init__(self, path, read_as, options):
166
- super(_ExternalResource, self).__init__("com.johnsnowlabs.nlp.util.io.ExternalResource.fromJava", path, read_as, options)
167
-
168
-
169
- class _ConfigLoaderGetter(ExtendedJavaWrapper):
170
- def __init__(self):
171
- super(_ConfigLoaderGetter, self).__init__("com.johnsnowlabs.util.ConfigLoader.getConfigPath")
172
-
173
-
174
- class _DownloadModel(ExtendedJavaWrapper):
175
- def __init__(self, reader, name, language, remote_loc, validator):
176
- super(_DownloadModel, self).__init__("com.johnsnowlabs.nlp.pretrained."+validator+".downloadModel", reader, name, language, remote_loc)
177
-
178
-
179
- class _DownloadPipeline(ExtendedJavaWrapper):
180
- def __init__(self, name, language, remote_loc):
181
- super(_DownloadPipeline, self).__init__("com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline", name, language, remote_loc)
182
-
183
-
184
- class _ClearCache(ExtendedJavaWrapper):
185
- def __init__(self, name, language, remote_loc):
186
- super(_ClearCache, self).__init__("com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.clearCache", name, language, remote_loc)
187
-
188
-
189
- class _GetResourceSize(ExtendedJavaWrapper):
190
- def __init__(self, name, language, remote_loc):
191
- super(_GetResourceSize, self).__init__(
192
- "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
193
-
194
-
195
- class _ShowUnCategorizedResources(ExtendedJavaWrapper):
196
- def __init__(self):
197
- super(_ShowUnCategorizedResources, self).__init__(
198
- "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.showUnCategorizedResources")
199
-
200
-
201
- class _ShowPublicPipelines(ExtendedJavaWrapper):
202
- def __init__(self):
203
- super(_ShowPublicPipelines, self).__init__(
204
- "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.showPublicPipelines")
205
-
206
-
207
- class _ShowPublicModels(ExtendedJavaWrapper):
208
- def __init__(self):
209
- super(_ShowPublicModels, self).__init__(
210
- "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.showPublicModels")
211
-
212
-
213
- # predefined pipelines
214
- class _DownloadPredefinedPipeline(ExtendedJavaWrapper):
215
- def __init__(self, java_path):
216
- super(_DownloadPredefinedPipeline, self).__init__(java_path)
217
-
218
-
219
- class _LightPipeline(ExtendedJavaWrapper):
220
- def __init__(self, pipelineModel, parse_embeddings):
221
- super(_LightPipeline, self).__init__("com.johnsnowlabs.nlp.LightPipeline", pipelineModel._to_java(), parse_embeddings)
222
-
223
- # ==================
224
- # Utils
225
- # ==================
226
-
227
-
228
- class _StorageHelper(ExtendedJavaWrapper):
229
- def __init__(self, path, spark, database, storage_ref, within_storage):
230
- super(_StorageHelper, self).__init__("com.johnsnowlabs.storage.StorageHelper.load", path, spark._jsparkSession, database, storage_ref, within_storage)
231
-
232
-
233
- class _CoNLLGeneratorExport(ExtendedJavaWrapper):
234
- def __init__(self, spark, target, pipeline, output_path):
235
- if type(pipeline) == PipelineModel:
236
- pipeline = pipeline._to_java()
237
- if type(target) == DataFrame:
238
- super(_CoNLLGeneratorExport, self).__init__("com.johnsnowlabs.util.CoNLLGenerator.exportConllFiles", target._jdf, pipeline, output_path)
239
- else:
240
- super(_CoNLLGeneratorExport, self).__init__("com.johnsnowlabs.util.CoNLLGenerator.exportConllFiles", spark._jsparkSession, target, pipeline, output_path)
241
-
242
- def __init__(self, dataframe, output_path):
243
- super(_CoNLLGeneratorExport, self).__init__("com.johnsnowlabs.util.CoNLLGenerator.exportConllFiles", dataframe, output_path)
244
-
245
-
246
- class _EmbeddingsOverallCoverage(ExtendedJavaWrapper):
247
- def __init__(self, dataset, embeddings_col):
248
- super(_EmbeddingsOverallCoverage, self).__init__("com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel.overallCoverage", dataset._jdf, embeddings_col)
249
-
250
-
251
- class _EmbeddingsCoverageColumn(ExtendedJavaWrapper):
252
- def __init__(self, dataset, embeddings_col, output_col):
253
- super(_EmbeddingsCoverageColumn, self).__init__("com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel.withCoverageColumn", dataset._jdf, embeddings_col, output_col)
254
-
255
-
256
- class _CoverageResult(ExtendedJavaWrapper):
257
- def __init__(self, covered, total, percentage):
258
- super(_CoverageResult, self).__init__("com.johnsnowlabs.nlp.embeddings.CoverageResult", covered, total, percentage)
259
-
260
-
261
- class _BertLoader(ExtendedJavaWrapper):
262
- def __init__(self, path, jspark):
263
- super(_BertLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BertEmbeddings.loadSavedModel", path, jspark)
264
-
265
-
266
- class _BertSentenceLoader(ExtendedJavaWrapper):
267
- def __init__(self, path, jspark):
268
- super(_BertSentenceLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings.loadSavedModel", path, jspark)
269
-
270
-
271
- class _USELoader(ExtendedJavaWrapper):
272
- def __init__(self, path, jspark):
273
- super(_USELoader, self).__init__("com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder.loadSavedModel", path, jspark)
274
-
275
-
276
- class _ElmoLoader(ExtendedJavaWrapper):
277
- def __init__(self, path, jspark):
278
- super(_ElmoLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings.loadSavedModel", path, jspark)
279
-
280
-
281
- class _AlbertLoader(ExtendedJavaWrapper):
282
- def __init__(self, path, jspark):
283
- super(_AlbertLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.AlbertEmbeddings.loadSavedModel", path, jspark)
284
-
285
-
286
- class _XlnetLoader(ExtendedJavaWrapper):
287
- def __init__(self, path, jspark):
288
- super(_XlnetLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings.loadSavedModel", path, jspark)
sparknlp/internal.pyc DELETED
Binary file
sparknlp/pretrained.py DELETED
@@ -1,123 +0,0 @@
1
- import sparknlp.internal as _internal
2
- import threading
3
- import time
4
- from pyspark.sql import DataFrame
5
- from sparknlp.annotator import *
6
- from sparknlp.base import LightPipeline
7
- from pyspark.ml import PipelineModel
8
-
9
-
10
- def printProgress(stop):
11
- states = [' | ', ' / ', ' — ', ' \\ ']
12
- nextc = 0
13
- while True:
14
- sys.stdout.write('\r[{}]'.format(states[nextc]))
15
- sys.stdout.flush()
16
- time.sleep(2.5)
17
- nextc = nextc + 1 if nextc < 3 else 0
18
- if stop():
19
- sys.stdout.write('\r[{}]'.format('OK!'))
20
- sys.stdout.flush()
21
- break
22
-
23
- sys.stdout.write('\n')
24
- return
25
-
26
-
27
- class ResourceDownloader(object):
28
-
29
- @staticmethod
30
- def downloadModel(reader, name, language, remote_loc=None, j_dwn='PythonResourceDownloader'):
31
- print(name + " download started this may take some time.")
32
- file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
33
- if file_size == "-1":
34
- print("Can not find the model to download please check the name!")
35
- else:
36
- print("Approximate size to download " + file_size)
37
- stop_threads = False
38
- t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
39
- t1.start()
40
- try:
41
- j_obj = _internal._DownloadModel(reader.name, name, language, remote_loc, j_dwn).apply()
42
- finally:
43
- stop_threads = True
44
- t1.join()
45
-
46
- return reader(classname=None, java_model=j_obj)
47
-
48
- @staticmethod
49
- def downloadPipeline(name, language, remote_loc=None):
50
- print(name + " download started this may take some time.")
51
- file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
52
- if file_size == "-1":
53
- print("Can not find the model to download please check the name!")
54
- else:
55
- print("Approx size to download " + file_size)
56
- stop_threads = False
57
- t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
58
- t1.start()
59
- try:
60
- j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
61
- jmodel = PipelineModel._from_java(j_obj)
62
- finally:
63
- stop_threads = True
64
- t1.join()
65
-
66
- return jmodel
67
-
68
- @staticmethod
69
- def clearCache(name, language, remote_loc=None):
70
- _internal._ClearCache(name, language, remote_loc).apply()
71
-
72
- @staticmethod
73
- def showPublicModels():
74
- print("test")
75
- _internal._ShowPublicModels().apply()
76
-
77
- @staticmethod
78
- def showPublicPipelines():
79
- _internal._ShowPublicPipelines().apply()
80
-
81
-
82
- @staticmethod
83
- def showUnCategorizedResources():
84
- _internal._ShowUnCategorizedResources().apply()
85
-
86
-
87
- class PretrainedPipeline:
88
-
89
- def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
90
- if not disk_location:
91
- self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
92
- else:
93
- self.model = PipelineModel.load(disk_location)
94
- self.light_model = LightPipeline(self.model, parse_embeddings)
95
-
96
- @staticmethod
97
- def from_disk(path, parse_embeddings=False):
98
- return PretrainedPipeline(None, None, None, parse_embeddings, path)
99
-
100
- def annotate(self, target, column=None):
101
- if type(target) is DataFrame:
102
- if not column:
103
- raise Exception("annotate() column arg needed when targeting a DataFrame")
104
- return self.model.transform(target.withColumnRenamed(column, "text"))
105
- elif type(target) is list or type(target) is str:
106
- pipeline = self.light_model
107
- return pipeline.annotate(target)
108
- else:
109
- raise Exception("target must be either a spark DataFrame, a list of strings or a string")
110
-
111
- def fullAnnotate(self, target, column=None):
112
- if type(target) is DataFrame:
113
- if not column:
114
- raise Exception("annotate() column arg needed when targeting a DataFrame")
115
- return self.model.transform(target.withColumnRenamed(column, "text"))
116
- elif type(target) is list or type(target) is str:
117
- pipeline = self.light_model
118
- return pipeline.fullAnnotate(target)
119
- else:
120
- raise Exception("target must be either a spark DataFrame, a list of strings or a string")
121
-
122
- def transform(self, data):
123
- return self.model.transform(data)
sparknlp/pretrained.pyc DELETED
Binary file
sparknlp/storage.py DELETED
@@ -1,32 +0,0 @@
1
- import sparknlp.internal as _internal
2
-
3
- from pyspark.ml.param import Params
4
- from pyspark import keyword_only
5
- import sys
6
- import threading
7
- import time
8
- import sparknlp.pretrained as _pretrained
9
-
10
-
11
- # DONT REMOVE THIS IMPORT
12
- from sparknlp.annotator import WordEmbeddingsModel
13
- ####
14
-
15
-
16
- class RocksDBConnection:
17
- def __init__(self, connection):
18
- self.jconnection = connection
19
-
20
-
21
- class StorageHelper:
22
- @classmethod
23
- def load(cls, path, spark_session, database):
24
- print("Loading started this may take some time")
25
- stop_threads = False
26
- t1 = threading.Thread(target=_pretrained.printProgress, args=(lambda: stop_threads,))
27
- t1.start()
28
- jembeddings = _internal._StorageHelper(path, spark_session, database).apply()
29
- stop_threads = True
30
- t1.join()
31
- print("Loading done")
32
- return RocksDBConnection(jembeddings)
sparknlp/storage.pyc DELETED
Binary file
sparknlp/training.py DELETED
@@ -1,62 +0,0 @@
1
- from sparknlp.internal import ExtendedJavaWrapper
2
- from sparknlp.common import ExternalResource, ReadAs
3
- from pyspark.sql import SparkSession, DataFrame
4
-
5
-
6
- class CoNLL(ExtendedJavaWrapper):
7
- def __init__(self,
8
- documentCol = 'document',
9
- sentenceCol = 'sentence',
10
- tokenCol = 'token',
11
- posCol = 'pos',
12
- conllLabelIndex = 3,
13
- conllPosIndex = 1,
14
- textCol = 'text',
15
- labelCol = 'label',
16
- explodeSentences = True,
17
- ):
18
- super(CoNLL, self).__init__("com.johnsnowlabs.nlp.training.CoNLL",
19
- documentCol,
20
- sentenceCol,
21
- tokenCol,
22
- posCol,
23
- conllLabelIndex,
24
- conllPosIndex,
25
- textCol,
26
- labelCol,
27
- explodeSentences)
28
-
29
- def readDataset(self, spark, path, read_as=ReadAs.TEXT):
30
-
31
- # ToDo Replace with std pyspark
32
- jSession = spark._jsparkSession
33
-
34
- jdf = self._java_obj.readDataset(jSession, path, read_as)
35
- return DataFrame(jdf, spark._wrapped)
36
-
37
-
38
- class POS(ExtendedJavaWrapper):
39
- def __init__(self):
40
- super(POS, self).__init__("com.johnsnowlabs.nlp.training.POS")
41
-
42
- def readDataset(self, spark, path, delimiter="|", outputPosCol="tags", outputDocumentCol="document", outputTextCol="text"):
43
-
44
- # ToDo Replace with std pyspark
45
- jSession = spark._jsparkSession
46
-
47
- jdf = self._java_obj.readDataset(jSession, path, delimiter, outputPosCol, outputDocumentCol, outputTextCol)
48
- return DataFrame(jdf, spark._wrapped)
49
-
50
-
51
- class PubTator(ExtendedJavaWrapper):
52
- def __init__(self):
53
- super(PubTator, self).__init__("com.johnsnowlabs.nlp.training.PubTator")
54
-
55
- def readDataset(self, spark, path):
56
-
57
- # ToDo Replace with std pyspark
58
- jSession = spark._jsparkSession
59
-
60
- jdf = self._java_obj.readDataset(jSession, path)
61
- return DataFrame(jdf, spark._wrapped)
62
-
sparknlp/training.pyc DELETED
Binary file
sparknlp/util.pyc DELETED
Binary file