spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,467 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Package that contains classes for integration with Comet."""
16
+
17
+ try:
18
+ import comet_ml
19
+ except AttributeError:
20
+ # Python 3.6
21
+ comet_ml = None
22
+ except ModuleNotFoundError:
23
+ # Python 3.7+
24
+ comet_ml = None
25
+
26
+ import threading
27
+ import time
28
+ import os
29
+
30
+
31
+ class CometLogger:
32
+ """Logger class for Comet integration
33
+
34
+ `Comet <https://www.comet.ml/>`__ is a meta machine learning platform
35
+ designed to help AI practitioners and teams build reliable machine learning
36
+ models for real-world applications by streamlining the machine learning
37
+ model lifecycle. By leveraging Comet, users can track, compare, explain and
38
+ reproduce their machine learning experiments.
39
+
40
+ To log a Spark NLP annotator, it will need an "outputLogPath" parameter, as the
41
+ CometLogger reads the log file generated during the training process.
42
+
43
+ For more examples see the `Examples
44
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/logging/Comet_SparkNLP_Integration.ipynb>`__.
45
+
46
+ Parameters
47
+ ----------
48
+ workspace : str, optional
49
+ Name of the workspace in Comet, by default None
50
+ project_name : str, optional
51
+ Name of the project in Comet, by default None
52
+ comet_mode : str, optional
53
+ Mode of logging, by default None. If set to "offline" then offline mode
54
+ will be used, otherwise online.
55
+ experiment_id : str, optional
56
+ Id of the experiment, if it is reused, by default None
57
+ tags : List[str], optional
58
+ List of tags for the experiment, by default None
59
+
60
+ Attributes
61
+ ----------
62
+ experiment : comet_ml.Experiment
63
+ Object representing the Comet experiment
64
+
65
+ Raises
66
+ ------
67
+ ImportError
68
+ If the package comet-ml is not installed
69
+
70
+ Examples
71
+ --------
72
+ Metrics while training an annotator can be logged with for example:
73
+
74
+ >>> import sparknlp
75
+ >>> from sparknlp.base import *
76
+ >>> from sparknlp.annotator import *
77
+ >>> from sparknlp.logging.comet import CometLogger
78
+ >>> spark = sparknlp.start()
79
+
80
+ To run an online experiment, the logger is defined like so.
81
+
82
+ >>> OUTPUT_LOG_PATH = "./run"
83
+ >>> logger = CometLogger()
84
+
85
+ Then the experiment can start like so
86
+
87
+ >>> document = DocumentAssembler() \\
88
+ ... .setInputCol("text")\\
89
+ ... .setOutputCol("document")
90
+ >>> embds = UniversalSentenceEncoder.pretrained() \\
91
+ ... .setInputCols("document") \\
92
+ ... .setOutputCol("sentence_embeddings")
93
+ >>> multiClassifier = MultiClassifierDLApproach() \\
94
+ ... .setInputCols("sentence_embeddings") \\
95
+ ... .setOutputCol("category") \\
96
+ ... .setLabelColumn("labels") \\
97
+ ... .setBatchSize(128) \\
98
+ ... .setLr(1e-3) \\
99
+ ... .setThreshold(0.5) \\
100
+ ... .setShufflePerEpoch(False) \\
101
+ ... .setEnableOutputLogs(True) \\
102
+ ... .setOutputLogsPath(OUTPUT_LOG_PATH) \\
103
+ ... .setMaxEpochs(1)
104
+ >>> logger.monitor(logdir=OUTPUT_LOG_PATH, model=multiClassifier)
105
+ >>> trainDataset = spark.createDataFrame(
106
+ ... [("Nice.", ["positive"]), ("That's bad.", ["negative"])],
107
+ ... schema=["text", "labels"],
108
+ ... )
109
+ >>> pipeline = Pipeline(stages=[document, embds, multiClassifier])
110
+ >>> pipeline.fit(trainDataset)
111
+ >>> logger.end()
112
+
113
+ If you are using a jupyter notebook, it is possible to display the live web
114
+ interface with
115
+
116
+ >>> logger.experiment.display(tab='charts')
117
+ """
118
+
119
+ def __init__(
120
+ self,
121
+ workspace=None,
122
+ project_name=None,
123
+ comet_mode=None,
124
+ experiment_id=None,
125
+ tags=None,
126
+ **experiment_kwargs,
127
+ ):
128
+ if comet_ml is None:
129
+ raise ImportError(
130
+ "`comet_ml` is not installed. Please install it with `pip install comet-ml`."
131
+ )
132
+
133
+ self.comet_mode = comet_mode
134
+ self.workspace = workspace
135
+ self.project_name = project_name
136
+ self.experiment_id = experiment_id
137
+ self.experiment_kwargs = experiment_kwargs
138
+
139
+ self.experiment = self._get_experiment(
140
+ self.comet_mode,
141
+ self.workspace,
142
+ self.project_name,
143
+ self.experiment_id,
144
+ **self.experiment_kwargs,
145
+ )
146
+ self.experiment.log_other("Created from", "SparkNLP")
147
+ if tags is not None:
148
+ self.experiment.add_tags(tags)
149
+
150
+ self._watch_file = False
151
+ self._monitor_thread_timeout = 5
152
+ self.thread = None
153
+
154
+ def _get_experiment(
155
+ self,
156
+ mode,
157
+ workspace=None,
158
+ project_name=None,
159
+ experiment_id=None,
160
+ **experiment_kwargs,
161
+ ):
162
+ if mode == "offline":
163
+ if experiment_id is not None:
164
+ return comet_ml.ExistingOfflineExperiment(
165
+ previous_experiment=experiment_id,
166
+ workspace=workspace,
167
+ project_name=project_name,
168
+ **experiment_kwargs,
169
+ )
170
+
171
+ return comet_ml.OfflineExperiment(
172
+ workspace=workspace,
173
+ project_name=project_name,
174
+ **experiment_kwargs,
175
+ )
176
+
177
+ else:
178
+ if experiment_id is not None:
179
+ return comet_ml.ExistingExperiment(
180
+ previous_experiment=experiment_id,
181
+ workspace=workspace,
182
+ project_name=project_name,
183
+ **experiment_kwargs,
184
+ )
185
+
186
+ return comet_ml.Experiment(
187
+ workspace=workspace,
188
+ project_name=project_name,
189
+ **experiment_kwargs,
190
+ )
191
+
192
+ def log_pipeline_parameters(self, pipeline, stages=None):
193
+ """Iterates over the different stages in a pyspark PipelineModel object
194
+ and logs the parameters to Comet.
195
+
196
+ Parameters
197
+ ----------
198
+ pipeline : pyspark.ml.PipelineModel
199
+ PipelineModel object
200
+ stages : List[str], optional
201
+ Names of the stages of the pipeline to include, by default None (logs all)
202
+
203
+ Examples
204
+ --------
205
+ The pipeline model contains the annotators of Spark NLP, that were
206
+ fitted to a dataframe.
207
+
208
+ >>> logger.log_pipeline_parameters(pipeline_model)
209
+ """
210
+ self.experiment.log_other("pipeline_uid", pipeline.uid)
211
+ if stages is None:
212
+ stages = [s.name for s in pipeline.stages]
213
+
214
+ for stage in pipeline.stages:
215
+ if stage.name not in stages:
216
+ continue
217
+
218
+ params = stage.extractParamMap()
219
+ for param, param_value in params.items():
220
+ self.experiment.log_parameter(f"{stage.name}-{param.name}", param_value)
221
+
222
+ def log_visualization(self, html, name="viz.html"):
223
+ """Uploads a NER visualization from Spark NLP Display to comet.
224
+
225
+ Parameters
226
+ ----------
227
+ html : str
228
+ HTML of the spark NLP Display visualization
229
+ name : str, optional
230
+ Name for the visualization in comet, by default "viz.html"
231
+
232
+ Examples
233
+ --------
234
+ This example has NER chunks (NER extracted by e.g. :class:`.NerDLModel`
235
+ and converted by a :class:`.NerConverter`) extracted in the colum
236
+ "ner_chunk".
237
+
238
+ >>> from sparknlp_display import NerVisualizer
239
+ >>> logger = CometLogger()
240
+ >>> for idx, result in enumerate(results.collect()):
241
+ ... viz = NerVisualizer().display(
242
+ ... result=result,
243
+ ... label_col='ner_chunk',
244
+ ... document_col='document',
245
+ ... return_html=True
246
+ ... )
247
+ ... logger.log_visualization(viz, name=f'viz-{idx}.html')
248
+ """
249
+ self.log_asset_data(html, name)
250
+
251
+ def log_metrics(self, metrics, step=None, epoch=None, prefix=None):
252
+ """Submits logs of an evaluation metrics.
253
+
254
+ Parameters
255
+ ----------
256
+ metrics : dict
257
+ Dictionary with key value pairs corresponding to the measured metric
258
+ and its value
259
+ step : int, optional
260
+ Used to associate a specific step, by default None
261
+ epoch : int, optional
262
+ Used to associate a specific epoch, by default None
263
+ prefix : str, optional
264
+ Name prefix for this metric, by default None. This can be used to
265
+ identify for example different features by name.
266
+
267
+ Examples
268
+ --------
269
+ In this example, sklearn is used to retrieve the metrics.
270
+
271
+ >>> from sklearn.preprocessing import MultiLabelBinarizer
272
+ >>> from sklearn.metrics import classification_report
273
+ >>> prediction = model.transform(testDataset)
274
+ >>> preds_df = prediction.select('labels', 'category.result').toPandas()
275
+
276
+ >>> mlb = MultiLabelBinarizer()
277
+ >>> y_true = mlb.fit_transform(preds_df['labels'])
278
+ >>> y_pred = mlb.fit_transform(preds_df['result'])
279
+ >>> report = classification_report(y_true, y_pred, output_dict=True)
280
+
281
+ Iterate over the report and log the metrics:
282
+
283
+ >>> for key, value in report.items():
284
+ ... logger.log_metrics(value, prefix=key)
285
+ >>> logger.end()
286
+
287
+ If you are using Spark NLP in a notebook, then you can display the
288
+ metrics directly with
289
+
290
+ >>> logger.experiment.display(tab='metrics')
291
+ """
292
+ self.experiment.log_metrics(metrics, step=step, epoch=epoch, prefix=prefix)
293
+
294
+ def log_parameters(self, parameters, step=None):
295
+ """Logs a dictionary (or dictionary-like object) of multiple parameters.
296
+
297
+ Parameters
298
+ ----------
299
+ parameters : dict
300
+ Parameters in a key : value form
301
+ step : int, optional
302
+ Used to associate a specific step, by default None, by default None
303
+ """
304
+ self.experiment.log_parameters(parameters, step=step)
305
+
306
+ def log_completed_run(self, log_file_path):
307
+ """Submit logs of training metrics after a run has completed.
308
+
309
+ Parameters
310
+ ----------
311
+ log_file_path : str
312
+ Path to log file containing training metrics
313
+ """
314
+ with open(log_file_path, "r") as f:
315
+ stats = f.read().splitlines()
316
+
317
+ self._parse_log_entry(stats)
318
+ self.experiment.log_other("log_file_path", log_file_path)
319
+
320
+ def log_asset(self, asset_path, metadata=None, step=None):
321
+ """Uploads an asset to comet.
322
+
323
+ Parameters
324
+ ----------
325
+ asset_path : str
326
+ Path to the asset
327
+ metadata : str, optional
328
+ Some additional data to attach to the the audio asset. Must be a
329
+ JSON-encodable dict, by default None
330
+ step : int, optional
331
+ Used to associate a specific step, by default None, by default None
332
+ """
333
+ self.experiment.log_asset(asset_path, metadata=metadata, step=step)
334
+
335
+ def log_asset_data(self, asset, name, overwrite=False, metadata=None, step=None):
336
+ """Uploads the data given to comet (str, binary, or JSON).
337
+
338
+ Parameters
339
+ ----------
340
+ asset : str or bytes or dict
341
+ Data to be saved as asset
342
+ name : str
343
+ A custom file name to be displayed
344
+ overwrite : bool, optional
345
+ If True will overwrite all existing assets with the same name, by
346
+ default False
347
+ metadata : dict, optional
348
+ Some additional data to attach to the the asset data.
349
+ Must be a JSON-encodable dict, by default None
350
+ step : int, optional
351
+ Used to associate a specific step, by default None, by default None
352
+ """
353
+ self.experiment.log_asset_data(
354
+ asset, name, overwrite=overwrite, metadata=metadata, step=step
355
+ )
356
+
357
+ def monitor(self, logdir, model, interval=10):
358
+ """Monitors the training of the model and submits logs to comet, given
359
+ by an interval.
360
+
361
+ To log a Spark NLP annotator, it will need an "outputLogPath" parameter, as the
362
+ CometLogger reads the log file generated during the training process.
363
+
364
+ If you are not able to monitor the live training, you can still log the training
365
+ at the end with :meth:`.log_completed_run`.
366
+
367
+ Parameters
368
+ ----------
369
+ logdir : str
370
+ Path to the output of the logs
371
+ model : AnnotatorApproach
372
+ Annotator to monitor
373
+ interval : int, optional
374
+ Interval for refreshing, by default 10
375
+ """
376
+ self._watch_file = True
377
+ self.experiment.log_other("model_uid", model.uid)
378
+ self.thread = threading.Thread(
379
+ target=self._monitor_log_file,
380
+ args=(
381
+ os.path.join(logdir, f"{model.uid}.log"),
382
+ interval,
383
+ ),
384
+ )
385
+ self.thread.start()
386
+
387
+ def _file_watcher(self, filename, interval):
388
+ """Generator that yields lines from the model log file.
389
+
390
+ Parameters
391
+ ----------
392
+ filename : str
393
+ Path to model log file
394
+ interval : int
395
+ Time (seconds) to wait in between checking for file updates
396
+
397
+ Yields
398
+ ------
399
+ str
400
+ A single line from the file
401
+ """
402
+ fp = open(filename)
403
+
404
+ line = ""
405
+ while self._watch_file:
406
+ partial_line = fp.readline()
407
+ if len(partial_line) != 0:
408
+ line += partial_line
409
+ if line.endswith("\n"):
410
+ yield line
411
+ line = ""
412
+ else:
413
+ time.sleep(interval)
414
+
415
+ fp.close()
416
+
417
+ def _monitor_log_file(self, filename, interval):
418
+ # Wait for file to be created:
419
+ while not os.path.exists(filename) and self._watch_file:
420
+ time.sleep(interval)
421
+
422
+ watcher = self._file_watcher(filename, interval)
423
+ for line in watcher:
424
+ lines = line.split("\n")
425
+ self._parse_log_entry(lines)
426
+
427
+ def _convert_log_entry_to_dict(self, log_entries):
428
+ output_dict = {}
429
+ for entry in log_entries:
430
+ key, value = entry.strip(" ").split(":")
431
+ output_dict[key] = float(value)
432
+
433
+ return output_dict
434
+
435
+ def _parse_run_metrics(self, parts):
436
+ epoch_str, ratio = parts[0].split(" ", 1)
437
+ epoch, total = ratio.split("/", 1)
438
+
439
+ metrics = parts[2:]
440
+ formatted_metrics = self._convert_log_entry_to_dict(metrics)
441
+
442
+ return formatted_metrics, epoch
443
+
444
+ def _parse_run_parameters(self, parts):
445
+ parameters = parts[2:]
446
+ formatted_parameters = self._convert_log_entry_to_dict(parameters)
447
+ return formatted_parameters
448
+
449
+ def _parse_log_entry(self, lines):
450
+ for line in lines:
451
+ parts = line.split("-")
452
+ if line.startswith("Training started"):
453
+ parameters = self._parse_run_parameters(parts)
454
+ self.log_parameters(parameters)
455
+
456
+ elif line.startswith("Epoch"):
457
+ metrics, epoch = self._parse_run_metrics(parts)
458
+ self.log_metrics(metrics, step=int(epoch), epoch=int(epoch))
459
+
460
+ def end(self):
461
+ """Ends the experiment and the logger. Submits all outstanding logs to
462
+ comet.
463
+ """
464
+ self._watch_file = False
465
+ self.experiment.end()
466
+ if self.thread:
467
+ self.thread.join(timeout=self._monitor_thread_timeout)
@@ -0,0 +1,16 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Module to read various types of documents into chunks"""
15
+ from sparknlp.partition.partition import *
16
+ from sparknlp.partition.partition_transformer import *