spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,169 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Doc2Chunk."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+
19
+ from sparknlp.internal import AnnotatorTransformer
20
+
21
+ from sparknlp.common import AnnotatorProperties, AnnotatorType
22
+
23
+
24
+ class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties):
25
+ """Converts ``DOCUMENT`` type annotations into ``CHUNK`` type with the
26
+ contents of a ``chunkCol``.
27
+
28
+ Chunk text must be contained within input ``DOCUMENT``. May be either
29
+ ``StringType`` or ``ArrayType[StringType]`` (using setIsArray). Useful for
30
+ annotators that require a CHUNK type input.
31
+
32
+ ====================== ======================
33
+ Input Annotation types Output Annotation type
34
+ ====================== ======================
35
+ ``DOCUMENT`` ``CHUNK``
36
+ ====================== ======================
37
+
38
+ Parameters
39
+ ----------
40
+ chunkCol
41
+ Column that contains the string. Must be part of DOCUMENT
42
+ startCol
43
+ Column that has a reference of where the chunk begins
44
+ startColByTokenIndex
45
+ Whether start column is prepended by whitespace tokens
46
+ isArray
47
+ Whether the chunkCol is an array of strings, by default False
48
+ failOnMissing
49
+ Whether to fail the job if a chunk is not found within document.
50
+ Return empty otherwise
51
+ lowerCase
52
+ Whether to lower case for matching case
53
+
54
+ Examples
55
+ --------
56
+ >>> import sparknlp
57
+ >>> from sparknlp.base import *
58
+ >>> from sparknlp.common import *
59
+ >>> from sparknlp.annotator import *
60
+ >>> from sparknlp.training import *
61
+ >>> from pyspark.ml import Pipeline
62
+ >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
63
+ >>> chunkAssembler = Doc2Chunk() \\
64
+ ... .setInputCols("document") \\
65
+ ... .setChunkCol("target") \\
66
+ ... .setOutputCol("chunk") \\
67
+ ... .setIsArray(True)
68
+ >>> data = spark.createDataFrame([[
69
+ ... "Spark NLP is an open-source text processing library for advanced natural language processing.",
70
+ ... ["Spark NLP", "text processing library", "natural language processing"]
71
+ ... ]]).toDF("text", "target")
72
+ >>> pipeline = Pipeline().setStages([documentAssembler, chunkAssembler]).fit(data)
73
+ >>> result = pipeline.transform(data)
74
+ >>> result.selectExpr("chunk.result", "chunk.annotatorType").show(truncate=False)
75
+ +-----------------------------------------------------------------+---------------------+
76
+ |result |annotatorType |
77
+ +-----------------------------------------------------------------+---------------------+
78
+ |[Spark NLP, text processing library, natural language processing]|[chunk, chunk, chunk]|
79
+ +-----------------------------------------------------------------+---------------------+
80
+
81
+ See Also
82
+ --------
83
+ Chunk2Doc : for converting `CHUNK` annotations to `DOCUMENT`
84
+ """
85
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
86
+
87
+ outputAnnotatorType = AnnotatorType.CHUNK
88
+
89
+ chunkCol = Param(Params._dummy(), "chunkCol", "column that contains string. Must be part of DOCUMENT", typeConverter=TypeConverters.toString)
90
+ startCol = Param(Params._dummy(), "startCol", "column that has a reference of where chunk begins", typeConverter=TypeConverters.toString)
91
+ startColByTokenIndex = Param(Params._dummy(), "startColByTokenIndex", "whether start col is by whitespace tokens", typeConverter=TypeConverters.toBoolean)
92
+ isArray = Param(Params._dummy(), "isArray", "whether the chunkCol is an array of strings", typeConverter=TypeConverters.toBoolean)
93
+ failOnMissing = Param(Params._dummy(), "failOnMissing", "whether to fail the job if a chunk is not found within document. return empty otherwise", typeConverter=TypeConverters.toBoolean)
94
+ lowerCase = Param(Params._dummy(), "lowerCase", "whether to lower case for matching case", typeConverter=TypeConverters.toBoolean)
95
+ name = "Doc2Chunk"
96
+
97
+ @keyword_only
98
+ def __init__(self):
99
+ super(Doc2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.Doc2Chunk")
100
+ self._setDefault(
101
+ isArray=False
102
+ )
103
+
104
+ @keyword_only
105
+ def setParams(self):
106
+ kwargs = self._input_kwargs
107
+ return self._set(**kwargs)
108
+
109
+ def setChunkCol(self, value):
110
+ """Sets column that contains the string. Must be part of DOCUMENT.
111
+
112
+ Parameters
113
+ ----------
114
+ value : str
115
+ Name of the Chunk Column
116
+ """
117
+ return self._set(chunkCol=value)
118
+
119
+ def setIsArray(self, value):
120
+ """Sets whether the chunkCol is an array of strings.
121
+
122
+ Parameters
123
+ ----------
124
+ value : bool
125
+ Whether the chunkCol is an array of strings
126
+ """
127
+ return self._set(isArray=value)
128
+
129
+ def setStartCol(self, value):
130
+ """Sets column that has a reference of where chunk begins.
131
+
132
+ Parameters
133
+ ----------
134
+ value : str
135
+ Name of the reference column
136
+ """
137
+ return self._set(startCol=value)
138
+
139
+ def setStartColByTokenIndex(self, value):
140
+ """Sets whether start column is prepended by whitespace tokens.
141
+
142
+ Parameters
143
+ ----------
144
+ value : bool
145
+ whether start column is prepended by whitespace tokens
146
+ """
147
+ return self._set(startColByTokenIndex=value)
148
+
149
+ def setFailOnMissing(self, value):
150
+ """Sets whether to fail the job if a chunk is not found within document.
151
+ Return empty otherwise.
152
+
153
+ Parameters
154
+ ----------
155
+ value : bool
156
+ Whether to fail job on missing chunks
157
+ """
158
+ return self._set(failOnMissing=value)
159
+
160
+ def setLowerCase(self, value):
161
+ """Sets whether to lower case for matching case.
162
+
163
+ Parameters
164
+ ----------
165
+ value : bool
166
+ Name of the Id Column
167
+ """
168
+ return self._set(lowerCase=value)
169
+
@@ -0,0 +1,164 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the DocumentAssembler."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+
19
+ from sparknlp.common import AnnotatorType
20
+ from sparknlp.internal import AnnotatorTransformer
21
+
22
+
23
+ class DocumentAssembler(AnnotatorTransformer):
24
+ """Prepares data into a format that is processable by Spark NLP.
25
+
26
+ This is the entry point for every Spark NLP pipeline. The
27
+ `DocumentAssembler` reads ``String`` columns. Additionally,
28
+ :meth:`.setCleanupMode` can be used to pre-process the
29
+ text (Default: ``disabled``). For possible options please refer the
30
+ parameters section.
31
+
32
+ For more extended examples on document pre-processing see the
33
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb>`__.
34
+
35
+ ====================== ======================
36
+ Input Annotation types Output Annotation type
37
+ ====================== ======================
38
+ ``NONE`` ``DOCUMENT``
39
+ ====================== ======================
40
+
41
+ Parameters
42
+ ----------
43
+ inputCol
44
+ Input column name
45
+ outputCol
46
+ Output column name
47
+ idCol
48
+ Name of String type column for row id.
49
+ metadataCol
50
+ Name of Map type column with metadata information
51
+ cleanupMode
52
+ How to cleanup the document , by default disabled.
53
+ Possible values: ``disabled, inplace, inplace_full, shrink, shrink_full,
54
+ each, each_full, delete_full``
55
+
56
+ Examples
57
+ --------
58
+ >>> import sparknlp
59
+ >>> from sparknlp.base import *
60
+ >>> from pyspark.ml import Pipeline
61
+ >>> data = spark.createDataFrame([["Spark NLP is an open-source text processing library."]]).toDF("text")
62
+ >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
63
+ >>> result = documentAssembler.transform(data)
64
+ >>> result.select("document").show(truncate=False)
65
+ +----------------------------------------------------------------------------------------------+
66
+ |document |
67
+ +----------------------------------------------------------------------------------------------+
68
+ |[[document, 0, 51, Spark NLP is an open-source text processing library., [sentence -> 0], []]]|
69
+ +----------------------------------------------------------------------------------------------+
70
+ >>> result.select("document").printSchema()
71
+ root
72
+ |-- document: array (nullable = True)
73
+ | |-- element: struct (containsNull = True)
74
+ | | |-- annotatorType: string (nullable = True)
75
+ | | |-- begin: integer (nullable = False)
76
+ | | |-- end: integer (nullable = False)
77
+ | | |-- result: string (nullable = True)
78
+ | | |-- metadata: map (nullable = True)
79
+ | | | |-- key: string
80
+ | | | |-- value: string (valueContainsNull = True)
81
+ | | |-- embeddings: array (nullable = True)
82
+ | | | |-- element: float (containsNull = False)
83
+ """
84
+
85
+ outputAnnotatorType = AnnotatorType.DOCUMENT
86
+
87
+ inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
88
+ outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
89
+ idCol = Param(Params._dummy(), "idCol", "column for setting an id to such string in row", typeConverter=TypeConverters.toString)
90
+ metadataCol = Param(Params._dummy(), "metadataCol", "String to String map column to use as metadata", typeConverter=TypeConverters.toString)
91
+ cleanupMode = Param(Params._dummy(), "cleanupMode", "possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full", typeConverter=TypeConverters.toString)
92
+ name = 'DocumentAssembler'
93
+
94
+ @keyword_only
95
+ def __init__(self):
96
+ super(DocumentAssembler, self).__init__(classname="com.johnsnowlabs.nlp.DocumentAssembler")
97
+ self._setDefault(outputCol="document", cleanupMode='disabled')
98
+
99
+ @keyword_only
100
+ def setParams(self):
101
+ kwargs = self._input_kwargs
102
+ return self._set(**kwargs)
103
+
104
+ def setInputCol(self, value):
105
+ """Sets input column name.
106
+
107
+ Parameters
108
+ ----------
109
+ value : str
110
+ Name of the input column
111
+ """
112
+ return self._set(inputCol=value)
113
+
114
+ def setOutputCol(self, value):
115
+ """Sets output column name.
116
+
117
+ Parameters
118
+ ----------
119
+ value : str
120
+ Name of the Output Column
121
+ """
122
+ return self._set(outputCol=value)
123
+
124
+ def setIdCol(self, value):
125
+ """Sets name of string type column for row id.
126
+
127
+ Parameters
128
+ ----------
129
+ value : str
130
+ Name of the Id Column
131
+ """
132
+ return self._set(idCol=value)
133
+
134
+ def setMetadataCol(self, value):
135
+ """Sets name for Map type column with metadata information.
136
+
137
+ Parameters
138
+ ----------
139
+ value : str
140
+ Name of the metadata column
141
+ """
142
+ return self._set(metadataCol=value)
143
+
144
+ def setCleanupMode(self, value):
145
+ """Sets how to cleanup the document, by default disabled.
146
+ Possible values: ``disabled, inplace, inplace_full, shrink, shrink_full,
147
+ each, each_full, delete_full``
148
+
149
+ Parameters
150
+ ----------
151
+ value : str
152
+ Cleanup mode
153
+ """
154
+ if value.strip().lower() not in ['disabled', 'inplace', 'inplace_full', 'shrink', 'shrink_full', 'each', 'each_full', 'delete_full']:
155
+ raise Exception("Cleanup mode possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full")
156
+ return self._set(cleanupMode=value)
157
+
158
+ def getOutputCol(self):
159
+ """Gets output column name of annotations."""
160
+ return self.getOrDefault(self.outputCol)
161
+
162
+ # def getInputCol(self):
163
+ # """Gets current column names of input annotations."""
164
+ # return self.getOrDefault(self.inputCol)
@@ -0,0 +1,201 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the EmbeddingsFinisher."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+ from sparknlp.internal import AnnotatorTransformer
19
+
20
+
21
+ class EmbeddingsFinisher(AnnotatorTransformer):
22
+ """Extracts embeddings from Annotations into a more easily usable form.
23
+
24
+ This is useful for example:
25
+
26
+ - WordEmbeddings,
27
+ - Transformer based embeddings such as BertEmbeddings,
28
+ - SentenceEmbeddings and
29
+ - ChunkEmbeddings, etc.
30
+
31
+ By using ``EmbeddingsFinisher`` you can easily transform your embeddings
32
+ into array of floats or vectors which are compatible with Spark ML functions
33
+ such as LDA, K-mean, Random Forest classifier or any other functions that
34
+ require a ``featureCol``.
35
+
36
+ For more extended examples see the
37
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb
38
+ >`__.
39
+
40
+ ====================== ======================
41
+ Input Annotation types Output Annotation type
42
+ ====================== ======================
43
+ ``EMBEDDINGS`` ``NONE``
44
+ ====================== ======================
45
+
46
+ Parameters
47
+ ----------
48
+ inputCols
49
+ Names of input annotation columns containing embeddings
50
+ outputCols
51
+ Names of finished output columns
52
+ cleanAnnotations
53
+ Whether to remove all the existing annotation columns, by default False
54
+ outputAsVector
55
+ Whether to output the embeddings as Vectors instead of arrays,
56
+ by default False
57
+
58
+ Examples
59
+ --------
60
+ First extract embeddings.
61
+
62
+ >>> import sparknlp
63
+ >>> from sparknlp.base import *
64
+ >>> from sparknlp.annotator import *
65
+ >>> from pyspark.ml import Pipeline
66
+ >>> documentAssembler = DocumentAssembler() \\
67
+ ... .setInputCol("text") \\
68
+ ... .setOutputCol("document")
69
+ >>> tokenizer = Tokenizer() \\
70
+ ... .setInputCols("document") \\
71
+ ... .setOutputCol("token")
72
+ >>> normalizer = Normalizer() \\
73
+ ... .setInputCols("token") \\
74
+ ... .setOutputCol("normalized")
75
+ >>> stopwordsCleaner = StopWordsCleaner() \\
76
+ ... .setInputCols("normalized") \\
77
+ ... .setOutputCol("cleanTokens") \\
78
+ ... .setCaseSensitive(False)
79
+ >>> gloveEmbeddings = WordEmbeddingsModel.pretrained() \\
80
+ ... .setInputCols("document", "cleanTokens") \\
81
+ ... .setOutputCol("embeddings") \\
82
+ ... .setCaseSensitive(False)
83
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
84
+ ... .setInputCols("embeddings") \\
85
+ ... .setOutputCols("finished_sentence_embeddings") \\
86
+ ... .setOutputAsVector(True) \\
87
+ ... .setCleanAnnotations(False)
88
+ >>> data = spark.createDataFrame([["Spark NLP is an open-source text processing library."]]) \\
89
+ ... .toDF("text")
90
+ >>> pipeline = Pipeline().setStages([
91
+ ... documentAssembler,
92
+ ... tokenizer,
93
+ ... normalizer,
94
+ ... stopwordsCleaner,
95
+ ... gloveEmbeddings,
96
+ ... embeddingsFinisher
97
+ ... ]).fit(data)
98
+ >>> result = pipeline.transform(data)
99
+
100
+ Show results.
101
+
102
+ >>> resultWithSize = result.selectExpr("explode(finished_sentence_embeddings) as embeddings")
103
+ >>> resultWithSize.show(5, 80)
104
+ +--------------------------------------------------------------------------------+
105
+ | embeddings|
106
+ +--------------------------------------------------------------------------------+
107
+ |[0.1619900017976761,0.045552998781204224,-0.03229299932718277,-0.685609996318...|
108
+ |[-0.42416998744010925,1.1378999948501587,-0.5717899799346924,-0.5078899860382...|
109
+ |[0.08621499687433243,-0.15772999823093414,-0.06067200005054474,0.395359992980...|
110
+ |[-0.4970499873161316,0.7164199948310852,0.40119001269340515,-0.05761000141501...|
111
+ |[-0.08170200139284134,0.7159299850463867,-0.20677000284194946,0.0295659992843...|
112
+ +--------------------------------------------------------------------------------+
113
+
114
+ See Also
115
+ --------
116
+ EmbeddingsFinisher : for finishing embeddings
117
+ """
118
+
119
+ inputCols = Param(Params._dummy(), "inputCols", "name of input annotation cols containing embeddings", typeConverter=TypeConverters.toListString)
120
+ outputCols = Param(Params._dummy(), "outputCols", "output EmbeddingsFinisher ouput cols", typeConverter=TypeConverters.toListString)
121
+ cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove all the existing annotation columns", typeConverter=TypeConverters.toBoolean)
122
+ outputAsVector = Param(Params._dummy(), "outputAsVector", "if enabled it will output the embeddings as Vectors instead of arrays", typeConverter=TypeConverters.toBoolean)
123
+
124
+ name = "EmbeddingsFinisher"
125
+
126
+ @keyword_only
127
+ def __init__(self):
128
+ super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher")
129
+ self._setDefault(
130
+ cleanAnnotations=False,
131
+ outputAsVector=False,
132
+ outputCols=[]
133
+ )
134
+
135
+ @keyword_only
136
+ def setParams(self):
137
+ kwargs = self._input_kwargs
138
+ return self._set(**kwargs)
139
+
140
+ def setInputCols(self, *value):
141
+ """Sets name of input annotation columns containing embeddings.
142
+
143
+ Parameters
144
+ ----------
145
+ *value : str
146
+ Input columns for the annotator
147
+ """
148
+
149
+ if len(value) == 1 and type(value[0]) == list:
150
+ return self._set(inputCols=value[0])
151
+ else:
152
+ return self._set(inputCols=list(value))
153
+
154
+ def setOutputCols(self, *value):
155
+ """Sets names of finished output columns.
156
+
157
+ Parameters
158
+ ----------
159
+ *value : List[str]
160
+ Input columns for the annotator
161
+ """
162
+
163
+ if len(value) == 1 and type(value[0]) == list:
164
+ return self._set(outputCols=value[0])
165
+ else:
166
+ return self._set(outputCols=list(value))
167
+
168
+ def setCleanAnnotations(self, value):
169
+ """Sets whether to remove all the existing annotation columns, by default
170
+ False.
171
+
172
+ Parameters
173
+ ----------
174
+ value : bool
175
+ Whether to remove all the existing annotation columns
176
+ """
177
+
178
+ return self._set(cleanAnnotations=value)
179
+
180
+ def setOutputAsVector(self, value):
181
+ """Sets whether to output the embeddings as Vectors instead of arrays,
182
+ by default False.
183
+
184
+ Parameters
185
+ ----------
186
+ value : bool
187
+ Whether to output the embeddings as Vectors instead of arrays
188
+ """
189
+
190
+ return self._set(outputAsVector=value)
191
+
192
+ def getInputCols(self):
193
+ """Gets input columns name of annotations."""
194
+ return self.getOrDefault(self.inputCols)
195
+
196
+ def getOutputCols(self):
197
+ """Gets output columns name of annotations."""
198
+ if len(self.getOrDefault(self.outputCols)) == 0:
199
+ return ["finished_" + input_col for input_col in self.getInputCols()]
200
+ else:
201
+ return self.getOrDefault(self.outputCols)