spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,353 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Word2Vec."""
15
+
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class Word2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties):
21
+ """Trains a Word2Vec model that creates vector representations of words in a
22
+ text corpus.
23
+
24
+ The algorithm first constructs a vocabulary from the corpus and then learns
25
+ vector representation of words in the vocabulary. The vector representation
26
+ can be used as features in natural language processing and machine learning
27
+ algorithms.
28
+
29
+ We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
30
+ implementation and a hierarchical softmax method to train the model. The
31
+ variable names in the implementation match the original C implementation.
32
+
33
+ For instantiated/pretrained models, see :class:`.Word2VecModel`.
34
+
35
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
36
+
37
+ ====================== =======================
38
+ Input Annotation types Output Annotation type
39
+ ====================== =======================
40
+ ``TOKEN`` ``WORD_EMBEDDINGS``
41
+ ====================== =======================
42
+
43
+ Parameters
44
+ ----------
45
+ vectorSize
46
+ The dimension of codes after transforming from words (> 0), by default
47
+ 100
48
+ windowSize
49
+ The window size (context words from [-window, window]) (> 0), by default
50
+ 5
51
+ numPartitions
52
+ Number of partitions for sentences of words (> 0), by default 1
53
+ minCount
54
+ The minimum number of times a token must appear to be included in the
55
+ word2vec model's vocabulary (>= 0), by default 1
56
+ maxSentenceLength
57
+ The window size (Maximum length (in words) of each sentence in the input
58
+ data. Any sentence longer than this threshold will be divided into
59
+ chunks up to the size (> 0), by default 1000
60
+ stepSize
61
+ Step size (learning rate) to be used for each iteration of optimization
62
+ (> 0), by default 0.025
63
+ maxIter
64
+ Maximum number of iterations (>= 0), by default 1
65
+ seed
66
+ Random seed, by default 44
67
+
68
+
69
+ References
70
+ ----------
71
+ For the original C implementation, see https://code.google.com/p/word2vec/
72
+
73
+ For the research paper, see `Efficient Estimation of Word Representations in
74
+ Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
75
+ Representations of Words and Phrases and their Compositionality
76
+ <https://arxiv.org/pdf/1310.4546v1.pdf>`__.
77
+
78
+ Examples
79
+ --------
80
+ >>> import sparknlp
81
+ >>> from sparknlp.base import *
82
+ >>> from sparknlp.annotator import *
83
+ >>> from pyspark.ml import Pipeline
84
+ >>> documentAssembler = DocumentAssembler() \\
85
+ ... .setInputCol("text") \\
86
+ ... .setOutputCol("document")
87
+ >>> tokenizer = Tokenizer() \\
88
+ ... .setInputCols(["document"]) \\
89
+ ... .setOutputCol("token")
90
+ >>> embeddings = Word2VecApproach() \\
91
+ ... .setInputCols(["token"]) \\
92
+ ... .setOutputCol("embeddings")
93
+ >>> pipeline = Pipeline() \\
94
+ ... .setStages([
95
+ ... documentAssembler,
96
+ ... tokenizer,
97
+ ... embeddings
98
+ ... ])
99
+ >>> path = "sherlockholmes.txt"
100
+ >>> dataset = spark.read.text(path).toDF("text")
101
+ >>> pipelineModel = pipeline.fit(dataset)
102
+ """
103
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
104
+
105
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
106
+
107
+ vectorSize = Param(Params._dummy(),
108
+ "vectorSize",
109
+ "the dimension of codes after transforming from words (> 0)",
110
+ typeConverter=TypeConverters.toInt)
111
+
112
+ windowSize = Param(Params._dummy(),
113
+ "windowSize",
114
+ "the window size (context words from [-window, window]) (> 0)",
115
+ typeConverter=TypeConverters.toInt)
116
+
117
+ numPartitions = Param(Params._dummy(),
118
+ "numPartitions",
119
+ "number of partitions for sentences of words (> 0)",
120
+ typeConverter=TypeConverters.toInt)
121
+
122
+ minCount = Param(Params._dummy(),
123
+ "minCount",
124
+ "the minimum number of times a token must " +
125
+ "appear to be included in the word2vec model's vocabulary (>= 0)",
126
+ typeConverter=TypeConverters.toInt)
127
+
128
+ maxSentenceLength = Param(Params._dummy(),
129
+ "maxSentenceLength",
130
+ "the window size (Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will " +
131
+ "be divided into chunks up to the size (> 0)",
132
+ typeConverter=TypeConverters.toInt)
133
+
134
+ stepSize = Param(Params._dummy(),
135
+ "stepSize",
136
+ "Step size (learning rate) to be used for each iteration of optimization (> 0)",
137
+ typeConverter=TypeConverters.toFloat)
138
+
139
+ maxIter = Param(Params._dummy(),
140
+ "maxIter",
141
+ "maximum number of iterations (>= 0)",
142
+ typeConverter=TypeConverters.toInt)
143
+
144
+ seed = Param(Params._dummy(),
145
+ "seed",
146
+ "Random seed",
147
+ typeConverter=TypeConverters.toInt)
148
+
149
+ def setVectorSize(self, vectorSize):
150
+ """
151
+ Sets vector size (default: 100).
152
+ """
153
+ return self._set(vectorSize=vectorSize)
154
+
155
+ def setWindowSize(self, windowSize):
156
+ """
157
+ Sets window size (default: 5).
158
+ """
159
+ return self._set(windowSize=windowSize)
160
+
161
+ def setStepSize(self, stepSize):
162
+ """
163
+ Sets initial learning rate (default: 0.025).
164
+ """
165
+ return self._set(stepSize=stepSize)
166
+
167
+ def setNumPartitions(self, numPartitions):
168
+ """
169
+ Sets number of partitions (default: 1). Use a small number for
170
+ accuracy.
171
+ """
172
+ return self._set(numPartitions=numPartitions)
173
+
174
+ def setMaxIter(self, numIterations):
175
+ """
176
+ Sets number of iterations (default: 1), which should be smaller
177
+ than or equal to number of partitions.
178
+ """
179
+ return self._set(maxIter=numIterations)
180
+
181
+ def setSeed(self, seed):
182
+ """
183
+ Sets random seed.
184
+ """
185
+ return self._set(seed=seed)
186
+
187
+ def setMinCount(self, minCount):
188
+ """
189
+ Sets minCount, the minimum number of times a token must appear
190
+ to be included in the word2vec model's vocabulary (default: 5).
191
+ """
192
+ return self._set(minCount=minCount)
193
+
194
+ def setMaxSentenceLength(self, maxSentenceLength):
195
+ """
196
+ Maximum length (in words) of each sentence in the input data.
197
+ Any sentence longer than this threshold will be divided into
198
+ chunks up to the size (> 0)
199
+ """
200
+ return self._set(maxSentenceLength=maxSentenceLength)
201
+
202
+ @keyword_only
203
+ def __init__(self):
204
+ super(Word2VecApproach, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.Word2VecApproach")
205
+ self._setDefault(
206
+ vectorSize=100,
207
+ windowSize=5,
208
+ numPartitions=1,
209
+ minCount=1,
210
+ maxSentenceLength=1000,
211
+ stepSize=0.025,
212
+ maxIter=1,
213
+ seed=44
214
+ )
215
+
216
+ def _create_model(self, java_model):
217
+ return Word2VecModel(java_model=java_model)
218
+
219
+
220
+ class Word2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties):
221
+ """Word2Vec model that creates vector representations of words in a text
222
+ corpus.
223
+
224
+ The algorithm first constructs a vocabulary from the corpus and then learns
225
+ vector representation of words in the vocabulary. The vector representation
226
+ can be used as features in natural language processing and machine learning
227
+ algorithms.
228
+
229
+ We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
230
+ implementation and a hierarchical softmax method to train the model. The
231
+ variable names in the implementation match the original C implementation.
232
+
233
+ This is the instantiated model of the :class:`.Word2VecApproach`. For
234
+ training your own model, please see the documentation of that class.
235
+
236
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
237
+ object:
238
+
239
+ >>> embeddings = Word2VecModel.pretrained() \\
240
+ ... .setInputCols(["token"]) \\
241
+ ... .setOutputCol("embeddings")
242
+
243
+ The default model is `"word2vec_gigaword_300"`, if no name is provided.
244
+
245
+ ====================== =======================
246
+ Input Annotation types Output Annotation type
247
+ ====================== =======================
248
+ ``TOKEN`` ``WORD_EMBEDDINGS``
249
+ ====================== =======================
250
+
251
+ Parameters
252
+ ----------
253
+ vectorSize
254
+ The dimension of codes after transforming from words (> 0), by default
255
+ 100
256
+
257
+ References
258
+ ----------
259
+ For the original C implementation, see https://code.google.com/p/word2vec/
260
+
261
+ For the research paper, see `Efficient Estimation of Word Representations in
262
+ Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
263
+ Representations of Words and Phrases and their Compositionality
264
+ <https://arxiv.org/pdf/1310.4546v1.pdf>`__.
265
+
266
+ Examples
267
+ --------
268
+ >>> import sparknlp
269
+ >>> from sparknlp.base import *
270
+ >>> from sparknlp.annotator import *
271
+ >>> from pyspark.ml import Pipeline
272
+ >>> documentAssembler = DocumentAssembler() \\
273
+ ... .setInputCol("text") \\
274
+ ... .setOutputCol("document")
275
+ >>> tokenizer = Tokenizer() \\
276
+ ... .setInputCols(["document"]) \\
277
+ ... .setOutputCol("token")
278
+ >>> embeddings = Word2VecModel.pretrained() \\
279
+ ... .setInputCols(["token"]) \\
280
+ ... .setOutputCol("embeddings")
281
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
282
+ ... .setInputCols(["embeddings"]) \\
283
+ ... .setOutputCols("finished_embeddings") \\
284
+ ... .setOutputAsVector(True)
285
+ >>> pipeline = Pipeline().setStages([
286
+ ... documentAssembler,
287
+ ... tokenizer,
288
+ ... embeddings,
289
+ ... embeddingsFinisher
290
+ ... ])
291
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
292
+ >>> result = pipeline.fit(data).transform(data)
293
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
294
+ +--------------------------------------------------------------------------------+
295
+ | result|
296
+ +--------------------------------------------------------------------------------+
297
+ |[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...|
298
+ +--------------------------------------------------------------------------------+
299
+ """
300
+ name = "Word2VecModel"
301
+
302
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
303
+
304
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
305
+
306
+ vectorSize = Param(Params._dummy(),
307
+ "vectorSize",
308
+ "the dimension of codes after transforming from words (> 0)",
309
+ typeConverter=TypeConverters.toInt)
310
+
311
+ def setVectorSize(self, vectorSize):
312
+ """
313
+ Sets vector size (default: 100).
314
+ """
315
+ return self._set(vectorSize=vectorSize)
316
+
317
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.Word2VecModel", java_model=None):
318
+ super(Word2VecModel, self).__init__(
319
+ classname=classname,
320
+ java_model=java_model
321
+ )
322
+ self._setDefault(
323
+ vectorSize=100
324
+ )
325
+
326
+ @staticmethod
327
+ def pretrained(name="word2vec_gigaword_300", lang="en", remote_loc=None):
328
+ """Downloads and loads a pretrained model.
329
+
330
+ Parameters
331
+ ----------
332
+ name : str, optional
333
+ Name of the pretrained model, by default "word2vec_wiki"
334
+ lang : str, optional
335
+ Language of the pretrained model, by default "en"
336
+ remote_loc : str, optional
337
+ Optional remote address of the resource, by default None. Will use
338
+ Spark NLPs repositories otherwise.
339
+
340
+ Returns
341
+ -------
342
+ Word2VecModel
343
+ The restored model
344
+ """
345
+ from sparknlp.pretrained import ResourceDownloader
346
+ return ResourceDownloader.downloadModel(Word2VecModel, name, lang, remote_loc)
347
+
348
+ def getVectors(self):
349
+ """
350
+ Returns the vector representation of the words as a dataframe
351
+ with two fields, word and vector.
352
+ """
353
+ return self._call_java("getVectors")