spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,88 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Date2Chunk."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class Date2Chunk(AnnotatorModel):
20
+ """Converts ``DATE`` type Annotations to ``CHUNK`` type.
21
+
22
+ This can be useful if the following annotators after DateMatcher and MultiDateMatcher require ```CHUNK``` types.
23
+
24
+ ====================== ======================
25
+ Input Annotation types Output Annotation type
26
+ ====================== ======================
27
+ ``DATE`` ``CHUNK``
28
+ ====================== ======================
29
+
30
+ Parameters
31
+ ----------
32
+ entityName
33
+ Entity name for the metadata, by default ``"DATE"``.
34
+
35
+ Examples
36
+ --------
37
+ >>> from pyspark.ml import Pipeline
38
+
39
+ >>> import sparknlp
40
+ >>> from sparknlp.base import *
41
+ >>> from sparknlp.annotator import *
42
+ >>> documentAssembler = DocumentAssembler() \\
43
+ ... .setInputCol("text") \\
44
+ ... .setOutputCol("document")
45
+ >>> date = DateMatcher() \\
46
+ ... .setInputCols(["document"]) \\
47
+ ... .setOutputCol("date")
48
+ >>> date2Chunk = Date2Chunk() \\
49
+ ... .setInputCols(["date"]) \\
50
+ ... .setOutputCol("date_chunk")
51
+ >>> pipeline = Pipeline().setStages([
52
+ ... documentAssembler,
53
+ ... date,
54
+ ... date2Chunk
55
+ ... ])
56
+ >>> data = spark.createDataFrame([["Omicron is a new variant of COVID-19, which the World Health Organization designated a variant of concern on Nov. 26, 2021/26/11."]]).toDF("text")
57
+ >>> result = pipeline.fit(data).transform(data)
58
+ >>> result.select("date_chunk").show(1, truncate=False)
59
+ ----------------------------------------------------+
60
+ |date_chunk |
61
+ ----------------------------------------------------+
62
+ |[{chunk, 118, 121, 2021/01/01, {sentence -> 0}, []}]|
63
+ ----------------------------------------------------+
64
+ """
65
+ name = "Date2Chunk"
66
+
67
+ inputAnnotatorTypes = [AnnotatorType.DATE]
68
+
69
+ outputAnnotatorType = AnnotatorType.CHUNK
70
+
71
+ @keyword_only
72
+ def __init__(self):
73
+ super(Date2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Date2Chunk")
74
+ self._setDefault(entityName="DATE")
75
+
76
+ entityName = Param(Params._dummy(), "entityName", "Entity name for the metadata",
77
+ TypeConverters.toString)
78
+
79
+ def setEntityName(self, name):
80
+ """Sets Learning Rate, by default 0.001.
81
+
82
+ Parameters
83
+ ----------
84
+ v : float
85
+ Learning Rate
86
+ """
87
+ self._set(entityName=name)
88
+ return self
@@ -0,0 +1,17 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Module of annotators for dependency parsing."""
16
+ from sparknlp.annotator.dependency.dependency_parser import *
17
+ from sparknlp.annotator.dependency.typed_dependency_parser import *
@@ -0,0 +1,294 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the DependencyParser."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class DependencyParserApproach(AnnotatorApproach):
20
+ """Trains an unlabeled parser that finds a grammatical relations between two
21
+ words in a sentence.
22
+
23
+ For instantiated/pretrained models, see :class:`.DependencyParserModel`.
24
+
25
+ Dependency parser provides information about word relationship. For example,
26
+ dependency parsing can tell you what the subjects and objects of a verb are,
27
+ as well as which words are modifying (describing) the subject. This can help
28
+ you find precise answers to specific questions.
29
+
30
+ The required training data can be set in two different ways (only one can be
31
+ chosen for a particular model):
32
+
33
+ - Dependency treebank in the
34
+ `Penn Treebank format <http://www.nltk.org/nltk_data/>`__ set with
35
+ ``setDependencyTreeBank``
36
+ - Dataset in the
37
+ `CoNLL-U format <https://universaldependencies.org/format.html>`__ set
38
+ with ``setConllU``
39
+
40
+ Apart from that, no additional training data is needed.
41
+
42
+ ======================== ======================
43
+ Input Annotation types Output Annotation type
44
+ ======================== ======================
45
+ ``DOCUMENT, POS, TOKEN`` ``DEPENDENCY``
46
+ ======================== ======================
47
+
48
+ Parameters
49
+ ----------
50
+ dependencyTreeBank
51
+ Dependency treebank source files
52
+ conllU
53
+ Universal Dependencies source files
54
+ numberOfIterations
55
+ Number of iterations in training, converges to better accuracy,
56
+ by default 10
57
+
58
+ Examples
59
+ --------
60
+ >>> import sparknlp
61
+ >>> from sparknlp.base import *
62
+ >>> from sparknlp.annotator import *
63
+ >>> from pyspark.ml import Pipeline
64
+ >>> documentAssembler = DocumentAssembler() \\
65
+ ... .setInputCol("text") \\
66
+ ... .setOutputCol("document")
67
+ >>> sentence = SentenceDetector() \\
68
+ ... .setInputCols(["document"]) \\
69
+ ... .setOutputCol("sentence")
70
+ >>> tokenizer = Tokenizer() \\
71
+ ... .setInputCols(["sentence"]) \\
72
+ ... .setOutputCol("token")
73
+ >>> posTagger = PerceptronModel.pretrained() \\
74
+ ... .setInputCols(["sentence", "token"]) \\
75
+ ... .setOutputCol("pos")
76
+ >>> dependencyParserApproach = DependencyParserApproach() \\
77
+ ... .setInputCols(["sentence", "pos", "token"]) \\
78
+ ... .setOutputCol("dependency") \\
79
+ ... .setDependencyTreeBank("src/test/resources/parser/unlabeled/dependency_treebank")
80
+ >>> pipeline = Pipeline().setStages([
81
+ ... documentAssembler,
82
+ ... sentence,
83
+ ... tokenizer,
84
+ ... posTagger,
85
+ ... dependencyParserApproach
86
+ ... ])
87
+ >>> emptyDataSet = spark.createDataFrame([[""]]).toDF("text")
88
+ >>> pipelineModel = pipeline.fit(emptyDataSet)
89
+
90
+ Additional training data is not needed, the dependency parser relies on the
91
+ dependency tree bank / CoNLL-U only.
92
+
93
+ See Also
94
+ --------
95
+ TypedDependencyParserApproach : to extract labels for the dependencies
96
+ """
97
+
98
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.TOKEN]
99
+
100
+ outputAnnotatorType = AnnotatorType.DEPENDENCY
101
+
102
+ dependencyTreeBank = Param(Params._dummy(),
103
+ "dependencyTreeBank",
104
+ "Dependency treebank source files",
105
+ typeConverter=TypeConverters.identity)
106
+
107
+ conllU = Param(Params._dummy(),
108
+ "conllU",
109
+ "Universal Dependencies source files",
110
+ typeConverter=TypeConverters.identity)
111
+
112
+ numberOfIterations = Param(Params._dummy(),
113
+ "numberOfIterations",
114
+ "Number of iterations in training, converges to better accuracy",
115
+ typeConverter=TypeConverters.toInt)
116
+
117
+ @keyword_only
118
+ def __init__(self):
119
+ super(DependencyParserApproach,
120
+ self).__init__(classname="com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach")
121
+ self._setDefault(numberOfIterations=10)
122
+
123
+ def setNumberOfIterations(self, value):
124
+ """Sets number of iterations in training, converges to better accuracy,
125
+ by default 10.
126
+
127
+ Parameters
128
+ ----------
129
+ value : int
130
+ Number of iterations
131
+ """
132
+ return self._set(numberOfIterations=value)
133
+
134
+ def setDependencyTreeBank(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
135
+ """Sets dependency treebank source files.
136
+
137
+ Parameters
138
+ ----------
139
+ path : str
140
+ Path to the source files
141
+ read_as : str, optional
142
+ How to read the file, by default ReadAs.TEXT
143
+ options : dict, optional
144
+ Options to read the resource, by default {"key": "value"}
145
+ """
146
+ opts = options.copy()
147
+ return self._set(dependencyTreeBank=ExternalResource(path, read_as, opts))
148
+
149
+ def setConllU(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
150
+ """Sets Universal Dependencies source files.
151
+
152
+ Parameters
153
+ ----------
154
+ path : str
155
+ Path to the source files
156
+ read_as : str, optional
157
+ How to read the file, by default ReadAs.TEXT
158
+ options : dict, optional
159
+ Options to read the resource, by default {"key": "value"}
160
+ """
161
+ opts = options.copy()
162
+ return self._set(conllU=ExternalResource(path, read_as, opts))
163
+
164
+ def _create_model(self, java_model):
165
+ return DependencyParserModel(java_model=java_model)
166
+
167
+
168
+ class DependencyParserModel(AnnotatorModel):
169
+ """Unlabeled parser that finds a grammatical relation between two words in a
170
+ sentence.
171
+
172
+ Dependency parser provides information about word relationship. For example,
173
+ dependency parsing can tell you what the subjects and objects of a verb are,
174
+ as well as which words are modifying (describing) the subject. This can help
175
+ you find precise answers to specific questions.
176
+
177
+ This is the instantiated model of the :class:`.DependencyParserApproach`.
178
+ For training your own model, please see the documentation of that class.
179
+
180
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
181
+ object:
182
+
183
+ >>> dependencyParserApproach = DependencyParserModel.pretrained() \\
184
+ ... .setInputCols(["sentence", "pos", "token"]) \\
185
+ ... .setOutputCol("dependency")
186
+
187
+
188
+ The default model is ``"dependency_conllu"``, if no name is provided.
189
+ For available pretrained models please see the
190
+ `Models Hub <https://sparknlp.org/models>`__.
191
+
192
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb>`__.
193
+
194
+ ================================ ======================
195
+ Input Annotation types Output Annotation type
196
+ ================================ ======================
197
+ ``[String]DOCUMENT, POS, TOKEN`` ``DEPENDENCY``
198
+ ================================ ======================
199
+
200
+ Parameters
201
+ ----------
202
+ perceptron
203
+ Dependency parsing perceptron features
204
+
205
+ Examples
206
+ --------
207
+ >>> import sparknlp
208
+ >>> from sparknlp.base import *
209
+ >>> from sparknlp.annotator import *
210
+ >>> from pyspark.ml import Pipeline
211
+ >>> documentAssembler = DocumentAssembler() \\
212
+ ... .setInputCol("text") \\
213
+ ... .setOutputCol("document")
214
+ >>> sentence = SentenceDetector() \\
215
+ ... .setInputCols(["document"]) \\
216
+ ... .setOutputCol("sentence")
217
+ >>> tokenizer = Tokenizer() \\
218
+ ... .setInputCols(["sentence"]) \\
219
+ ... .setOutputCol("token")
220
+ >>> posTagger = PerceptronModel.pretrained() \\
221
+ ... .setInputCols(["sentence", "token"]) \\
222
+ ... .setOutputCol("pos")
223
+ >>> dependencyParser = DependencyParserModel.pretrained() \\
224
+ ... .setInputCols(["sentence", "pos", "token"]) \\
225
+ ... .setOutputCol("dependency")
226
+ >>> pipeline = Pipeline().setStages([
227
+ ... documentAssembler,
228
+ ... sentence,
229
+ ... tokenizer,
230
+ ... posTagger,
231
+ ... dependencyParser
232
+ ... ])
233
+ >>> data = spark.createDataFrame([[
234
+ ... "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent " +
235
+ ... "firm Federal Mogul."
236
+ ... ]]).toDF("text")
237
+ >>> result = pipeline.fit(data).transform(data)
238
+ >>> result.selectExpr("explode(arrays_zip(token.result, dependency.result)) as cols") \\
239
+ ... .selectExpr("cols['0'] as token", "cols['1'] as dependency").show(8, truncate = False)
240
+ +------------+------------+
241
+ |token |dependency |
242
+ +------------+------------+
243
+ |Unions |ROOT |
244
+ |representing|workers |
245
+ |workers |Unions |
246
+ |at |Turner |
247
+ |Turner |workers |
248
+ |Newall |say |
249
+ |say |Unions |
250
+ |they |disappointed|
251
+ +------------+------------+
252
+
253
+ See Also
254
+ --------
255
+ TypedDependencyParserMdoel : to extract labels for the dependencies
256
+ """
257
+ name = "DependencyParserModel"
258
+
259
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.TOKEN]
260
+
261
+ outputAnnotatorType = AnnotatorType.DEPENDENCY
262
+
263
+ perceptron = Param(Params._dummy(),
264
+ "perceptron",
265
+ "Dependency parsing perceptron features",
266
+ typeConverter=TypeConverters.identity)
267
+
268
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserModel", java_model=None):
269
+ super(DependencyParserModel, self).__init__(
270
+ classname=classname,
271
+ java_model=java_model
272
+ )
273
+
274
+ @staticmethod
275
+ def pretrained(name="dependency_conllu", lang="en", remote_loc=None):
276
+ """Downloads and loads a pretrained model.
277
+
278
+ Parameters
279
+ ----------
280
+ name : str, optional
281
+ Name of the pretrained model, by default "dependency_conllu"
282
+ lang : str, optional
283
+ Language of the pretrained model, by default "en"
284
+ remote_loc : str, optional
285
+ Optional remote address of the resource, by default None. Will use
286
+ Spark NLPs repositories otherwise.
287
+
288
+ Returns
289
+ -------
290
+ DependencyParserModel
291
+ The restored model
292
+ """
293
+ from sparknlp.pretrained import ResourceDownloader
294
+ return ResourceDownloader.downloadModel(DependencyParserModel, name, lang, remote_loc)