spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,318 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the TypedDependencyParser."""
15
+
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class TypedDependencyParserApproach(AnnotatorApproach):
21
+ """Labeled parser that finds a grammatical relation between two words in a
22
+ sentence. Its input is either a CoNLL2009 or ConllU dataset.
23
+
24
+ For instantiated/pretrained models, see
25
+ :class:`.TypedDependencyParserModel`.
26
+
27
+ Dependency parsers provide information about word relationship. For example,
28
+ dependency parsing can tell you what the subjects and objects of a verb are,
29
+ as well as which words are modifying (describing) the subject. This can help
30
+ you find precise answers to specific questions.
31
+
32
+ The parser requires the dependant tokens beforehand with e.g.
33
+ DependencyParser. The required training data can be set in two different
34
+ ways (only one can be chosen for a particular model):
35
+
36
+ - Dataset in the `CoNLL 2009 format
37
+ <https://ufal.mff.cuni.cz/conll2009-st/trial-data.html>`__ set with
38
+ :meth:`.setConll2009`
39
+ - Dataset in the `CoNLL-U format
40
+ <https://universaldependencies.org/format.html>`__ set with
41
+ :meth:`.setConllU`
42
+
43
+ Apart from that, no additional training data is needed.
44
+
45
+ ========================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ========================== ======================
48
+ ``TOKEN, POS, DEPENDENCY`` ``LABELED_DEPENDENCY``
49
+ ========================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ conll2009
54
+ Path to file with CoNLL 2009 format
55
+ conllU
56
+ Universal Dependencies source files
57
+ numberOfIterations
58
+ Number of iterations in training, converges to better accuracy
59
+
60
+ Examples
61
+ --------
62
+ >>> import sparknlp
63
+ >>> from sparknlp.base import *
64
+ >>> from sparknlp.annotator import *
65
+ >>> from pyspark.ml import Pipeline
66
+ >>> documentAssembler = DocumentAssembler() \\
67
+ ... .setInputCol("text") \\
68
+ ... .setOutputCol("document")
69
+ >>> sentence = SentenceDetector() \\
70
+ ... .setInputCols(["document"]) \\
71
+ ... .setOutputCol("sentence")
72
+ >>> tokenizer = Tokenizer() \\
73
+ ... .setInputCols(["sentence"]) \\
74
+ ... .setOutputCol("token")
75
+ >>> posTagger = PerceptronModel.pretrained() \\
76
+ ... .setInputCols(["sentence", "token"]) \\
77
+ ... .setOutputCol("pos")
78
+ >>> dependencyParser = DependencyParserModel.pretrained() \\
79
+ ... .setInputCols(["sentence", "pos", "token"]) \\
80
+ ... .setOutputCol("dependency")
81
+ >>> typedDependencyParser = TypedDependencyParserApproach() \\
82
+ ... .setInputCols(["dependency", "pos", "token"]) \\
83
+ ... .setOutputCol("dependency_type") \\
84
+ ... .setConllU("src/test/resources/parser/labeled/train_small.conllu.txt") \\
85
+ ... .setNumberOfIterations(1)
86
+ >>> pipeline = Pipeline().setStages([
87
+ ... documentAssembler,
88
+ ... sentence,
89
+ ... tokenizer,
90
+ ... posTagger,
91
+ ... dependencyParser,
92
+ ... typedDependencyParser
93
+ ... ])
94
+
95
+ Additional training data is not needed, the dependency parser relies on
96
+ CoNLL-U only.
97
+
98
+ >>> emptyDataSet = spark.createDataFrame([[""]]).toDF("text")
99
+ >>> pipelineModel = pipeline.fit(emptyDataSet)
100
+ """
101
+
102
+ inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.DEPENDENCY]
103
+
104
+ outputAnnotatorType = AnnotatorType.LABELED_DEPENDENCY
105
+
106
+ conll2009 = Param(Params._dummy(),
107
+ "conll2009",
108
+ "Path to file with CoNLL 2009 format",
109
+ typeConverter=TypeConverters.identity)
110
+
111
+ conllU = Param(Params._dummy(),
112
+ "conllU",
113
+ "Universal Dependencies source files",
114
+ typeConverter=TypeConverters.identity)
115
+
116
+ numberOfIterations = Param(Params._dummy(),
117
+ "numberOfIterations",
118
+ "Number of iterations in training, converges to better accuracy",
119
+ typeConverter=TypeConverters.toInt)
120
+
121
+ @keyword_only
122
+ def __init__(self):
123
+ super(TypedDependencyParserApproach,
124
+ self).__init__(classname="com.johnsnowlabs.nlp.annotators.parser.typdep.TypedDependencyParserApproach")
125
+
126
+ def setConll2009(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
127
+ """Sets path to file with CoNLL 2009 format.
128
+
129
+ Parameters
130
+ ----------
131
+ path : str
132
+ Path to the resource
133
+ read_as : str, optional
134
+ How to read the resource, by default ReadAs.TEXT
135
+ options : dict, optional
136
+ Options for reading the resource, by default {"key": "value"}
137
+ """
138
+ opts = options.copy()
139
+ return self._set(conll2009=ExternalResource(path, read_as, opts))
140
+
141
+ def setConllU(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
142
+ """Sets path to Universal Dependencies source files.
143
+
144
+ Parameters
145
+ ----------
146
+ path : str
147
+ Path to the resource
148
+ read_as : str, optional
149
+ How to read the resource, by default ReadAs.TEXT
150
+ options : dict, optional
151
+ Options for reading the resource, by default {"key": "value"}
152
+ """
153
+ opts = options.copy()
154
+ return self._set(conllU=ExternalResource(path, read_as, opts))
155
+
156
+ def setNumberOfIterations(self, value):
157
+ """Sets Number of iterations in training, converges to better accuracy.
158
+
159
+ Parameters
160
+ ----------
161
+ value : int
162
+ Number of iterations in training
163
+
164
+ Returns
165
+ -------
166
+ [type]
167
+ [description]
168
+ """
169
+ return self._set(numberOfIterations=value)
170
+
171
+ def _create_model(self, java_model):
172
+ return TypedDependencyParserModel(java_model=java_model)
173
+
174
+
175
+ class TypedDependencyParserModel(AnnotatorModel):
176
+ """Labeled parser that finds a grammatical relation between two words in a
177
+ sentence. Its input is either a CoNLL2009 or ConllU dataset.
178
+
179
+ Dependency parsers provide information about word relationship. For example,
180
+ dependency parsing can tell you what the subjects and objects of a verb are,
181
+ as well as which words are modifying (describing) the subject. This can help
182
+ you find precise answers to specific questions.
183
+
184
+ The parser requires the dependant tokens beforehand with e.g.
185
+ DependencyParser.
186
+
187
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
188
+ object:
189
+
190
+ >>> typedDependencyParser = TypedDependencyParserModel.pretrained() \\
191
+ ... .setInputCols(["dependency", "pos", "token"]) \\
192
+ ... .setOutputCol("dependency_type")
193
+
194
+ The default model is ``"dependency_typed_conllu"``, if no name is provided.
195
+ For available pretrained models please see the `Models Hub
196
+ <https://sparknlp.org/models>`__.
197
+
198
+ For extended examples of usage, see the `Examples
199
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb>`__.
200
+
201
+ ========================== ======================
202
+ Input Annotation types Output Annotation type
203
+ ========================== ======================
204
+ ``TOKEN, POS, DEPENDENCY`` ``LABELED_DEPENDENCY``
205
+ ========================== ======================
206
+
207
+ Parameters
208
+ ----------
209
+ None
210
+
211
+ Examples
212
+ --------
213
+ >>> import sparknlp
214
+ >>> from sparknlp.base import *
215
+ >>> from sparknlp.annotator import *
216
+ >>> from pyspark.ml import Pipeline
217
+ >>> documentAssembler = DocumentAssembler() \\
218
+ ... .setInputCol("text") \\
219
+ ... .setOutputCol("document")
220
+ >>> sentence = SentenceDetector() \\
221
+ ... .setInputCols(["document"]) \\
222
+ ... .setOutputCol("sentence")
223
+ >>> tokenizer = Tokenizer() \\
224
+ ... .setInputCols(["sentence"]) \\
225
+ ... .setOutputCol("token")
226
+ >>> posTagger = PerceptronModel.pretrained() \\
227
+ ... .setInputCols(["sentence", "token"]) \\
228
+ ... .setOutputCol("pos")
229
+ >>> dependencyParser = DependencyParserModel.pretrained() \\
230
+ ... .setInputCols(["sentence", "pos", "token"]) \\
231
+ ... .setOutputCol("dependency")
232
+ >>> typedDependencyParser = TypedDependencyParserModel.pretrained() \\
233
+ ... .setInputCols(["dependency", "pos", "token"]) \\
234
+ ... .setOutputCol("dependency_type")
235
+ >>> pipeline = Pipeline().setStages([
236
+ ... documentAssembler,
237
+ ... sentence,
238
+ ... tokenizer,
239
+ ... posTagger,
240
+ ... dependencyParser,
241
+ ... typedDependencyParser
242
+ ... ])
243
+ >>> data = spark.createDataFrame([[
244
+ ... "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent " +
245
+ ... "firm Federal Mogul."
246
+ ... ]]).toDF("text")
247
+ >>> result = pipeline.fit(data).transform(data)
248
+ >>> result.selectExpr("explode(arrays_zip(token.result, dependency.result, dependency_type.result)) as cols") \\
249
+ ... .selectExpr("cols['0'] as token", "cols['1'] as dependency", "cols['2'] as dependency_type") \\
250
+ ... .show(8, truncate = False)
251
+ +------------+------------+---------------+
252
+ |token |dependency |dependency_type|
253
+ +------------+------------+---------------+
254
+ |Unions |ROOT |root |
255
+ |representing|workers |amod |
256
+ |workers |Unions |flat |
257
+ |at |Turner |case |
258
+ |Turner |workers |flat |
259
+ |Newall |say |nsubj |
260
+ |say |Unions |parataxis |
261
+ |they |disappointed|nsubj |
262
+ +------------+------------+---------------+
263
+ """
264
+
265
+ name = "TypedDependencyParserModel"
266
+
267
+ inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.DEPENDENCY]
268
+
269
+ outputAnnotatorType = AnnotatorType.LABELED_DEPENDENCY
270
+
271
+ trainOptions = Param(Params._dummy(),
272
+ "trainOptions",
273
+ "Training Options",
274
+ typeConverter=TypeConverters.identity)
275
+
276
+ trainParameters = Param(Params._dummy(),
277
+ "trainParameters",
278
+ "Training Parameters",
279
+ typeConverter=TypeConverters.identity)
280
+
281
+ trainDependencyPipe = Param(Params._dummy(),
282
+ "trainDependencyPipe",
283
+ "Training dependency pipe",
284
+ typeConverter=TypeConverters.identity)
285
+
286
+ conllFormat = Param(Params._dummy(),
287
+ "conllFormat",
288
+ "CoNLL Format",
289
+ typeConverter=TypeConverters.toString)
290
+
291
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.parser.typdep.TypedDependencyParserModel",
292
+ java_model=None):
293
+ super(TypedDependencyParserModel, self).__init__(
294
+ classname=classname,
295
+ java_model=java_model
296
+ )
297
+
298
+ @staticmethod
299
+ def pretrained(name="dependency_typed_conllu", lang="en", remote_loc=None):
300
+ """Downloads and loads a pretrained model.
301
+
302
+ Parameters
303
+ ----------
304
+ name : str, optional
305
+ Name of the pretrained model, by default "dependency_typed_conllu"
306
+ lang : str, optional
307
+ Language of the pretrained model, by default "en"
308
+ remote_loc : str, optional
309
+ Optional remote address of the resource, by default None. Will use
310
+ Spark NLPs repositories otherwise.
311
+
312
+ Returns
313
+ -------
314
+ TypedDependencyParserModel
315
+ The restored model
316
+ """
317
+ from sparknlp.pretrained import ResourceDownloader
318
+ return ResourceDownloader.downloadModel(TypedDependencyParserModel, name, lang, remote_loc)
@@ -0,0 +1,228 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the DocumentNormalizer"""
15
+ from sparknlp.common import *
16
+
17
+
18
+ class DocumentCharacterTextSplitter(AnnotatorModel):
19
+ """Annotator which splits large documents into chunks of roughly given size.
20
+
21
+ DocumentCharacterTextSplitter takes a list of separators. It takes the separators in order and
22
+ splits subtexts if they are over the chunk length, considering optional overlap of the chunks.
23
+
24
+ For example, given chunk size 20 and overlap 5:
25
+
26
+ .. code-block:: python
27
+
28
+ "He was, I take it, the most perfect reasoning and observing machine that the world has seen."
29
+
30
+ ["He was, I take it,", "it, the most", "most perfect", "reasoning and", "and observing", "machine that the", "the world has seen."]
31
+
32
+
33
+ Additionally, you can set
34
+
35
+ - custom patterns with setSplitPatterns
36
+ - whether patterns should be interpreted as regex with setPatternsAreRegex
37
+ - whether to keep the separators with setKeepSeparators
38
+ - whether to trim whitespaces with setTrimWhitespace
39
+ - whether to explode the splits to individual rows with setExplodeSplits
40
+
41
+ For extended examples of usage, see the
42
+ `DocumentCharacterTextSplitterTest <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala>`__.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``DOCUMENT``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+
53
+ chunkSize
54
+ Size of each chunk of text.
55
+ chunkOverlap
56
+ Length of the overlap between text chunks , by default `0`.
57
+ splitPatterns
58
+ Patterns to separate the text by in decreasing priority , by default `["\\n\\n", "\\n", " ", ""]`.
59
+ patternsAreRegex
60
+ Whether to interpret the split patterns as regular expressions , by default `False`.
61
+ keepSeparators
62
+ Whether to keep the separators in the final result , by default `True`.
63
+ explodeSplits
64
+ Whether to explode split chunks to separate rows , by default `False`.
65
+ trimWhitespace
66
+ Whether to trim whitespaces of extracted chunks , by default `True`.
67
+
68
+ Examples
69
+ --------
70
+ >>> import sparknlp
71
+ >>> from sparknlp.base import *
72
+ >>> from sparknlp.annotator import *
73
+ >>> from pyspark.ml import Pipeline
74
+ >>> textDF = spark.read.text(
75
+ ... "sherlockholmes.txt",
76
+ ... wholetext=True
77
+ ... ).toDF("text")
78
+ >>> documentAssembler = DocumentAssembler().setInputCol("text")
79
+ >>> textSplitter = DocumentCharacterTextSplitter() \\
80
+ ... .setInputCols(["document"]) \\
81
+ ... .setOutputCol("splits") \\
82
+ ... .setChunkSize(20000) \\
83
+ ... .setChunkOverlap(200) \\
84
+ ... .setExplodeSplits(True)
85
+ >>> pipeline = Pipeline().setStages([documentAssembler, textSplitter])
86
+ >>> result = pipeline.fit(textDF).transform(textDF)
87
+ >>> result.selectExpr(
88
+ ... "splits.result",
89
+ ... "splits[0].begin",
90
+ ... "splits[0].end",
91
+ ... "splits[0].end - splits[0].begin as length") \\
92
+ ... .show(8, truncate = 80)
93
+ +--------------------------------------------------------------------------------+---------------+-------------+------+
94
+ | result|splits[0].begin|splits[0].end|length|
95
+ +--------------------------------------------------------------------------------+---------------+-------------+------+
96
+ |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 19994| 19994|
97
+ |["And Mademoiselle's address?" he asked.\\n\\n"Is Briony Lodge, Serpentine Aven...| 19798| 39395| 19597|
98
+ |["How did that help you?"\\n\\n"It was all-important. When a woman thinks that ...| 39371| 59242| 19871|
99
+ |["'But,' said I, 'there would be millions of red-headed men who\\nwould apply....| 59166| 77833| 18667|
100
+ |[My friend was an enthusiastic musician, being himself not only a\\nvery capab...| 77835| 97769| 19934|
101
+ |["And yet I am not convinced of it," I answered. "The cases which\\ncome to li...| 97771| 117248| 19477|
102
+ |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\\nfeather of...| 117250| 137242| 19992|
103
+ |["That sounds a little paradoxical."\\n\\n"But it is profoundly True. Singulari...| 137244| 157171| 19927|
104
+ +--------------------------------------------------------------------------------+---------------+-------------+------+
105
+
106
+ """
107
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
108
+
109
+ outputAnnotatorType = AnnotatorType.DOCUMENT
110
+
111
+ chunkSize = Param(Params._dummy(),
112
+ "chunkSize",
113
+ "Size of each chunk of text",
114
+ typeConverter=TypeConverters.toInt)
115
+ chunkOverlap = Param(Params._dummy(),
116
+ "chunkOverlap",
117
+ "Length of the overlap between text chunks",
118
+ typeConverter=TypeConverters.toInt)
119
+ splitPatterns = Param(Params._dummy(),
120
+ "splitPatterns",
121
+ "Patterns to separate the text by in decreasing priority",
122
+ typeConverter=TypeConverters.toListString)
123
+ patternsAreRegex = Param(Params._dummy(),
124
+ "patternsAreRegex",
125
+ "Whether to interpret the split patterns as regular expressions",
126
+ typeConverter=TypeConverters.toBoolean)
127
+ keepSeparators = Param(Params._dummy(),
128
+ "keepSeparators",
129
+ "Whether to keep the separators in the final result",
130
+ typeConverter=TypeConverters.toBoolean)
131
+ explodeSplits = Param(Params._dummy(),
132
+ "explodeSplits",
133
+ "Whether to explode split chunks to separate rows",
134
+ typeConverter=TypeConverters.toBoolean)
135
+ trimWhitespace = Param(Params._dummy(),
136
+ "trimWhitespace",
137
+ "Whether to trim whitespaces of extracted chunks",
138
+ typeConverter=TypeConverters.toBoolean)
139
+
140
+ @keyword_only
141
+ def __init__(self):
142
+ super(DocumentCharacterTextSplitter, self).__init__(
143
+ classname="com.johnsnowlabs.nlp.annotators.DocumentCharacterTextSplitter")
144
+ self._setDefault(
145
+ chunkOverlap=0,
146
+ explodeSplits=False,
147
+ keepSeparators=True,
148
+ patternsAreRegex=False,
149
+ splitPatterns=["\n\n", "\n", " ", ""],
150
+ trimWhitespace=True
151
+ )
152
+
153
+ def setChunkSize(self, value):
154
+ """Sets size of each chunk of text.
155
+
156
+ Parameters
157
+ ----------
158
+ value : int
159
+ Size of each chunk of text
160
+ """
161
+ if value < 1:
162
+ raise ValueError("Chunk size should be larger than 0.")
163
+ return self._set(chunkSize=value)
164
+
165
+ def setChunkOverlap(self, value):
166
+ """Sets length of the overlap between text chunks , by default `0`.
167
+
168
+ Parameters
169
+ ----------
170
+ value : int
171
+ Length of the overlap between text chunks
172
+ """
173
+ if value > self.getOrDefault(self.chunkSize):
174
+ raise ValueError("Chunk overlap can't be larger than chunk size.")
175
+ return self._set(chunkOverlap=value)
176
+
177
+ def setSplitPatterns(self, value):
178
+ """Sets patterns to separate the text by in decreasing priority , by default `["\n\n", "\n", " ", ""]`.
179
+
180
+ Parameters
181
+ ----------
182
+ value : List[str]
183
+ Patterns to separate the text by in decreasing priority
184
+ """
185
+ if len(value) == 0:
186
+ raise ValueError("Patterns are empty")
187
+
188
+ return self._set(splitPatterns=value)
189
+
190
+ def setPatternsAreRegex(self, value):
191
+ """Sets whether to interpret the split patterns as regular expressions , by default `False`.
192
+
193
+ Parameters
194
+ ----------
195
+ value : bool
196
+ Whether to interpret the split patterns as regular expressions
197
+ """
198
+ return self._set(patternsAreRegex=value)
199
+
200
+ def setKeepSeparators(self, value):
201
+ """Sets whether to keep the separators in the final result , by default `True`.
202
+
203
+ Parameters
204
+ ----------
205
+ value : bool
206
+ Whether to keep the separators in the final result
207
+ """
208
+ return self._set(keepSeparators=value)
209
+
210
+ def setExplodeSplits(self, value):
211
+ """Sets whether to explode split chunks to separate rows , by default `False`.
212
+
213
+ Parameters
214
+ ----------
215
+ value : bool
216
+ Whether to explode split chunks to separate rows
217
+ """
218
+ return self._set(explodeSplits=value)
219
+
220
+ def setTrimWhitespace(self, value):
221
+ """Sets whether to trim whitespaces of extracted chunks , by default `True`.
222
+
223
+ Parameters
224
+ ----------
225
+ value : bool
226
+ Whether to trim whitespaces of extracted chunks
227
+ """
228
+ return self._set(trimWhitespace=value)