spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,124 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the TokenAssembler."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+
19
+ from sparknlp.common.annotator_type import AnnotatorType
20
+ from sparknlp.internal import AnnotatorTransformer
21
+
22
+ from sparknlp.common import AnnotatorProperties
23
+
24
+
25
+ class TokenAssembler(AnnotatorTransformer, AnnotatorProperties):
26
+ """This transformer reconstructs a ``DOCUMENT`` type annotation from tokens,
27
+ usually after these have been normalized, lemmatized, normalized, spell
28
+ checked, etc, in order to use this document annotation in further
29
+ annotators. Requires ``DOCUMENT`` and ``TOKEN`` type annotations as input.
30
+
31
+ For more extended examples on document pre-processing see the
32
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb>`__.
33
+
34
+ ====================== ======================
35
+ Input Annotation types Output Annotation type
36
+ ====================== ======================
37
+ ``DOCUMENT, TOKEN`` ``DOCUMENT``
38
+ ====================== ======================
39
+
40
+ Parameters
41
+ ----------
42
+ preservePosition
43
+ Whether to preserve the actual position of the tokens or reduce them to
44
+ one space
45
+
46
+ Examples
47
+ --------
48
+ >>> import sparknlp
49
+ >>> from sparknlp.base import *
50
+ >>> from sparknlp.annotator import *
51
+ >>> from pyspark.ml import Pipeline
52
+
53
+ First, the text is tokenized and cleaned
54
+
55
+ >>> documentAssembler = DocumentAssembler() \\
56
+ ... .setInputCol("text") \\
57
+ ... .setOutputCol("document")
58
+ >>> sentenceDetector = SentenceDetector() \\
59
+ ... .setInputCols(["document"]) \\
60
+ ... .setOutputCol("sentences")
61
+ >>> tokenizer = Tokenizer() \\
62
+ ... .setInputCols(["sentences"]) \\
63
+ ... .setOutputCol("token")
64
+ >>> normalizer = Normalizer() \\
65
+ ... .setInputCols(["token"]) \\
66
+ ... .setOutputCol("normalized") \\
67
+ ... .setLowercase(False)
68
+ >>> stopwordsCleaner = StopWordsCleaner() \\
69
+ ... .setInputCols(["normalized"]) \\
70
+ ... .setOutputCol("cleanTokens") \\
71
+ ... .setCaseSensitive(False)
72
+
73
+ Then the TokenAssembler turns the cleaned tokens into a ``DOCUMENT`` type
74
+ structure.
75
+
76
+ >>> tokenAssembler = TokenAssembler() \\
77
+ ... .setInputCols(["sentences", "cleanTokens"]) \\
78
+ ... .setOutputCol("cleanText")
79
+ >>> data = spark.createDataFrame([["Spark NLP is an open-source text processing library for advanced natural language processing."]]) \\
80
+ ... .toDF("text")
81
+ >>> pipeline = Pipeline().setStages([
82
+ ... documentAssembler,
83
+ ... sentenceDetector,
84
+ ... tokenizer,
85
+ ... normalizer,
86
+ ... stopwordsCleaner,
87
+ ... tokenAssembler
88
+ ... ]).fit(data)
89
+ >>> result = pipeline.transform(data)
90
+ >>> result.select("cleanText").show(truncate=False)
91
+ +---------------------------------------------------------------------------------------------------------------------------+
92
+ |cleanText |
93
+ +---------------------------------------------------------------------------------------------------------------------------+
94
+ |[[document, 0, 80, Spark NLP opensource text processing library advanced natural language processing, [sentence -> 0], []]]|
95
+ +---------------------------------------------------------------------------------------------------------------------------+
96
+ """
97
+
98
+ name = "TokenAssembler"
99
+
100
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
101
+
102
+ outputAnnotatorType = AnnotatorType.DOCUMENT
103
+
104
+ preservePosition = Param(Params._dummy(), "preservePosition", "whether to preserve the actual position of the tokens or reduce them to one space", typeConverter=TypeConverters.toBoolean)
105
+
106
+ @keyword_only
107
+ def __init__(self):
108
+ super(TokenAssembler, self).__init__(classname="com.johnsnowlabs.nlp.TokenAssembler")
109
+
110
+ @keyword_only
111
+ def setParams(self):
112
+ kwargs = self._input_kwargs
113
+ return self._set(**kwargs)
114
+
115
+ def setPreservePosition(self, value):
116
+ """Sets whether to preserve the actual position of the tokens or reduce
117
+ them to one space.
118
+
119
+ Parameters
120
+ ----------
121
+ value : str
122
+ Name of the Id Column
123
+ """
124
+ return self._set(preservePosition=value)
@@ -0,0 +1,26 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Module of common base classes for Spark NLP annotators."""
15
+ from sparknlp.common.annotator_approach import *
16
+ from sparknlp.common.annotator_model import *
17
+ from sparknlp.common.annotator_properties import *
18
+ from sparknlp.common.coverage_result import *
19
+ from sparknlp.common.properties import *
20
+ from sparknlp.common.read_as import *
21
+ from sparknlp.common.recursive_annotator_approach import *
22
+ from sparknlp.common.storage import *
23
+ from sparknlp.common.utils import *
24
+ from sparknlp.common.annotator_type import *
25
+ from sparknlp.common.match_strategy import *
26
+ from sparknlp.common.completion_post_processing import *
@@ -0,0 +1,41 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the base classes for Annotator Approaches."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.util import JavaMLWritable
18
+ from pyspark.ml.wrapper import JavaEstimator
19
+
20
+ import sparknlp.internal as _internal
21
+ from sparknlp.common.annotator_properties import AnnotatorProperties
22
+
23
+
24
+ class AnnotatorApproach(JavaEstimator, JavaMLWritable, _internal.AnnotatorJavaMLReadable, AnnotatorProperties,
25
+ _internal.ParamsGettersSetters):
26
+
27
+ @keyword_only
28
+ def __init__(self, classname):
29
+ _internal.ParamsGettersSetters.__init__(self)
30
+ self.__class__._java_class_name = classname
31
+ self._java_obj = self._new_java_obj(classname, self.uid)
32
+ self._setDefault(lazyAnnotator=False)
33
+
34
+ def _create_model(self, java_model):
35
+ raise NotImplementedError('Please implement _create_model in %s' % self)
36
+
37
+ def __init_subclass__(cls, **kwargs):
38
+ for required in ('inputAnnotatorTypes', 'outputAnnotatorType'):
39
+ if not getattr(cls, required):
40
+ raise TypeError(f"Can't instantiate class {cls.__name__} without {required} attribute defined")
41
+ return super().__init_subclass__(**kwargs)
@@ -0,0 +1,47 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the base classes for Annotator Models."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.util import JavaMLWritable
18
+ from pyspark.ml.wrapper import JavaModel
19
+
20
+ import sparknlp.internal as _internal
21
+ from sparknlp.common import AnnotatorProperties
22
+
23
+
24
+ class AnnotatorModel(JavaModel, _internal.AnnotatorJavaMLReadable, JavaMLWritable, AnnotatorProperties,
25
+ _internal.ParamsGettersSetters):
26
+
27
+ @keyword_only
28
+ def setParams(self):
29
+ kwargs = self._input_kwargs
30
+ return self._set(**kwargs)
31
+
32
+ @keyword_only
33
+ def __init__(self, classname, java_model=None):
34
+ super(AnnotatorModel, self).__init__(java_model=java_model)
35
+ if classname and not java_model:
36
+ self.__class__._java_class_name = classname
37
+ self._java_obj = self._new_java_obj(classname, self.uid)
38
+ if java_model is not None:
39
+ self._transfer_params_from_java()
40
+ self._setDefault(lazyAnnotator=False)
41
+
42
+ def __init_subclass__(cls, **kwargs):
43
+ for required in ('inputAnnotatorTypes', 'outputAnnotatorType'):
44
+ if not getattr(cls, required):
45
+ raise TypeError(f"Can't instantiate class {cls.__name__} without {required} attribute defined")
46
+
47
+ return super().__init_subclass__(**kwargs)
@@ -0,0 +1,114 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the base classes for Annotator properties."""
15
+
16
+ from pyspark.ml.param import TypeConverters, Params, Param
17
+
18
+
19
+ class AnnotatorProperties(Params):
20
+
21
+ inputAnnotatorTypes = []
22
+ optionalInputAnnotatorTypes = []
23
+
24
+ outputAnnotatorType = None
25
+
26
+ inputCols = Param(Params._dummy(),
27
+ "inputCols",
28
+ "previous annotations columns, if renamed",
29
+ typeConverter=TypeConverters.toListString)
30
+
31
+ outputCol = Param(Params._dummy(),
32
+ "outputCol",
33
+ "output annotation column. can be left default.",
34
+ typeConverter=TypeConverters.toString)
35
+
36
+ lazyAnnotator = Param(Params._dummy(),
37
+ "lazyAnnotator",
38
+ "Whether this AnnotatorModel acts as lazy in RecursivePipelines",
39
+ typeConverter=TypeConverters.toBoolean
40
+ )
41
+
42
+ def setInputCols(self, *value):
43
+ """Sets column names of input annotations.
44
+
45
+ Parameters
46
+ ----------
47
+ *value : List[str]
48
+ Input columns for the annotator
49
+ """
50
+ if type(value[0]) == str or type(value[0]) == list:
51
+ self.inputColsValidation(value)
52
+ if len(value) == 1 and type(value[0]) == list:
53
+ return self._set(inputCols=value[0])
54
+ else:
55
+ return self._set(inputCols=list(value))
56
+ else:
57
+ raise TypeError("InputCols datatype not supported. It must be either str or list")
58
+
59
+ def inputColsValidation(self, value):
60
+ actual_columns = len(value)
61
+ if type(value[0]) == list:
62
+ actual_columns = len(value[0])
63
+
64
+ expected_columns = len(self.inputAnnotatorTypes)
65
+
66
+ if len(self.optionalInputAnnotatorTypes) == 0:
67
+ if actual_columns != expected_columns:
68
+ raise TypeError(
69
+ f"setInputCols in {self.uid} expecting {expected_columns} columns. "
70
+ f"Provided column amount: {actual_columns}. "
71
+ f"Which should be columns from the following annotators: {self.inputAnnotatorTypes}")
72
+ else:
73
+ expected_columns = expected_columns + len(self.optionalInputAnnotatorTypes)
74
+ if not (actual_columns == len(self.inputAnnotatorTypes) or actual_columns == expected_columns):
75
+ raise TypeError(
76
+ f"setInputCols in {self.uid} expecting at least {len(self.inputAnnotatorTypes)} columns. "
77
+ f"Provided column amount: {actual_columns}. "
78
+ f"Which should be columns from at least the following annotators: {self.inputAnnotatorTypes}")
79
+
80
+ def getInputCols(self):
81
+ """Gets current column names of input annotations."""
82
+ return self.getOrDefault(self.inputCols)
83
+
84
+ def setOutputCol(self, value):
85
+ """Sets output column name of annotations.
86
+
87
+ Parameters
88
+ ----------
89
+ value : str
90
+ Name of output column
91
+ """
92
+ return self._set(outputCol=value)
93
+
94
+ def getOutputCol(self):
95
+ """Gets output column name of annotations."""
96
+ return self.getOrDefault(self.outputCol)
97
+
98
+ def setLazyAnnotator(self, value):
99
+ """Sets whether Annotator should be evaluated lazily in a
100
+ RecursivePipeline.
101
+
102
+ Parameters
103
+ ----------
104
+ value : bool
105
+ Whether Annotator should be evaluated lazily in a
106
+ RecursivePipeline
107
+ """
108
+ return self._set(lazyAnnotator=value)
109
+
110
+ def getLazyAnnotator(self):
111
+ """Gets whether Annotator should be evaluated lazily in a
112
+ RecursivePipeline.
113
+ """
114
+ return self.getOrDefault(self.lazyAnnotator)
@@ -0,0 +1,38 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ class AnnotatorType(object):
17
+ AUDIO = "audio"
18
+ DOCUMENT = "document"
19
+ IMAGE = "image"
20
+ TOKEN = "token"
21
+ WORDPIECE = "wordpiece"
22
+ WORD_EMBEDDINGS = "word_embeddings"
23
+ SENTENCE_EMBEDDINGS = "sentence_embeddings"
24
+ CATEGORY = "category"
25
+ DATE = "date"
26
+ ENTITY = "entity"
27
+ SENTIMENT = "sentiment"
28
+ POS = "pos"
29
+ CHUNK = "chunk"
30
+ NAMED_ENTITY = "named_entity"
31
+ NEGEX = "negex"
32
+ DEPENDENCY = "dependency"
33
+ LABELED_DEPENDENCY = "labeled_dependency"
34
+ LANGUAGE = "language"
35
+ NODE = "node"
36
+ TABLE = "table"
37
+ DUMMY = "dummy"
38
+ DOC_SIMILARITY_RANKINGS = "doc_similarity_rankings"
@@ -0,0 +1,37 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark.ml.param import Param, Params, TypeConverters
15
+
16
+
17
+ class CompletionPostProcessing:
18
+ removeThinkingTag = Param(
19
+ Params._dummy(),
20
+ "removeThinkingTag",
21
+ "Set a thinking tag (e.g. think) to be removed from output. Will match <TAG>...</TAG>",
22
+ typeConverter=TypeConverters.toString,
23
+ )
24
+
25
+ def setRemoveThinkingTag(self, value: str):
26
+ """Set a thinking tag (e.g. `think`) to be removed from output.
27
+ Will produce the regex: `(?s)<$TAG>.+?</$TAG>`
28
+ """
29
+ self._set(removeThinkingTag=value)
30
+ return self
31
+
32
+ def getRemoveThinkingTag(self):
33
+ """Get the thinking tag to be removed from output."""
34
+ value = None
35
+ if self.removeThinkingTag in self._paramMap:
36
+ value = self._paramMap[self.removeThinkingTag]
37
+ return value
@@ -0,0 +1,22 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains utilities for coverage analysis."""
15
+
16
+
17
+ class CoverageResult:
18
+ def __init__(self, cov_obj):
19
+ self.covered = cov_obj.covered()
20
+ self.total = cov_obj.total()
21
+ self.percentage = cov_obj.percentage()
22
+
@@ -0,0 +1,33 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Allowed strategies for RuleFactory applications regarding replacement"""
16
+
17
+
18
+ class MatchStrategy(object):
19
+ """Object that contains constants for how for matched strategies used in RuleFactory.
20
+
21
+ Possible values are:
22
+
23
+ ================================== ===============================================================================
24
+ Value Description
25
+ ================================== ===============================================================================
26
+ ``MatchStrategy.MATCH_ALL`` This strategy matches all occurrences of all rules in the given text.
27
+ ``MatchStrategy.MATCH_FIRST`` This strategy matches only the first occurrence of each rule in the given text.
28
+ ``MatchStrategy.MATCH_COMPLETE`` This strategy matches only the first occurrence of each rule in the given text.
29
+ ================================== ===============================================================================
30
+ """
31
+ MATCH_ALL = "MATCH_ALL"
32
+ MATCH_FIRST = "MATCH_FIRST"
33
+ MATCH_COMPLETE = "MATCH_COMPLETE"