spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,172 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the ImageAssembler."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+ from pyspark.sql import SparkSession, DataFrame
19
+ from pyspark.sql.functions import regexp_replace, col
20
+
21
+ from sparknlp.common import AnnotatorType
22
+ from sparknlp.internal import AnnotatorTransformer
23
+
24
+
25
+ class ImageAssembler(AnnotatorTransformer):
26
+ """Prepares images read by Spark into a format that is processable by Spark NLP.
27
+ This component is needed to process images.
28
+
29
+ ====================== ======================
30
+ Input Annotation types Output Annotation type
31
+ ====================== ======================
32
+ ``NONE`` ``IMAGE``
33
+ ====================== ======================
34
+
35
+ Parameters
36
+ ----------
37
+ inputCol
38
+ Input column name
39
+ outputCol
40
+ Output column name
41
+
42
+ Examples
43
+ --------
44
+ >>> import sparknlp
45
+ >>> from sparknlp.base import *
46
+ >>> from pyspark.ml import Pipeline
47
+ >>> data = spark.read.format("image").load("./tmp/images/").toDF("image")
48
+ >>> imageAssembler = ImageAssembler().setInputCol("image").setOutputCol("image_assembler")
49
+ >>> result = imageAssembler.transform(data)
50
+ >>> result.select("image_assembler").show()
51
+ >>> result.select("image_assembler").printSchema()
52
+ root
53
+ |-- image_assembler: array (nullable = true)
54
+ | |-- element: struct (containsNull = true)
55
+ | | |-- annotatorType: string (nullable = true)
56
+ | | |-- origin: string (nullable = true)
57
+ | | |-- height: integer (nullable = true)
58
+ | | |-- width: integer (nullable = true)
59
+ | | |-- nChannels: integer (nullable = true)
60
+ | | |-- mode: integer (nullable = true)
61
+ | | |-- result: binary (nullable = true)
62
+ | | |-- metadata: map (nullable = true)
63
+ | | | |-- key: string
64
+ | | | |-- value: string (valueContainsNull = true)
65
+ """
66
+
67
+ outputAnnotatorType = AnnotatorType.IMAGE
68
+
69
+ inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
70
+ textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
71
+ outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
72
+ name = 'ImageAssembler'
73
+
74
+ @keyword_only
75
+ def __init__(self):
76
+ super(ImageAssembler, self).__init__(classname="com.johnsnowlabs.nlp.ImageAssembler")
77
+ self._setDefault(outputCol="image_assembler", inputCol='image')
78
+
79
+ @keyword_only
80
+ def setParams(self):
81
+ kwargs = self._input_kwargs
82
+ return self._set(**kwargs)
83
+
84
+ def setInputCol(self, value):
85
+ """Sets input column name.
86
+
87
+ Parameters
88
+ ----------
89
+ value : str
90
+ Name of the input column that has image format loaded via spark.read.format("image").load(PATH)
91
+ """
92
+ return self._set(inputCol=value)
93
+
94
+ def setOutputCol(self, value):
95
+ """Sets output column name.
96
+
97
+ Parameters
98
+ ----------
99
+ value : str
100
+ Name of the Output Column
101
+ """
102
+ return self._set(outputCol=value)
103
+
104
+ def getOutputCol(self):
105
+ """Gets output column name of annotations."""
106
+ return self.getOrDefault(self.outputCol)
107
+
108
+ def setTextCol(self, value):
109
+ """Sets an optional text column name.
110
+
111
+ Parameters
112
+ ----------
113
+ value : str
114
+ Name of an optional input text column
115
+ """
116
+ return self._set(inputCol=value)
117
+
118
+ @classmethod
119
+ def loadImagesAsBytes(cls, spark: SparkSession, path: str):
120
+ """
121
+ Loads images from a given path and returns them as raw bytes, instead of the default
122
+ OpenCV-compatible format. Supported image types include JPEG, PNG, GIF, and BMP.
123
+
124
+ Multimodal inference with llama.cpp requires raw bytes as input.
125
+
126
+ Parameters
127
+ ----------
128
+ spark : SparkSession
129
+ The active SparkSession.
130
+ path : str
131
+ The path to the images. Supported image types are JPEG, PNG, GIF, and BMP.
132
+
133
+ Returns
134
+ -------
135
+ DataFrame
136
+ A DataFrame containing the images as raw bytes along with their metadata.
137
+ """
138
+
139
+ # Replace the path separator in the `origin` field and `path` column, so that they match
140
+ def replace_path(column_name: str):
141
+ return regexp_replace(col(column_name), ":///", ":/")
142
+
143
+ # Load the images as metadata with the default Spark image format
144
+ data = (
145
+ spark.read.format("image")
146
+ .option("dropInvalid", True)
147
+ .load(path)
148
+ .withColumn(
149
+ "image", col("image").withField("origin", replace_path("image.origin"))
150
+ )
151
+ )
152
+
153
+ # Load the images as raw binary files
154
+ image_bytes = (
155
+ spark.read.format("binaryFile")
156
+ .option("pathGlobFilter", "*.{jpeg,jpg,png,gif,bmp,JPEG,JPG,PNG,GIF,BMP}")
157
+ .option("dropInvalid", True)
158
+ .load(path)
159
+ .withColumn("path", replace_path("path"))
160
+ )
161
+
162
+ # Join the two datasets on the file path
163
+ df_joined = data.join(
164
+ image_bytes, data["image.origin"] == image_bytes["path"], "inner"
165
+ )
166
+
167
+ # Replace the `data` field of the `image` column with raw bytes
168
+ df_image_replaced = df_joined.withColumn(
169
+ "image", df_joined["image"].withField("data", df_joined["content"])
170
+ )
171
+
172
+ return df_image_replaced
@@ -0,0 +1,429 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the LightPipeline."""
15
+
16
+ import sparknlp.internal as _internal
17
+ from sparknlp.annotation import Annotation
18
+ from sparknlp.annotation_audio import AnnotationAudio
19
+ from sparknlp.annotation_image import AnnotationImage
20
+ from sparknlp.common import AnnotatorApproach, AnnotatorModel
21
+ from sparknlp.internal import AnnotatorTransformer
22
+
23
+
24
+ class LightPipeline:
25
+ """Creates a LightPipeline from a Spark PipelineModel.
26
+
27
+ LightPipeline is a Spark NLP specific Pipeline class equivalent to Spark
28
+ ML Pipeline. The difference is that it’s execution does not hold to
29
+ Spark principles, instead it computes everything locally (but in
30
+ parallel) in order to achieve fast results when dealing with small
31
+ amounts of data. This means, we do not input a Spark Dataframe, but a
32
+ string or an Array of strings instead, to be annotated. To create Light
33
+ Pipelines, you need to input an already trained (fit) Spark ML Pipeline.
34
+
35
+ It’s :meth:`.transform` has now an alternative :meth:`.annotate`, which
36
+ directly outputs the results.
37
+
38
+ Parameters
39
+ ----------
40
+ pipelineModel : :class:`pyspark.ml.PipelineModel`
41
+ The PipelineModel containing Spark NLP Annotators
42
+ parse_embeddings : bool, optional
43
+ Whether to parse embeddings, by default False
44
+
45
+ Notes
46
+ -----
47
+ Use :meth:`.fullAnnotate` to also output the result as
48
+ :class:`.Annotation`, with metadata.
49
+
50
+ Examples
51
+ --------
52
+ >>> from sparknlp.base import LightPipeline
53
+ >>> light = LightPipeline(pipeline.fit(data))
54
+ >>> light.annotate("We are very happy about Spark NLP")
55
+ {
56
+ 'document': ['We are very happy about Spark NLP'],
57
+ 'lemmas': ['We', 'be', 'very', 'happy', 'about', 'Spark', 'NLP'],
58
+ 'pos': ['PRP', 'VBP', 'RB', 'JJ', 'IN', 'NNP', 'NNP'],
59
+ 'sentence': ['We are very happy about Spark NLP'],
60
+ 'spell': ['We', 'are', 'very', 'happy', 'about', 'Spark', 'NLP'],
61
+ 'stems': ['we', 'ar', 'veri', 'happi', 'about', 'spark', 'nlp'],
62
+ 'token': ['We', 'are', 'very', 'happy', 'about', 'Spark', 'NLP']
63
+ }
64
+ """
65
+
66
+ def __init__(self, pipelineModel, parse_embeddings=False):
67
+ self.pipeline_model = pipelineModel
68
+ self.parse_embeddings = parse_embeddings
69
+ self._lightPipeline = _internal._LightPipeline(pipelineModel, parse_embeddings).apply()
70
+
71
+ def _validateStagesInputCols(self, stages):
72
+ annotator_types = self._getAnnotatorTypes(stages)
73
+ for stage in stages:
74
+ if isinstance(stage, AnnotatorApproach) or isinstance(stage, AnnotatorModel):
75
+ input_cols = stage.getInputCols()
76
+ if type(input_cols) == str:
77
+ input_cols = [input_cols]
78
+ input_annotator_types = stage.inputAnnotatorTypes + stage.optionalInputAnnotatorTypes
79
+ for input_col in input_cols:
80
+ annotator_type = annotator_types.get(input_col)
81
+ if annotator_type is None or annotator_type not in input_annotator_types:
82
+ raise TypeError(f"Wrong or missing inputCols annotators in {stage.uid}"
83
+ f" Make sure such annotator exist in your pipeline,"
84
+ f" with the right output names and that they have following annotator types:"
85
+ f" {input_annotator_types}")
86
+
87
+ def _skipPipelineValidation(self, stages):
88
+ exceptional_pipeline = [stage for stage in stages if self._skipStageValidation(stage)]
89
+ if len(exceptional_pipeline) >= 1:
90
+ return True
91
+ else:
92
+ return False
93
+
94
+ def _skipStageValidation(self, stage):
95
+ return hasattr(stage, 'skipLPInputColsValidation') and stage.skipLPInputColsValidation
96
+
97
+ def _getAnnotatorTypes(self, stages):
98
+ annotator_types = {}
99
+ for stage in stages:
100
+ if hasattr(stage, 'getOutputCols'):
101
+ output_cols = stage.getOutputCols()
102
+ for output_col in output_cols:
103
+ annotator_types[output_col] = stage.outputAnnotatorType
104
+ elif isinstance(stage, AnnotatorApproach) or isinstance(stage, AnnotatorModel) or\
105
+ isinstance(stage, AnnotatorTransformer):
106
+ if stage.outputAnnotatorType is not None:
107
+ annotator_types[stage.getOutputCol()] = stage.outputAnnotatorType
108
+ return annotator_types
109
+
110
+ def _annotationFromJava(self, java_annotations):
111
+ annotations = []
112
+ for annotation in java_annotations:
113
+
114
+ index = annotation.toString().index("(")
115
+ annotation_type = annotation.toString()[:index]
116
+
117
+ if annotation_type == "AnnotationImage":
118
+ result = self.__get_result(annotation)
119
+ annotations.append(
120
+ AnnotationImage(annotation.annotatorType(),
121
+ annotation.origin(),
122
+ annotation.height(),
123
+ annotation.width(),
124
+ annotation.nChannels(),
125
+ annotation.mode(),
126
+ result,
127
+ annotation.metadata())
128
+ )
129
+ elif annotation_type == "AnnotationAudio":
130
+ result = self.__get_result(annotation)
131
+ annotations.append(
132
+ AnnotationAudio(annotation.annotatorType(),
133
+ result,
134
+ annotation.metadata())
135
+ )
136
+ else:
137
+ if self.parse_embeddings:
138
+ embeddings = list(annotation.embeddings())
139
+ else:
140
+ embeddings = []
141
+ annotations.append(
142
+ Annotation(annotation.annotatorType(),
143
+ annotation.begin(),
144
+ annotation.end(),
145
+ annotation.result(),
146
+ annotation.metadata(),
147
+ embeddings)
148
+ )
149
+ return annotations
150
+
151
+ @staticmethod
152
+ def __get_result(annotation):
153
+ try:
154
+ result = list(annotation.result())
155
+ except TypeError:
156
+ result = []
157
+
158
+ return result
159
+
160
+ def fullAnnotate(self, target, optional_target=""):
161
+ """Annotates the data provided into `Annotation` type results.
162
+
163
+ The data should be either a list or a str.
164
+
165
+ Parameters
166
+ ----------
167
+ target : list or str or float
168
+ The data to be annotated
169
+ optional_target: list or str
170
+ Optional data to be annotated (currently used for Question Answering)
171
+
172
+ Returns
173
+ -------
174
+ List[Annotation]
175
+ The result of the annotation
176
+
177
+ Examples
178
+ --------
179
+ >>> from sparknlp.pretrained import PretrainedPipeline
180
+ >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
181
+ >>> result = explain_document_pipeline.fullAnnotate('U.N. official Ekeus heads for Baghdad.')
182
+ >>> result[0].keys()
183
+ dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
184
+ >>> result[0]["ner"]
185
+ [Annotation(named_entity, 0, 2, B-ORG, {'word': 'U.N'}),
186
+ Annotation(named_entity, 3, 3, O, {'word': '.'}),
187
+ Annotation(named_entity, 5, 12, O, {'word': 'official'}),
188
+ Annotation(named_entity, 14, 18, B-PER, {'word': 'Ekeus'}),
189
+ Annotation(named_entity, 20, 24, O, {'word': 'heads'}),
190
+ Annotation(named_entity, 26, 28, O, {'word': 'for'}),
191
+ Annotation(named_entity, 30, 36, B-LOC, {'word': 'Baghdad'}),
192
+ Annotation(named_entity, 37, 37, O, {'word': '.'})]
193
+ """
194
+ stages = self.pipeline_model.stages
195
+ if not self._skipPipelineValidation(stages):
196
+ self._validateStagesInputCols(stages)
197
+
198
+ if optional_target == "":
199
+ if self.__isTextInput(target):
200
+ result = self.__fullAnnotateText(target)
201
+ elif self.__isAudioInput(target):
202
+ result = self.__fullAnnotateAudio(target)
203
+ else:
204
+ raise TypeError(
205
+ "argument for annotation must be 'str' or list[str] or list[float] or list[list[float]]")
206
+ else:
207
+ if self.__isTextInput(target) and self.__isTextInput(optional_target):
208
+ result = self.__fullAnnotateQuestionAnswering(target, optional_target)
209
+ else:
210
+ raise TypeError("arguments for annotation must be 'str' or list[str]")
211
+
212
+ return result
213
+
214
+ @staticmethod
215
+ def __isTextInput(target):
216
+ if type(target) is str:
217
+ return True
218
+ elif type(target) is list and type(target[0]) is str:
219
+ return True
220
+ else:
221
+ return False
222
+
223
+ @staticmethod
224
+ def __isAudioInput(target):
225
+ if type(target) is list and type(target[0]) is float:
226
+ return True
227
+ elif type(target) is list and type(target[0]) is list and type(target[0][0]) is float:
228
+ return True
229
+ else:
230
+ return False
231
+
232
+ def __fullAnnotateText(self, target):
233
+
234
+ if self.__isPath(target):
235
+ result = self.fullAnnotateImage(target)
236
+ return result
237
+ else:
238
+ result = []
239
+ if type(target) is str:
240
+ target = [target]
241
+
242
+ for annotations_result in self._lightPipeline.fullAnnotateJava(target):
243
+ result.append(self.__buildStages(annotations_result))
244
+ return result
245
+
246
+ def __isPath(self, target):
247
+ if type(target) is list:
248
+ target = target[0]
249
+
250
+ if target.find("/") < 0:
251
+ return False
252
+ else:
253
+ is_valid_file = _internal._ResourceHelper_validFile(target).apply()
254
+ return is_valid_file
255
+
256
+ def __fullAnnotateAudio(self, audios):
257
+ result = []
258
+ if type(audios[0]) is float:
259
+ annotations_dict = self._lightPipeline.fullAnnotateSingleAudioJava(audios)
260
+ result.append(self.__buildStages(annotations_dict))
261
+ else:
262
+ full_annotations = self._lightPipeline.fullAnnotateAudiosJava(audios)
263
+ for annotations_dict in full_annotations:
264
+ result.append(self.__buildStages(annotations_dict))
265
+
266
+ return result
267
+
268
+ def __fullAnnotateQuestionAnswering(self, question, context):
269
+ result = []
270
+ if type(question) is str and type(context) is str:
271
+ annotations_dict = self._lightPipeline.fullAnnotateJava(question, context)
272
+ result.append(self.__buildStages(annotations_dict))
273
+ else:
274
+ full_annotations = self._lightPipeline.fullAnnotateJava(question, context)
275
+ for annotations_dict in full_annotations:
276
+ result.append(self.__buildStages(annotations_dict))
277
+
278
+ return result
279
+
280
+ def fullAnnotateImage(self, path_to_image, text=None):
281
+ """Annotates the data provided into `Annotation` type results.
282
+
283
+ The data should be either a list or a str.
284
+
285
+ Parameters
286
+ ----------
287
+ path_to_image : list or str
288
+ Source path of image, list of paths to images
289
+
290
+ text: list or str, optional
291
+ Optional list or str of texts. If None, defaults to empty list if path_to_image is a list, or empty string if path_to_image is a string.
292
+
293
+ Returns
294
+ -------
295
+ List[AnnotationImage]
296
+ The result of the annotation
297
+ """
298
+ if not isinstance(path_to_image, (str, list)):
299
+ raise TypeError("argument for path_to_image must be 'str' or 'list[str]'")
300
+
301
+ if text is None:
302
+ text = "" if isinstance(path_to_image, str) else []
303
+
304
+ if type(path_to_image) != type(text):
305
+ raise ValueError("`path_to_image` and `text` must be of the same type")
306
+
307
+ stages = self.pipeline_model.stages
308
+ if not self._skipPipelineValidation(stages):
309
+ self._validateStagesInputCols(stages)
310
+
311
+ if isinstance(path_to_image, str):
312
+ path_to_image = [path_to_image]
313
+ text = [text]
314
+
315
+ result = []
316
+
317
+ for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image, text):
318
+ result.append(self.__buildStages(image_result))
319
+
320
+ return result
321
+
322
+
323
+ def __buildStages(self, annotations_result):
324
+ stages = {}
325
+ for annotator_type, annotations in annotations_result.items():
326
+ stages[annotator_type] = self._annotationFromJava(annotations)
327
+ return stages
328
+
329
+ def annotate(self, target, optional_target=""):
330
+ """Annotates the data provided, extracting the results.
331
+
332
+ The data should be either a list or a str.
333
+
334
+ Parameters
335
+ ----------
336
+ target : list or str
337
+ The data to be annotated
338
+ optional_target: list or str
339
+ Optional data to be annotated (currently used for Question Answering)
340
+
341
+ Returns
342
+ -------
343
+ List[dict] or dict
344
+ The result of the annotation
345
+
346
+ Examples
347
+ --------
348
+ >>> from sparknlp.pretrained import PretrainedPipeline
349
+ >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
350
+ >>> result = explain_document_pipeline.annotate('U.N. official Ekeus heads for Baghdad.')
351
+ >>> result.keys()
352
+ dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
353
+ >>> result["ner"]
354
+ ['B-ORG', 'O', 'O', 'B-PER', 'O', 'O', 'B-LOC', 'O']
355
+ """
356
+
357
+ def reformat(annotations):
358
+ return {k: list(v) for k, v in annotations.items()}
359
+
360
+ stages = self.pipeline_model.stages
361
+ if not self._skipPipelineValidation(stages):
362
+ self._validateStagesInputCols(stages)
363
+
364
+ if optional_target == "":
365
+ if type(target) is str:
366
+ annotations = self._lightPipeline.annotateJava(target)
367
+ result = reformat(annotations)
368
+ elif type(target) is list:
369
+ if type(target[0]) is list:
370
+ raise TypeError("target is a 1D list")
371
+ annotations = self._lightPipeline.annotateJava(target)
372
+ result = list(map(lambda a: reformat(a), list(annotations)))
373
+ else:
374
+ raise TypeError("target for annotation must be 'str' or list")
375
+
376
+ else:
377
+ if type(target) is str and type(optional_target) is str:
378
+ annotations = self._lightPipeline.annotateJava(target, optional_target)
379
+ result = reformat(annotations)
380
+ elif type(target) is list and type(optional_target) is list:
381
+ if type(target[0]) is list or type(optional_target[0]) is list:
382
+ raise TypeError("target and optional_target is a 1D list")
383
+ annotations = self._lightPipeline.annotateJava(target, optional_target)
384
+ result = list(map(lambda a: reformat(a), list(annotations)))
385
+ else:
386
+ raise TypeError("target and optional_target for annotation must be both 'str' or both lists")
387
+
388
+ return result
389
+
390
+ def transform(self, dataframe):
391
+ """Transforms a dataframe provided with the stages of the LightPipeline.
392
+
393
+ Parameters
394
+ ----------
395
+ dataframe : :class:`pyspark.sql.DataFrame`
396
+ The Dataframe to be transformed
397
+
398
+ Returns
399
+ -------
400
+ :class:`pyspark.sql.DataFrame`
401
+ The transformed DataFrame
402
+ """
403
+ return self.pipeline_model.transform(dataframe)
404
+
405
+ def setIgnoreUnsupported(self, value):
406
+ """Sets whether to ignore unsupported AnnotatorModels.
407
+
408
+ Parameters
409
+ ----------
410
+ value : bool
411
+ Whether to ignore unsupported AnnotatorModels.
412
+
413
+ Returns
414
+ -------
415
+ LightPipeline
416
+ The current LightPipeline
417
+ """
418
+ self._lightPipeline.setIgnoreUnsupported(value)
419
+ return self
420
+
421
+ def getIgnoreUnsupported(self):
422
+ """Gets whether to ignore unsupported AnnotatorModels.
423
+
424
+ Returns
425
+ -------
426
+ bool
427
+ Whether to ignore unsupported AnnotatorModels.
428
+ """
429
+ return self._lightPipeline.getIgnoreUnsupported()