spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark import keyword_only
15
+ from pyspark.ml.param import Param, Params, TypeConverters
16
+ from pyspark.ml.param.shared import HasInputCol, HasOutputCol
17
+ from pyspark.ml.util import JavaMLReadable, JavaMLWritable
18
+ from pyspark.ml.wrapper import JavaTransformer
19
+
20
+ from sparknlp.reader.enums import TextStripperType
21
+
22
+
23
+ class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
24
+ JavaMLReadable, JavaMLWritable):
25
+ """
26
+ Extract text from PDF documents as either a single string or multiple strings per page.
27
+ Input is a column with binary content of PDF files. Output is a column with extracted text,
28
+ with options to include page numbers or split pages.
29
+
30
+ Parameters
31
+ ----------
32
+ pageNumCol : str, optional
33
+ Page number output column name.
34
+ partitionNum : int, optional
35
+ Number of partitions (default is 0).
36
+ storeSplittedPdf : bool, optional
37
+ Whether to store content of split PDFs (default is False).
38
+ splitPage : bool, optional
39
+ Enable/disable splitting per page (default is True).
40
+ onlyPageNum : bool, optional
41
+ Whether to extract only page numbers (default is False).
42
+ textStripper : str or TextStripperType, optional
43
+ Defines layout and formatting type.
44
+ sort : bool, optional
45
+ Enable/disable sorting content per page (default is False).
46
+
47
+ Examples
48
+ --------
49
+ >>> import sparknlp
50
+ >>> from sparknlp.reader import *
51
+ >>> from pyspark.ml import Pipeline
52
+ >>> pdf_path = "Documents/files/pdf"
53
+ >>> data_frame = spark.read.format("binaryFile").load(pdf_path)
54
+ >>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
55
+ >>> pipeline = Pipeline(stages=[pdf_to_text])
56
+ >>> pipeline_model = pipeline.fit(data_frame)
57
+ >>> pdf_df = pipeline_model.transform(data_frame)
58
+ >>> pdf_df.show()
59
+ +--------------------+--------------------+
60
+ | path| modificationTime|
61
+ +--------------------+--------------------+
62
+ |file:/Users/paula...|2025-05-15 11:33:...|
63
+ |file:/Users/paula...|2025-05-15 11:33:...|
64
+ +--------------------+--------------------+
65
+ >>> pdf_df.printSchema()
66
+ root
67
+ |-- path: string (nullable = true)
68
+ |-- modificationTime: timestamp (nullable = true)
69
+ |-- length: long (nullable = true)
70
+ |-- text: string (nullable = true)
71
+ |-- height_dimension: integer (nullable = true)
72
+ |-- width_dimension: integer (nullable = true)
73
+ |-- content: binary (nullable = true)
74
+ |-- exception: string (nullable = true)
75
+ |-- pagenum: integer (nullable = true)
76
+ """
77
+ pageNumCol = Param(Params._dummy(), "pageNumCol",
78
+ "Page number output column name.",
79
+ typeConverter=TypeConverters.toString)
80
+
81
+ partitionNum = Param(Params._dummy(), "partitionNum",
82
+ "Number of partitions.",
83
+ typeConverter=TypeConverters.toInt)
84
+
85
+ storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
86
+ "Force to store splitted pdf.",
87
+ typeConverter=TypeConverters.toBoolean)
88
+
89
+ splitPage = Param(Params._dummy(), "splitPage",
90
+ "Param for enable/disable splitting document per page",
91
+ typeConverter=TypeConverters.toBoolean)
92
+
93
+ textStripper = Param(Params._dummy(), "textStripper",
94
+ "Text stripper type used for output layout and formatting",
95
+ typeConverter=TypeConverters.toString)
96
+
97
+ sort = Param(Params._dummy(), "sort",
98
+ "Param for enable/disable sort lines",
99
+ typeConverter=TypeConverters.toBoolean)
100
+
101
+ onlyPageNum = Param(Params._dummy(), "onlyPageNum",
102
+ "Force to extract only number of pages",
103
+ typeConverter=TypeConverters.toBoolean)
104
+
105
+ extractCoordinates = Param(Params._dummy(), "extractCoordinates",
106
+ "Force extract coordinates of text.",
107
+ typeConverter=TypeConverters.toBoolean)
108
+
109
+ normalizeLigatures = Param(Params._dummy(), "normalizeLigatures",
110
+ "Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
111
+ typeConverter=TypeConverters.toBoolean)
112
+
113
+ @keyword_only
114
+ def __init__(self):
115
+ """
116
+ __init__(self)
117
+ """
118
+ super(PdfToText, self).__init__()
119
+ self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
120
+
121
+ def setInputCol(self, value):
122
+ """
123
+ Sets the value of :py:attr:`inputCol`.
124
+ """
125
+ return self._set(inputCol=value)
126
+
127
+ def setOutputCol(self, value):
128
+ """
129
+ Sets the value of :py:attr:`outputCol`.
130
+ """
131
+ return self._set(outputCol=value)
132
+
133
+ def setPageNumCol(self, value):
134
+ """
135
+ Sets the value of :py:attr:`pageNumCol`.
136
+ """
137
+ return self._set(pageNumCol=value)
138
+
139
+ def setPartitionNum(self, value):
140
+ """
141
+ Sets the value of :py:attr:`partitionNum`.
142
+ """
143
+ return self._set(partitionNum=value)
144
+
145
+ def setStoreSplittedPdf(self, value):
146
+ """
147
+ Sets the value of :py:attr:`storeSplittedPdf`.
148
+ """
149
+ return self._set(storeSplittedPdf=value)
150
+
151
+ def setSplitPage(self, value):
152
+ """
153
+ Sets the value of :py:attr:`splitPage`.
154
+ """
155
+ return self._set(splitPage=value)
156
+
157
+ def setOnlyPageNum(self, value):
158
+ """
159
+ Sets the value of :py:attr:`onlyPageNum`.
160
+ """
161
+ return self._set(onlyPageNum=value)
162
+
163
+ def setTextStripper(self, value):
164
+ """
165
+ Sets the value of :py:attr:`textStripper`.
166
+ """
167
+ if isinstance(value, TextStripperType):
168
+ value = value.value
169
+ if value not in [i.value for i in TextStripperType]:
170
+ type_value = type(value)
171
+ raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.")
172
+ return self._set(textStripper=str(value))
173
+
174
+ def setSort(self, value):
175
+ """
176
+ Sets the value of :py:attr:`sort`.
177
+ """
178
+ return self._set(sort=value)
179
+
180
+ def setExtractCoordinates(self, value):
181
+ """
182
+ Sets the value of :py:attr:`extractCoordinates`.
183
+ """
184
+ return self._set(extractCoordinates=value)
185
+
186
+ def setNormalizeLigatures(self, value):
187
+ """
188
+ Sets the value of :py:attr:`normalizeLigatures`.
189
+ """
190
+ return self._set(normalizeLigatures=value)
@@ -0,0 +1,124 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark import keyword_only
15
+
16
+ from sparknlp.common import AnnotatorType
17
+ from sparknlp.internal import AnnotatorTransformer
18
+ from sparknlp.partition.partition_properties import *
19
+
20
+
21
+ class Reader2Doc(
22
+ AnnotatorTransformer,
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
25
+ HasEmailReaderProperties,
26
+ HasExcelReaderProperties,
27
+ HasPowerPointProperties,
28
+ HasTextReaderProperties
29
+ ):
30
+ """
31
+ The Reader2Doc annotator allows you to use reading files more smoothly within existing
32
+ Spark NLP workflows, enabling seamless reuse of your pipelines.
33
+
34
+ Reader2Doc can be used for extracting structured content from various document types
35
+ using Spark NLP readers. It supports reading from many file types and returns parsed
36
+ output as a structured Spark DataFrame.
37
+
38
+ Supported formats include:
39
+
40
+ - Plain text
41
+ - HTML
42
+ - Word (.doc/.docx)
43
+ - Excel (.xls/.xlsx)
44
+ - PowerPoint (.ppt/.pptx)
45
+ - Email files (.eml, .msg)
46
+ - PDFs
47
+
48
+ Examples
49
+ --------
50
+ >>> from johnsnowlabs.reader import Reader2Doc
51
+ >>> from johnsnowlabs.nlp.base import DocumentAssembler
52
+ >>> from pyspark.ml import Pipeline
53
+ >>> # Initialize Reader2Doc for PDF files
54
+ >>> reader2doc = Reader2Doc() \\
55
+ ... .setContentType("application/pdf") \\
56
+ ... .setContentPath(f"{pdf_directory}/")
57
+ >>> # Build the pipeline with the Reader2Doc stage
58
+ >>> pipeline = Pipeline(stages=[reader2doc])
59
+ >>> # Fit the pipeline to an empty DataFrame
60
+ >>> pipeline_model = pipeline.fit(empty_data_set)
61
+ >>> result_df = pipeline_model.transform(empty_data_set)
62
+ >>> # Show the resulting DataFrame
63
+ >>> result_df.show()
64
+ +------------------------------------------------------------------------------------------------------------------------------------+
65
+ |document |
66
+ +------------------------------------------------------------------------------------------------------------------------------------+
67
+ |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
68
+ |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
69
+ |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
70
+ +------------------------------------------------------------------------------------------------------------------------------------+
71
+ """
72
+
73
+ name = "Reader2Doc"
74
+
75
+ outputAnnotatorType = AnnotatorType.DOCUMENT
76
+
77
+ excludeNonText = Param(
78
+ Params._dummy(),
79
+ "excludeNonText",
80
+ "Whether to exclude non-text content from the output. Default is False.",
81
+ typeConverter=TypeConverters.toBoolean
82
+ )
83
+
84
+ def setExcludeNonText(self, value):
85
+ """Sets whether to exclude non-text content from the output.
86
+
87
+ Parameters
88
+ ----------
89
+ value : bool
90
+ Whether to exclude non-text content from the output. Default is False.
91
+ """
92
+ return self._set(excludeNonText=value)
93
+
94
+ joinString = Param(
95
+ Params._dummy(),
96
+ "joinString",
97
+ "If outputAsDocument is true, specifies the string used to join elements into a single document.",
98
+ typeConverter=TypeConverters.toString
99
+ )
100
+
101
+ def setJoinString(self, value):
102
+ """
103
+ If outputAsDocument is true, specifies the string used to join elements into a single
104
+ """
105
+ return self._set(joinString=value)
106
+
107
+ @keyword_only
108
+ def __init__(self):
109
+ super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
110
+ self._setDefault(
111
+ outputCol="document",
112
+ explodeDocs=False,
113
+ contentType="",
114
+ flattenOutput=False,
115
+ outputAsDocument=True,
116
+ outputFormat="plain-text",
117
+ excludeNonText=False,
118
+ joinString="\n"
119
+ )
120
+
121
+ @keyword_only
122
+ def setParams(self):
123
+ kwargs = self._input_kwargs
124
+ return self._set(**kwargs)
@@ -0,0 +1,136 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark import keyword_only
15
+ from pyspark.ml.param import TypeConverters, Params, Param
16
+
17
+ from sparknlp.common import AnnotatorType
18
+ from sparknlp.internal import AnnotatorTransformer
19
+ from sparknlp.partition.partition_properties import *
20
+
21
+ class Reader2Image(
22
+ AnnotatorTransformer,
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
25
+ HasPdfProperties
26
+ ):
27
+ """
28
+ The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
29
+ Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
30
+ extracting structured image content from various document types using Spark NLP readers. It supports
31
+ reading from many file types and returns parsed output as a structured Spark DataFrame.
32
+
33
+ Supported formats include HTML and Markdown.
34
+
35
+ == Example ==
36
+ This example demonstrates how to load HTML files with images and process them into a structured
37
+ Spark DataFrame using Reader2Image.
38
+
39
+ Expected output:
40
+ +-------------------+--------------------+
41
+ | fileName| image|
42
+ +-------------------+--------------------+
43
+ |example-images.html|[{image, example-...|
44
+ |example-images.html|[{image, example-...|
45
+ +-------------------+--------------------+
46
+
47
+ Schema:
48
+ root
49
+ |-- fileName: string (nullable = true)
50
+ |-- image: array (nullable = false)
51
+ | |-- element: struct (containsNull = true)
52
+ | | |-- annotatorType: string (nullable = true)
53
+ | | |-- origin: string (nullable = true)
54
+ | | |-- height: integer (nullable = false)
55
+ | | |-- width: integer (nullable = false)
56
+ | | |-- nChannels: integer (nullable = false)
57
+ | | |-- mode: integer (nullable = false)
58
+ | | |-- result: binary (nullable = true)
59
+ | | |-- metadata: map (nullable = true)
60
+ | | | |-- key: string
61
+ | | | |-- value: string (valueContainsNull = true)
62
+ | | |-- text: string (nullable = true)
63
+ """
64
+
65
+ name = "Reader2Image"
66
+ outputAnnotatorType = AnnotatorType.IMAGE
67
+
68
+ userMessage = Param(
69
+ Params._dummy(),
70
+ "userMessage",
71
+ "Custom user message.",
72
+ typeConverter=TypeConverters.toString
73
+ )
74
+
75
+ promptTemplate = Param(
76
+ Params._dummy(),
77
+ "promptTemplate",
78
+ "Format of the output prompt.",
79
+ typeConverter=TypeConverters.toString
80
+ )
81
+
82
+ customPromptTemplate = Param(
83
+ Params._dummy(),
84
+ "customPromptTemplate",
85
+ "Custom prompt template for image models.",
86
+ typeConverter=TypeConverters.toString
87
+ )
88
+
89
+ @keyword_only
90
+ def __init__(self):
91
+ super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
92
+ self._setDefault(
93
+ contentType="",
94
+ outputFormat="image",
95
+ explodeDocs=True,
96
+ userMessage="Describe this image",
97
+ promptTemplate="qwen2vl-chat",
98
+ readAsImage=True,
99
+ customPromptTemplate="",
100
+ ignoreExceptions=True
101
+ )
102
+
103
+ @keyword_only
104
+ def setParams(self):
105
+ kwargs = self._input_kwargs
106
+ return self._set(**kwargs)
107
+
108
+ def setUserMessage(self, value: str):
109
+ """Sets custom user message.
110
+
111
+ Parameters
112
+ ----------
113
+ value : str
114
+ Custom user message to include.
115
+ """
116
+ return self._set(userMessage=value)
117
+
118
+ def setPromptTemplate(self, value: str):
119
+ """Sets format of the output prompt.
120
+
121
+ Parameters
122
+ ----------
123
+ value : str
124
+ Prompt template format.
125
+ """
126
+ return self._set(promptTemplate=value)
127
+
128
+ def setCustomPromptTemplate(self, value: str):
129
+ """Sets custom prompt template for image models.
130
+
131
+ Parameters
132
+ ----------
133
+ value : str
134
+ Custom prompt template string.
135
+ """
136
+ return self._set(customPromptTemplate=value)
@@ -0,0 +1,44 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from pyspark import keyword_only
16
+
17
+ from sparknlp.common import AnnotatorType
18
+ from sparknlp.internal import AnnotatorTransformer
19
+ from sparknlp.partition.partition_properties import *
20
+
21
+
22
+ class Reader2Table(
23
+ AnnotatorTransformer,
24
+ HasReaderProperties,
25
+ HasEmailReaderProperties,
26
+ HasExcelReaderProperties,
27
+ HasHTMLReaderProperties,
28
+ HasPowerPointProperties,
29
+ HasTextReaderProperties
30
+ ):
31
+ name = 'Reader2Table'
32
+
33
+ outputAnnotatorType = AnnotatorType.DOCUMENT
34
+
35
+ @keyword_only
36
+ def __init__(self):
37
+ super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
38
+ self._setDefault(outputCol="document", outputFormat="json-table", inferTableStructure=True,
39
+ outputAsDocument=False)
40
+
41
+ @keyword_only
42
+ def setParams(self):
43
+ kwargs = self._input_kwargs
44
+ return self._set(**kwargs)
@@ -0,0 +1,159 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from pyspark import keyword_only
16
+
17
+ from sparknlp.common import AnnotatorType
18
+ from sparknlp.internal import AnnotatorTransformer
19
+ from sparknlp.partition.partition_properties import *
20
+
21
+ class ReaderAssembler(
22
+ AnnotatorTransformer,
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
25
+ HasEmailReaderProperties,
26
+ HasExcelReaderProperties,
27
+ HasPowerPointProperties,
28
+ HasTextReaderProperties,
29
+ HasPdfProperties
30
+ ):
31
+ """
32
+ The ReaderAssembler annotator provides a unified interface for combining multiple Spark NLP
33
+ readers (such as Reader2Doc, Reader2Table, and Reader2Image) into a single, configurable
34
+ component. It automatically orchestrates the execution of different readers based on input type,
35
+ configured priorities, and fallback strategies allowing you to handle diverse content formats
36
+ without manually chaining multiple readers in your pipeline.
37
+
38
+ ReaderAssembler simplifies the process of building flexible pipelines capable of ingesting and
39
+ processing documents, tables, and images in a consistent way. It handles reader selection,
40
+ ordering, and fault-tolerance internally, ensuring that pipelines remain concise, robust, and
41
+ easy to maintain.
42
+
43
+ Examples
44
+ --------
45
+ >>> from johnsnowlabs.reader import ReaderAssembler
46
+ >>> from pyspark.ml import Pipeline
47
+ >>>
48
+ >>> reader_assembler = ReaderAssembler() \\
49
+ ... .setContentType("text/html") \\
50
+ ... .setContentPath("/table-image.html") \\
51
+ ... .setOutputCol("document")
52
+ >>>
53
+ >>> pipeline = Pipeline(stages=[reader_assembler])
54
+ >>> pipeline_model = pipeline.fit(empty_data_set)
55
+ >>> result_df = pipeline_model.transform(empty_data_set)
56
+ >>>
57
+ >>> result_df.show()
58
+ +--------+--------------------+--------------------+--------------------+---------+
59
+ |fileName| document_text| document_table| document_image|exception|
60
+ +--------+--------------------+--------------------+--------------------+---------+
61
+ | null|[{'document', 0, 26...|[{'document', 0, 50...|[{'image', , 5, 5, ...| null|
62
+ +--------+--------------------+--------------------+--------------------+---------+
63
+
64
+ This annotator is especially useful when working with heterogeneous input data — for example,
65
+ when a dataset includes PDFs, spreadsheets, and images — allowing Spark NLP to automatically
66
+ invoke the appropriate reader for each file type while preserving a unified schema in the output.
67
+ """
68
+
69
+
70
+ name = 'ReaderAssembler'
71
+
72
+ outputAnnotatorType = AnnotatorType.DOCUMENT
73
+
74
+ excludeNonText = Param(
75
+ Params._dummy(),
76
+ "excludeNonText",
77
+ "Whether to exclude non-text content from the output. Default is False.",
78
+ typeConverter=TypeConverters.toBoolean
79
+ )
80
+
81
+ userMessage = Param(
82
+ Params._dummy(),
83
+ "userMessage",
84
+ "Custom user message.",
85
+ typeConverter=TypeConverters.toString
86
+ )
87
+
88
+ promptTemplate = Param(
89
+ Params._dummy(),
90
+ "promptTemplate",
91
+ "Format of the output prompt.",
92
+ typeConverter=TypeConverters.toString
93
+ )
94
+
95
+ customPromptTemplate = Param(
96
+ Params._dummy(),
97
+ "customPromptTemplate",
98
+ "Custom prompt template for image models.",
99
+ typeConverter=TypeConverters.toString
100
+ )
101
+
102
+ @keyword_only
103
+ def __init__(self):
104
+ super(ReaderAssembler, self).__init__(classname="com.johnsnowlabs.reader.ReaderAssembler")
105
+ self._setDefault(contentType="",
106
+ explodeDocs=False,
107
+ userMessage="Describe this image",
108
+ promptTemplate="qwen2vl-chat",
109
+ readAsImage=True,
110
+ customPromptTemplate="",
111
+ ignoreExceptions=True,
112
+ flattenOutput=False,
113
+ titleThreshold=18)
114
+
115
+
116
+ @keyword_only
117
+ def setParams(self):
118
+ kwargs = self._input_kwargs
119
+ return self._set(**kwargs)
120
+
121
+ def setExcludeNonText(self, value):
122
+ """Sets whether to exclude non-text content from the output.
123
+
124
+ Parameters
125
+ ----------
126
+ value : bool
127
+ Whether to exclude non-text content from the output. Default is False.
128
+ """
129
+ return self._set(excludeNonText=value)
130
+
131
+ def setUserMessage(self, value: str):
132
+ """Sets custom user message.
133
+
134
+ Parameters
135
+ ----------
136
+ value : str
137
+ Custom user message to include.
138
+ """
139
+ return self._set(userMessage=value)
140
+
141
+ def setPromptTemplate(self, value: str):
142
+ """Sets format of the output prompt.
143
+
144
+ Parameters
145
+ ----------
146
+ value : str
147
+ Prompt template format.
148
+ """
149
+ return self._set(promptTemplate=value)
150
+
151
+ def setCustomPromptTemplate(self, value: str):
152
+ """Sets custom prompt template for image models.
153
+
154
+ Parameters
155
+ ----------
156
+ value : str
157
+ Custom prompt template string.
158
+ """
159
+ return self._set(customPromptTemplate=value)