spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,902 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for partition properties used in reading various document types."""
15
+ from typing import Dict
16
+ from pyspark.ml.param import Param, Params, TypeConverters
17
+
18
+
19
+ class HasReaderProperties(Params):
20
+ inputCol = Param(
21
+ Params._dummy(),
22
+ "inputCol",
23
+ "input column name",
24
+ typeConverter=TypeConverters.toString
25
+ )
26
+
27
+ def setInputCol(self, value):
28
+ """Sets input column name.
29
+
30
+ Parameters
31
+ ----------
32
+ value : str
33
+ Name of the Input Column
34
+ """
35
+ return self._set(inputCol=value)
36
+
37
+ outputCol = Param(
38
+ Params._dummy(),
39
+ "outputCol",
40
+ "output column name",
41
+ typeConverter=TypeConverters.toString
42
+ )
43
+
44
+ def setOutputCol(self, value):
45
+ """Sets output column name.
46
+
47
+ Parameters
48
+ ----------
49
+ value : str
50
+ Name of the Output Column
51
+ """
52
+ return self._set(outputCol=value)
53
+
54
+ contentPath = Param(
55
+ Params._dummy(),
56
+ "contentPath",
57
+ "Path to the content source.",
58
+ typeConverter=TypeConverters.toString
59
+ )
60
+
61
+ def setContentPath(self, value: str):
62
+ """Sets content path.
63
+
64
+ Parameters
65
+ ----------
66
+ value : str
67
+ Path to the content source.
68
+ """
69
+ return self._set(contentPath=value)
70
+
71
+ contentType = Param(
72
+ Params._dummy(),
73
+ "contentType",
74
+ "Set the content type to load following MIME specification.",
75
+ typeConverter=TypeConverters.toString
76
+ )
77
+
78
+ def setContentType(self, value: str):
79
+ """Sets content type following MIME specification.
80
+
81
+ Parameters
82
+ ----------
83
+ value : str
84
+ Content type string (MIME format).
85
+ """
86
+ return self._set(contentType=value)
87
+
88
+ storeContent = Param(
89
+ Params._dummy(),
90
+ "storeContent",
91
+ "Whether to include the raw file content in the output DataFrame "
92
+ "as a separate 'content' column, alongside the structured output.",
93
+ typeConverter=TypeConverters.toBoolean
94
+ )
95
+
96
+ def setStoreContent(self, value: bool):
97
+ """Sets whether to store raw file content.
98
+
99
+ Parameters
100
+ ----------
101
+ value : bool
102
+ True to include raw file content, False otherwise.
103
+ """
104
+ return self._set(storeContent=value)
105
+
106
+ titleFontSize = Param(
107
+ Params._dummy(),
108
+ "titleFontSize",
109
+ "Minimum font size threshold used as part of heuristic rules to detect "
110
+ "title elements based on formatting (e.g., bold, centered, capitalized).",
111
+ typeConverter=TypeConverters.toInt
112
+ )
113
+
114
+ def setTitleFontSize(self, value: int):
115
+ """Sets minimum font size for detecting titles.
116
+
117
+ Parameters
118
+ ----------
119
+ value : int
120
+ Minimum font size threshold for title detection.
121
+ """
122
+ return self._set(titleFontSize=value)
123
+
124
+ inferTableStructure = Param(
125
+ Params._dummy(),
126
+ "inferTableStructure",
127
+ "Whether to generate an HTML table representation from structured table content. "
128
+ "When enabled, a full <table> element is added alongside cell-level elements, "
129
+ "based on row and column layout.",
130
+ typeConverter=TypeConverters.toBoolean
131
+ )
132
+
133
+ def setInferTableStructure(self, value: bool):
134
+ """Sets whether to infer table structure.
135
+
136
+ Parameters
137
+ ----------
138
+ value : bool
139
+ True to generate HTML table representation, False otherwise.
140
+ """
141
+ return self._set(inferTableStructure=value)
142
+
143
+ includePageBreaks = Param(
144
+ Params._dummy(),
145
+ "includePageBreaks",
146
+ "Whether to detect and tag content with page break metadata. "
147
+ "In Word documents, this includes manual and section breaks. "
148
+ "In Excel files, this includes page breaks based on column boundaries.",
149
+ typeConverter=TypeConverters.toBoolean
150
+ )
151
+
152
+ def setIncludePageBreaks(self, value: bool):
153
+ """Sets whether to include page break metadata.
154
+
155
+ Parameters
156
+ ----------
157
+ value : bool
158
+ True to detect and tag page breaks, False otherwise.
159
+ """
160
+ return self._set(includePageBreaks=value)
161
+
162
+ ignoreExceptions = Param(
163
+ Params._dummy(),
164
+ "ignoreExceptions",
165
+ "Whether to ignore exceptions during processing.",
166
+ typeConverter=TypeConverters.toBoolean
167
+ )
168
+
169
+ def setIgnoreExceptions(self, value: bool):
170
+ """Sets whether to ignore exceptions during processing.
171
+
172
+ Parameters
173
+ ----------
174
+ value : bool
175
+ True to ignore exceptions, False otherwise.
176
+ """
177
+ return self._set(ignoreExceptions=value)
178
+
179
+ explodeDocs = Param(
180
+ Params._dummy(),
181
+ "explodeDocs",
182
+ "Whether to explode the documents into separate rows.",
183
+ typeConverter=TypeConverters.toBoolean
184
+ )
185
+
186
+ def setExplodeDocs(self, value: bool):
187
+ """Sets whether to explode the documents into separate rows.
188
+
189
+ Parameters
190
+ ----------
191
+ value : bool
192
+ True to split documents into multiple rows, False to keep them in one row.
193
+ """
194
+ return self._set(explodeDocs=value)
195
+
196
+ flattenOutput = Param(
197
+ Params._dummy(),
198
+ "flattenOutput",
199
+ "If true, output is flattened to plain text with minimal metadata",
200
+ typeConverter=TypeConverters.toBoolean
201
+ )
202
+
203
+ def setFlattenOutput(self, value):
204
+ """Sets whether to flatten the output to plain text with minimal metadata.
205
+
206
+ ParametersF
207
+ ----------
208
+ value : bool
209
+ If true, output is flattened to plain text with minimal metadata
210
+ """
211
+ return self._set(flattenOutput=value)
212
+
213
+ titleThreshold = Param(
214
+ Params._dummy(),
215
+ "titleThreshold",
216
+ "Minimum font size threshold for title detection in PDF docs",
217
+ typeConverter=TypeConverters.toFloat
218
+ )
219
+
220
+ def setTitleThreshold(self, value):
221
+ """Sets the minimum font size threshold for title detection in PDF documents.
222
+
223
+ Parameters
224
+ ----------
225
+ value : float
226
+ Minimum font size threshold for title detection in PDF docs
227
+ """
228
+ return self._set(titleThreshold=value)
229
+
230
+ outputAsDocument = Param(
231
+ Params._dummy(),
232
+ "outputAsDocument",
233
+ "Whether to return all sentences joined into a single document",
234
+ typeConverter=TypeConverters.toBoolean
235
+ )
236
+
237
+ def setOutputAsDocument(self, value):
238
+ """Sets whether to return all sentences joined into a single document.
239
+
240
+ Parameters
241
+ ----------
242
+ value : bool
243
+ Whether to return all sentences joined into a single document
244
+ """
245
+ return self._set(outputAsDocument=value)
246
+
247
+
248
+ class HasEmailReaderProperties(Params):
249
+ addAttachmentContent = Param(
250
+ Params._dummy(),
251
+ "addAttachmentContent",
252
+ "Whether to extract and include the textual content of plain-text attachments in the output",
253
+ typeConverter=TypeConverters.toBoolean
254
+ )
255
+
256
+ def setAddAttachmentContent(self, value):
257
+ """
258
+ Sets whether to extract and include the textual content of plain-text attachments in the output.
259
+
260
+ Parameters
261
+ ----------
262
+ value : bool
263
+ Whether to include text from plain-text attachments.
264
+ """
265
+ return self._set(addAttachmentContent=value)
266
+
267
+ def getAddAttachmentContent(self):
268
+ """
269
+ Gets whether to extract and include the textual content of plain-text attachments in the output.
270
+
271
+ Returns
272
+ -------
273
+ bool
274
+ Whether to include text from plain-text attachments.
275
+ """
276
+ return self.getOrDefault(self.addAttachmentContent)
277
+
278
+
279
+ class HasExcelReaderProperties(Params):
280
+ cellSeparator = Param(
281
+ Params._dummy(),
282
+ "cellSeparator",
283
+ "String used to join cell values in a row when assembling textual output.",
284
+ typeConverter=TypeConverters.toString
285
+ )
286
+
287
+ def setCellSeparator(self, value):
288
+ """
289
+ Sets the string used to join cell values in a row when assembling textual output.
290
+
291
+ Parameters
292
+ ----------
293
+ value : str
294
+ Delimiter used to concatenate cell values.
295
+ """
296
+ return self._set(cellSeparator=value)
297
+
298
+ def getCellSeparator(self):
299
+ """
300
+ Gets the string used to join cell values in a row when assembling textual output.
301
+
302
+ Returns
303
+ -------
304
+ str
305
+ Delimiter used to concatenate cell values.
306
+ """
307
+ return self.getOrDefault(self.cellSeparator)
308
+
309
+ appendCells = Param(
310
+ Params._dummy(),
311
+ "appendCells",
312
+ "Whether to append all rows into a single content block instead of creating separate elements per row.",
313
+ typeConverter=TypeConverters.toBoolean
314
+ )
315
+
316
+ def setAppendCells(self, value):
317
+ """
318
+ Sets whether to append all rows into a single content block.
319
+
320
+ Parameters
321
+ ----------
322
+ value : bool
323
+ True to merge rows into one block, False for individual elements.
324
+ """
325
+ return self._set(appendCells=value)
326
+
327
+ def getAppendCells(self):
328
+ """
329
+ Gets whether to append all rows into a single content block.
330
+
331
+ Returns
332
+ -------
333
+ bool
334
+ True to merge rows into one block, False for individual elements.
335
+ """
336
+ return self.getOrDefault(self.appendCells)
337
+
338
+
339
+ class HasHTMLReaderProperties(Params):
340
+ timeout = Param(
341
+ Params._dummy(),
342
+ "timeout",
343
+ "Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
344
+ typeConverter=TypeConverters.toInt
345
+ )
346
+
347
+ def setTimeout(self, value):
348
+ """
349
+ Sets the timeout (in seconds) for reading remote HTML resources.
350
+
351
+ Parameters
352
+ ----------
353
+ value : int
354
+ Timeout in seconds for remote content retrieval.
355
+ """
356
+ return self._set(timeout=value)
357
+
358
+ def getTimeout(self):
359
+ """
360
+ Gets the timeout value for reading remote HTML resources.
361
+
362
+ Returns
363
+ -------
364
+ int
365
+ Timeout in seconds.
366
+ """
367
+ return self.getOrDefault(self.timeout)
368
+
369
+ def setHeaders(self, headers: Dict[str, str]):
370
+ self._call_java("setHeadersPython", headers)
371
+ return self
372
+
373
+ outputFormat = Param(
374
+ Params._dummy(),
375
+ "outputFormat",
376
+ "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
377
+ typeConverter=TypeConverters.toString
378
+ )
379
+
380
+ def setOutputFormat(self, value: str):
381
+ """Sets output format for the table content.
382
+
383
+ Options
384
+ -------
385
+ - 'plain-text'
386
+ - 'html-table'
387
+ - 'json-table' (default)
388
+
389
+ Parameters
390
+ ----------
391
+ value : str
392
+ Output format for the table content.
393
+ """
394
+ return self._set(outputFormat=value)
395
+
396
+
397
+ class HasPowerPointProperties(Params):
398
+ includeSlideNotes = Param(
399
+ Params._dummy(),
400
+ "includeSlideNotes",
401
+ "Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
402
+ typeConverter=TypeConverters.toBoolean
403
+ )
404
+
405
+ def setIncludeSlideNotes(self, value):
406
+ """
407
+ Sets whether to extract speaker notes from slides.
408
+
409
+ Parameters
410
+ ----------
411
+ value : bool
412
+ If True, notes are included as narrative text elements.
413
+ """
414
+ return self._set(includeSlideNotes=value)
415
+
416
+ def getIncludeSlideNotes(self):
417
+ """
418
+ Gets whether to extract speaker notes from slides.
419
+
420
+ Returns
421
+ -------
422
+ bool
423
+ True if notes are included as narrative text elements.
424
+ """
425
+ return self.getOrDefault(self.includeSlideNotes)
426
+
427
+
428
+ class HasTextReaderProperties(Params):
429
+ titleLengthSize = Param(
430
+ Params._dummy(),
431
+ "titleLengthSize",
432
+ "Maximum character length used to determine if a text block qualifies as a title during parsing.",
433
+ typeConverter=TypeConverters.toInt
434
+ )
435
+
436
+ def setTitleLengthSize(self, value):
437
+ """Set the maximum character length used to identify title blocks.
438
+
439
+ Parameters
440
+ ----------
441
+ value : int
442
+ Maximum number of characters a text block can have to be considered a title.
443
+
444
+ Returns
445
+ -------
446
+ self
447
+ The instance with updated `titleLengthSize` parameter.
448
+ """
449
+ return self._set(titleLengthSize=value)
450
+
451
+ def getTitleLengthSize(self):
452
+ """Get the configured maximum title length.
453
+
454
+ Returns
455
+ -------
456
+ int
457
+ The maximum character length used to detect title blocks.
458
+ """
459
+ return self.getOrDefault(self.titleLengthSize)
460
+
461
+ groupBrokenParagraphs = Param(
462
+ Params._dummy(),
463
+ "groupBrokenParagraphs",
464
+ "Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
465
+ typeConverter=TypeConverters.toBoolean
466
+ )
467
+
468
+ def setGroupBrokenParagraphs(self, value):
469
+ """Enable or disable grouping of broken paragraphs.
470
+
471
+ Parameters
472
+ ----------
473
+ value : bool
474
+ True to merge fragmented lines into paragraphs, False to leave lines as-is.
475
+
476
+ Returns
477
+ -------
478
+ self
479
+ The instance with updated `groupBrokenParagraphs` parameter.
480
+ """
481
+ return self._set(groupBrokenParagraphs=value)
482
+
483
+ def getGroupBrokenParagraphs(self):
484
+ """Get whether broken paragraph grouping is enabled.
485
+
486
+ Returns
487
+ -------
488
+ bool
489
+ True if grouping of broken paragraphs is enabled, False otherwise.
490
+ """
491
+ return self.getOrDefault(self.groupBrokenParagraphs)
492
+
493
+ paragraphSplit = Param(
494
+ Params._dummy(),
495
+ "paragraphSplit",
496
+ "Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
497
+ typeConverter=TypeConverters.toString
498
+ )
499
+
500
+ def setParagraphSplit(self, value):
501
+ """Set the regex pattern used to split paragraphs when grouping broken paragraphs.
502
+
503
+ Parameters
504
+ ----------
505
+ value : str
506
+ Regular expression string used to detect paragraph boundaries.
507
+
508
+ Returns
509
+ -------
510
+ self
511
+ The instance with updated `paragraphSplit` parameter.
512
+ """
513
+ return self._set(paragraphSplit=value)
514
+
515
+ def getParagraphSplit(self):
516
+ """Get the paragraph-splitting regex pattern.
517
+
518
+ Returns
519
+ -------
520
+ str
521
+ The regex pattern used to detect paragraph boundaries.
522
+ """
523
+ return self.getOrDefault(self.paragraphSplit)
524
+
525
+ shortLineWordThreshold = Param(
526
+ Params._dummy(),
527
+ "shortLineWordThreshold",
528
+ "Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
529
+ typeConverter=TypeConverters.toInt
530
+ )
531
+
532
+ def setShortLineWordThreshold(self, value):
533
+ """Set the maximum word count for a line to be considered short.
534
+
535
+ Parameters
536
+ ----------
537
+ value : int
538
+ Number of words under which a line is considered 'short'.
539
+
540
+ Returns
541
+ -------
542
+ self
543
+ The instance with updated `shortLineWordThreshold` parameter.
544
+ """
545
+ return self._set(shortLineWordThreshold=value)
546
+
547
+ def getShortLineWordThreshold(self):
548
+ """Get the short line word threshold.
549
+
550
+ Returns
551
+ -------
552
+ int
553
+ Word count threshold for short lines used in paragraph grouping.
554
+ """
555
+ return self.getOrDefault(self.shortLineWordThreshold)
556
+
557
+ maxLineCount = Param(
558
+ Params._dummy(),
559
+ "maxLineCount",
560
+ "Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
561
+ typeConverter=TypeConverters.toInt
562
+ )
563
+
564
+ def setMaxLineCount(self, value):
565
+ """Set the maximum number of lines to inspect when estimating paragraph layout.
566
+
567
+ Parameters
568
+ ----------
569
+ value : int
570
+ Maximum number of lines to evaluate for layout heuristics.
571
+
572
+ Returns
573
+ -------
574
+ self
575
+ The instance with updated `maxLineCount` parameter.
576
+ """
577
+ return self._set(maxLineCount=value)
578
+
579
+ def getMaxLineCount(self):
580
+ """Get the maximum number of lines used for layout heuristics.
581
+
582
+ Returns
583
+ -------
584
+ int
585
+ The configured maximum number of lines to consider.
586
+ """
587
+ return self.getOrDefault(self.maxLineCount)
588
+
589
+ threshold = Param(
590
+ Params._dummy(),
591
+ "threshold",
592
+ "Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
593
+ typeConverter=TypeConverters.toFloat
594
+ )
595
+
596
+ def setThreshold(self, value):
597
+ """Set the empty-line ratio threshold for paragraph grouping decision.
598
+
599
+ Parameters
600
+ ----------
601
+ value : float
602
+ Ratio (0.0-1.0) of empty lines used to switch grouping strategies.
603
+
604
+ Returns
605
+ -------
606
+ self
607
+ The instance with updated `threshold` parameter.
608
+ """
609
+ return self._set(threshold=value)
610
+
611
+ def getThreshold(self):
612
+ """Get the configured empty-line threshold ratio.
613
+
614
+ Returns
615
+ -------
616
+ float
617
+ The ratio used to decide paragraph grouping strategy.
618
+ """
619
+ return self.getOrDefault(self.threshold)
620
+
621
+ extractTagAttributes = Param(
622
+ Params._dummy(),
623
+ "extractTagAttributes",
624
+ "Extract attribute values into separate lines when parsing tag-based formats (e.g., HTML or XML).",
625
+ typeConverter=TypeConverters.toListString
626
+ )
627
+
628
+ def setExtractTagAttributes(self, attributes: list[str]):
629
+ """
630
+ Specify which tag attributes should have their values extracted as text when parsing
631
+ tag-based formats (e.g., HTML or XML).
632
+
633
+ :param attributes: list of attribute names to extract
634
+ :return: this instance with the updated `extractTagAttributes` parameter
635
+ """
636
+ return self._set(extractTagAttributes=attributes)
637
+
638
+ def getExtractTagAttributes(self):
639
+ """Get the list of tag attribute names configured to be extracted.
640
+
641
+ Returns
642
+ -------
643
+ list[str]
644
+ The attribute names whose values will be extracted as text.
645
+ """
646
+ return self.getOrDefault(self.extractTagAttributes)
647
+
648
+
649
+ class HasChunkerProperties(Params):
650
+
651
+ chunkingStrategy = Param(
652
+ Params._dummy(),
653
+ "chunkingStrategy",
654
+ "Set the chunking strategy",
655
+ typeConverter=TypeConverters.toString
656
+ )
657
+
658
+ def setChunkingStrategy(self, value):
659
+ return self._set(chunkingStrategy=value)
660
+
661
+ maxCharacters = Param(
662
+ Params._dummy(),
663
+ "maxCharacters",
664
+ "Set the maximum number of characters",
665
+ typeConverter=TypeConverters.toInt
666
+ )
667
+
668
+ def setMaxCharacters(self, value):
669
+ return self._set(maxCharacters=value)
670
+
671
+ newAfterNChars = Param(
672
+ Params._dummy(),
673
+ "newAfterNChars",
674
+ "Insert a new chunk after N characters",
675
+ typeConverter=TypeConverters.toInt
676
+ )
677
+
678
+ def setNewAfterNChars(self, value):
679
+ return self._set(newAfterNChars=value)
680
+
681
+ overlap = Param(
682
+ Params._dummy(),
683
+ "overlap",
684
+ "Set the number of overlapping characters between chunks",
685
+ typeConverter=TypeConverters.toInt
686
+ )
687
+
688
+ def setOverlap(self, value):
689
+ return self._set(overlap=value)
690
+
691
+ combineTextUnderNChars = Param(
692
+ Params._dummy(),
693
+ "combineTextUnderNChars",
694
+ "Threshold to merge adjacent small sections",
695
+ typeConverter=TypeConverters.toInt
696
+ )
697
+
698
+ def setCombineTextUnderNChars(self, value):
699
+ return self._set(combineTextUnderNChars=value)
700
+
701
+ overlapAll = Param(
702
+ Params._dummy(),
703
+ "overlapAll",
704
+ "Apply overlap context between all sections, not just split chunks",
705
+ typeConverter=TypeConverters.toBoolean
706
+ )
707
+
708
+ def setOverlapAll(self, value):
709
+ return self._set(overlapAll=value)
710
+
711
+
712
+ from pyspark.ml.param import Param, Params, TypeConverters
713
+
714
+
715
+ class HasPdfProperties(Params):
716
+
717
+ pageNumCol = Param(
718
+ Params._dummy(),
719
+ "pageNumCol",
720
+ "Page number output column name.",
721
+ typeConverter=TypeConverters.toString
722
+ )
723
+
724
+ def setPageNumCol(self, value: str):
725
+ """Sets page number output column name.
726
+
727
+ Parameters
728
+ ----------
729
+ value : str
730
+ Name of the column for page numbers.
731
+ """
732
+ return self._set(pageNumCol=value)
733
+
734
+ originCol = Param(
735
+ Params._dummy(),
736
+ "originCol",
737
+ "Input column name with original path of file.",
738
+ typeConverter=TypeConverters.toString
739
+ )
740
+
741
+ def setOriginCol(self, value: str):
742
+ """Sets input column with original file path.
743
+
744
+ Parameters
745
+ ----------
746
+ value : str
747
+ Column name that stores the file path.
748
+ """
749
+ return self._set(originCol=value)
750
+
751
+ partitionNum = Param(
752
+ Params._dummy(),
753
+ "partitionNum",
754
+ "Number of partitions.",
755
+ typeConverter=TypeConverters.toInt
756
+ )
757
+
758
+ def setPartitionNum(self, value: int):
759
+ """Sets number of partitions.
760
+
761
+ Parameters
762
+ ----------
763
+ value : int
764
+ Number of partitions to use.
765
+ """
766
+ return self._set(partitionNum=value)
767
+
768
+ storeSplittedPdf = Param(
769
+ Params._dummy(),
770
+ "storeSplittedPdf",
771
+ "Force to store bytes content of splitted pdf.",
772
+ typeConverter=TypeConverters.toBoolean
773
+ )
774
+
775
+ def setStoreSplittedPdf(self, value: bool):
776
+ """Sets whether to store byte content of split PDF pages.
777
+
778
+ Parameters
779
+ ----------
780
+ value : bool
781
+ True to store PDF page bytes, False otherwise.
782
+ """
783
+ return self._set(storeSplittedPdf=value)
784
+
785
+ splitPage = Param(
786
+ Params._dummy(),
787
+ "splitPage",
788
+ "Enable/disable splitting per page to identify page numbers and improve performance.",
789
+ typeConverter=TypeConverters.toBoolean
790
+ )
791
+
792
+ def setSplitPage(self, value: bool):
793
+ """Sets whether to split PDF into pages.
794
+
795
+ Parameters
796
+ ----------
797
+ value : bool
798
+ True to split per page, False otherwise.
799
+ """
800
+ return self._set(splitPage=value)
801
+
802
+ onlyPageNum = Param(
803
+ Params._dummy(),
804
+ "onlyPageNum",
805
+ "Extract only page numbers.",
806
+ typeConverter=TypeConverters.toBoolean
807
+ )
808
+
809
+ def setOnlyPageNum(self, value: bool):
810
+ """Sets whether to extract only page numbers.
811
+
812
+ Parameters
813
+ ----------
814
+ value : bool
815
+ True to extract only page numbers, False otherwise.
816
+ """
817
+ return self._set(onlyPageNum=value)
818
+
819
+ textStripper = Param(
820
+ Params._dummy(),
821
+ "textStripper",
822
+ "Text stripper type used for output layout and formatting.",
823
+ typeConverter=TypeConverters.toString
824
+ )
825
+
826
+ def setTextStripper(self, value: str):
827
+ """Sets text stripper type.
828
+
829
+ Parameters
830
+ ----------
831
+ value : str
832
+ Text stripper type for layout and formatting.
833
+ """
834
+ return self._set(textStripper=value)
835
+
836
+ sort = Param(
837
+ Params._dummy(),
838
+ "sort",
839
+ "Enable/disable sorting content on the page.",
840
+ typeConverter=TypeConverters.toBoolean
841
+ )
842
+
843
+ def setSort(self, value: bool):
844
+ """Sets whether to sort content on the page.
845
+
846
+ Parameters
847
+ ----------
848
+ value : bool
849
+ True to sort content, False otherwise.
850
+ """
851
+ return self._set(sort=value)
852
+
853
+ extractCoordinates = Param(
854
+ Params._dummy(),
855
+ "extractCoordinates",
856
+ "Force extract coordinates of text.",
857
+ typeConverter=TypeConverters.toBoolean
858
+ )
859
+
860
+ def setExtractCoordinates(self, value: bool):
861
+ """Sets whether to extract coordinates of text.
862
+
863
+ Parameters
864
+ ----------
865
+ value : bool
866
+ True to extract coordinates, False otherwise.
867
+ """
868
+ return self._set(extractCoordinates=value)
869
+
870
+ normalizeLigatures = Param(
871
+ Params._dummy(),
872
+ "normalizeLigatures",
873
+ "Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
874
+ typeConverter=TypeConverters.toBoolean
875
+ )
876
+
877
+ def setNormalizeLigatures(self, value: bool):
878
+ """Sets whether to normalize ligatures (e.g., fl → f + l).
879
+
880
+ Parameters
881
+ ----------
882
+ value : bool
883
+ True to normalize ligatures, False otherwise.
884
+ """
885
+ return self._set(normalizeLigatures=value)
886
+
887
+ readAsImage = Param(
888
+ Params._dummy(),
889
+ "readAsImage",
890
+ "Read PDF pages as images.",
891
+ typeConverter=TypeConverters.toBoolean
892
+ )
893
+
894
+ def setReadAsImage(self, value: bool):
895
+ """Sets whether to read PDF pages as images.
896
+
897
+ Parameters
898
+ ----------
899
+ value : bool
900
+ True to read as images, False otherwise.
901
+ """
902
+ return self._set(readAsImage=value)