spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,902 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for partition properties used in reading various document types."""
|
|
15
|
+
from typing import Dict
|
|
16
|
+
from pyspark.ml.param import Param, Params, TypeConverters
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HasReaderProperties(Params):
|
|
20
|
+
inputCol = Param(
|
|
21
|
+
Params._dummy(),
|
|
22
|
+
"inputCol",
|
|
23
|
+
"input column name",
|
|
24
|
+
typeConverter=TypeConverters.toString
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def setInputCol(self, value):
|
|
28
|
+
"""Sets input column name.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
value : str
|
|
33
|
+
Name of the Input Column
|
|
34
|
+
"""
|
|
35
|
+
return self._set(inputCol=value)
|
|
36
|
+
|
|
37
|
+
outputCol = Param(
|
|
38
|
+
Params._dummy(),
|
|
39
|
+
"outputCol",
|
|
40
|
+
"output column name",
|
|
41
|
+
typeConverter=TypeConverters.toString
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def setOutputCol(self, value):
|
|
45
|
+
"""Sets output column name.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
value : str
|
|
50
|
+
Name of the Output Column
|
|
51
|
+
"""
|
|
52
|
+
return self._set(outputCol=value)
|
|
53
|
+
|
|
54
|
+
contentPath = Param(
|
|
55
|
+
Params._dummy(),
|
|
56
|
+
"contentPath",
|
|
57
|
+
"Path to the content source.",
|
|
58
|
+
typeConverter=TypeConverters.toString
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def setContentPath(self, value: str):
|
|
62
|
+
"""Sets content path.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
value : str
|
|
67
|
+
Path to the content source.
|
|
68
|
+
"""
|
|
69
|
+
return self._set(contentPath=value)
|
|
70
|
+
|
|
71
|
+
contentType = Param(
|
|
72
|
+
Params._dummy(),
|
|
73
|
+
"contentType",
|
|
74
|
+
"Set the content type to load following MIME specification.",
|
|
75
|
+
typeConverter=TypeConverters.toString
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def setContentType(self, value: str):
|
|
79
|
+
"""Sets content type following MIME specification.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
value : str
|
|
84
|
+
Content type string (MIME format).
|
|
85
|
+
"""
|
|
86
|
+
return self._set(contentType=value)
|
|
87
|
+
|
|
88
|
+
storeContent = Param(
|
|
89
|
+
Params._dummy(),
|
|
90
|
+
"storeContent",
|
|
91
|
+
"Whether to include the raw file content in the output DataFrame "
|
|
92
|
+
"as a separate 'content' column, alongside the structured output.",
|
|
93
|
+
typeConverter=TypeConverters.toBoolean
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def setStoreContent(self, value: bool):
|
|
97
|
+
"""Sets whether to store raw file content.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
value : bool
|
|
102
|
+
True to include raw file content, False otherwise.
|
|
103
|
+
"""
|
|
104
|
+
return self._set(storeContent=value)
|
|
105
|
+
|
|
106
|
+
titleFontSize = Param(
|
|
107
|
+
Params._dummy(),
|
|
108
|
+
"titleFontSize",
|
|
109
|
+
"Minimum font size threshold used as part of heuristic rules to detect "
|
|
110
|
+
"title elements based on formatting (e.g., bold, centered, capitalized).",
|
|
111
|
+
typeConverter=TypeConverters.toInt
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def setTitleFontSize(self, value: int):
|
|
115
|
+
"""Sets minimum font size for detecting titles.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
value : int
|
|
120
|
+
Minimum font size threshold for title detection.
|
|
121
|
+
"""
|
|
122
|
+
return self._set(titleFontSize=value)
|
|
123
|
+
|
|
124
|
+
inferTableStructure = Param(
|
|
125
|
+
Params._dummy(),
|
|
126
|
+
"inferTableStructure",
|
|
127
|
+
"Whether to generate an HTML table representation from structured table content. "
|
|
128
|
+
"When enabled, a full <table> element is added alongside cell-level elements, "
|
|
129
|
+
"based on row and column layout.",
|
|
130
|
+
typeConverter=TypeConverters.toBoolean
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def setInferTableStructure(self, value: bool):
|
|
134
|
+
"""Sets whether to infer table structure.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
value : bool
|
|
139
|
+
True to generate HTML table representation, False otherwise.
|
|
140
|
+
"""
|
|
141
|
+
return self._set(inferTableStructure=value)
|
|
142
|
+
|
|
143
|
+
includePageBreaks = Param(
|
|
144
|
+
Params._dummy(),
|
|
145
|
+
"includePageBreaks",
|
|
146
|
+
"Whether to detect and tag content with page break metadata. "
|
|
147
|
+
"In Word documents, this includes manual and section breaks. "
|
|
148
|
+
"In Excel files, this includes page breaks based on column boundaries.",
|
|
149
|
+
typeConverter=TypeConverters.toBoolean
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def setIncludePageBreaks(self, value: bool):
|
|
153
|
+
"""Sets whether to include page break metadata.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
value : bool
|
|
158
|
+
True to detect and tag page breaks, False otherwise.
|
|
159
|
+
"""
|
|
160
|
+
return self._set(includePageBreaks=value)
|
|
161
|
+
|
|
162
|
+
ignoreExceptions = Param(
|
|
163
|
+
Params._dummy(),
|
|
164
|
+
"ignoreExceptions",
|
|
165
|
+
"Whether to ignore exceptions during processing.",
|
|
166
|
+
typeConverter=TypeConverters.toBoolean
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def setIgnoreExceptions(self, value: bool):
|
|
170
|
+
"""Sets whether to ignore exceptions during processing.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
value : bool
|
|
175
|
+
True to ignore exceptions, False otherwise.
|
|
176
|
+
"""
|
|
177
|
+
return self._set(ignoreExceptions=value)
|
|
178
|
+
|
|
179
|
+
explodeDocs = Param(
|
|
180
|
+
Params._dummy(),
|
|
181
|
+
"explodeDocs",
|
|
182
|
+
"Whether to explode the documents into separate rows.",
|
|
183
|
+
typeConverter=TypeConverters.toBoolean
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def setExplodeDocs(self, value: bool):
|
|
187
|
+
"""Sets whether to explode the documents into separate rows.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
value : bool
|
|
192
|
+
True to split documents into multiple rows, False to keep them in one row.
|
|
193
|
+
"""
|
|
194
|
+
return self._set(explodeDocs=value)
|
|
195
|
+
|
|
196
|
+
flattenOutput = Param(
|
|
197
|
+
Params._dummy(),
|
|
198
|
+
"flattenOutput",
|
|
199
|
+
"If true, output is flattened to plain text with minimal metadata",
|
|
200
|
+
typeConverter=TypeConverters.toBoolean
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def setFlattenOutput(self, value):
|
|
204
|
+
"""Sets whether to flatten the output to plain text with minimal metadata.
|
|
205
|
+
|
|
206
|
+
ParametersF
|
|
207
|
+
----------
|
|
208
|
+
value : bool
|
|
209
|
+
If true, output is flattened to plain text with minimal metadata
|
|
210
|
+
"""
|
|
211
|
+
return self._set(flattenOutput=value)
|
|
212
|
+
|
|
213
|
+
titleThreshold = Param(
|
|
214
|
+
Params._dummy(),
|
|
215
|
+
"titleThreshold",
|
|
216
|
+
"Minimum font size threshold for title detection in PDF docs",
|
|
217
|
+
typeConverter=TypeConverters.toFloat
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def setTitleThreshold(self, value):
|
|
221
|
+
"""Sets the minimum font size threshold for title detection in PDF documents.
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
value : float
|
|
226
|
+
Minimum font size threshold for title detection in PDF docs
|
|
227
|
+
"""
|
|
228
|
+
return self._set(titleThreshold=value)
|
|
229
|
+
|
|
230
|
+
outputAsDocument = Param(
|
|
231
|
+
Params._dummy(),
|
|
232
|
+
"outputAsDocument",
|
|
233
|
+
"Whether to return all sentences joined into a single document",
|
|
234
|
+
typeConverter=TypeConverters.toBoolean
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def setOutputAsDocument(self, value):
|
|
238
|
+
"""Sets whether to return all sentences joined into a single document.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
value : bool
|
|
243
|
+
Whether to return all sentences joined into a single document
|
|
244
|
+
"""
|
|
245
|
+
return self._set(outputAsDocument=value)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class HasEmailReaderProperties(Params):
|
|
249
|
+
addAttachmentContent = Param(
|
|
250
|
+
Params._dummy(),
|
|
251
|
+
"addAttachmentContent",
|
|
252
|
+
"Whether to extract and include the textual content of plain-text attachments in the output",
|
|
253
|
+
typeConverter=TypeConverters.toBoolean
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def setAddAttachmentContent(self, value):
|
|
257
|
+
"""
|
|
258
|
+
Sets whether to extract and include the textual content of plain-text attachments in the output.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
value : bool
|
|
263
|
+
Whether to include text from plain-text attachments.
|
|
264
|
+
"""
|
|
265
|
+
return self._set(addAttachmentContent=value)
|
|
266
|
+
|
|
267
|
+
def getAddAttachmentContent(self):
|
|
268
|
+
"""
|
|
269
|
+
Gets whether to extract and include the textual content of plain-text attachments in the output.
|
|
270
|
+
|
|
271
|
+
Returns
|
|
272
|
+
-------
|
|
273
|
+
bool
|
|
274
|
+
Whether to include text from plain-text attachments.
|
|
275
|
+
"""
|
|
276
|
+
return self.getOrDefault(self.addAttachmentContent)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class HasExcelReaderProperties(Params):
|
|
280
|
+
cellSeparator = Param(
|
|
281
|
+
Params._dummy(),
|
|
282
|
+
"cellSeparator",
|
|
283
|
+
"String used to join cell values in a row when assembling textual output.",
|
|
284
|
+
typeConverter=TypeConverters.toString
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
def setCellSeparator(self, value):
|
|
288
|
+
"""
|
|
289
|
+
Sets the string used to join cell values in a row when assembling textual output.
|
|
290
|
+
|
|
291
|
+
Parameters
|
|
292
|
+
----------
|
|
293
|
+
value : str
|
|
294
|
+
Delimiter used to concatenate cell values.
|
|
295
|
+
"""
|
|
296
|
+
return self._set(cellSeparator=value)
|
|
297
|
+
|
|
298
|
+
def getCellSeparator(self):
|
|
299
|
+
"""
|
|
300
|
+
Gets the string used to join cell values in a row when assembling textual output.
|
|
301
|
+
|
|
302
|
+
Returns
|
|
303
|
+
-------
|
|
304
|
+
str
|
|
305
|
+
Delimiter used to concatenate cell values.
|
|
306
|
+
"""
|
|
307
|
+
return self.getOrDefault(self.cellSeparator)
|
|
308
|
+
|
|
309
|
+
appendCells = Param(
|
|
310
|
+
Params._dummy(),
|
|
311
|
+
"appendCells",
|
|
312
|
+
"Whether to append all rows into a single content block instead of creating separate elements per row.",
|
|
313
|
+
typeConverter=TypeConverters.toBoolean
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def setAppendCells(self, value):
|
|
317
|
+
"""
|
|
318
|
+
Sets whether to append all rows into a single content block.
|
|
319
|
+
|
|
320
|
+
Parameters
|
|
321
|
+
----------
|
|
322
|
+
value : bool
|
|
323
|
+
True to merge rows into one block, False for individual elements.
|
|
324
|
+
"""
|
|
325
|
+
return self._set(appendCells=value)
|
|
326
|
+
|
|
327
|
+
def getAppendCells(self):
|
|
328
|
+
"""
|
|
329
|
+
Gets whether to append all rows into a single content block.
|
|
330
|
+
|
|
331
|
+
Returns
|
|
332
|
+
-------
|
|
333
|
+
bool
|
|
334
|
+
True to merge rows into one block, False for individual elements.
|
|
335
|
+
"""
|
|
336
|
+
return self.getOrDefault(self.appendCells)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
class HasHTMLReaderProperties(Params):
|
|
340
|
+
timeout = Param(
|
|
341
|
+
Params._dummy(),
|
|
342
|
+
"timeout",
|
|
343
|
+
"Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
|
|
344
|
+
typeConverter=TypeConverters.toInt
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def setTimeout(self, value):
|
|
348
|
+
"""
|
|
349
|
+
Sets the timeout (in seconds) for reading remote HTML resources.
|
|
350
|
+
|
|
351
|
+
Parameters
|
|
352
|
+
----------
|
|
353
|
+
value : int
|
|
354
|
+
Timeout in seconds for remote content retrieval.
|
|
355
|
+
"""
|
|
356
|
+
return self._set(timeout=value)
|
|
357
|
+
|
|
358
|
+
def getTimeout(self):
|
|
359
|
+
"""
|
|
360
|
+
Gets the timeout value for reading remote HTML resources.
|
|
361
|
+
|
|
362
|
+
Returns
|
|
363
|
+
-------
|
|
364
|
+
int
|
|
365
|
+
Timeout in seconds.
|
|
366
|
+
"""
|
|
367
|
+
return self.getOrDefault(self.timeout)
|
|
368
|
+
|
|
369
|
+
def setHeaders(self, headers: Dict[str, str]):
|
|
370
|
+
self._call_java("setHeadersPython", headers)
|
|
371
|
+
return self
|
|
372
|
+
|
|
373
|
+
outputFormat = Param(
|
|
374
|
+
Params._dummy(),
|
|
375
|
+
"outputFormat",
|
|
376
|
+
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
|
|
377
|
+
typeConverter=TypeConverters.toString
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
def setOutputFormat(self, value: str):
|
|
381
|
+
"""Sets output format for the table content.
|
|
382
|
+
|
|
383
|
+
Options
|
|
384
|
+
-------
|
|
385
|
+
- 'plain-text'
|
|
386
|
+
- 'html-table'
|
|
387
|
+
- 'json-table' (default)
|
|
388
|
+
|
|
389
|
+
Parameters
|
|
390
|
+
----------
|
|
391
|
+
value : str
|
|
392
|
+
Output format for the table content.
|
|
393
|
+
"""
|
|
394
|
+
return self._set(outputFormat=value)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class HasPowerPointProperties(Params):
|
|
398
|
+
includeSlideNotes = Param(
|
|
399
|
+
Params._dummy(),
|
|
400
|
+
"includeSlideNotes",
|
|
401
|
+
"Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
|
|
402
|
+
typeConverter=TypeConverters.toBoolean
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
def setIncludeSlideNotes(self, value):
|
|
406
|
+
"""
|
|
407
|
+
Sets whether to extract speaker notes from slides.
|
|
408
|
+
|
|
409
|
+
Parameters
|
|
410
|
+
----------
|
|
411
|
+
value : bool
|
|
412
|
+
If True, notes are included as narrative text elements.
|
|
413
|
+
"""
|
|
414
|
+
return self._set(includeSlideNotes=value)
|
|
415
|
+
|
|
416
|
+
def getIncludeSlideNotes(self):
|
|
417
|
+
"""
|
|
418
|
+
Gets whether to extract speaker notes from slides.
|
|
419
|
+
|
|
420
|
+
Returns
|
|
421
|
+
-------
|
|
422
|
+
bool
|
|
423
|
+
True if notes are included as narrative text elements.
|
|
424
|
+
"""
|
|
425
|
+
return self.getOrDefault(self.includeSlideNotes)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class HasTextReaderProperties(Params):
|
|
429
|
+
titleLengthSize = Param(
|
|
430
|
+
Params._dummy(),
|
|
431
|
+
"titleLengthSize",
|
|
432
|
+
"Maximum character length used to determine if a text block qualifies as a title during parsing.",
|
|
433
|
+
typeConverter=TypeConverters.toInt
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
def setTitleLengthSize(self, value):
|
|
437
|
+
"""Set the maximum character length used to identify title blocks.
|
|
438
|
+
|
|
439
|
+
Parameters
|
|
440
|
+
----------
|
|
441
|
+
value : int
|
|
442
|
+
Maximum number of characters a text block can have to be considered a title.
|
|
443
|
+
|
|
444
|
+
Returns
|
|
445
|
+
-------
|
|
446
|
+
self
|
|
447
|
+
The instance with updated `titleLengthSize` parameter.
|
|
448
|
+
"""
|
|
449
|
+
return self._set(titleLengthSize=value)
|
|
450
|
+
|
|
451
|
+
def getTitleLengthSize(self):
|
|
452
|
+
"""Get the configured maximum title length.
|
|
453
|
+
|
|
454
|
+
Returns
|
|
455
|
+
-------
|
|
456
|
+
int
|
|
457
|
+
The maximum character length used to detect title blocks.
|
|
458
|
+
"""
|
|
459
|
+
return self.getOrDefault(self.titleLengthSize)
|
|
460
|
+
|
|
461
|
+
groupBrokenParagraphs = Param(
|
|
462
|
+
Params._dummy(),
|
|
463
|
+
"groupBrokenParagraphs",
|
|
464
|
+
"Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
|
|
465
|
+
typeConverter=TypeConverters.toBoolean
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
def setGroupBrokenParagraphs(self, value):
|
|
469
|
+
"""Enable or disable grouping of broken paragraphs.
|
|
470
|
+
|
|
471
|
+
Parameters
|
|
472
|
+
----------
|
|
473
|
+
value : bool
|
|
474
|
+
True to merge fragmented lines into paragraphs, False to leave lines as-is.
|
|
475
|
+
|
|
476
|
+
Returns
|
|
477
|
+
-------
|
|
478
|
+
self
|
|
479
|
+
The instance with updated `groupBrokenParagraphs` parameter.
|
|
480
|
+
"""
|
|
481
|
+
return self._set(groupBrokenParagraphs=value)
|
|
482
|
+
|
|
483
|
+
def getGroupBrokenParagraphs(self):
|
|
484
|
+
"""Get whether broken paragraph grouping is enabled.
|
|
485
|
+
|
|
486
|
+
Returns
|
|
487
|
+
-------
|
|
488
|
+
bool
|
|
489
|
+
True if grouping of broken paragraphs is enabled, False otherwise.
|
|
490
|
+
"""
|
|
491
|
+
return self.getOrDefault(self.groupBrokenParagraphs)
|
|
492
|
+
|
|
493
|
+
paragraphSplit = Param(
|
|
494
|
+
Params._dummy(),
|
|
495
|
+
"paragraphSplit",
|
|
496
|
+
"Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
|
|
497
|
+
typeConverter=TypeConverters.toString
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
def setParagraphSplit(self, value):
|
|
501
|
+
"""Set the regex pattern used to split paragraphs when grouping broken paragraphs.
|
|
502
|
+
|
|
503
|
+
Parameters
|
|
504
|
+
----------
|
|
505
|
+
value : str
|
|
506
|
+
Regular expression string used to detect paragraph boundaries.
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
-------
|
|
510
|
+
self
|
|
511
|
+
The instance with updated `paragraphSplit` parameter.
|
|
512
|
+
"""
|
|
513
|
+
return self._set(paragraphSplit=value)
|
|
514
|
+
|
|
515
|
+
def getParagraphSplit(self):
|
|
516
|
+
"""Get the paragraph-splitting regex pattern.
|
|
517
|
+
|
|
518
|
+
Returns
|
|
519
|
+
-------
|
|
520
|
+
str
|
|
521
|
+
The regex pattern used to detect paragraph boundaries.
|
|
522
|
+
"""
|
|
523
|
+
return self.getOrDefault(self.paragraphSplit)
|
|
524
|
+
|
|
525
|
+
shortLineWordThreshold = Param(
|
|
526
|
+
Params._dummy(),
|
|
527
|
+
"shortLineWordThreshold",
|
|
528
|
+
"Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
|
|
529
|
+
typeConverter=TypeConverters.toInt
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
def setShortLineWordThreshold(self, value):
|
|
533
|
+
"""Set the maximum word count for a line to be considered short.
|
|
534
|
+
|
|
535
|
+
Parameters
|
|
536
|
+
----------
|
|
537
|
+
value : int
|
|
538
|
+
Number of words under which a line is considered 'short'.
|
|
539
|
+
|
|
540
|
+
Returns
|
|
541
|
+
-------
|
|
542
|
+
self
|
|
543
|
+
The instance with updated `shortLineWordThreshold` parameter.
|
|
544
|
+
"""
|
|
545
|
+
return self._set(shortLineWordThreshold=value)
|
|
546
|
+
|
|
547
|
+
def getShortLineWordThreshold(self):
|
|
548
|
+
"""Get the short line word threshold.
|
|
549
|
+
|
|
550
|
+
Returns
|
|
551
|
+
-------
|
|
552
|
+
int
|
|
553
|
+
Word count threshold for short lines used in paragraph grouping.
|
|
554
|
+
"""
|
|
555
|
+
return self.getOrDefault(self.shortLineWordThreshold)
|
|
556
|
+
|
|
557
|
+
maxLineCount = Param(
|
|
558
|
+
Params._dummy(),
|
|
559
|
+
"maxLineCount",
|
|
560
|
+
"Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
|
|
561
|
+
typeConverter=TypeConverters.toInt
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
def setMaxLineCount(self, value):
|
|
565
|
+
"""Set the maximum number of lines to inspect when estimating paragraph layout.
|
|
566
|
+
|
|
567
|
+
Parameters
|
|
568
|
+
----------
|
|
569
|
+
value : int
|
|
570
|
+
Maximum number of lines to evaluate for layout heuristics.
|
|
571
|
+
|
|
572
|
+
Returns
|
|
573
|
+
-------
|
|
574
|
+
self
|
|
575
|
+
The instance with updated `maxLineCount` parameter.
|
|
576
|
+
"""
|
|
577
|
+
return self._set(maxLineCount=value)
|
|
578
|
+
|
|
579
|
+
def getMaxLineCount(self):
|
|
580
|
+
"""Get the maximum number of lines used for layout heuristics.
|
|
581
|
+
|
|
582
|
+
Returns
|
|
583
|
+
-------
|
|
584
|
+
int
|
|
585
|
+
The configured maximum number of lines to consider.
|
|
586
|
+
"""
|
|
587
|
+
return self.getOrDefault(self.maxLineCount)
|
|
588
|
+
|
|
589
|
+
threshold = Param(
|
|
590
|
+
Params._dummy(),
|
|
591
|
+
"threshold",
|
|
592
|
+
"Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
|
|
593
|
+
typeConverter=TypeConverters.toFloat
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
def setThreshold(self, value):
|
|
597
|
+
"""Set the empty-line ratio threshold for paragraph grouping decision.
|
|
598
|
+
|
|
599
|
+
Parameters
|
|
600
|
+
----------
|
|
601
|
+
value : float
|
|
602
|
+
Ratio (0.0-1.0) of empty lines used to switch grouping strategies.
|
|
603
|
+
|
|
604
|
+
Returns
|
|
605
|
+
-------
|
|
606
|
+
self
|
|
607
|
+
The instance with updated `threshold` parameter.
|
|
608
|
+
"""
|
|
609
|
+
return self._set(threshold=value)
|
|
610
|
+
|
|
611
|
+
def getThreshold(self):
|
|
612
|
+
"""Get the configured empty-line threshold ratio.
|
|
613
|
+
|
|
614
|
+
Returns
|
|
615
|
+
-------
|
|
616
|
+
float
|
|
617
|
+
The ratio used to decide paragraph grouping strategy.
|
|
618
|
+
"""
|
|
619
|
+
return self.getOrDefault(self.threshold)
|
|
620
|
+
|
|
621
|
+
extractTagAttributes = Param(
|
|
622
|
+
Params._dummy(),
|
|
623
|
+
"extractTagAttributes",
|
|
624
|
+
"Extract attribute values into separate lines when parsing tag-based formats (e.g., HTML or XML).",
|
|
625
|
+
typeConverter=TypeConverters.toListString
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
def setExtractTagAttributes(self, attributes: list[str]):
|
|
629
|
+
"""
|
|
630
|
+
Specify which tag attributes should have their values extracted as text when parsing
|
|
631
|
+
tag-based formats (e.g., HTML or XML).
|
|
632
|
+
|
|
633
|
+
:param attributes: list of attribute names to extract
|
|
634
|
+
:return: this instance with the updated `extractTagAttributes` parameter
|
|
635
|
+
"""
|
|
636
|
+
return self._set(extractTagAttributes=attributes)
|
|
637
|
+
|
|
638
|
+
def getExtractTagAttributes(self):
|
|
639
|
+
"""Get the list of tag attribute names configured to be extracted.
|
|
640
|
+
|
|
641
|
+
Returns
|
|
642
|
+
-------
|
|
643
|
+
list[str]
|
|
644
|
+
The attribute names whose values will be extracted as text.
|
|
645
|
+
"""
|
|
646
|
+
return self.getOrDefault(self.extractTagAttributes)
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
class HasChunkerProperties(Params):
|
|
650
|
+
|
|
651
|
+
chunkingStrategy = Param(
|
|
652
|
+
Params._dummy(),
|
|
653
|
+
"chunkingStrategy",
|
|
654
|
+
"Set the chunking strategy",
|
|
655
|
+
typeConverter=TypeConverters.toString
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
def setChunkingStrategy(self, value):
|
|
659
|
+
return self._set(chunkingStrategy=value)
|
|
660
|
+
|
|
661
|
+
maxCharacters = Param(
|
|
662
|
+
Params._dummy(),
|
|
663
|
+
"maxCharacters",
|
|
664
|
+
"Set the maximum number of characters",
|
|
665
|
+
typeConverter=TypeConverters.toInt
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
def setMaxCharacters(self, value):
|
|
669
|
+
return self._set(maxCharacters=value)
|
|
670
|
+
|
|
671
|
+
newAfterNChars = Param(
|
|
672
|
+
Params._dummy(),
|
|
673
|
+
"newAfterNChars",
|
|
674
|
+
"Insert a new chunk after N characters",
|
|
675
|
+
typeConverter=TypeConverters.toInt
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
def setNewAfterNChars(self, value):
|
|
679
|
+
return self._set(newAfterNChars=value)
|
|
680
|
+
|
|
681
|
+
overlap = Param(
|
|
682
|
+
Params._dummy(),
|
|
683
|
+
"overlap",
|
|
684
|
+
"Set the number of overlapping characters between chunks",
|
|
685
|
+
typeConverter=TypeConverters.toInt
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
def setOverlap(self, value):
|
|
689
|
+
return self._set(overlap=value)
|
|
690
|
+
|
|
691
|
+
combineTextUnderNChars = Param(
|
|
692
|
+
Params._dummy(),
|
|
693
|
+
"combineTextUnderNChars",
|
|
694
|
+
"Threshold to merge adjacent small sections",
|
|
695
|
+
typeConverter=TypeConverters.toInt
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
def setCombineTextUnderNChars(self, value):
|
|
699
|
+
return self._set(combineTextUnderNChars=value)
|
|
700
|
+
|
|
701
|
+
overlapAll = Param(
|
|
702
|
+
Params._dummy(),
|
|
703
|
+
"overlapAll",
|
|
704
|
+
"Apply overlap context between all sections, not just split chunks",
|
|
705
|
+
typeConverter=TypeConverters.toBoolean
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
def setOverlapAll(self, value):
|
|
709
|
+
return self._set(overlapAll=value)
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
from pyspark.ml.param import Param, Params, TypeConverters
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
class HasPdfProperties(Params):
|
|
716
|
+
|
|
717
|
+
pageNumCol = Param(
|
|
718
|
+
Params._dummy(),
|
|
719
|
+
"pageNumCol",
|
|
720
|
+
"Page number output column name.",
|
|
721
|
+
typeConverter=TypeConverters.toString
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
def setPageNumCol(self, value: str):
|
|
725
|
+
"""Sets page number output column name.
|
|
726
|
+
|
|
727
|
+
Parameters
|
|
728
|
+
----------
|
|
729
|
+
value : str
|
|
730
|
+
Name of the column for page numbers.
|
|
731
|
+
"""
|
|
732
|
+
return self._set(pageNumCol=value)
|
|
733
|
+
|
|
734
|
+
originCol = Param(
|
|
735
|
+
Params._dummy(),
|
|
736
|
+
"originCol",
|
|
737
|
+
"Input column name with original path of file.",
|
|
738
|
+
typeConverter=TypeConverters.toString
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
def setOriginCol(self, value: str):
|
|
742
|
+
"""Sets input column with original file path.
|
|
743
|
+
|
|
744
|
+
Parameters
|
|
745
|
+
----------
|
|
746
|
+
value : str
|
|
747
|
+
Column name that stores the file path.
|
|
748
|
+
"""
|
|
749
|
+
return self._set(originCol=value)
|
|
750
|
+
|
|
751
|
+
partitionNum = Param(
|
|
752
|
+
Params._dummy(),
|
|
753
|
+
"partitionNum",
|
|
754
|
+
"Number of partitions.",
|
|
755
|
+
typeConverter=TypeConverters.toInt
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
def setPartitionNum(self, value: int):
|
|
759
|
+
"""Sets number of partitions.
|
|
760
|
+
|
|
761
|
+
Parameters
|
|
762
|
+
----------
|
|
763
|
+
value : int
|
|
764
|
+
Number of partitions to use.
|
|
765
|
+
"""
|
|
766
|
+
return self._set(partitionNum=value)
|
|
767
|
+
|
|
768
|
+
storeSplittedPdf = Param(
|
|
769
|
+
Params._dummy(),
|
|
770
|
+
"storeSplittedPdf",
|
|
771
|
+
"Force to store bytes content of splitted pdf.",
|
|
772
|
+
typeConverter=TypeConverters.toBoolean
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
def setStoreSplittedPdf(self, value: bool):
|
|
776
|
+
"""Sets whether to store byte content of split PDF pages.
|
|
777
|
+
|
|
778
|
+
Parameters
|
|
779
|
+
----------
|
|
780
|
+
value : bool
|
|
781
|
+
True to store PDF page bytes, False otherwise.
|
|
782
|
+
"""
|
|
783
|
+
return self._set(storeSplittedPdf=value)
|
|
784
|
+
|
|
785
|
+
splitPage = Param(
|
|
786
|
+
Params._dummy(),
|
|
787
|
+
"splitPage",
|
|
788
|
+
"Enable/disable splitting per page to identify page numbers and improve performance.",
|
|
789
|
+
typeConverter=TypeConverters.toBoolean
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
def setSplitPage(self, value: bool):
|
|
793
|
+
"""Sets whether to split PDF into pages.
|
|
794
|
+
|
|
795
|
+
Parameters
|
|
796
|
+
----------
|
|
797
|
+
value : bool
|
|
798
|
+
True to split per page, False otherwise.
|
|
799
|
+
"""
|
|
800
|
+
return self._set(splitPage=value)
|
|
801
|
+
|
|
802
|
+
onlyPageNum = Param(
|
|
803
|
+
Params._dummy(),
|
|
804
|
+
"onlyPageNum",
|
|
805
|
+
"Extract only page numbers.",
|
|
806
|
+
typeConverter=TypeConverters.toBoolean
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
def setOnlyPageNum(self, value: bool):
|
|
810
|
+
"""Sets whether to extract only page numbers.
|
|
811
|
+
|
|
812
|
+
Parameters
|
|
813
|
+
----------
|
|
814
|
+
value : bool
|
|
815
|
+
True to extract only page numbers, False otherwise.
|
|
816
|
+
"""
|
|
817
|
+
return self._set(onlyPageNum=value)
|
|
818
|
+
|
|
819
|
+
textStripper = Param(
|
|
820
|
+
Params._dummy(),
|
|
821
|
+
"textStripper",
|
|
822
|
+
"Text stripper type used for output layout and formatting.",
|
|
823
|
+
typeConverter=TypeConverters.toString
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
def setTextStripper(self, value: str):
|
|
827
|
+
"""Sets text stripper type.
|
|
828
|
+
|
|
829
|
+
Parameters
|
|
830
|
+
----------
|
|
831
|
+
value : str
|
|
832
|
+
Text stripper type for layout and formatting.
|
|
833
|
+
"""
|
|
834
|
+
return self._set(textStripper=value)
|
|
835
|
+
|
|
836
|
+
sort = Param(
|
|
837
|
+
Params._dummy(),
|
|
838
|
+
"sort",
|
|
839
|
+
"Enable/disable sorting content on the page.",
|
|
840
|
+
typeConverter=TypeConverters.toBoolean
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
def setSort(self, value: bool):
|
|
844
|
+
"""Sets whether to sort content on the page.
|
|
845
|
+
|
|
846
|
+
Parameters
|
|
847
|
+
----------
|
|
848
|
+
value : bool
|
|
849
|
+
True to sort content, False otherwise.
|
|
850
|
+
"""
|
|
851
|
+
return self._set(sort=value)
|
|
852
|
+
|
|
853
|
+
extractCoordinates = Param(
|
|
854
|
+
Params._dummy(),
|
|
855
|
+
"extractCoordinates",
|
|
856
|
+
"Force extract coordinates of text.",
|
|
857
|
+
typeConverter=TypeConverters.toBoolean
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
def setExtractCoordinates(self, value: bool):
|
|
861
|
+
"""Sets whether to extract coordinates of text.
|
|
862
|
+
|
|
863
|
+
Parameters
|
|
864
|
+
----------
|
|
865
|
+
value : bool
|
|
866
|
+
True to extract coordinates, False otherwise.
|
|
867
|
+
"""
|
|
868
|
+
return self._set(extractCoordinates=value)
|
|
869
|
+
|
|
870
|
+
normalizeLigatures = Param(
|
|
871
|
+
Params._dummy(),
|
|
872
|
+
"normalizeLigatures",
|
|
873
|
+
"Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
|
|
874
|
+
typeConverter=TypeConverters.toBoolean
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
def setNormalizeLigatures(self, value: bool):
|
|
878
|
+
"""Sets whether to normalize ligatures (e.g., fl → f + l).
|
|
879
|
+
|
|
880
|
+
Parameters
|
|
881
|
+
----------
|
|
882
|
+
value : bool
|
|
883
|
+
True to normalize ligatures, False otherwise.
|
|
884
|
+
"""
|
|
885
|
+
return self._set(normalizeLigatures=value)
|
|
886
|
+
|
|
887
|
+
readAsImage = Param(
|
|
888
|
+
Params._dummy(),
|
|
889
|
+
"readAsImage",
|
|
890
|
+
"Read PDF pages as images.",
|
|
891
|
+
typeConverter=TypeConverters.toBoolean
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
def setReadAsImage(self, value: bool):
|
|
895
|
+
"""Sets whether to read PDF pages as images.
|
|
896
|
+
|
|
897
|
+
Parameters
|
|
898
|
+
----------
|
|
899
|
+
value : bool
|
|
900
|
+
True to read as images, False otherwise.
|
|
901
|
+
"""
|
|
902
|
+
return self._set(readAsImage=value)
|