PyPI - spark-nlp - Versions diffs - 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (329) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
com/johnsnowlabs/nlp/__init__.py +4 -2
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +281 -27
sparknlp/annotation.py +137 -6
sparknlp/annotation_audio.py +61 -0
sparknlp/annotation_image.py +82 -0
sparknlp/annotator/__init__.py +93 -0
sparknlp/annotator/audio/__init__.py +16 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/annotator/chunk2_doc.py +85 -0
sparknlp/annotator/chunker.py +137 -0
sparknlp/annotator/classifier_dl/__init__.py +61 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/__init__.py +1 -0
sparknlp/annotator/coref/spanbert_coref.py +221 -0
sparknlp/annotator/cv/__init__.py +29 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/__init__.py +17 -0
sparknlp/annotator/dependency/dependency_parser.py +294 -0
sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +235 -0
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +45 -0
sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
sparknlp/annotator/embeddings/doc2vec.py +352 -0
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
sparknlp/annotator/embeddings/word2vec.py +353 -0
sparknlp/annotator/embeddings/word_embeddings.py +385 -0
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
sparknlp/annotator/er/__init__.py +16 -0
sparknlp/annotator/er/entity_ruler.py +267 -0
sparknlp/annotator/graph_extraction.py +368 -0
sparknlp/annotator/keyword_extraction/__init__.py +16 -0
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
sparknlp/annotator/ld_dl/__init__.py +16 -0
sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
sparknlp/annotator/lemmatizer.py +250 -0
sparknlp/annotator/matcher/__init__.py +20 -0
sparknlp/annotator/matcher/big_text_matcher.py +272 -0
sparknlp/annotator/matcher/date_matcher.py +303 -0
sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
sparknlp/annotator/matcher/regex_matcher.py +221 -0
sparknlp/annotator/matcher/text_matcher.py +290 -0
sparknlp/annotator/n_gram_generator.py +141 -0
sparknlp/annotator/ner/__init__.py +21 -0
sparknlp/annotator/ner/ner_approach.py +94 -0
sparknlp/annotator/ner/ner_converter.py +148 -0
sparknlp/annotator/ner/ner_crf.py +397 -0
sparknlp/annotator/ner/ner_dl.py +591 -0
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +166 -0
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +230 -0
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/param/__init__.py +17 -0
sparknlp/annotator/param/classifier_encoder.py +98 -0
sparknlp/annotator/param/evaluation_dl_params.py +130 -0
sparknlp/annotator/pos/__init__.py +16 -0
sparknlp/annotator/pos/perceptron.py +263 -0
sparknlp/annotator/sentence/__init__.py +17 -0
sparknlp/annotator/sentence/sentence_detector.py +290 -0
sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
sparknlp/annotator/sentiment/__init__.py +17 -0
sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
sparknlp/annotator/seq2seq/__init__.py +35 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/__init__.py +18 -0
sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
sparknlp/annotator/stemmer.py +79 -0
sparknlp/annotator/stop_words_cleaner.py +190 -0
sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
sparknlp/annotator/token/__init__.py +19 -0
sparknlp/annotator/token/chunk_tokenizer.py +118 -0
sparknlp/annotator/token/recursive_tokenizer.py +205 -0
sparknlp/annotator/token/regex_tokenizer.py +208 -0
sparknlp/annotator/token/tokenizer.py +561 -0
sparknlp/annotator/token2_chunk.py +76 -0
sparknlp/annotator/ws/__init__.py +16 -0
sparknlp/annotator/ws/word_segmenter.py +429 -0
sparknlp/base/__init__.py +30 -0
sparknlp/base/audio_assembler.py +95 -0
sparknlp/base/doc2_chunk.py +169 -0
sparknlp/base/document_assembler.py +164 -0
sparknlp/base/embeddings_finisher.py +201 -0
sparknlp/base/finisher.py +217 -0
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/graph_finisher.py +125 -0
sparknlp/base/has_recursive_fit.py +24 -0
sparknlp/base/has_recursive_transform.py +22 -0
sparknlp/base/image_assembler.py +172 -0
sparknlp/base/light_pipeline.py +429 -0
sparknlp/base/multi_document_assembler.py +164 -0
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/recursive_pipeline.py +107 -0
sparknlp/base/table_assembler.py +145 -0
sparknlp/base/token_assembler.py +124 -0
sparknlp/common/__init__.py +26 -0
sparknlp/common/annotator_approach.py +41 -0
sparknlp/common/annotator_model.py +47 -0
sparknlp/common/annotator_properties.py +114 -0
sparknlp/common/annotator_type.py +38 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/coverage_result.py +22 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +1298 -0
sparknlp/common/read_as.py +33 -0
sparknlp/common/recursive_annotator_approach.py +35 -0
sparknlp/common/storage.py +149 -0
sparknlp/common/utils.py +39 -0
sparknlp/functions.py +315 -5
sparknlp/internal/__init__.py +1199 -0
sparknlp/internal/annotator_java_ml.py +32 -0
sparknlp/internal/annotator_transformer.py +37 -0
sparknlp/internal/extended_java_wrapper.py +63 -0
sparknlp/internal/params_getters_setters.py +71 -0
sparknlp/internal/recursive.py +70 -0
sparknlp/logging/__init__.py +15 -0
sparknlp/logging/comet.py +467 -0
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/__init__.py +17 -0
sparknlp/pretrained/pretrained_pipeline.py +158 -0
sparknlp/pretrained/resource_downloader.py +216 -0
sparknlp/pretrained/utils.py +35 -0
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +20 -0
sparknlp/training/_tf_graph_builders/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/conll.py +150 -0
sparknlp/training/conllu.py +103 -0
sparknlp/training/pos.py +103 -0
sparknlp/training/pub_tator.py +76 -0
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/training/tfgraphs.py +5 -0
sparknlp/upload_to_hub.py +149 -0
sparknlp/util.py +51 -5
com/__init__.pyc +0 -0
com/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/__init__.pyc +0 -0
com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/nlp/__init__.pyc +0 -0
com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
sparknlp/__init__.pyc +0 -0
sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
sparknlp/__pycache__/base.cpython-36.pyc +0 -0
sparknlp/__pycache__/common.cpython-36.pyc +0 -0
sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
sparknlp/__pycache__/training.cpython-36.pyc +0 -0
sparknlp/__pycache__/util.cpython-36.pyc +0 -0
sparknlp/annotation.pyc +0 -0
sparknlp/annotator.py +0 -3006
sparknlp/annotator.pyc +0 -0
sparknlp/base.py +0 -347
sparknlp/base.pyc +0 -0
sparknlp/common.py +0 -193
sparknlp/common.pyc +0 -0
sparknlp/embeddings.py +0 -40
sparknlp/embeddings.pyc +0 -0
sparknlp/internal.py +0 -288
sparknlp/internal.pyc +0 -0
sparknlp/pretrained.py +0 -123
sparknlp/pretrained.pyc +0 -0
sparknlp/storage.py +0 -32
sparknlp/storage.pyc +0 -0
sparknlp/training.py +0 -62
sparknlp/training.pyc +0 -0
sparknlp/util.pyc +0 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0

sparknlp/base/finisher.py ADDED Viewed

@@ -0,0 +1,217 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the Finisher."""
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+from sparknlp.internal import AnnotatorTransformer
+class Finisher(AnnotatorTransformer):
+    """Converts annotation results into a format that easier to use.
+    It is useful to extract the results from Spark NLP Pipelines. The Finisher
+    outputs annotation(s) values into ``String``.
+    For more extended examples on document pre-processing see the
+    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb
+>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``ANY``                ``NONE``
+    ====================== ======================
+    Parameters
+    ----------
+    inputCols
+        Input annotations
+    outputCols
+        Output finished annotation cols
+    valueSplitSymbol
+        Character separating values, by default #
+    annotationSplitSymbol
+        Character separating annotations, by default @
+    cleanAnnotations
+        Whether to remove annotation columns, by default True
+    includeMetadata
+        Whether to include annotation metadata, by default False
+    outputAsArray
+        Finisher generates an Array with the results instead of string, by
+        default True
+    parseEmbeddingsVectors
+        Whether to include embeddings vectors in the process, by default False
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from sparknlp.pretrained import PretrainedPipeline
+    >>> data = spark.createDataFrame([[1, "New York and New Jersey aren't that far apart actually."]]).toDF("id", "text")
+    Define pretrained pipeline that extracts Named Entities amongst other things
+    and apply the `Finisher` on it.
+    >>> pipeline = PretrainedPipeline("explain_document_dl")
+    >>> finisher = Finisher().setInputCols("entities").setOutputCols("output")
+    >>> explainResult = pipeline.transform(data)
+    Show results.
+    >>> explainResult.selectExpr("explode(entities)").show(truncate=False)
+    +------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |entities                                                                                                                                              |
+    +------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |[[chunk, 0, 7, New York, [entity -> LOC, sentence -> 0, chunk -> 0], []], [chunk, 13, 22, New Jersey, [entity -> LOC, sentence -> 0, chunk -> 1], []]]|
+    +------------------------------------------------------------------------------------------------------------------------------------------------------+
+    >>> result = finisher.transform(explainResult)
+    >>> result.select("output").show(truncate=False)
+    +----------------------+
+    |output                |
+    +----------------------+
+    |[New York, New Jersey]|
+    +----------------------+
+    See Also
+    --------
+    Finisher : for finishing Strings
+    """
+    inputCols = Param(Params._dummy(), "inputCols", "input annotations", typeConverter=TypeConverters.toListString)
+    outputCols = Param(Params._dummy(), "outputCols", "output finished annotation cols", typeConverter=TypeConverters.toListString)
+    valueSplitSymbol = Param(Params._dummy(), "valueSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
+    annotationSplitSymbol = Param(Params._dummy(), "annotationSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
+    cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove annotation columns", typeConverter=TypeConverters.toBoolean)
+    includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
+    outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
+    parseEmbeddingsVectors = Param(Params._dummy(), "parseEmbeddingsVectors", "whether to include embeddings vectors in the process", typeConverter=TypeConverters.toBoolean)
+    name = "Finisher"
+    @keyword_only
+    def __init__(self):
+        super(Finisher, self).__init__(classname="com.johnsnowlabs.nlp.Finisher")
+        self._setDefault(
+            cleanAnnotations=True,
+            includeMetadata=False,
+            outputAsArray=True,
+            parseEmbeddingsVectors=False,
+            valueSplitSymbol="#",
+            annotationSplitSymbol="@",
+            outputCols=[]
+        )
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+    def setInputCols(self, *value):
+        """Sets column names of input annotations.
+        Parameters
+        ----------
+        *value : List[str]
+            Input columns for the annotator
+        """
+        if len(value) == 1 and type(value[0]) == list:
+            return self._set(inputCols=value[0])
+        else:
+            return self._set(inputCols=list(value))
+    def setOutputCols(self, *value):
+        """Sets column names of finished output annotations.
+        Parameters
+        ----------
+        *value : List[str]
+            List of output columns
+        """
+        if len(value) == 1 and type(value[0]) == list:
+            return self._set(outputCols=value[0])
+        else:
+            return self._set(outputCols=list(value))
+    def setValueSplitSymbol(self, value):
+        """Sets character separating values, by default #.
+        Parameters
+        ----------
+        value : str
+            Character to separate annotations
+        """
+        return self._set(valueSplitSymbol=value)
+    def setAnnotationSplitSymbol(self, value):
+        """Sets character separating annotations, by default @.
+        Parameters
+        ----------
+        value : str
+            ...
+        """
+        return self._set(annotationSplitSymbol=value)
+    def setCleanAnnotations(self, value):
+        """Sets whether to remove annotation columns, by default True.
+        Parameters
+        ----------
+        value : bool
+            Whether to remove annotation columns
+        """
+        return self._set(cleanAnnotations=value)
+    def setIncludeMetadata(self, value):
+        """Sets whether to include annotation metadata.
+        Parameters
+        ----------
+        value : bool
+            Whether to include annotation metadata
+        """
+        return self._set(includeMetadata=value)
+    def setOutputAsArray(self, value):
+        """Sets whether to generate an array with the results instead of a
+        string.
+        Parameters
+        ----------
+        value : bool
+            Whether to generate an array with the results instead of a string
+        """
+        return self._set(outputAsArray=value)
+    def setParseEmbeddingsVectors(self, value):
+        """Sets whether to include embeddings vectors in the process.
+        Parameters
+        ----------
+        value : bool
+            Whether to include embeddings vectors in the process
+        """
+        return self._set(parseEmbeddingsVectors=value)
+    def getInputCols(self):
+        """Gets input columns name of annotations."""
+        return self.getOrDefault(self.inputCols)
+    def getOutputCols(self):
+        """Gets output columns name of annotations."""
+        if len(self.getOrDefault(self.outputCols)) == 0:
+            return ["finished_" + input_col for input_col in self.getInputCols()]
+        else:
+            return self.getOrDefault(self.outputCols)

sparknlp/base/gguf_ranking_finisher.py ADDED Viewed

@@ -0,0 +1,234 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the GGUFRankingFinisher."""
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+from sparknlp.internal import AnnotatorTransformer
+class GGUFRankingFinisher(AnnotatorTransformer):
+    """Finisher for AutoGGUFReranker outputs that provides ranking capabilities
+    including top-k selection, sorting by relevance score, and score normalization.
+    This finisher processes the output of AutoGGUFReranker, which contains documents with
+    relevance scores in their metadata. It provides several options for post-processing:
+    - Top-k selection: Select only the top k documents by relevance score
+    - Score thresholding: Filter documents by minimum relevance score
+    - Min-max scaling: Normalize relevance scores to 0-1 range
+    - Sorting: Sort documents by relevance score in descending order
+    - Ranking: Add rank information to document metadata
+    The finisher preserves the document annotation structure while adding ranking information
+    to the metadata and optionally filtering/sorting the documents.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+    Parameters
+    ----------
+    inputCols
+        Name of input annotation columns containing reranked documents
+    outputCol
+        Name of output annotation column containing ranked documents, by default "ranked_documents"
+    topK
+        Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
+    minRelevanceScore
+        Minimum relevance score threshold for filtering documents, by default Double.MinValue
+    minMaxScaling
+        Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> reranker = AutoGGUFReranker.pretrained() \\
+    ...     .setInputCols("document") \\
+    ...     .setOutputCol("reranked_documents") \\
+    ...     .setQuery("A man is eating pasta.")
+    >>> finisher = GGUFRankingFinisher() \\
+    ...     .setInputCols("reranked_documents") \\
+    ...     .setOutputCol("ranked_documents") \\
+    ...     .setTopK(3) \\
+    ...     .setMinMaxScaling(True)
+    >>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
+    >>> data = spark.createDataFrame([
+    ...     ("A man is eating food.",),
+    ...     ("A man is eating a piece of bread.",),
+    ...     ("The girl is carrying a baby.",),
+    ...     ("A man is riding a horse.",)
+    ... ], ["text"])
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("ranked_documents").show(truncate=False)
+    # Documents will be sorted by relevance with rank information in metadata
+    """
+    name = "GGUFRankingFinisher"
+    inputCols = Param(Params._dummy(),
+                     "inputCols",
+                     "Name of input annotation columns containing reranked documents",
+                     typeConverter=TypeConverters.toListString)
+    outputCol = Param(Params._dummy(),
+                     "outputCol",
+                     "Name of output annotation column containing ranked documents",
+                     typeConverter=TypeConverters.toListString)
+    topK = Param(Params._dummy(),
+                 "topK",
+                 "Maximum number of top documents to return based on relevance score (-1 for no limit)",
+                 typeConverter=TypeConverters.toInt)
+    minRelevanceScore = Param(Params._dummy(),
+                             "minRelevanceScore",
+                             "Minimum relevance score threshold for filtering documents",
+                             typeConverter=TypeConverters.toFloat)
+    minMaxScaling = Param(Params._dummy(),
+                         "minMaxScaling",
+                         "Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
+                         typeConverter=TypeConverters.toBoolean)
+    @keyword_only
+    def __init__(self):
+        super(GGUFRankingFinisher, self).__init__(
+            classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
+        self._setDefault(
+            topK=-1,
+            minRelevanceScore=float('-inf'),  # Equivalent to Double.MinValue
+            minMaxScaling=False,
+            outputCol=["ranked_documents"]
+        )
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+    def setInputCols(self, *value):
+        """Sets input annotation column names.
+        Parameters
+        ----------
+        value : List[str]
+            Input annotation column names containing reranked documents
+        """
+        if len(value) == 1 and isinstance(value[0], list):
+            return self._set(inputCols=value[0])
+        else:
+            return self._set(inputCols=list(value))
+    def getInputCols(self):
+        """Gets input annotation column names.
+        Returns
+        -------
+        List[str]
+            Input annotation column names
+        """
+        return self.getOrDefault(self.inputCols)
+    def setOutputCol(self, value):
+        """Sets output annotation column name.
+        Parameters
+        ----------
+        value : str
+            Output annotation column name
+        """
+        return self._set(outputCol=[value])
+    def getOutputCol(self):
+        """Gets output annotation column name.
+        Returns
+        -------
+        str
+            Output annotation column name
+        """
+        output_cols = self.getOrDefault(self.outputCol)
+        return output_cols[0] if output_cols else "ranked_documents"
+    def setTopK(self, value):
+        """Sets maximum number of top documents to return.
+        Parameters
+        ----------
+        value : int
+            Maximum number of top documents to return (-1 for no limit)
+        """
+        return self._set(topK=value)
+    def getTopK(self):
+        """Gets maximum number of top documents to return.
+        Returns
+        -------
+        int
+            Maximum number of top documents to return
+        """
+        return self.getOrDefault(self.topK)
+    def setMinRelevanceScore(self, value):
+        """Sets minimum relevance score threshold.
+        Parameters
+        ----------
+        value : float
+            Minimum relevance score threshold
+        """
+        return self._set(minRelevanceScore=value)
+    def getMinRelevanceScore(self):
+        """Gets minimum relevance score threshold.
+        Returns
+        -------
+        float
+            Minimum relevance score threshold
+        """
+        return self.getOrDefault(self.minRelevanceScore)
+    def setMinMaxScaling(self, value):
+        """Sets whether to apply min-max scaling.
+        Parameters
+        ----------
+        value : bool
+            Whether to apply min-max scaling to normalize scores
+        """
+        return self._set(minMaxScaling=value)
+    def getMinMaxScaling(self):
+        """Gets whether to apply min-max scaling.
+        Returns
+        -------
+        bool
+            Whether min-max scaling is enabled
+        """
+        return self.getOrDefault(self.minMaxScaling)

sparknlp/base/graph_finisher.py ADDED Viewed

@@ -0,0 +1,125 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the GraphFinisher."""
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+from sparknlp.internal import AnnotatorTransformer
+class GraphFinisher(AnnotatorTransformer):
+    """Helper class to convert the knowledge graph from GraphExtraction into a
+    generic format, such as RDF.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``NONE``               ``NONE``
+    ====================== ======================
+    Parameters
+    ----------
+    inputCol
+        Name of input annotation column
+    outputCol
+        Name of finisher output column
+    cleanAnnotations
+        Whether to remove all the existing annotation columns, by default True
+    outputAsArray
+        Whether to generate an Array with the results, by default True
+    Examples
+    --------
+    This is a continuation of the example of
+    :class:`.GraphExtraction`. To see how the graph is extracted, see the
+    documentation of that class.
+    >>> graphFinisher = GraphFinisher() \\
+    ...     .setInputCol("graph") \\
+    ...     .setOutputCol("graph_finished")
+    ...     .setOutputAsArray(False)
+    >>> finishedResult = graphFinisher.transform(result)
+    >>> finishedResult.select("text", "graph_finished").show(truncate=False)
+    +-----------------------------------------------------+-----------------------------------------------------------------------+
+    |text                                                 |graph_finished                                                         |
+    +-----------------------------------------------------+-----------------------------------------------------------------------+
+    |You and John prefer the morning flight through Denver|[[(prefer,nsubj,morning), (morning,flat,flight), (flight,flat,Denver)]]|
+    +-----------------------------------------------------+-----------------------------------------------------------------------+
+    """
+    inputCol = Param(Params._dummy(), "inputCol", "Name of input annotation col", typeConverter=TypeConverters.toString)
+    outputCol = Param(Params._dummy(), "outputCol", "Name of finisher output col", typeConverter=TypeConverters.toString)
+    cleanAnnotations = Param(Params._dummy(),
+                             "cleanAnnotations",
+                             "Whether to remove all the existing annotation columns",
+                             typeConverter=TypeConverters.toBoolean)
+    outputAsArray = Param(Params._dummy(), "outputAsArray", "Finisher generates an Array with the results",
+                          typeConverter=TypeConverters.toBoolean)
+    name = "GraphFinisher"
+    @keyword_only
+    def __init__(self):
+        super(GraphFinisher, self).__init__(classname="com.johnsnowlabs.nlp.GraphFinisher")
+        self._setDefault(
+            cleanAnnotations=True,
+            outputAsArray=True
+        )
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+    def setInputCol(self, value):
+        """Sets name of input annotation column.
+        Parameters
+        ----------
+        value : str
+            Name of input annotation column.
+        """
+        return self._set(inputCol=value)
+    def setOutputCol(self, value):
+        """Sets name of finisher output column.
+        Parameters
+        ----------
+        value : str
+            Name of finisher output column.
+        """
+        return self._set(outputCol=value)
+    def setCleanAnnotations(self, value):
+        """Sets whether to remove all the existing annotation columns, by
+        default True.
+        Parameters
+        ----------
+        value : bool
+            Whether to remove all the existing annotation columns, by default True.
+        """
+        return self._set(cleanAnnotations=value)
+    def setOutputAsArray(self, value):
+        """Sets whether to generate an Array with the results, by default True.
+        Parameters
+        ----------
+        value : bool
+            Whether to generate an Array with the results, by default True.
+        """
+        return self._set(outputAsArray=value)

sparknlp/base/has_recursive_fit.py ADDED Viewed

@@ -0,0 +1,24 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains HasRecursiveFit property for estimators."""
+from abc import ABC
+from sparknlp.internal import RecursiveEstimator
+class HasRecursiveFit(RecursiveEstimator, ABC):
+    """Properties for the implementation of the RecursivePipeline."""
+    pass

sparknlp/base/has_recursive_transform.py ADDED Viewed

@@ -0,0 +1,22 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains HasRecursiveFit property for Spark transformers."""
+from sparknlp.internal import RecursiveTransformer
+class HasRecursiveTransform(RecursiveTransformer):
+    """Properties for the implementation of the RecursivePipeline."""
+    pass

spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl