PyPI - spark-nlp - Versions diffs - 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (329) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
com/johnsnowlabs/nlp/__init__.py +4 -2
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +281 -27
sparknlp/annotation.py +137 -6
sparknlp/annotation_audio.py +61 -0
sparknlp/annotation_image.py +82 -0
sparknlp/annotator/__init__.py +93 -0
sparknlp/annotator/audio/__init__.py +16 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/annotator/chunk2_doc.py +85 -0
sparknlp/annotator/chunker.py +137 -0
sparknlp/annotator/classifier_dl/__init__.py +61 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/__init__.py +1 -0
sparknlp/annotator/coref/spanbert_coref.py +221 -0
sparknlp/annotator/cv/__init__.py +29 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/__init__.py +17 -0
sparknlp/annotator/dependency/dependency_parser.py +294 -0
sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +235 -0
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +45 -0
sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
sparknlp/annotator/embeddings/doc2vec.py +352 -0
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
sparknlp/annotator/embeddings/word2vec.py +353 -0
sparknlp/annotator/embeddings/word_embeddings.py +385 -0
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
sparknlp/annotator/er/__init__.py +16 -0
sparknlp/annotator/er/entity_ruler.py +267 -0
sparknlp/annotator/graph_extraction.py +368 -0
sparknlp/annotator/keyword_extraction/__init__.py +16 -0
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
sparknlp/annotator/ld_dl/__init__.py +16 -0
sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
sparknlp/annotator/lemmatizer.py +250 -0
sparknlp/annotator/matcher/__init__.py +20 -0
sparknlp/annotator/matcher/big_text_matcher.py +272 -0
sparknlp/annotator/matcher/date_matcher.py +303 -0
sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
sparknlp/annotator/matcher/regex_matcher.py +221 -0
sparknlp/annotator/matcher/text_matcher.py +290 -0
sparknlp/annotator/n_gram_generator.py +141 -0
sparknlp/annotator/ner/__init__.py +21 -0
sparknlp/annotator/ner/ner_approach.py +94 -0
sparknlp/annotator/ner/ner_converter.py +148 -0
sparknlp/annotator/ner/ner_crf.py +397 -0
sparknlp/annotator/ner/ner_dl.py +591 -0
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +166 -0
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +230 -0
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/param/__init__.py +17 -0
sparknlp/annotator/param/classifier_encoder.py +98 -0
sparknlp/annotator/param/evaluation_dl_params.py +130 -0
sparknlp/annotator/pos/__init__.py +16 -0
sparknlp/annotator/pos/perceptron.py +263 -0
sparknlp/annotator/sentence/__init__.py +17 -0
sparknlp/annotator/sentence/sentence_detector.py +290 -0
sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
sparknlp/annotator/sentiment/__init__.py +17 -0
sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
sparknlp/annotator/seq2seq/__init__.py +35 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/__init__.py +18 -0
sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
sparknlp/annotator/stemmer.py +79 -0
sparknlp/annotator/stop_words_cleaner.py +190 -0
sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
sparknlp/annotator/token/__init__.py +19 -0
sparknlp/annotator/token/chunk_tokenizer.py +118 -0
sparknlp/annotator/token/recursive_tokenizer.py +205 -0
sparknlp/annotator/token/regex_tokenizer.py +208 -0
sparknlp/annotator/token/tokenizer.py +561 -0
sparknlp/annotator/token2_chunk.py +76 -0
sparknlp/annotator/ws/__init__.py +16 -0
sparknlp/annotator/ws/word_segmenter.py +429 -0
sparknlp/base/__init__.py +30 -0
sparknlp/base/audio_assembler.py +95 -0
sparknlp/base/doc2_chunk.py +169 -0
sparknlp/base/document_assembler.py +164 -0
sparknlp/base/embeddings_finisher.py +201 -0
sparknlp/base/finisher.py +217 -0
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/graph_finisher.py +125 -0
sparknlp/base/has_recursive_fit.py +24 -0
sparknlp/base/has_recursive_transform.py +22 -0
sparknlp/base/image_assembler.py +172 -0
sparknlp/base/light_pipeline.py +429 -0
sparknlp/base/multi_document_assembler.py +164 -0
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/recursive_pipeline.py +107 -0
sparknlp/base/table_assembler.py +145 -0
sparknlp/base/token_assembler.py +124 -0
sparknlp/common/__init__.py +26 -0
sparknlp/common/annotator_approach.py +41 -0
sparknlp/common/annotator_model.py +47 -0
sparknlp/common/annotator_properties.py +114 -0
sparknlp/common/annotator_type.py +38 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/coverage_result.py +22 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +1298 -0
sparknlp/common/read_as.py +33 -0
sparknlp/common/recursive_annotator_approach.py +35 -0
sparknlp/common/storage.py +149 -0
sparknlp/common/utils.py +39 -0
sparknlp/functions.py +315 -5
sparknlp/internal/__init__.py +1199 -0
sparknlp/internal/annotator_java_ml.py +32 -0
sparknlp/internal/annotator_transformer.py +37 -0
sparknlp/internal/extended_java_wrapper.py +63 -0
sparknlp/internal/params_getters_setters.py +71 -0
sparknlp/internal/recursive.py +70 -0
sparknlp/logging/__init__.py +15 -0
sparknlp/logging/comet.py +467 -0
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/__init__.py +17 -0
sparknlp/pretrained/pretrained_pipeline.py +158 -0
sparknlp/pretrained/resource_downloader.py +216 -0
sparknlp/pretrained/utils.py +35 -0
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +20 -0
sparknlp/training/_tf_graph_builders/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/conll.py +150 -0
sparknlp/training/conllu.py +103 -0
sparknlp/training/pos.py +103 -0
sparknlp/training/pub_tator.py +76 -0
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/training/tfgraphs.py +5 -0
sparknlp/upload_to_hub.py +149 -0
sparknlp/util.py +51 -5
com/__init__.pyc +0 -0
com/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/__init__.pyc +0 -0
com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/nlp/__init__.pyc +0 -0
com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
sparknlp/__init__.pyc +0 -0
sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
sparknlp/__pycache__/base.cpython-36.pyc +0 -0
sparknlp/__pycache__/common.cpython-36.pyc +0 -0
sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
sparknlp/__pycache__/training.cpython-36.pyc +0 -0
sparknlp/__pycache__/util.cpython-36.pyc +0 -0
sparknlp/annotation.pyc +0 -0
sparknlp/annotator.py +0 -3006
sparknlp/annotator.pyc +0 -0
sparknlp/base.py +0 -347
sparknlp/base.pyc +0 -0
sparknlp/common.py +0 -193
sparknlp/common.pyc +0 -0
sparknlp/embeddings.py +0 -40
sparknlp/embeddings.pyc +0 -0
sparknlp/internal.py +0 -288
sparknlp/internal.pyc +0 -0
sparknlp/pretrained.py +0 -123
sparknlp/pretrained.pyc +0 -0
sparknlp/storage.py +0 -32
sparknlp/storage.pyc +0 -0
sparknlp/training.py +0 -62
sparknlp/training.pyc +0 -0
sparknlp/util.pyc +0 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0

sparknlp/base/image_assembler.py ADDED Viewed

@@ -0,0 +1,172 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the ImageAssembler."""
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql.functions import regexp_replace, col
+from sparknlp.common import AnnotatorType
+from sparknlp.internal import AnnotatorTransformer
+class ImageAssembler(AnnotatorTransformer):
+    """Prepares images read by Spark into a format that is processable by Spark NLP.
+    This component is needed to process images.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``NONE``               ``IMAGE``
+    ====================== ======================
+    Parameters
+    ----------
+    inputCol
+        Input column name
+    outputCol
+        Output column name
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from pyspark.ml import Pipeline
+    >>> data = spark.read.format("image").load("./tmp/images/").toDF("image")
+    >>> imageAssembler = ImageAssembler().setInputCol("image").setOutputCol("image_assembler")
+    >>> result = imageAssembler.transform(data)
+    >>> result.select("image_assembler").show()
+    >>> result.select("image_assembler").printSchema()
+    root
+     |-- image_assembler: array (nullable = true)
+     |    |-- element: struct (containsNull = true)
+     |    |    |-- annotatorType: string (nullable = true)
+     |    |    |-- origin: string (nullable = true)
+     |    |    |-- height: integer (nullable = true)
+     |    |    |-- width: integer (nullable = true)
+     |    |    |-- nChannels: integer (nullable = true)
+     |    |    |-- mode: integer (nullable = true)
+     |    |    |-- result: binary (nullable = true)
+     |    |    |-- metadata: map (nullable = true)
+     |    |    |    |-- key: string
+     |    |    |    |-- value: string (valueContainsNull = true)
+    """
+    outputAnnotatorType = AnnotatorType.IMAGE
+    inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
+    textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
+    outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
+    name = 'ImageAssembler'
+    @keyword_only
+    def __init__(self):
+        super(ImageAssembler, self).__init__(classname="com.johnsnowlabs.nlp.ImageAssembler")
+        self._setDefault(outputCol="image_assembler", inputCol='image')
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+    def setInputCol(self, value):
+        """Sets input column name.
+        Parameters
+        ----------
+        value : str
+            Name of the input column that has image format loaded via spark.read.format("image").load(PATH)
+        """
+        return self._set(inputCol=value)
+    def setOutputCol(self, value):
+        """Sets output column name.
+        Parameters
+        ----------
+        value : str
+            Name of the Output Column
+        """
+        return self._set(outputCol=value)
+    def getOutputCol(self):
+        """Gets output column name of annotations."""
+        return self.getOrDefault(self.outputCol)
+    def setTextCol(self, value):
+        """Sets an optional text column name.
+        Parameters
+        ----------
+        value : str
+            Name of an optional input text column
+        """
+        return self._set(inputCol=value)
+    @classmethod
+    def loadImagesAsBytes(cls, spark: SparkSession, path: str):
+        """
+        Loads images from a given path and returns them as raw bytes, instead of the default
+        OpenCV-compatible format. Supported image types include JPEG, PNG, GIF, and BMP.
+        Multimodal inference with llama.cpp requires raw bytes as input.
+        Parameters
+        ----------
+        spark : SparkSession
+            The active SparkSession.
+        path : str
+            The path to the images. Supported image types are JPEG, PNG, GIF, and BMP.
+        Returns
+        -------
+        DataFrame
+            A DataFrame containing the images as raw bytes along with their metadata.
+        """
+        # Replace the path separator in the `origin` field and `path` column, so that they match
+        def replace_path(column_name: str):
+            return regexp_replace(col(column_name), ":///", ":/")
+        # Load the images as metadata with the default Spark image format
+        data = (
+            spark.read.format("image")
+            .option("dropInvalid", True)
+            .load(path)
+            .withColumn(
+                "image", col("image").withField("origin", replace_path("image.origin"))
+            )
+        )
+        # Load the images as raw binary files
+        image_bytes = (
+            spark.read.format("binaryFile")
+            .option("pathGlobFilter", "*.{jpeg,jpg,png,gif,bmp,JPEG,JPG,PNG,GIF,BMP}")
+            .option("dropInvalid", True)
+            .load(path)
+            .withColumn("path", replace_path("path"))
+        )
+        # Join the two datasets on the file path
+        df_joined = data.join(
+            image_bytes, data["image.origin"] == image_bytes["path"], "inner"
+        )
+        # Replace the `data` field of the `image` column with raw bytes
+        df_image_replaced = df_joined.withColumn(
+            "image", df_joined["image"].withField("data", df_joined["content"])
+        )
+        return df_image_replaced

sparknlp/base/light_pipeline.py ADDED Viewed

@@ -0,0 +1,429 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the LightPipeline."""
+import sparknlp.internal as _internal
+from sparknlp.annotation import Annotation
+from sparknlp.annotation_audio import AnnotationAudio
+from sparknlp.annotation_image import AnnotationImage
+from sparknlp.common import AnnotatorApproach, AnnotatorModel
+from sparknlp.internal import AnnotatorTransformer
+class LightPipeline:
+    """Creates a LightPipeline from a Spark PipelineModel.
+    LightPipeline is a Spark NLP specific Pipeline class equivalent to Spark
+    ML Pipeline. The difference is that it’s execution does not hold to
+    Spark principles, instead it computes everything locally (but in
+    parallel) in order to achieve fast results when dealing with small
+    amounts of data. This means, we do not input a Spark Dataframe, but a
+    string or an Array of strings instead, to be annotated. To create Light
+    Pipelines, you need to input an already trained (fit) Spark ML Pipeline.
+    It’s :meth:`.transform` has now an alternative :meth:`.annotate`, which
+    directly outputs the results.
+    Parameters
+    ----------
+    pipelineModel : :class:`pyspark.ml.PipelineModel`
+        The PipelineModel containing Spark NLP Annotators
+    parse_embeddings : bool, optional
+        Whether to parse embeddings, by default False
+    Notes
+    -----
+    Use :meth:`.fullAnnotate` to also output the result as
+    :class:`.Annotation`, with metadata.
+    Examples
+    --------
+    >>> from sparknlp.base import LightPipeline
+    >>> light = LightPipeline(pipeline.fit(data))
+    >>> light.annotate("We are very happy about Spark NLP")
+    {
+        'document': ['We are very happy about Spark NLP'],
+        'lemmas': ['We', 'be', 'very', 'happy', 'about', 'Spark', 'NLP'],
+        'pos': ['PRP', 'VBP', 'RB', 'JJ', 'IN', 'NNP', 'NNP'],
+        'sentence': ['We are very happy about Spark NLP'],
+        'spell': ['We', 'are', 'very', 'happy', 'about', 'Spark', 'NLP'],
+        'stems': ['we', 'ar', 'veri', 'happi', 'about', 'spark', 'nlp'],
+        'token': ['We', 'are', 'very', 'happy', 'about', 'Spark', 'NLP']
+    }
+    """
+    def __init__(self, pipelineModel, parse_embeddings=False):
+        self.pipeline_model = pipelineModel
+        self.parse_embeddings = parse_embeddings
+        self._lightPipeline = _internal._LightPipeline(pipelineModel, parse_embeddings).apply()
+    def _validateStagesInputCols(self, stages):
+        annotator_types = self._getAnnotatorTypes(stages)
+        for stage in stages:
+            if isinstance(stage, AnnotatorApproach) or isinstance(stage, AnnotatorModel):
+                input_cols = stage.getInputCols()
+                if type(input_cols) == str:
+                    input_cols = [input_cols]
+                input_annotator_types = stage.inputAnnotatorTypes + stage.optionalInputAnnotatorTypes
+                for input_col in input_cols:
+                    annotator_type = annotator_types.get(input_col)
+                    if annotator_type is None or annotator_type not in input_annotator_types:
+                        raise TypeError(f"Wrong or missing inputCols annotators in {stage.uid}"
+                                        f" Make sure such annotator exist in your pipeline,"
+                                        f" with the right output names and that they have following annotator types:"
+                                        f" {input_annotator_types}")
+    def _skipPipelineValidation(self, stages):
+        exceptional_pipeline = [stage for stage in stages if self._skipStageValidation(stage)]
+        if len(exceptional_pipeline) >= 1:
+            return True
+        else:
+            return False
+    def _skipStageValidation(self, stage):
+        return hasattr(stage, 'skipLPInputColsValidation') and stage.skipLPInputColsValidation
+    def _getAnnotatorTypes(self, stages):
+        annotator_types = {}
+        for stage in stages:
+            if hasattr(stage, 'getOutputCols'):
+                output_cols = stage.getOutputCols()
+                for output_col in output_cols:
+                    annotator_types[output_col] = stage.outputAnnotatorType
+            elif isinstance(stage, AnnotatorApproach) or isinstance(stage, AnnotatorModel) or\
+                    isinstance(stage, AnnotatorTransformer):
+                if stage.outputAnnotatorType is not None:
+                    annotator_types[stage.getOutputCol()] = stage.outputAnnotatorType
+        return annotator_types
+    def _annotationFromJava(self, java_annotations):
+        annotations = []
+        for annotation in java_annotations:
+            index = annotation.toString().index("(")
+            annotation_type = annotation.toString()[:index]
+            if annotation_type == "AnnotationImage":
+                result = self.__get_result(annotation)
+                annotations.append(
+                    AnnotationImage(annotation.annotatorType(),
+                                    annotation.origin(),
+                                    annotation.height(),
+                                    annotation.width(),
+                                    annotation.nChannels(),
+                                    annotation.mode(),
+                                    result,
+                                    annotation.metadata())
+                )
+            elif annotation_type == "AnnotationAudio":
+                result = self.__get_result(annotation)
+                annotations.append(
+                    AnnotationAudio(annotation.annotatorType(),
+                                    result,
+                                    annotation.metadata())
+                )
+            else:
+                if self.parse_embeddings:
+                    embeddings = list(annotation.embeddings())
+                else:
+                    embeddings = []
+                annotations.append(
+                    Annotation(annotation.annotatorType(),
+                               annotation.begin(),
+                               annotation.end(),
+                               annotation.result(),
+                               annotation.metadata(),
+                               embeddings)
+                )
+        return annotations
+    @staticmethod
+    def __get_result(annotation):
+        try:
+            result = list(annotation.result())
+        except TypeError:
+            result = []
+        return result
+    def fullAnnotate(self, target, optional_target=""):
+        """Annotates the data provided into `Annotation` type results.
+        The data should be either a list or a str.
+        Parameters
+        ----------
+        target : list or str or float
+            The data to be annotated
+        optional_target: list or str
+            Optional data to be annotated (currently used for Question Answering)
+        Returns
+        -------
+        List[Annotation]
+            The result of the annotation
+        Examples
+        --------
+        >>> from sparknlp.pretrained import PretrainedPipeline
+        >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
+        >>> result = explain_document_pipeline.fullAnnotate('U.N. official Ekeus heads for Baghdad.')
+        >>> result[0].keys()
+        dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
+        >>> result[0]["ner"]
+        [Annotation(named_entity, 0, 2, B-ORG, {'word': 'U.N'}),
+        Annotation(named_entity, 3, 3, O, {'word': '.'}),
+        Annotation(named_entity, 5, 12, O, {'word': 'official'}),
+        Annotation(named_entity, 14, 18, B-PER, {'word': 'Ekeus'}),
+        Annotation(named_entity, 20, 24, O, {'word': 'heads'}),
+        Annotation(named_entity, 26, 28, O, {'word': 'for'}),
+        Annotation(named_entity, 30, 36, B-LOC, {'word': 'Baghdad'}),
+        Annotation(named_entity, 37, 37, O, {'word': '.'})]
+        """
+        stages = self.pipeline_model.stages
+        if not self._skipPipelineValidation(stages):
+            self._validateStagesInputCols(stages)
+        if optional_target == "":
+            if self.__isTextInput(target):
+                result = self.__fullAnnotateText(target)
+            elif self.__isAudioInput(target):
+                result = self.__fullAnnotateAudio(target)
+            else:
+                raise TypeError(
+                    "argument for annotation must be 'str' or list[str] or list[float] or list[list[float]]")
+        else:
+            if self.__isTextInput(target) and self.__isTextInput(optional_target):
+                result = self.__fullAnnotateQuestionAnswering(target, optional_target)
+            else:
+                raise TypeError("arguments for annotation must be 'str' or list[str]")
+        return result
+    @staticmethod
+    def __isTextInput(target):
+        if type(target) is str:
+            return True
+        elif type(target) is list and type(target[0]) is str:
+            return True
+        else:
+            return False
+    @staticmethod
+    def __isAudioInput(target):
+        if type(target) is list and type(target[0]) is float:
+            return True
+        elif type(target) is list and type(target[0]) is list and type(target[0][0]) is float:
+            return True
+        else:
+            return False
+    def __fullAnnotateText(self, target):
+        if self.__isPath(target):
+            result = self.fullAnnotateImage(target)
+            return result
+        else:
+            result = []
+            if type(target) is str:
+                target = [target]
+            for annotations_result in self._lightPipeline.fullAnnotateJava(target):
+                result.append(self.__buildStages(annotations_result))
+            return result
+    def __isPath(self, target):
+        if type(target) is list:
+            target = target[0]
+        if target.find("/") < 0:
+            return False
+        else:
+            is_valid_file = _internal._ResourceHelper_validFile(target).apply()
+            return is_valid_file
+    def __fullAnnotateAudio(self, audios):
+        result = []
+        if type(audios[0]) is float:
+            annotations_dict = self._lightPipeline.fullAnnotateSingleAudioJava(audios)
+            result.append(self.__buildStages(annotations_dict))
+        else:
+            full_annotations = self._lightPipeline.fullAnnotateAudiosJava(audios)
+            for annotations_dict in full_annotations:
+                result.append(self.__buildStages(annotations_dict))
+        return result
+    def __fullAnnotateQuestionAnswering(self, question, context):
+        result = []
+        if type(question) is str and type(context) is str:
+            annotations_dict = self._lightPipeline.fullAnnotateJava(question, context)
+            result.append(self.__buildStages(annotations_dict))
+        else:
+            full_annotations = self._lightPipeline.fullAnnotateJava(question, context)
+            for annotations_dict in full_annotations:
+                result.append(self.__buildStages(annotations_dict))
+        return result
+    def fullAnnotateImage(self, path_to_image, text=None):
+        """Annotates the data provided into `Annotation` type results.
+        The data should be either a list or a str.
+        Parameters
+        ----------
+        path_to_image : list or str
+            Source path of image, list of paths to images
+        text: list or str, optional
+           Optional list or str of texts. If None, defaults to empty list if path_to_image is a list, or empty string if path_to_image is a string.
+        Returns
+        -------
+        List[AnnotationImage]
+            The result of the annotation
+        """
+        if not isinstance(path_to_image, (str, list)):
+            raise TypeError("argument for path_to_image must be 'str' or 'list[str]'")
+        if text is None:
+            text = "" if isinstance(path_to_image, str) else []
+        if type(path_to_image) != type(text):
+            raise ValueError("`path_to_image` and `text` must be of the same type")
+        stages = self.pipeline_model.stages
+        if not self._skipPipelineValidation(stages):
+            self._validateStagesInputCols(stages)
+        if isinstance(path_to_image, str):
+            path_to_image = [path_to_image]
+            text = [text]
+        result = []
+        for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image, text):
+            result.append(self.__buildStages(image_result))
+        return result
+    def __buildStages(self, annotations_result):
+        stages = {}
+        for annotator_type, annotations in annotations_result.items():
+            stages[annotator_type] = self._annotationFromJava(annotations)
+        return stages
+    def annotate(self, target, optional_target=""):
+        """Annotates the data provided, extracting the results.
+        The data should be either a list or a str.
+        Parameters
+        ----------
+        target : list or str
+            The data to be annotated
+        optional_target: list or str
+            Optional data to be annotated (currently used for Question Answering)
+        Returns
+        -------
+        List[dict] or dict
+            The result of the annotation
+        Examples
+        --------
+        >>> from sparknlp.pretrained import PretrainedPipeline
+        >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
+        >>> result = explain_document_pipeline.annotate('U.N. official Ekeus heads for Baghdad.')
+        >>> result.keys()
+        dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
+        >>> result["ner"]
+        ['B-ORG', 'O', 'O', 'B-PER', 'O', 'O', 'B-LOC', 'O']
+        """
+        def reformat(annotations):
+            return {k: list(v) for k, v in annotations.items()}
+        stages = self.pipeline_model.stages
+        if not self._skipPipelineValidation(stages):
+            self._validateStagesInputCols(stages)
+        if optional_target == "":
+            if type(target) is str:
+                annotations = self._lightPipeline.annotateJava(target)
+                result = reformat(annotations)
+            elif type(target) is list:
+                if type(target[0]) is list:
+                    raise TypeError("target is a 1D list")
+                annotations = self._lightPipeline.annotateJava(target)
+                result = list(map(lambda a: reformat(a), list(annotations)))
+            else:
+                raise TypeError("target for annotation must be 'str' or list")
+        else:
+            if type(target) is str and type(optional_target) is str:
+                annotations = self._lightPipeline.annotateJava(target, optional_target)
+                result = reformat(annotations)
+            elif type(target) is list and type(optional_target) is list:
+                if type(target[0]) is list or type(optional_target[0]) is list:
+                    raise TypeError("target and optional_target is a 1D list")
+                annotations = self._lightPipeline.annotateJava(target, optional_target)
+                result = list(map(lambda a: reformat(a), list(annotations)))
+            else:
+                raise TypeError("target and optional_target for annotation must be both 'str' or both lists")
+        return result
+    def transform(self, dataframe):
+        """Transforms a dataframe provided with the stages of the LightPipeline.
+        Parameters
+        ----------
+        dataframe : :class:`pyspark.sql.DataFrame`
+            The Dataframe to be transformed
+        Returns
+        -------
+        :class:`pyspark.sql.DataFrame`
+            The transformed DataFrame
+        """
+        return self.pipeline_model.transform(dataframe)
+    def setIgnoreUnsupported(self, value):
+        """Sets whether to ignore unsupported AnnotatorModels.
+        Parameters
+        ----------
+        value : bool
+            Whether to ignore unsupported AnnotatorModels.
+        Returns
+        -------
+        LightPipeline
+            The current LightPipeline
+        """
+        self._lightPipeline.setIgnoreUnsupported(value)
+        return self
+    def getIgnoreUnsupported(self):
+        """Gets whether to ignore unsupported AnnotatorModels.
+        Returns
+        -------
+        bool
+            Whether to ignore unsupported AnnotatorModels.
+        """
+        return self._lightPipeline.getIgnoreUnsupported()

spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl