spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains utility classes for reading resources."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ReadAs(object):
|
|
18
|
+
"""Object that contains constants for how to read Spark Resources.
|
|
19
|
+
|
|
20
|
+
Possible values are:
|
|
21
|
+
|
|
22
|
+
================= =======================================
|
|
23
|
+
Value Description
|
|
24
|
+
================= =======================================
|
|
25
|
+
``ReadAs.TEXT`` Read the resource as text.
|
|
26
|
+
``ReadAs.SPARK`` Read the resource as a Spark DataFrame.
|
|
27
|
+
``ReadAs.BINARY`` Read the resource as a binary file.
|
|
28
|
+
================= =======================================
|
|
29
|
+
"""
|
|
30
|
+
TEXT = "TEXT"
|
|
31
|
+
SPARK = "SPARK"
|
|
32
|
+
BINARY = "BINARY"
|
|
33
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains base classes for recursive AnnotatorApproaches."""
|
|
15
|
+
|
|
16
|
+
from pyspark import keyword_only
|
|
17
|
+
from pyspark.ml.util import JavaMLWritable
|
|
18
|
+
|
|
19
|
+
import sparknlp.internal as _internal
|
|
20
|
+
from sparknlp.common import AnnotatorProperties
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RecursiveAnnotatorApproach(_internal.RecursiveEstimator, JavaMLWritable, _internal.AnnotatorJavaMLReadable,
|
|
24
|
+
AnnotatorProperties,
|
|
25
|
+
_internal.ParamsGettersSetters):
|
|
26
|
+
@keyword_only
|
|
27
|
+
def __init__(self, classname):
|
|
28
|
+
_internal.ParamsGettersSetters.__init__(self)
|
|
29
|
+
self.__class__._java_class_name = classname
|
|
30
|
+
self._java_obj = self._new_java_obj(classname, self.uid)
|
|
31
|
+
self._setDefault(lazyAnnotator=False)
|
|
32
|
+
|
|
33
|
+
def _create_model(self, java_model):
|
|
34
|
+
raise NotImplementedError('Please implement _create_model in %s' % self)
|
|
35
|
+
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains utility classes for handling storage."""
|
|
15
|
+
|
|
16
|
+
from pyspark.ml.param import Param, Params, TypeConverters
|
|
17
|
+
|
|
18
|
+
from sparknlp.common.utils import ExternalResource
|
|
19
|
+
from sparknlp.common.properties import HasCaseSensitiveProperties
|
|
20
|
+
import sparknlp.internal as _internal
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HasStorageRef:
|
|
24
|
+
storageRef = Param(Params._dummy(), "storageRef",
|
|
25
|
+
"unique reference name for identification",
|
|
26
|
+
TypeConverters.toString)
|
|
27
|
+
|
|
28
|
+
def setStorageRef(self, value):
|
|
29
|
+
"""Sets unique reference name for identification.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
value : str
|
|
34
|
+
Unique reference name for identification
|
|
35
|
+
"""
|
|
36
|
+
return self._set(storageRef=value)
|
|
37
|
+
|
|
38
|
+
def getStorageRef(self):
|
|
39
|
+
"""Gets unique reference name for identification.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
str
|
|
44
|
+
Unique reference name for identification
|
|
45
|
+
"""
|
|
46
|
+
return self.getOrDefault("storageRef")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class HasStorageOptions:
|
|
50
|
+
includeStorage = Param(Params._dummy(),
|
|
51
|
+
"includeStorage",
|
|
52
|
+
"whether to include indexed storage in trained model",
|
|
53
|
+
typeConverter=TypeConverters.toBoolean)
|
|
54
|
+
|
|
55
|
+
enableInMemoryStorage = Param(Params._dummy(),
|
|
56
|
+
"enableInMemoryStorage",
|
|
57
|
+
"whether to load whole indexed storage in memory (in-memory lookup)",
|
|
58
|
+
typeConverter=TypeConverters.toBoolean)
|
|
59
|
+
|
|
60
|
+
def setIncludeStorage(self, value):
|
|
61
|
+
"""Sets whether to include indexed storage in trained model.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
value : bool
|
|
66
|
+
Whether to include indexed storage in trained model
|
|
67
|
+
"""
|
|
68
|
+
return self._set(includeStorage=value)
|
|
69
|
+
|
|
70
|
+
def getIncludeStorage(self):
|
|
71
|
+
"""Gets whether to include indexed storage in trained model.
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
bool
|
|
76
|
+
Whether to include indexed storage in trained model
|
|
77
|
+
"""
|
|
78
|
+
return self.getOrDefault("includeStorage")
|
|
79
|
+
|
|
80
|
+
def setEnableInMemoryStorage(self, value):
|
|
81
|
+
"""Sets whether to load whole indexed storage in memory (in-memory lookup)
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
value : bool
|
|
86
|
+
Whether to load whole indexed storage in memory (in-memory lookup)
|
|
87
|
+
"""
|
|
88
|
+
return self._set(enableInMemoryStorage=value)
|
|
89
|
+
|
|
90
|
+
def getEnableInMemoryStorage(self):
|
|
91
|
+
return self.getOrDefault("enableInMemoryStorage")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class HasStorageModel(HasStorageRef, HasCaseSensitiveProperties, HasStorageOptions):
|
|
95
|
+
|
|
96
|
+
def saveStorage(self, path, spark):
|
|
97
|
+
"""Saves the current model to storage.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
path : str
|
|
102
|
+
Path for saving the model.
|
|
103
|
+
spark : :class:`pyspark.sql.SparkSession`
|
|
104
|
+
The current SparkSession
|
|
105
|
+
"""
|
|
106
|
+
self._transfer_params_to_java()
|
|
107
|
+
self._java_obj.saveStorage(path, spark._jsparkSession, False)
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def loadStorage(path, spark, storage_ref):
|
|
111
|
+
raise NotImplementedError("AnnotatorModel with HasStorageModel did not implement 'loadStorage'")
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def loadStorages(path, spark, storage_ref, databases):
|
|
115
|
+
for database in databases:
|
|
116
|
+
_internal._StorageHelper(path, spark, database, storage_ref, within_storage=False)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class HasStorage(HasStorageRef, HasCaseSensitiveProperties, HasStorageOptions):
|
|
120
|
+
storagePath = Param(Params._dummy(),
|
|
121
|
+
"storagePath",
|
|
122
|
+
"path to file",
|
|
123
|
+
typeConverter=TypeConverters.identity)
|
|
124
|
+
|
|
125
|
+
def setStoragePath(self, path, read_as):
|
|
126
|
+
"""Sets path to file.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
path : str
|
|
131
|
+
Path to file
|
|
132
|
+
read_as : str
|
|
133
|
+
How to interpret the file
|
|
134
|
+
|
|
135
|
+
Notes
|
|
136
|
+
-----
|
|
137
|
+
See :class:`ReadAs <sparknlp.common.ReadAs>` for reading options.
|
|
138
|
+
"""
|
|
139
|
+
return self._set(storagePath=ExternalResource(path, read_as, {}))
|
|
140
|
+
|
|
141
|
+
def getStoragePath(self):
|
|
142
|
+
"""Gets path to file.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
str
|
|
147
|
+
path to file
|
|
148
|
+
"""
|
|
149
|
+
return self.getOrDefault("storagePath")
|
sparknlp/common/utils.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains utilities for annotators."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common.read_as import ReadAs
|
|
17
|
+
import sparknlp.internal as _internal
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def ExternalResource(path, read_as=ReadAs.TEXT, options={}):
|
|
21
|
+
"""Returns a representation fo an External Resource.
|
|
22
|
+
|
|
23
|
+
How the resource is read can be set with `read_as`.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
path : str
|
|
28
|
+
Path to the resource
|
|
29
|
+
read_as : str, optional
|
|
30
|
+
How to read the resource, by default ReadAs.TEXT
|
|
31
|
+
options : dict, optional
|
|
32
|
+
Options to read the resource, by default {}
|
|
33
|
+
"""
|
|
34
|
+
return _internal._ExternalResource(path, read_as, options).apply()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def RegexRule(rule, identifier):
|
|
38
|
+
return _internal._RegexRule(rule, identifier).apply()
|
|
39
|
+
|
sparknlp/functions.py
CHANGED
|
@@ -1,28 +1,295 @@
|
|
|
1
|
-
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Contains helper functions to assist in transforming Annotation results.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from pyspark.sql.functions import udf, array
|
|
2
19
|
from pyspark.sql.types import *
|
|
3
20
|
from pyspark.sql import DataFrame
|
|
4
21
|
from sparknlp.annotation import Annotation
|
|
5
22
|
|
|
6
23
|
|
|
7
24
|
def map_annotations(f, output_type: DataType):
|
|
25
|
+
"""Creates a Spark UDF to map over an Annotator's results.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
f : function
|
|
30
|
+
The function to be applied over the results
|
|
31
|
+
output_type : :class:`pyspark.sql.types.DataType`
|
|
32
|
+
Output type of the data
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
:func:`pyspark.sql.functions.udf`
|
|
37
|
+
Spark UserDefinedFunction (udf)
|
|
38
|
+
|
|
39
|
+
Examples
|
|
40
|
+
--------
|
|
41
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
42
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
43
|
+
>>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
|
|
44
|
+
>>> result = explain_document_pipeline.transform(data)
|
|
45
|
+
|
|
46
|
+
The array type must be provided in order to tell Spark the expected output
|
|
47
|
+
type of our column. We are using an Annotation array here.
|
|
48
|
+
|
|
49
|
+
>>> from sparknlp.functions import *
|
|
50
|
+
>>> def nnp_tokens(annotations: List[Row]):
|
|
51
|
+
... return list(
|
|
52
|
+
... filter(lambda annotation: annotation.result == 'NNP', annotations)
|
|
53
|
+
... )
|
|
54
|
+
>>> result.select(
|
|
55
|
+
... map_annotations(nnp_tokens, Annotation.arrayType())('pos').alias("nnp")
|
|
56
|
+
... ).selectExpr("explode(nnp) as nnp").show(truncate=False)
|
|
57
|
+
+-----------------------------------------+
|
|
58
|
+
|nnp |
|
|
59
|
+
+-----------------------------------------+
|
|
60
|
+
|[pos, 0, 2, NNP, [word -> U.N], []] |
|
|
61
|
+
|[pos, 14, 18, NNP, [word -> Epeus], []] |
|
|
62
|
+
|[pos, 30, 36, NNP, [word -> Baghdad], []]|
|
|
63
|
+
+-----------------------------------------+
|
|
64
|
+
"""
|
|
8
65
|
return udf(
|
|
9
|
-
lambda content: f(content),
|
|
66
|
+
lambda content: [ Annotation.toRow(a) for a in f([Annotation.fromRow(r) for r in content])],
|
|
10
67
|
output_type
|
|
11
68
|
)
|
|
12
69
|
|
|
70
|
+
def map_annotations_array(f, output_type: DataType):
|
|
71
|
+
"""Creates a Spark UDF to map over an Annotator's array results.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
f : function
|
|
76
|
+
The function to be applied over the results
|
|
77
|
+
output_type : :class:`pyspark.sql.types.DataType`
|
|
78
|
+
Output type of the data
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
:func:`pyspark.sql.functions.udf`
|
|
83
|
+
Spark UserDefinedFunction (udf)
|
|
84
|
+
"""
|
|
85
|
+
return udf(
|
|
86
|
+
lambda cols: [Annotation.toRow(item) for item in f([Annotation.fromRow(r) for col in cols for r in col])],
|
|
87
|
+
output_type
|
|
88
|
+
)
|
|
13
89
|
|
|
14
90
|
def map_annotations_strict(f):
|
|
91
|
+
"""Creates a Spark UDF to map over an Annotator's results, for which the
|
|
92
|
+
return type is explicitly defined as a `Annotation.dataType()`.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
f : function
|
|
97
|
+
The function to be applied over the results
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
:func:`pyspark.sql.functions.udf`
|
|
102
|
+
Spark UserDefinedFunction (udf)
|
|
103
|
+
|
|
104
|
+
Examples
|
|
105
|
+
--------
|
|
106
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
107
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
108
|
+
>>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
|
|
109
|
+
>>> result = explain_document_pipeline.transform(data)
|
|
110
|
+
>>> def nnp_tokens(annotations):
|
|
111
|
+
... return list(
|
|
112
|
+
... filter(lambda annotation: annotation.result == 'NNP', annotations)
|
|
113
|
+
... )
|
|
114
|
+
>>> result.select(
|
|
115
|
+
... map_annotations_strict(nnp_tokens)('pos').alias("nnp")
|
|
116
|
+
... ).selectExpr("explode(nnp) as nnp").show(truncate=False)
|
|
117
|
+
+-----------------------------------------+
|
|
118
|
+
|nnp |
|
|
119
|
+
+-----------------------------------------+
|
|
120
|
+
|[pos, 0, 2, NNP, [word -> U.N], []] |
|
|
121
|
+
|[pos, 14, 18, NNP, [word -> Epeus], []] |
|
|
122
|
+
|[pos, 30, 36, NNP, [word -> Baghdad], []]|
|
|
123
|
+
+-----------------------------------------+
|
|
124
|
+
"""
|
|
15
125
|
return udf(
|
|
16
|
-
lambda content: f(content),
|
|
126
|
+
lambda content: [ Annotation.toRow(a) for a in f([Annotation.fromRow(r) for r in content])],
|
|
17
127
|
ArrayType(Annotation.dataType())
|
|
18
128
|
)
|
|
19
129
|
|
|
20
130
|
|
|
21
|
-
def map_annotations_col(dataframe: DataFrame, f, column, output_column,
|
|
22
|
-
|
|
131
|
+
def map_annotations_col(dataframe: DataFrame, f, column: str, output_column: str, annotatyon_type: str,
|
|
132
|
+
output_type: DataType = Annotation.arrayType()):
|
|
133
|
+
"""Creates a Spark UDF to map over a column of Annotation results.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
dataframe : DataFrame
|
|
138
|
+
Input DataFrame
|
|
139
|
+
f : function
|
|
140
|
+
Function to apply to the column
|
|
141
|
+
column : str
|
|
142
|
+
Name of the input column
|
|
143
|
+
output_column : str
|
|
144
|
+
Name of the output column
|
|
145
|
+
annotatyon_type : str
|
|
146
|
+
Annotator type
|
|
147
|
+
output_type : DataType, optional
|
|
148
|
+
Output type, by default Annotation.arrayType()
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
:class:`pyspark.sql.DataFrame`
|
|
153
|
+
Transformed DataFrame
|
|
154
|
+
|
|
155
|
+
Examples
|
|
156
|
+
--------
|
|
157
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
158
|
+
>>> from sparknlp.functions import *
|
|
159
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
160
|
+
>>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
|
|
161
|
+
>>> result = explain_document_pipeline.transform(data)
|
|
162
|
+
>>> chunks_df = map_annotations_col(
|
|
163
|
+
... result,
|
|
164
|
+
... lambda x: [
|
|
165
|
+
... Annotation("chunk", a.begin, a.end, a.result, a.metadata, a.embeddings)
|
|
166
|
+
... for a in x
|
|
167
|
+
... ],
|
|
168
|
+
... "pos",
|
|
169
|
+
... "pos_chunk",
|
|
170
|
+
... "chunk",
|
|
171
|
+
... )
|
|
172
|
+
>>> chunks_df.selectExpr("explode(pos_chunk)").show()
|
|
173
|
+
+--------------------+
|
|
174
|
+
| col|
|
|
175
|
+
+--------------------+
|
|
176
|
+
|[chunk, 0, 2, NNP...|
|
|
177
|
+
|[chunk, 3, 3, ., ...|
|
|
178
|
+
|[chunk, 5, 12, JJ...|
|
|
179
|
+
|[chunk, 14, 18, N...|
|
|
180
|
+
|[chunk, 20, 24, V...|
|
|
181
|
+
|[chunk, 26, 28, I...|
|
|
182
|
+
|[chunk, 30, 36, N...|
|
|
183
|
+
|[chunk, 37, 37, ....|
|
|
184
|
+
+--------------------+
|
|
185
|
+
"""
|
|
186
|
+
return dataframe.withColumn(output_column, map_annotations(f, output_type)(column).alias(output_column, metadata={
|
|
187
|
+
'annotatorType': annotatyon_type}))
|
|
188
|
+
|
|
189
|
+
def map_annotations_cols(dataframe: DataFrame, f, columns: list, output_column: str, annotatyon_type: str,
|
|
190
|
+
output_type: DataType = Annotation.arrayType()):
|
|
191
|
+
"""Creates a Spark UDF to map over multiple columns of Annotation results.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
dataframe : DataFrame
|
|
196
|
+
Input DataFrame
|
|
197
|
+
f : function
|
|
198
|
+
Function to apply to the column
|
|
199
|
+
columns : list
|
|
200
|
+
Name of the input column
|
|
201
|
+
output_column : str
|
|
202
|
+
Name of the output column
|
|
203
|
+
annotatyon_type : str
|
|
204
|
+
Annotator type
|
|
205
|
+
output_type : DataType, optional
|
|
206
|
+
Output type, by default Annotation.arrayType()
|
|
207
|
+
|
|
208
|
+
Returns
|
|
209
|
+
-------
|
|
210
|
+
:class:`pyspark.sql.DataFrame`
|
|
211
|
+
Transformed DataFrame
|
|
212
|
+
|
|
213
|
+
Examples
|
|
214
|
+
--------
|
|
215
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
216
|
+
>>> from sparknlp.functions import *
|
|
217
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
218
|
+
>>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
|
|
219
|
+
>>> result = explain_document_pipeline.transform(data)
|
|
220
|
+
>>> chunks_df = map_annotations_cols(
|
|
221
|
+
... result,
|
|
222
|
+
... lambda x: [
|
|
223
|
+
... Annotation("tag", a.begin, a.end, a.result, a.metadata, a.embeddings)
|
|
224
|
+
... for a in x
|
|
225
|
+
... ],
|
|
226
|
+
... ["pos", "ner"],
|
|
227
|
+
... "tags",
|
|
228
|
+
... "chunk"
|
|
229
|
+
... )
|
|
230
|
+
>>> chunks_df.selectExpr("explode(tags)").show(truncate=False)
|
|
231
|
+
+-------------------------------------------+
|
|
232
|
+
|col |
|
|
233
|
+
+-------------------------------------------+
|
|
234
|
+
|[tag, 0, 2, NNP, [word -> U.N], []] |
|
|
235
|
+
|[tag, 3, 3, ., [word -> .], []] |
|
|
236
|
+
|[tag, 5, 12, JJ, [word -> official], []] |
|
|
237
|
+
|[tag, 14, 18, NNP, [word -> Epeus], []] |
|
|
238
|
+
|[tag, 20, 24, VBZ, [word -> heads], []] |
|
|
239
|
+
|[tag, 26, 28, IN, [word -> for], []] |
|
|
240
|
+
|[tag, 30, 36, NNP, [word -> Baghdad], []] |
|
|
241
|
+
|[tag, 37, 37, ., [word -> .], []] |
|
|
242
|
+
|[tag, 0, 2, B-ORG, [word -> U.N], []] |
|
|
243
|
+
|[tag, 3, 3, O, [word -> .], []] |
|
|
244
|
+
|[tag, 5, 12, O, [word -> official], []] |
|
|
245
|
+
|[tag, 14, 18, B-PER, [word -> Ekeus], []] |
|
|
246
|
+
|[tag, 20, 24, O, [word -> heads], []] |
|
|
247
|
+
|[tag, 26, 28, O, [word -> for], []] |
|
|
248
|
+
|[tag, 30, 36, B-LOC, [word -> Baghdad], []]|
|
|
249
|
+
|[tag, 37, 37, O, [word -> .], []] |
|
|
250
|
+
+-------------------------------------------+
|
|
251
|
+
"""
|
|
252
|
+
return dataframe.withColumn(output_column, map_annotations_array(f, output_type)(array(*columns)).alias(output_column, metadata={
|
|
253
|
+
'annotatorType': annotatyon_type}))
|
|
23
254
|
|
|
24
255
|
|
|
25
256
|
def filter_by_annotations_col(dataframe, f, column):
|
|
257
|
+
"""Applies a filter over a column of Annotations.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
dataframe : DataFrame
|
|
262
|
+
Input DataFrame
|
|
263
|
+
f : function
|
|
264
|
+
Filter function
|
|
265
|
+
column : str
|
|
266
|
+
Name of the column
|
|
267
|
+
|
|
268
|
+
Returns
|
|
269
|
+
-------
|
|
270
|
+
:class:`pyspark.sql.DataFrame`
|
|
271
|
+
Filtered DataFrame
|
|
272
|
+
|
|
273
|
+
Examples
|
|
274
|
+
--------
|
|
275
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
276
|
+
>>> from sparknlp.functions import *
|
|
277
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
278
|
+
>>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
|
|
279
|
+
>>> result = explain_document_pipeline.transform(data)
|
|
280
|
+
>>> def filter_pos(annotation: Annotation):
|
|
281
|
+
... return annotation.result == "NNP"
|
|
282
|
+
>>> filter_by_annotations_col(
|
|
283
|
+
... explode_annotations_col(result, "pos", "pos"), filter_pos, "pos"
|
|
284
|
+
... ).select("pos").show(truncate=False)
|
|
285
|
+
+-----------------------------------------+
|
|
286
|
+
|pos |
|
|
287
|
+
+-----------------------------------------+
|
|
288
|
+
|[pos, 0, 2, NNP, [word -> U.N], []] |
|
|
289
|
+
|[pos, 14, 18, NNP, [word -> Epeus], []] |
|
|
290
|
+
|[pos, 30, 36, NNP, [word -> Baghdad], []]|
|
|
291
|
+
+-----------------------------------------+
|
|
292
|
+
"""
|
|
26
293
|
this_udf = udf(
|
|
27
294
|
lambda content: f(content),
|
|
28
295
|
BooleanType()
|
|
@@ -31,5 +298,48 @@ def filter_by_annotations_col(dataframe, f, column):
|
|
|
31
298
|
|
|
32
299
|
|
|
33
300
|
def explode_annotations_col(dataframe: DataFrame, column, output_column):
|
|
301
|
+
"""Explodes an Annotation column, putting each result onto a separate row.
|
|
302
|
+
|
|
303
|
+
Parameters
|
|
304
|
+
----------
|
|
305
|
+
dataframe : DataFrame
|
|
306
|
+
The Spark DataFrame containing output Annotations
|
|
307
|
+
column : str
|
|
308
|
+
Name of the column
|
|
309
|
+
output_column : str
|
|
310
|
+
Name of the output column
|
|
311
|
+
|
|
312
|
+
Returns
|
|
313
|
+
-------
|
|
314
|
+
:class:`pyspark.sql.DataFrame`
|
|
315
|
+
Transformed DataFrame
|
|
316
|
+
|
|
317
|
+
Examples
|
|
318
|
+
--------
|
|
319
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
320
|
+
>>> from sparknlp.functions import *
|
|
321
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
322
|
+
>>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
|
|
323
|
+
>>> result = explain_document_pipeline.transform(data)
|
|
324
|
+
>>> result.select("pos.result").show(truncate=False)
|
|
325
|
+
+----------------------------------+
|
|
326
|
+
|result |
|
|
327
|
+
+----------------------------------+
|
|
328
|
+
|[NNP, ., JJ, NNP, VBZ, IN, NNP, .]|
|
|
329
|
+
+----------------------------------+
|
|
330
|
+
>>> explode_annotations_col(result, "pos", "pos").select("pos.result").show()
|
|
331
|
+
+------+
|
|
332
|
+
|result|
|
|
333
|
+
+------+
|
|
334
|
+
| NNP|
|
|
335
|
+
| .|
|
|
336
|
+
| JJ|
|
|
337
|
+
| NNP|
|
|
338
|
+
| VBZ|
|
|
339
|
+
| IN|
|
|
340
|
+
| NNP|
|
|
341
|
+
| .|
|
|
342
|
+
+------+
|
|
343
|
+
"""
|
|
34
344
|
from pyspark.sql.functions import explode
|
|
35
345
|
return dataframe.withColumn(output_column, explode(column))
|