spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Contains classes concerning Wav2Vec2ForCTC."""
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Wav2Vec2ForCTC(AnnotatorModel,
|
|
21
|
+
HasBatchedAnnotateAudio,
|
|
22
|
+
HasAudioFeatureProperties,
|
|
23
|
+
HasEngine):
|
|
24
|
+
"""Wav2Vec2 Model with a language modeling head on top for Connectionist Temporal
|
|
25
|
+
Classification (CTC). Wav2Vec2 was proposed in wav2vec 2.0: A Framework for
|
|
26
|
+
Self-Supervised Learning of Speech Representations by Alexei Baevski, Henry Zhou,
|
|
27
|
+
Abdelrahman Mohamed, Michael Auli.
|
|
28
|
+
|
|
29
|
+
The annotator takes audio files and transcribes it as text. The audio needs to be
|
|
30
|
+
provided pre-processed an array of floats.
|
|
31
|
+
|
|
32
|
+
Note that this annotator is currently not supported on Apple Silicon processors such
|
|
33
|
+
as the M1. This is due to the processor not supporting instructions for XLA.
|
|
34
|
+
|
|
35
|
+
Pretrained models can be loaded with ``pretrained`` of the companion object:
|
|
36
|
+
|
|
37
|
+
>>> speechToText = Wav2Vec2ForCTC.pretrained() \\
|
|
38
|
+
... .setInputCols(["audio_assembler"]) \\
|
|
39
|
+
... .setOutputCol("text")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
The default model is ``"asr_wav2vec2_base_960h"``, if no name is provided.
|
|
43
|
+
|
|
44
|
+
For available pretrained models please see the
|
|
45
|
+
`Models Hub <https://sparknlp.org/models>`__.
|
|
46
|
+
|
|
47
|
+
To see which models are compatible and how to import them see
|
|
48
|
+
https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
|
|
49
|
+
examples, see
|
|
50
|
+
`Wav2Vec2ForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTCTestSpec.scala>`__.
|
|
51
|
+
|
|
52
|
+
====================== ======================
|
|
53
|
+
Input Annotation types Output Annotation type
|
|
54
|
+
====================== ======================
|
|
55
|
+
``AUDIO`` ``DOCUMENT``
|
|
56
|
+
====================== ======================
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
|
|
61
|
+
batchSize
|
|
62
|
+
Size of each batch, by default 2
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
>>> import sparknlp
|
|
67
|
+
>>> from sparknlp.base import *
|
|
68
|
+
>>> from sparknlp.annotator import *
|
|
69
|
+
>>> from pyspark.ml import Pipeline
|
|
70
|
+
>>> audioAssembler = AudioAssembler() \\
|
|
71
|
+
... .setInputCol("audio_content") \\
|
|
72
|
+
... .setOutputCol("audio_assembler")
|
|
73
|
+
>>> speechToText = Wav2Vec2ForCTC \\
|
|
74
|
+
... .pretrained() \\
|
|
75
|
+
... .setInputCols(["audio_assembler"]) \\
|
|
76
|
+
... .setOutputCol("text")
|
|
77
|
+
>>> pipeline = Pipeline().setStages([audioAssembler, speechToText])
|
|
78
|
+
>>> processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content")
|
|
79
|
+
>>> result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)
|
|
80
|
+
>>> result.select("text.result").show(truncate = False)
|
|
81
|
+
+------------------------------------------------------------------------------------------+
|
|
82
|
+
|result |
|
|
83
|
+
+------------------------------------------------------------------------------------------+
|
|
84
|
+
|[MISTER QUILTER IS THE APOSTLE OF THE MIDLE CLASES AND WE ARE GLAD TO WELCOME HIS GOSPEL ]|
|
|
85
|
+
+------------------------------------------------------------------------------------------+
|
|
86
|
+
"""
|
|
87
|
+
name = "Wav2Vec2ForCTC"
|
|
88
|
+
|
|
89
|
+
inputAnnotatorTypes = [AnnotatorType.AUDIO]
|
|
90
|
+
|
|
91
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
92
|
+
|
|
93
|
+
configProtoBytes = Param(Params._dummy(),
|
|
94
|
+
"configProtoBytes",
|
|
95
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with "
|
|
96
|
+
"config_proto.SerializeToString()",
|
|
97
|
+
TypeConverters.toListInt)
|
|
98
|
+
|
|
99
|
+
def setConfigProtoBytes(self, b):
|
|
100
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
b : List[int]
|
|
105
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
106
|
+
"""
|
|
107
|
+
return self._set(configProtoBytes=b)
|
|
108
|
+
|
|
109
|
+
@keyword_only
|
|
110
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.audio.Wav2Vec2ForCTC",
|
|
111
|
+
java_model=None):
|
|
112
|
+
super(Wav2Vec2ForCTC, self).__init__(
|
|
113
|
+
classname=classname,
|
|
114
|
+
java_model=java_model
|
|
115
|
+
)
|
|
116
|
+
self._setDefault(
|
|
117
|
+
batchSize=2
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def loadSavedModel(folder, spark_session):
|
|
122
|
+
"""Loads a locally saved model.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
folder : str
|
|
127
|
+
Folder of the saved model
|
|
128
|
+
spark_session : pyspark.sql.SparkSession
|
|
129
|
+
The current SparkSession
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
Wav2Vec2ForCTC
|
|
134
|
+
The restored model
|
|
135
|
+
"""
|
|
136
|
+
from sparknlp.internal import _Wav2Vec2ForCTC
|
|
137
|
+
jModel = _Wav2Vec2ForCTC(folder, spark_session._jsparkSession)._java_obj
|
|
138
|
+
return Wav2Vec2ForCTC(java_model=jModel)
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def pretrained(name="asr_wav2vec2_base_960h", lang="en", remote_loc=None):
|
|
142
|
+
"""Downloads and loads a pretrained model.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
name : str, optional
|
|
147
|
+
Name of the pretrained model, by default
|
|
148
|
+
"asr_wav2vec2_base_960h"
|
|
149
|
+
lang : str, optional
|
|
150
|
+
Language of the pretrained model, by default "en"
|
|
151
|
+
remote_loc : str, optional
|
|
152
|
+
Optional remote address of the resource, by default None. Will use
|
|
153
|
+
Spark NLPs repositories otherwise.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
Wav2Vec2ForCTC
|
|
158
|
+
The restored model
|
|
159
|
+
"""
|
|
160
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
161
|
+
return ResourceDownloader.downloadModel(Wav2Vec2ForCTC, name, lang, remote_loc)
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Contains classes concerning WhisperForCTC."""
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class WhisperForCTC(AnnotatorModel,
|
|
21
|
+
HasBatchedAnnotateAudio,
|
|
22
|
+
HasAudioFeatureProperties,
|
|
23
|
+
HasEngine, HasGeneratorProperties):
|
|
24
|
+
"""Whisper Model with a language modeling head on top for Connectionist Temporal Classification
|
|
25
|
+
(CTC).
|
|
26
|
+
|
|
27
|
+
Whisper is an automatic speech recognition (ASR) system trained on 680,000 hours of
|
|
28
|
+
multilingual and multitask supervised data collected from the web. It transcribe in multiple
|
|
29
|
+
languages, as well as translate from those languages into English.
|
|
30
|
+
|
|
31
|
+
The audio needs to be provided pre-processed an array of floats.
|
|
32
|
+
|
|
33
|
+
Note that at the moment, this annotator only supports greedy search and only Spark Versions
|
|
34
|
+
3.4 and up are supported.
|
|
35
|
+
|
|
36
|
+
For multilingual models, the language and the task (transcribe or translate) can be set with
|
|
37
|
+
``setLanguage`` and ``setTask``.
|
|
38
|
+
|
|
39
|
+
Pretrained models can be loaded with ``pretrained`` of the companion object:
|
|
40
|
+
|
|
41
|
+
.. code-block:: python
|
|
42
|
+
|
|
43
|
+
speechToText = WhisperForCTC.pretrained() \\
|
|
44
|
+
.setInputCols(["audio_assembler"]) \\
|
|
45
|
+
.setOutputCol("text")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
The default model is ``"asr_whisper_tiny_opt"``, if no name is provided.
|
|
49
|
+
|
|
50
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
|
|
51
|
+
|
|
52
|
+
To see which models are compatible and how to import them see
|
|
53
|
+
https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
|
|
54
|
+
examples, see
|
|
55
|
+
`WhisperForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala>`__.
|
|
56
|
+
|
|
57
|
+
**References:**
|
|
58
|
+
|
|
59
|
+
`Robust Speech Recognition via Large-Scale Weak Supervision <https://arxiv.org/abs/2212.04356>`__
|
|
60
|
+
|
|
61
|
+
**Paper Abstract:**
|
|
62
|
+
|
|
63
|
+
*We study the capabilities of speech processing systems trained simply to predict large
|
|
64
|
+
amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual
|
|
65
|
+
and multitask supervision, the resulting models generalize well to standard benchmarks and are
|
|
66
|
+
often competitive with prior fully supervised results but in a zero- shot transfer setting
|
|
67
|
+
without the need for any fine- tuning. When compared to humans, the models approach their
|
|
68
|
+
accuracy and robustness. We are releasing models and inference code to serve as a foundation
|
|
69
|
+
for further work on robust speech processing.*
|
|
70
|
+
|
|
71
|
+
====================== ======================
|
|
72
|
+
Input Annotation types Output Annotation type
|
|
73
|
+
====================== ======================
|
|
74
|
+
``AUDIO`` ``DOCUMENT``
|
|
75
|
+
====================== ======================
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
task
|
|
80
|
+
The formatted task for the audio. Either `<|translate|>` or `<|transcribe|>`.
|
|
81
|
+
language
|
|
82
|
+
The language for the audio, formatted to e.g. `<|en|>`. Check the model description for
|
|
83
|
+
supported languages.
|
|
84
|
+
isMultilingual
|
|
85
|
+
Whether the model is multilingual
|
|
86
|
+
minOutputLength
|
|
87
|
+
Minimum length of the sequence to be generated
|
|
88
|
+
maxOutputLength
|
|
89
|
+
Maximum length of output text
|
|
90
|
+
doSample
|
|
91
|
+
Whether or not to use sampling; use greedy decoding otherwise
|
|
92
|
+
temperature
|
|
93
|
+
The value used to module the next token probabilities
|
|
94
|
+
topK
|
|
95
|
+
The number of highest probability vocabulary tokens to keep for top-k-filtering
|
|
96
|
+
topP
|
|
97
|
+
If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are
|
|
98
|
+
kept for generation
|
|
99
|
+
repetitionPenalty
|
|
100
|
+
The parameter for repetition penalty. 1.0 means no penalty.
|
|
101
|
+
See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details
|
|
102
|
+
noRepeatNgramSize
|
|
103
|
+
If set to int > 0, all ngrams of that size can only occur once
|
|
104
|
+
beamSize
|
|
105
|
+
The Number of beams for beam search
|
|
106
|
+
|
|
107
|
+
Examples
|
|
108
|
+
--------
|
|
109
|
+
>>> import sparknlp
|
|
110
|
+
>>> from sparknlp.base import *
|
|
111
|
+
>>> from sparknlp.annotator import *
|
|
112
|
+
>>> from pyspark.ml import Pipeline
|
|
113
|
+
>>> audioAssembler = AudioAssembler() \\
|
|
114
|
+
... .setInputCol("audio_content") \\
|
|
115
|
+
... .setOutputCol("audio_assembler")
|
|
116
|
+
>>> speechToText = WhisperForCTC.pretrained() \\
|
|
117
|
+
... .setInputCols(["audio_assembler"]) \\
|
|
118
|
+
... .setOutputCol("text")
|
|
119
|
+
>>> pipeline = Pipeline().setStages([audioAssembler, speechToText])
|
|
120
|
+
>>> processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content")
|
|
121
|
+
>>> result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)
|
|
122
|
+
>>> result.select("text.result").show(truncate = False)
|
|
123
|
+
+------------------------------------------------------------------------------------------+
|
|
124
|
+
|result |
|
|
125
|
+
+------------------------------------------------------------------------------------------+
|
|
126
|
+
|[ Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.]|
|
|
127
|
+
+------------------------------------------------------------------------------------------+
|
|
128
|
+
"""
|
|
129
|
+
name = "WhisperForCTC"
|
|
130
|
+
|
|
131
|
+
inputAnnotatorTypes = [AnnotatorType.AUDIO]
|
|
132
|
+
|
|
133
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
134
|
+
|
|
135
|
+
configProtoBytes = Param(Params._dummy(),
|
|
136
|
+
"configProtoBytes",
|
|
137
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with "
|
|
138
|
+
"config_proto.SerializeToString()",
|
|
139
|
+
TypeConverters.toListInt)
|
|
140
|
+
|
|
141
|
+
language = Param(Params._dummy(), "language", "Optional parameter to set the language for the transcription.",
|
|
142
|
+
typeConverter=TypeConverters.toString)
|
|
143
|
+
|
|
144
|
+
isMultilingual = Param(Params._dummy(), "isMultilingual", "Whether the model is multilingual.",
|
|
145
|
+
typeConverter=TypeConverters.toBoolean)
|
|
146
|
+
|
|
147
|
+
def setConfigProtoBytes(self, b):
|
|
148
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
b : List[int]
|
|
153
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
154
|
+
"""
|
|
155
|
+
return self._set(configProtoBytes=b)
|
|
156
|
+
|
|
157
|
+
def getLanguage(self):
|
|
158
|
+
"""Gets the langauge for the transcription."""
|
|
159
|
+
return self.getOrDefault(self.language)
|
|
160
|
+
|
|
161
|
+
def getIsMultilingual(self):
|
|
162
|
+
"""Gets whether the model is multilingual."""
|
|
163
|
+
return self.getOrDefault(self.isMultilingual)
|
|
164
|
+
|
|
165
|
+
def setLanguage(self, value):
|
|
166
|
+
"""Sets the language for the audio, formatted to e.g. `<|en|>`. Check the model description for
|
|
167
|
+
supported languages.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
value : String
|
|
172
|
+
Formatted language code
|
|
173
|
+
"""
|
|
174
|
+
return self._call_java("setLanguage", value)
|
|
175
|
+
|
|
176
|
+
def setTask(self, value):
|
|
177
|
+
"""Sets the formatted task for the audio. Either `<|translate|>` or `<|transcribe|>`.
|
|
178
|
+
|
|
179
|
+
Only multilingual models can do translation.
|
|
180
|
+
|
|
181
|
+
Parameters
|
|
182
|
+
----------
|
|
183
|
+
value : String
|
|
184
|
+
Formatted task
|
|
185
|
+
"""
|
|
186
|
+
return self._call_java("setTask", value)
|
|
187
|
+
|
|
188
|
+
@keyword_only
|
|
189
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.audio.WhisperForCTC",
|
|
190
|
+
java_model=None):
|
|
191
|
+
super(WhisperForCTC, self).__init__(
|
|
192
|
+
classname=classname,
|
|
193
|
+
java_model=java_model
|
|
194
|
+
)
|
|
195
|
+
self._setDefault(
|
|
196
|
+
minOutputLength=0,
|
|
197
|
+
maxOutputLength=448,
|
|
198
|
+
doSample=False,
|
|
199
|
+
temperature=1.0,
|
|
200
|
+
topK=1,
|
|
201
|
+
topP=1.0,
|
|
202
|
+
repetitionPenalty=1.0,
|
|
203
|
+
noRepeatNgramSize=0,
|
|
204
|
+
batchSize=2,
|
|
205
|
+
beamSize=1,
|
|
206
|
+
nReturnSequences=1,
|
|
207
|
+
isMultilingual=True,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
@staticmethod
|
|
211
|
+
def loadSavedModel(folder, spark_session):
|
|
212
|
+
"""Loads a locally saved model.
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
folder : str
|
|
217
|
+
Folder of the saved model
|
|
218
|
+
spark_session : pyspark.sql.SparkSession
|
|
219
|
+
The current SparkSession
|
|
220
|
+
|
|
221
|
+
Returns
|
|
222
|
+
-------
|
|
223
|
+
WhisperForCTC
|
|
224
|
+
The restored model
|
|
225
|
+
"""
|
|
226
|
+
from sparknlp.internal import _WhisperForCTC
|
|
227
|
+
jModel = _WhisperForCTC(folder, spark_session._jsparkSession)._java_obj
|
|
228
|
+
return WhisperForCTC(java_model=jModel)
|
|
229
|
+
|
|
230
|
+
@staticmethod
|
|
231
|
+
def pretrained(name="asr_whisper_tiny_opt", lang="xx", remote_loc=None):
|
|
232
|
+
"""Downloads and loads a pretrained model.
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
name : str, optional
|
|
237
|
+
Name of the pretrained model, by default
|
|
238
|
+
"asr_hubert_large_ls960"
|
|
239
|
+
lang : str, optional
|
|
240
|
+
Language of the pretrained model, by default "en"
|
|
241
|
+
remote_loc : str, optional
|
|
242
|
+
Optional remote address of the resource, by default None. Will use
|
|
243
|
+
Spark NLPs repositories otherwise.
|
|
244
|
+
|
|
245
|
+
Returns
|
|
246
|
+
-------
|
|
247
|
+
WhisperForCTC
|
|
248
|
+
The restored model
|
|
249
|
+
"""
|
|
250
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
251
|
+
return ResourceDownloader.downloadModel(WhisperForCTC, name, lang, remote_loc)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for Chunk2Doc."""
|
|
15
|
+
|
|
16
|
+
from pyspark import keyword_only
|
|
17
|
+
|
|
18
|
+
from sparknlp.common import AnnotatorProperties
|
|
19
|
+
from sparknlp.common.annotator_type import AnnotatorType
|
|
20
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties):
|
|
24
|
+
"""Converts a ``CHUNK`` type column back into ``DOCUMENT``.
|
|
25
|
+
|
|
26
|
+
Useful when trying to re-tokenize or do further analysis on a ``CHUNK`` result.
|
|
27
|
+
|
|
28
|
+
====================== ======================
|
|
29
|
+
Input Annotation types Output Annotation type
|
|
30
|
+
====================== ======================
|
|
31
|
+
``CHUNK`` ``DOCUMENT``
|
|
32
|
+
====================== ======================
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
None
|
|
37
|
+
|
|
38
|
+
Examples
|
|
39
|
+
--------
|
|
40
|
+
>>> import sparknlp
|
|
41
|
+
>>> from sparknlp.base import *
|
|
42
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
43
|
+
|
|
44
|
+
Location entities are extracted and converted back into ``DOCUMENT`` type for
|
|
45
|
+
further processing.
|
|
46
|
+
|
|
47
|
+
>>> data = spark.createDataFrame([[1, "New York and New Jersey aren't that far apart actually."]]).toDF("id", "text")
|
|
48
|
+
|
|
49
|
+
Define pretrained pipeline that extracts Named Entities amongst other things
|
|
50
|
+
and apply `Chunk2Doc` on it.
|
|
51
|
+
|
|
52
|
+
>>> pipeline = PretrainedPipeline("explain_document_dl")
|
|
53
|
+
>>> chunkToDoc = Chunk2Doc().setInputCols("entities").setOutputCol("chunkConverted")
|
|
54
|
+
>>> explainResult = pipeline.transform(data)
|
|
55
|
+
|
|
56
|
+
Show results.
|
|
57
|
+
|
|
58
|
+
>>> result = chunkToDoc.transform(explainResult)
|
|
59
|
+
>>> result.selectExpr("explode(chunkConverted)").show(truncate=False)
|
|
60
|
+
+------------------------------------------------------------------------------+
|
|
61
|
+
|col |
|
|
62
|
+
+------------------------------------------------------------------------------+
|
|
63
|
+
|[document, 0, 7, New York, [entity -> LOC, sentence -> 0, chunk -> 0], []] |
|
|
64
|
+
|[document, 13, 22, New Jersey, [entity -> LOC, sentence -> 0, chunk -> 1], []]|
|
|
65
|
+
+------------------------------------------------------------------------------+
|
|
66
|
+
|
|
67
|
+
See Also
|
|
68
|
+
--------
|
|
69
|
+
Doc2Chunk : for converting `DOCUMENT` annotations to `CHUNK`
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
name = "Chunk2Doc"
|
|
73
|
+
|
|
74
|
+
inputAnnotatorTypes = [AnnotatorType.CHUNK]
|
|
75
|
+
|
|
76
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
77
|
+
|
|
78
|
+
@keyword_only
|
|
79
|
+
def __init__(self):
|
|
80
|
+
super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunk2Doc")
|
|
81
|
+
|
|
82
|
+
@keyword_only
|
|
83
|
+
def setParams(self):
|
|
84
|
+
kwargs = self._input_kwargs
|
|
85
|
+
return self._set(**kwargs)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the Chunker."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Chunker(AnnotatorModel):
|
|
19
|
+
"""This annotator matches a pattern of part-of-speech tags in order to
|
|
20
|
+
return meaningful phrases from document. Extracted part-of-speech tags are
|
|
21
|
+
mapped onto the sentence, which can then be parsed by regular expressions.
|
|
22
|
+
The part-of-speech tags are wrapped by angle brackets ``<>`` to be easily
|
|
23
|
+
distinguishable in the text itself.
|
|
24
|
+
|
|
25
|
+
This example sentence will result in the form:
|
|
26
|
+
|
|
27
|
+
.. code-block:: none
|
|
28
|
+
|
|
29
|
+
"Peter Pipers employees are picking pecks of pickled peppers."
|
|
30
|
+
"<NNP><NNP><NNS><VBP><VBG><NNS><IN><JJ><NNS><.>"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
To then extract these tags, ``regexParsers`` need to be set with e.g.:
|
|
34
|
+
|
|
35
|
+
>>> chunker = Chunker() \\
|
|
36
|
+
... .setInputCols(["sentence", "pos"]) \\
|
|
37
|
+
... .setOutputCol("chunk") \\
|
|
38
|
+
... .setRegexParsers(["<NNP>+", "<NNS>+"])
|
|
39
|
+
|
|
40
|
+
When defining the regular expressions, tags enclosed in angle brackets are
|
|
41
|
+
treated as groups, so here specifically ``"<NNP>+"`` means 1 or more nouns
|
|
42
|
+
in succession.
|
|
43
|
+
|
|
44
|
+
For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb>`__.
|
|
45
|
+
|
|
46
|
+
====================== ======================
|
|
47
|
+
Input Annotation types Output Annotation type
|
|
48
|
+
====================== ======================
|
|
49
|
+
``DOCUMENT, POS`` ``CHUNK``
|
|
50
|
+
====================== ======================
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
regexParsers
|
|
55
|
+
An array of grammar based chunk parsers
|
|
56
|
+
|
|
57
|
+
Examples
|
|
58
|
+
--------
|
|
59
|
+
>>> import sparknlp
|
|
60
|
+
>>> from sparknlp.base import *
|
|
61
|
+
>>> from sparknlp.annotator import *
|
|
62
|
+
>>> from pyspark.ml import Pipeline
|
|
63
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
64
|
+
... .setInputCol("text") \\
|
|
65
|
+
... .setOutputCol("document")
|
|
66
|
+
>>> sentence = SentenceDetector() \\
|
|
67
|
+
... .setInputCols("document") \\
|
|
68
|
+
... .setOutputCol("sentence")
|
|
69
|
+
>>> tokenizer = Tokenizer() \\
|
|
70
|
+
... .setInputCols(["sentence"]) \\
|
|
71
|
+
... .setOutputCol("token")
|
|
72
|
+
>>> POSTag = PerceptronModel.pretrained() \\
|
|
73
|
+
... .setInputCols("document", "token") \\
|
|
74
|
+
... .setOutputCol("pos")
|
|
75
|
+
>>> chunker = Chunker() \\
|
|
76
|
+
... .setInputCols("sentence", "pos") \\
|
|
77
|
+
... .setOutputCol("chunk") \\
|
|
78
|
+
... .setRegexParsers(["<NNP>+", "<NNS>+"])
|
|
79
|
+
>>> pipeline = Pipeline() \\
|
|
80
|
+
... .setStages([
|
|
81
|
+
... documentAssembler,
|
|
82
|
+
... sentence,
|
|
83
|
+
... tokenizer,
|
|
84
|
+
... POSTag,
|
|
85
|
+
... chunker
|
|
86
|
+
... ])
|
|
87
|
+
>>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]).toDF("text")
|
|
88
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
89
|
+
>>> result.selectExpr("explode(chunk) as result").show(truncate=False)
|
|
90
|
+
+-------------------------------------------------------------+
|
|
91
|
+
|result |
|
|
92
|
+
+-------------------------------------------------------------+
|
|
93
|
+
|[chunk, 0, 11, Peter Pipers, [sentence -> 0, chunk -> 0], []]|
|
|
94
|
+
|[chunk, 13, 21, employees, [sentence -> 0, chunk -> 1], []] |
|
|
95
|
+
|[chunk, 35, 39, pecks, [sentence -> 0, chunk -> 2], []] |
|
|
96
|
+
|[chunk, 52, 58, peppers, [sentence -> 0, chunk -> 3], []] |
|
|
97
|
+
+-------------------------------------------------------------+
|
|
98
|
+
|
|
99
|
+
See Also
|
|
100
|
+
--------
|
|
101
|
+
PerceptronModel : for Part-Of-Speech tagging
|
|
102
|
+
"""
|
|
103
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS]
|
|
104
|
+
|
|
105
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
106
|
+
|
|
107
|
+
regexParsers = Param(Params._dummy(),
|
|
108
|
+
"regexParsers",
|
|
109
|
+
"an array of grammar based chunk parsers",
|
|
110
|
+
typeConverter=TypeConverters.toListString)
|
|
111
|
+
|
|
112
|
+
name = "Chunker"
|
|
113
|
+
|
|
114
|
+
@keyword_only
|
|
115
|
+
def __init__(self):
|
|
116
|
+
super(Chunker, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunker")
|
|
117
|
+
|
|
118
|
+
def setRegexParsers(self, value):
|
|
119
|
+
"""Sets an array of grammar based chunk parsers.
|
|
120
|
+
|
|
121
|
+
POS classes should be enclosed in angle brackets, then treated as
|
|
122
|
+
groups.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
value : List[str]
|
|
127
|
+
Array of grammar based chunk parsers
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
Examples
|
|
131
|
+
--------
|
|
132
|
+
>>> chunker = Chunker() \\
|
|
133
|
+
... .setInputCols("sentence", "pos") \\
|
|
134
|
+
... .setOutputCol("chunk") \\
|
|
135
|
+
... .setRegexParsers(["<NNP>+", "<NNS>+"])
|
|
136
|
+
"""
|
|
137
|
+
return self._set(regexParsers=value)
|