spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the TextMatcher."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TextMatcher(AnnotatorApproach):
|
|
21
|
+
"""Annotator to match exact phrases (by token) provided in a file against a
|
|
22
|
+
Document.
|
|
23
|
+
|
|
24
|
+
A text file of predefined phrases must be provided with
|
|
25
|
+
:meth:`.setEntities`.
|
|
26
|
+
|
|
27
|
+
For extended examples of usage, see the `Examples
|
|
28
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb>`__.
|
|
29
|
+
|
|
30
|
+
====================== ======================
|
|
31
|
+
Input Annotation types Output Annotation type
|
|
32
|
+
====================== ======================
|
|
33
|
+
``DOCUMENT, TOKEN`` ``CHUNK``
|
|
34
|
+
====================== ======================
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
entities
|
|
39
|
+
ExternalResource for entities
|
|
40
|
+
caseSensitive
|
|
41
|
+
Whether to match regardless of case, by default True
|
|
42
|
+
mergeOverlapping
|
|
43
|
+
Whether to merge overlapping matched chunks, by default False
|
|
44
|
+
entityValue
|
|
45
|
+
Value for the entity metadata field
|
|
46
|
+
buildFromTokens
|
|
47
|
+
Whether the TextMatcher should take the CHUNK from TOKEN or not
|
|
48
|
+
|
|
49
|
+
Examples
|
|
50
|
+
--------
|
|
51
|
+
In this example, the entities file is of the form::
|
|
52
|
+
|
|
53
|
+
...
|
|
54
|
+
dolore magna aliqua
|
|
55
|
+
lorem ipsum dolor. sit
|
|
56
|
+
laborum
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
where each line represents an entity phrase to be extracted.
|
|
60
|
+
|
|
61
|
+
>>> import sparknlp
|
|
62
|
+
>>> from sparknlp.base import *
|
|
63
|
+
>>> from sparknlp.annotator import *
|
|
64
|
+
>>> from pyspark.ml import Pipeline
|
|
65
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
66
|
+
... .setInputCol("text") \\
|
|
67
|
+
... .setOutputCol("document")
|
|
68
|
+
>>> tokenizer = Tokenizer() \\
|
|
69
|
+
... .setInputCols(["document"]) \\
|
|
70
|
+
... .setOutputCol("token")
|
|
71
|
+
>>> data = spark.createDataFrame([["Hello dolore magna aliqua. Lorem ipsum dolor. sit in laborum"]]).toDF("text")
|
|
72
|
+
>>> entityExtractor = TextMatcher() \\
|
|
73
|
+
... .setInputCols(["document", "token"]) \\
|
|
74
|
+
... .setEntities("src/test/resources/entity-extractor/test-phrases.txt", ReadAs.TEXT) \\
|
|
75
|
+
... .setOutputCol("entity") \\
|
|
76
|
+
... .setCaseSensitive(False)
|
|
77
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, tokenizer, entityExtractor])
|
|
78
|
+
>>> results = pipeline.fit(data).transform(data)
|
|
79
|
+
>>> results.selectExpr("explode(entity) as result").show(truncate=False)
|
|
80
|
+
+------------------------------------------------------------------------------------------+
|
|
81
|
+
|result |
|
|
82
|
+
+------------------------------------------------------------------------------------------+
|
|
83
|
+
|[chunk, 6, 24, dolore magna aliqua, [entity -> entity, sentence -> 0, chunk -> 0], []] |
|
|
84
|
+
|[chunk, 27, 48, Lorem ipsum dolor. sit, [entity -> entity, sentence -> 0, chunk -> 1], []]|
|
|
85
|
+
|[chunk, 53, 59, laborum, [entity -> entity, sentence -> 0, chunk -> 2], []] |
|
|
86
|
+
+------------------------------------------------------------------------------------------+
|
|
87
|
+
|
|
88
|
+
See Also
|
|
89
|
+
--------
|
|
90
|
+
BigTextMatcher : to match large amounts of text
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
|
|
94
|
+
|
|
95
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
96
|
+
|
|
97
|
+
entities = Param(Params._dummy(),
|
|
98
|
+
"entities",
|
|
99
|
+
"ExternalResource for entities",
|
|
100
|
+
typeConverter=TypeConverters.identity)
|
|
101
|
+
|
|
102
|
+
caseSensitive = Param(Params._dummy(),
|
|
103
|
+
"caseSensitive",
|
|
104
|
+
"whether to match regardless of case. Defaults true",
|
|
105
|
+
typeConverter=TypeConverters.toBoolean)
|
|
106
|
+
|
|
107
|
+
mergeOverlapping = Param(Params._dummy(),
|
|
108
|
+
"mergeOverlapping",
|
|
109
|
+
"whether to merge overlapping matched chunks. Defaults false",
|
|
110
|
+
typeConverter=TypeConverters.toBoolean)
|
|
111
|
+
|
|
112
|
+
entityValue = Param(Params._dummy(),
|
|
113
|
+
"entityValue",
|
|
114
|
+
"value for the entity metadata field",
|
|
115
|
+
typeConverter=TypeConverters.toString)
|
|
116
|
+
|
|
117
|
+
buildFromTokens = Param(Params._dummy(),
|
|
118
|
+
"buildFromTokens",
|
|
119
|
+
"whether the TextMatcher should take the CHUNK from TOKEN or not",
|
|
120
|
+
typeConverter=TypeConverters.toBoolean)
|
|
121
|
+
|
|
122
|
+
@keyword_only
|
|
123
|
+
def __init__(self):
|
|
124
|
+
super(TextMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.TextMatcher")
|
|
125
|
+
self._setDefault(inputCols=[AnnotatorType.DOCUMENT, AnnotatorType.TOKEN])
|
|
126
|
+
self._setDefault(caseSensitive=True)
|
|
127
|
+
self._setDefault(mergeOverlapping=False)
|
|
128
|
+
|
|
129
|
+
def _create_model(self, java_model):
|
|
130
|
+
return TextMatcherModel(java_model=java_model)
|
|
131
|
+
|
|
132
|
+
def setEntities(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
133
|
+
"""Sets the external resource for the entities.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
path : str
|
|
138
|
+
Path to the external resource
|
|
139
|
+
read_as : str, optional
|
|
140
|
+
How to read the resource, by default ReadAs.TEXT
|
|
141
|
+
options : dict, optional
|
|
142
|
+
Options for reading the resource, by default {"format": "text"}
|
|
143
|
+
"""
|
|
144
|
+
return self._set(entities=ExternalResource(path, read_as, options.copy()))
|
|
145
|
+
|
|
146
|
+
def setCaseSensitive(self, b):
|
|
147
|
+
"""Sets whether to match regardless of case, by default True.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
b : bool
|
|
152
|
+
Whether to match regardless of case
|
|
153
|
+
"""
|
|
154
|
+
return self._set(caseSensitive=b)
|
|
155
|
+
|
|
156
|
+
def setMergeOverlapping(self, b):
|
|
157
|
+
"""Sets whether to merge overlapping matched chunks, by default False.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
b : bool
|
|
162
|
+
Whether to merge overlapping matched chunks
|
|
163
|
+
"""
|
|
164
|
+
return self._set(mergeOverlapping=b)
|
|
165
|
+
|
|
166
|
+
def setEntityValue(self, b):
|
|
167
|
+
"""Sets value for the entity metadata field.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
b : str
|
|
172
|
+
Value for the entity metadata field
|
|
173
|
+
"""
|
|
174
|
+
return self._set(entityValue=b)
|
|
175
|
+
|
|
176
|
+
def setBuildFromTokens(self, b):
|
|
177
|
+
"""Sets whether the TextMatcher should take the CHUNK from TOKEN or not.
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
b : bool
|
|
182
|
+
Whether the TextMatcher should take the CHUNK from TOKEN or not
|
|
183
|
+
"""
|
|
184
|
+
return self._set(buildFromTokens=b)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class TextMatcherModel(AnnotatorModel):
|
|
188
|
+
"""Instantiated model of the TextMatcher.
|
|
189
|
+
|
|
190
|
+
This is the instantiated model of the :class:`.TextMatcher`. For training
|
|
191
|
+
your own model, please see the documentation of that class.
|
|
192
|
+
|
|
193
|
+
====================== ======================
|
|
194
|
+
Input Annotation types Output Annotation type
|
|
195
|
+
====================== ======================
|
|
196
|
+
``DOCUMENT, TOKEN`` ``CHUNK``
|
|
197
|
+
====================== ======================
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
mergeOverlapping
|
|
202
|
+
Whether to merge overlapping matched chunks, by default False
|
|
203
|
+
entityValue
|
|
204
|
+
Value for the entity metadata field
|
|
205
|
+
buildFromTokens
|
|
206
|
+
Whether the TextMatcher should take the CHUNK from TOKEN or not
|
|
207
|
+
"""
|
|
208
|
+
name = "TextMatcherModel"
|
|
209
|
+
|
|
210
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
|
|
211
|
+
|
|
212
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
213
|
+
|
|
214
|
+
mergeOverlapping = Param(Params._dummy(),
|
|
215
|
+
"mergeOverlapping",
|
|
216
|
+
"whether to merge overlapping matched chunks. Defaults false",
|
|
217
|
+
typeConverter=TypeConverters.toBoolean)
|
|
218
|
+
|
|
219
|
+
searchTrie = Param(Params._dummy(),
|
|
220
|
+
"searchTrie",
|
|
221
|
+
"searchTrie",
|
|
222
|
+
typeConverter=TypeConverters.identity)
|
|
223
|
+
|
|
224
|
+
entityValue = Param(Params._dummy(),
|
|
225
|
+
"entityValue",
|
|
226
|
+
"value for the entity metadata field",
|
|
227
|
+
typeConverter=TypeConverters.toString)
|
|
228
|
+
|
|
229
|
+
buildFromTokens = Param(Params._dummy(),
|
|
230
|
+
"buildFromTokens",
|
|
231
|
+
"whether the TextMatcher should take the CHUNK from TOKEN or not",
|
|
232
|
+
typeConverter=TypeConverters.toBoolean)
|
|
233
|
+
|
|
234
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TextMatcherModel", java_model=None):
|
|
235
|
+
super(TextMatcherModel, self).__init__(
|
|
236
|
+
classname=classname,
|
|
237
|
+
java_model=java_model
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def setMergeOverlapping(self, b):
|
|
241
|
+
"""Sets whether to merge overlapping matched chunks, by default False.
|
|
242
|
+
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
b : bool
|
|
246
|
+
Whether to merge overlapping matched chunks
|
|
247
|
+
"""
|
|
248
|
+
return self._set(mergeOverlapping=b)
|
|
249
|
+
|
|
250
|
+
def setEntityValue(self, b):
|
|
251
|
+
"""Sets value for the entity metadata field.
|
|
252
|
+
|
|
253
|
+
Parameters
|
|
254
|
+
----------
|
|
255
|
+
b : str
|
|
256
|
+
Value for the entity metadata field
|
|
257
|
+
"""
|
|
258
|
+
return self._set(entityValue=b)
|
|
259
|
+
|
|
260
|
+
def setBuildFromTokens(self, b):
|
|
261
|
+
"""Sets whether the TextMatcher should take the CHUNK from TOKEN or not.
|
|
262
|
+
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
b : bool
|
|
266
|
+
Whether the TextMatcher should take the CHUNK from TOKEN or not
|
|
267
|
+
"""
|
|
268
|
+
return self._set(buildFromTokens=b)
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def pretrained(name, lang="en", remote_loc=None):
|
|
272
|
+
"""Downloads and loads a pretrained model.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
name : str, optional
|
|
277
|
+
Name of the pretrained model
|
|
278
|
+
lang : str, optional
|
|
279
|
+
Language of the pretrained model, by default "en"
|
|
280
|
+
remote_loc : str, optional
|
|
281
|
+
Optional remote address of the resource, by default None. Will use
|
|
282
|
+
Spark NLPs repositories otherwise.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
TextMatcherModel
|
|
287
|
+
The restored model
|
|
288
|
+
"""
|
|
289
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
290
|
+
return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the NGramGenerator."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class NGramGenerator(AnnotatorModel):
|
|
19
|
+
"""A feature transformer that converts the input array of strings
|
|
20
|
+
(annotatorType ``TOKEN``) into an array of n-grams (annotatorType
|
|
21
|
+
``CHUNK``).
|
|
22
|
+
|
|
23
|
+
Null values in the input array are ignored. It returns an array of n-grams
|
|
24
|
+
where each n-gram is represented by a space-separated string of words.
|
|
25
|
+
|
|
26
|
+
When the input is empty, an empty array is returned. When the input array
|
|
27
|
+
length is less than n (number of elements per n-gram), no n-grams are
|
|
28
|
+
returned.
|
|
29
|
+
|
|
30
|
+
For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb>`__.
|
|
31
|
+
|
|
32
|
+
====================== ======================
|
|
33
|
+
Input Annotation types Output Annotation type
|
|
34
|
+
====================== ======================
|
|
35
|
+
``TOKEN`` ``CHUNK``
|
|
36
|
+
====================== ======================
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
n
|
|
41
|
+
Number elements per n-gram (>=1), by default 2
|
|
42
|
+
enableCumulative
|
|
43
|
+
Whether to calculate just the actual n-grams, by default False
|
|
44
|
+
delimiter
|
|
45
|
+
Character to use to join the tokens, by default " "
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
>>> import sparknlp
|
|
50
|
+
>>> from sparknlp.base import *
|
|
51
|
+
>>> from sparknlp.annotator import *
|
|
52
|
+
>>> from pyspark.ml import Pipeline
|
|
53
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
54
|
+
... .setInputCol("text") \\
|
|
55
|
+
... .setOutputCol("document")
|
|
56
|
+
>>> sentence = SentenceDetector() \\
|
|
57
|
+
... .setInputCols(["document"]) \\
|
|
58
|
+
... .setOutputCol("sentence")
|
|
59
|
+
>>> tokenizer = Tokenizer() \\
|
|
60
|
+
... .setInputCols(["sentence"]) \\
|
|
61
|
+
... .setOutputCol("token")
|
|
62
|
+
>>> nGrams = NGramGenerator() \\
|
|
63
|
+
... .setInputCols(["token"]) \\
|
|
64
|
+
... .setOutputCol("ngrams") \\
|
|
65
|
+
... .setN(2)
|
|
66
|
+
>>> pipeline = Pipeline().setStages([
|
|
67
|
+
... documentAssembler,
|
|
68
|
+
... sentence,
|
|
69
|
+
... tokenizer,
|
|
70
|
+
... nGrams
|
|
71
|
+
... ])
|
|
72
|
+
>>> data = spark.createDataFrame([["This is my sentence."]]).toDF("text")
|
|
73
|
+
>>> results = pipeline.fit(data).transform(data)
|
|
74
|
+
>>> results.selectExpr("explode(ngrams) as result").show(truncate=False)
|
|
75
|
+
+------------------------------------------------------------+
|
|
76
|
+
|result |
|
|
77
|
+
+------------------------------------------------------------+
|
|
78
|
+
|[chunk, 0, 6, This is, [sentence -> 0, chunk -> 0], []] |
|
|
79
|
+
|[chunk, 5, 9, is my, [sentence -> 0, chunk -> 1], []] |
|
|
80
|
+
|[chunk, 8, 18, my sentence, [sentence -> 0, chunk -> 2], []]|
|
|
81
|
+
|[chunk, 11, 19, sentence ., [sentence -> 0, chunk -> 3], []]|
|
|
82
|
+
+------------------------------------------------------------+
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
name = "NGramGenerator"
|
|
86
|
+
|
|
87
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
88
|
+
|
|
89
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
90
|
+
|
|
91
|
+
@keyword_only
|
|
92
|
+
def __init__(self):
|
|
93
|
+
super(NGramGenerator, self).__init__(classname="com.johnsnowlabs.nlp.annotators.NGramGenerator")
|
|
94
|
+
self._setDefault(
|
|
95
|
+
n=2,
|
|
96
|
+
enableCumulative=False
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
n = Param(Params._dummy(), "n", "number elements per n-gram (>=1)", typeConverter=TypeConverters.toInt)
|
|
100
|
+
enableCumulative = Param(Params._dummy(), "enableCumulative", "whether to calculate just the actual n-grams " +
|
|
101
|
+
"or all n-grams from 1 through n", typeConverter=TypeConverters.toBoolean)
|
|
102
|
+
|
|
103
|
+
delimiter = Param(Params._dummy(), "delimiter", "String to use to join the tokens ",
|
|
104
|
+
typeConverter=TypeConverters.toString)
|
|
105
|
+
|
|
106
|
+
def setN(self, value):
|
|
107
|
+
"""Sets number elements per n-gram (>=1), by default 2.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
value : int
|
|
112
|
+
Number elements per n-gram (>=1), by default 2
|
|
113
|
+
"""
|
|
114
|
+
return self._set(n=value)
|
|
115
|
+
|
|
116
|
+
def setEnableCumulative(self, value):
|
|
117
|
+
"""Sets whether to calculate just the actual n-grams, by default False.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
value : bool
|
|
122
|
+
Whether to calculate just the actual n-grams
|
|
123
|
+
"""
|
|
124
|
+
return self._set(enableCumulative=value)
|
|
125
|
+
|
|
126
|
+
def setDelimiter(self, value):
|
|
127
|
+
"""Sets character to use to join the tokens
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
value : str
|
|
132
|
+
character to use to join the tokens
|
|
133
|
+
|
|
134
|
+
Raises
|
|
135
|
+
------
|
|
136
|
+
Exception
|
|
137
|
+
Delimiter should have length == 1
|
|
138
|
+
"""
|
|
139
|
+
if len(value) > 1:
|
|
140
|
+
raise Exception("Delimiter should have length == 1")
|
|
141
|
+
return self._set(delimiter=value)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Copyright 2017-2023 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Module of annotators for named entity recognition."""
|
|
15
|
+
from sparknlp.annotator.ner.ner_approach import *
|
|
16
|
+
from sparknlp.annotator.ner.ner_converter import *
|
|
17
|
+
from sparknlp.annotator.ner.ner_crf import *
|
|
18
|
+
from sparknlp.annotator.ner.ner_dl import *
|
|
19
|
+
from sparknlp.annotator.ner.ner_dl_graph_checker import *
|
|
20
|
+
from sparknlp.annotator.ner.ner_overwriter import *
|
|
21
|
+
from sparknlp.annotator.ner.zero_shot_ner_model import *
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains base classes for NER Annotators."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NerApproach(Params):
|
|
20
|
+
"""Base class for Ner*Approach Annotators
|
|
21
|
+
"""
|
|
22
|
+
labelColumn = Param(Params._dummy(),
|
|
23
|
+
"labelColumn",
|
|
24
|
+
"Column with label per each token",
|
|
25
|
+
typeConverter=TypeConverters.toString)
|
|
26
|
+
|
|
27
|
+
entities = Param(Params._dummy(), "entities", "Entities to recognize", TypeConverters.toListString)
|
|
28
|
+
|
|
29
|
+
minEpochs = Param(Params._dummy(), "minEpochs", "Minimum number of epochs to train", TypeConverters.toInt)
|
|
30
|
+
|
|
31
|
+
maxEpochs = Param(Params._dummy(), "maxEpochs", "Maximum number of epochs to train", TypeConverters.toInt)
|
|
32
|
+
|
|
33
|
+
randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)
|
|
34
|
+
|
|
35
|
+
def setLabelColumn(self, value):
|
|
36
|
+
"""Sets name of column for data labels.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
value : str
|
|
41
|
+
Column for data labels
|
|
42
|
+
"""
|
|
43
|
+
return self._set(labelColumn=value)
|
|
44
|
+
|
|
45
|
+
def setEntities(self, tags):
|
|
46
|
+
"""Sets entities to recognize.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
tags : List[str]
|
|
51
|
+
List of entities
|
|
52
|
+
"""
|
|
53
|
+
return self._set(entities=tags)
|
|
54
|
+
|
|
55
|
+
def setMinEpochs(self, epochs):
|
|
56
|
+
"""Sets minimum number of epochs to train.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
epochs : int
|
|
61
|
+
Minimum number of epochs to train
|
|
62
|
+
"""
|
|
63
|
+
return self._set(minEpochs=epochs)
|
|
64
|
+
|
|
65
|
+
def setMaxEpochs(self, epochs):
|
|
66
|
+
"""Sets maximum number of epochs to train.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
epochs : int
|
|
71
|
+
Maximum number of epochs to train
|
|
72
|
+
"""
|
|
73
|
+
return self._set(maxEpochs=epochs)
|
|
74
|
+
|
|
75
|
+
def setRandomSeed(self, seed):
|
|
76
|
+
"""Sets random seed for shuffling.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
seed : int
|
|
81
|
+
Random seed for shuffling
|
|
82
|
+
"""
|
|
83
|
+
return self._set(randomSeed=seed)
|
|
84
|
+
|
|
85
|
+
def getLabelColumn(self):
|
|
86
|
+
"""Gets column for label per each token.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
str
|
|
91
|
+
Column with label per each token
|
|
92
|
+
"""
|
|
93
|
+
return self.getOrDefault(self.labelColumn)
|
|
94
|
+
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the NerConverter."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NerConverter(AnnotatorModel):
|
|
20
|
+
"""Converts a IOB or IOB2 representation of NER to a user-friendly one, by
|
|
21
|
+
associating the tokens of recognized entities and their label. Results in
|
|
22
|
+
``CHUNK`` Annotation type.
|
|
23
|
+
|
|
24
|
+
NER chunks can then be filtered by setting a whitelist with
|
|
25
|
+
``setWhiteList``. Chunks with no associated entity (tagged "O") are
|
|
26
|
+
filtered.
|
|
27
|
+
|
|
28
|
+
See also `Inside–outside–beginning (tagging)
|
|
29
|
+
<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`__
|
|
30
|
+
for more information.
|
|
31
|
+
|
|
32
|
+
================================= ======================
|
|
33
|
+
Input Annotation types Output Annotation type
|
|
34
|
+
================================= ======================
|
|
35
|
+
``DOCUMENT, TOKEN, NAMED_ENTITY`` ``CHUNK``
|
|
36
|
+
================================= ======================
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
whiteList
|
|
41
|
+
If defined, list of entities to process. The rest will be ignored. Do
|
|
42
|
+
not include IOB prefix on labels
|
|
43
|
+
preservePosition
|
|
44
|
+
Whether to preserve the original position of the tokens in the original document
|
|
45
|
+
or use the modified tokens, by default `True`
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
This is a continuation of the example of the :class:`.NerDLModel`. See that
|
|
50
|
+
class on how to extract the entities. The output of the NerDLModel follows
|
|
51
|
+
the Annotator schema and can be converted like so:
|
|
52
|
+
|
|
53
|
+
>>> result.selectExpr("explode(ner)").show(truncate=False)
|
|
54
|
+
+----------------------------------------------------+
|
|
55
|
+
|col |
|
|
56
|
+
+----------------------------------------------------+
|
|
57
|
+
|[named_entity, 0, 2, B-ORG, [word -> U.N], []] |
|
|
58
|
+
|[named_entity, 3, 3, O, [word -> .], []] |
|
|
59
|
+
|[named_entity, 5, 12, O, [word -> official], []] |
|
|
60
|
+
|[named_entity, 14, 18, B-PER, [word -> Ekeus], []] |
|
|
61
|
+
|[named_entity, 20, 24, O, [word -> heads], []] |
|
|
62
|
+
|[named_entity, 26, 28, O, [word -> for], []] |
|
|
63
|
+
|[named_entity, 30, 36, B-LOC, [word -> Baghdad], []]|
|
|
64
|
+
|[named_entity, 37, 37, O, [word -> .], []] |
|
|
65
|
+
+----------------------------------------------------+
|
|
66
|
+
|
|
67
|
+
After the converter is used:
|
|
68
|
+
|
|
69
|
+
>>> converter = NerConverter() \\
|
|
70
|
+
... .setInputCols(["sentence", "token", "ner"]) \\
|
|
71
|
+
... .setOutputCol("entities")
|
|
72
|
+
>>> converter.transform(result).selectExpr("explode(entities)").show(truncate=False)
|
|
73
|
+
+------------------------------------------------------------------------+
|
|
74
|
+
|col |
|
|
75
|
+
+------------------------------------------------------------------------+
|
|
76
|
+
|[chunk, 0, 2, U.N, [entity -> ORG, sentence -> 0, chunk -> 0], []] |
|
|
77
|
+
|[chunk, 14, 18, Ekeus, [entity -> PER, sentence -> 0, chunk -> 1], []] |
|
|
78
|
+
|[chunk, 30, 36, Baghdad, [entity -> LOC, sentence -> 0, chunk -> 2], []]|
|
|
79
|
+
+------------------------------------------------------------------------+
|
|
80
|
+
"""
|
|
81
|
+
name = 'NerConverter'
|
|
82
|
+
|
|
83
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.NAMED_ENTITY]
|
|
84
|
+
|
|
85
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
86
|
+
|
|
87
|
+
whiteList = Param(
|
|
88
|
+
Params._dummy(),
|
|
89
|
+
"whiteList",
|
|
90
|
+
"If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels",
|
|
91
|
+
typeConverter=TypeConverters.toListString
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
preservePosition = Param(
|
|
95
|
+
Params._dummy(),
|
|
96
|
+
"preservePosition",
|
|
97
|
+
"Whether to preserve the original position of the tokens in the original document or use the modified tokens",
|
|
98
|
+
typeConverter=TypeConverters.toBoolean
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
nerHasNoSchema = Param(
|
|
102
|
+
Params._dummy(),
|
|
103
|
+
"nerHasNoSchema",
|
|
104
|
+
"set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema",
|
|
105
|
+
typeConverter=TypeConverters.toBoolean
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def setWhiteList(self, entities):
|
|
109
|
+
"""Sets list of entities to process. The rest will be ignored.
|
|
110
|
+
|
|
111
|
+
Does not include IOB prefix on labels.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
entities : List[str]
|
|
116
|
+
If defined, list of entities to process. The rest will be ignored.
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
return self._set(whiteList=entities)
|
|
120
|
+
|
|
121
|
+
def setPreservePosition(self, value):
|
|
122
|
+
"""
|
|
123
|
+
Whether to preserve the original position of the tokens in the original document
|
|
124
|
+
or use the modified tokens, by default `True`.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
value : bool
|
|
129
|
+
Whether to preserve the original position of the tokens in the original
|
|
130
|
+
document or use the modified tokens
|
|
131
|
+
"""
|
|
132
|
+
return self._set(preservePosition=value)
|
|
133
|
+
|
|
134
|
+
def setNerHasNoSchema(self, value):
|
|
135
|
+
"""
|
|
136
|
+
set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
value : bool
|
|
141
|
+
set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
|
|
142
|
+
"""
|
|
143
|
+
return self._set(nerHasNoSchema=value)
|
|
144
|
+
|
|
145
|
+
@keyword_only
|
|
146
|
+
def __init__(self):
|
|
147
|
+
super(NerConverter, self).__init__(
|
|
148
|
+
classname="com.johnsnowlabs.nlp.annotators.ner.NerConverter")
|