spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the Stemmer."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Stemmer(AnnotatorModel):
|
|
19
|
+
"""Returns hard-stems out of words with the objective of retrieving the
|
|
20
|
+
meaningful part of the word.
|
|
21
|
+
|
|
22
|
+
For extended examples of usage, see the `Examples
|
|
23
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb>`__.
|
|
24
|
+
|
|
25
|
+
====================== ======================
|
|
26
|
+
Input Annotation types Output Annotation type
|
|
27
|
+
====================== ======================
|
|
28
|
+
``TOKEN`` ``TOKEN``
|
|
29
|
+
====================== ======================
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
None
|
|
34
|
+
|
|
35
|
+
Examples
|
|
36
|
+
--------
|
|
37
|
+
>>> import sparknlp
|
|
38
|
+
>>> from sparknlp.base import *
|
|
39
|
+
>>> from sparknlp.annotator import *
|
|
40
|
+
>>> from pyspark.ml import Pipeline
|
|
41
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
42
|
+
... .setInputCol("text") \\
|
|
43
|
+
... .setOutputCol("document")
|
|
44
|
+
>>> tokenizer = Tokenizer() \\
|
|
45
|
+
... .setInputCols(["document"]) \\
|
|
46
|
+
... .setOutputCol("token")
|
|
47
|
+
>>> stemmer = Stemmer() \\
|
|
48
|
+
... .setInputCols(["token"]) \\
|
|
49
|
+
... .setOutputCol("stem")
|
|
50
|
+
>>> pipeline = Pipeline().setStages([
|
|
51
|
+
... documentAssembler,
|
|
52
|
+
... tokenizer,
|
|
53
|
+
... stemmer
|
|
54
|
+
... ])
|
|
55
|
+
>>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]) \\
|
|
56
|
+
... .toDF("text")
|
|
57
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
58
|
+
>>> result.selectExpr("stem.result").show(truncate = False)
|
|
59
|
+
+-------------------------------------------------------------+
|
|
60
|
+
|result |
|
|
61
|
+
+-------------------------------------------------------------+
|
|
62
|
+
|[peter, piper, employe, ar, pick, peck, of, pickl, pepper, .]|
|
|
63
|
+
+-------------------------------------------------------------+
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
67
|
+
|
|
68
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
69
|
+
|
|
70
|
+
language = Param(Params._dummy(), "language", "stemmer algorithm", typeConverter=TypeConverters.toString)
|
|
71
|
+
|
|
72
|
+
name = "Stemmer"
|
|
73
|
+
|
|
74
|
+
@keyword_only
|
|
75
|
+
def __init__(self):
|
|
76
|
+
super(Stemmer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Stemmer")
|
|
77
|
+
self._setDefault(
|
|
78
|
+
language="english"
|
|
79
|
+
)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the StopWordsCleaner."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StopWordsCleaner(AnnotatorModel):
|
|
19
|
+
"""This annotator takes a sequence of strings (e.g. the output of a
|
|
20
|
+
Tokenizer, Normalizer, Lemmatizer, and Stemmer) and drops all the stop words
|
|
21
|
+
from the input sequences.
|
|
22
|
+
|
|
23
|
+
By default, it uses stop words from MLlibs `StopWordsRemover
|
|
24
|
+
<https://spark.apache.org/docs/latest/ml-features#stopwordsremover>`__. Stop
|
|
25
|
+
words can also be defined by explicitly setting them with
|
|
26
|
+
:meth:`.setStopWords` or loaded from pretrained models using ``pretrained``
|
|
27
|
+
of its companion object.
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
>>> stopWords = StopWordsCleaner.pretrained() \\
|
|
31
|
+
... .setInputCols(["token"]) \\
|
|
32
|
+
... .setOutputCol("cleanTokens")
|
|
33
|
+
|
|
34
|
+
This will load the default pretrained model ``"stopwords_en"``.
|
|
35
|
+
|
|
36
|
+
For available pretrained models please see the `Models Hub
|
|
37
|
+
<https://sparknlp.orgtask=Stop+Words+Removal>`__.
|
|
38
|
+
|
|
39
|
+
For extended examples of usage, see the `Examples
|
|
40
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb>`__.
|
|
41
|
+
|
|
42
|
+
====================== ======================
|
|
43
|
+
Input Annotation types Output Annotation type
|
|
44
|
+
====================== ======================
|
|
45
|
+
``TOKEN`` ``TOKEN``
|
|
46
|
+
====================== ======================
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
stopWords
|
|
51
|
+
The words to be filtered out, by default english stopwords from Spark ML
|
|
52
|
+
caseSensitive
|
|
53
|
+
Whether to consider case, by default False
|
|
54
|
+
locale
|
|
55
|
+
Locale of the input. ignored when case sensitive, by default locale of
|
|
56
|
+
the JVM
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> import sparknlp
|
|
61
|
+
>>> from sparknlp.base import *
|
|
62
|
+
>>> from sparknlp.annotator import *
|
|
63
|
+
>>> from pyspark.ml import Pipeline
|
|
64
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
65
|
+
... .setInputCol("text") \\
|
|
66
|
+
... .setOutputCol("document")
|
|
67
|
+
>>> sentenceDetector = SentenceDetector() \\
|
|
68
|
+
... .setInputCols(["document"]) \\
|
|
69
|
+
... .setOutputCol("sentence")
|
|
70
|
+
>>> tokenizer = Tokenizer() \\
|
|
71
|
+
... .setInputCols(["sentence"]) \\
|
|
72
|
+
... .setOutputCol("token")
|
|
73
|
+
>>> stopWords = StopWordsCleaner() \\
|
|
74
|
+
... .setInputCols(["token"]) \\
|
|
75
|
+
... .setOutputCol("cleanTokens") \\
|
|
76
|
+
... .setCaseSensitive(False)
|
|
77
|
+
>>> pipeline = Pipeline().setStages([
|
|
78
|
+
... documentAssembler,
|
|
79
|
+
... sentenceDetector,
|
|
80
|
+
... tokenizer,
|
|
81
|
+
... stopWords
|
|
82
|
+
... ])
|
|
83
|
+
>>> data = spark.createDataFrame([
|
|
84
|
+
... ["This is my first sentence. This is my second."],
|
|
85
|
+
... ["This is my third sentence. This is my forth."]
|
|
86
|
+
... ]).toDF("text")
|
|
87
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
88
|
+
>>> result.selectExpr("cleanTokens.result").show(truncate=False)
|
|
89
|
+
+-------------------------------+
|
|
90
|
+
|result |
|
|
91
|
+
+-------------------------------+
|
|
92
|
+
|[first, sentence, ., second, .]|
|
|
93
|
+
|[third, sentence, ., forth, .] |
|
|
94
|
+
+-------------------------------+
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
name = "StopWordsCleaner"
|
|
98
|
+
|
|
99
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
100
|
+
|
|
101
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
102
|
+
|
|
103
|
+
@keyword_only
|
|
104
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.StopWordsCleaner", java_model=None):
|
|
105
|
+
super(StopWordsCleaner, self).__init__(
|
|
106
|
+
classname=classname,
|
|
107
|
+
java_model=java_model
|
|
108
|
+
)
|
|
109
|
+
self._setDefault(
|
|
110
|
+
stopWords=StopWordsCleaner.loadDefaultStopWords("english"),
|
|
111
|
+
caseSensitive=False,
|
|
112
|
+
locale=self._java_obj.getLocale()
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out",
|
|
116
|
+
typeConverter=TypeConverters.toListString)
|
|
117
|
+
caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
|
|
118
|
+
"comparison over the stop words", typeConverter=TypeConverters.toBoolean)
|
|
119
|
+
locale = Param(Params._dummy(), "locale", "locale of the input. ignored when case sensitive " +
|
|
120
|
+
"is true", typeConverter=TypeConverters.toString)
|
|
121
|
+
|
|
122
|
+
def setStopWords(self, value):
|
|
123
|
+
"""Sets the words to be filtered out, by default english stopwords from
|
|
124
|
+
Spark ML.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
value : List[str]
|
|
129
|
+
The words to be filtered out
|
|
130
|
+
"""
|
|
131
|
+
return self._set(stopWords=value)
|
|
132
|
+
|
|
133
|
+
def setCaseSensitive(self, value):
|
|
134
|
+
"""Sets whether to do a case sensitive comparison, by default False.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
value : bool
|
|
139
|
+
Whether to do a case sensitive comparison
|
|
140
|
+
"""
|
|
141
|
+
return self._set(caseSensitive=value)
|
|
142
|
+
|
|
143
|
+
def setLocale(self, value):
|
|
144
|
+
"""Sets locale of the input. Ignored when case sensitive, by default
|
|
145
|
+
locale of the JVM.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
value : str
|
|
150
|
+
Locale of the input
|
|
151
|
+
"""
|
|
152
|
+
return self._set(locale=value)
|
|
153
|
+
|
|
154
|
+
def loadDefaultStopWords(language="english"):
|
|
155
|
+
"""Loads the default stop words for the given language.
|
|
156
|
+
|
|
157
|
+
Supported languages: danish, dutch, english, finnish, french, german,
|
|
158
|
+
hungarian, italian, norwegian, portuguese, russian, spanish, swedish,
|
|
159
|
+
turkish
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
language : str, optional
|
|
164
|
+
Language stopwords to load, by default "english"
|
|
165
|
+
"""
|
|
166
|
+
from pyspark.ml.wrapper import _jvm
|
|
167
|
+
stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover
|
|
168
|
+
return list(stopWordsObj.loadDefaultStopWords(language))
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def pretrained(name="stopwords_en", lang="en", remote_loc=None):
|
|
172
|
+
"""Downloads and loads a pretrained model.
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
name : str, optional
|
|
177
|
+
Name of the pretrained model, by default "stopwords_en"
|
|
178
|
+
lang : str, optional
|
|
179
|
+
Language of the pretrained model, by default "en"
|
|
180
|
+
remote_loc : str, optional
|
|
181
|
+
Optional remote address of the resource, by default None. Will use
|
|
182
|
+
Spark NLPs repositories otherwise.
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
StopWordsCleaner
|
|
187
|
+
The restored model
|
|
188
|
+
"""
|
|
189
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
190
|
+
return ResourceDownloader.downloadModel(StopWordsCleaner, name, lang, remote_loc)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from pyspark.ml import Model, Estimator
|
|
2
|
+
from pyspark.ml.util import DefaultParamsWritable, DefaultParamsReadable
|
|
3
|
+
from sparknlp.common import *
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TFNerDLGraphBuilderModel(Model, DefaultParamsWritable, DefaultParamsReadable):
|
|
7
|
+
def _transform(self, dataset):
|
|
8
|
+
return dataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TFNerDLGraphBuilder(Estimator, DefaultParamsWritable, DefaultParamsReadable):
|
|
12
|
+
|
|
13
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.WORD_EMBEDDINGS]
|
|
14
|
+
|
|
15
|
+
labelColumn = Param(Params._dummy(),
|
|
16
|
+
"labelColumn",
|
|
17
|
+
"Labels",
|
|
18
|
+
typeConverter=TypeConverters.toString)
|
|
19
|
+
|
|
20
|
+
inputCols = Param(Params._dummy(),
|
|
21
|
+
"inputCols",
|
|
22
|
+
"Input columns",
|
|
23
|
+
typeConverter=TypeConverters.toListString)
|
|
24
|
+
|
|
25
|
+
graphFolder = Param(Params._dummy(), "graphFolder", "Folder path that contain external graph files",
|
|
26
|
+
TypeConverters.toString)
|
|
27
|
+
|
|
28
|
+
graphFile = Param(Params._dummy(), "graphFile", "Graph file name. If empty, default name is generated.",
|
|
29
|
+
TypeConverters.toString)
|
|
30
|
+
|
|
31
|
+
hiddenUnitsNumber = Param(Params._dummy(),
|
|
32
|
+
"hiddenUnitsNumber",
|
|
33
|
+
"Number of hidden units",
|
|
34
|
+
typeConverter=TypeConverters.toInt)
|
|
35
|
+
|
|
36
|
+
def setHiddenUnitsNumber(self, value):
|
|
37
|
+
"""Sets the number of hidden units for AssertionDLApproach and MedicalNerApproach
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
value : int
|
|
42
|
+
Number of hidden units for AssertionDLApproach and MedicalNerApproach
|
|
43
|
+
"""
|
|
44
|
+
return self._set(hiddenUnitsNumber=value)
|
|
45
|
+
|
|
46
|
+
def getHiddenUnitsNumber(self):
|
|
47
|
+
"""Gets the number of hidden units for AssertionDLApproach and MedicalNerApproach."""
|
|
48
|
+
return self.getOrDefault(self.hiddenUnitsNumber)
|
|
49
|
+
|
|
50
|
+
def setLabelColumn(self, value):
|
|
51
|
+
"""Sets the name of the column for data labels.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
value : str
|
|
56
|
+
Column for data labels
|
|
57
|
+
"""
|
|
58
|
+
return self._set(labelColumn=value)
|
|
59
|
+
|
|
60
|
+
def getLabelColumn(self):
|
|
61
|
+
"""Gets the name of the label column."""
|
|
62
|
+
return self.getOrDefault(self.labelColumn)
|
|
63
|
+
|
|
64
|
+
def setInputCols(self, *value):
|
|
65
|
+
"""Sets column names of input annotations.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
*value : List[str]
|
|
70
|
+
Input columns for the annotator
|
|
71
|
+
"""
|
|
72
|
+
if type(value[0]) == str or type(value[0]) == list:
|
|
73
|
+
self.inputColsValidation(value)
|
|
74
|
+
if len(value) == 1 and type(value[0]) == list:
|
|
75
|
+
return self._set(inputCols=value[0])
|
|
76
|
+
else:
|
|
77
|
+
return self._set(inputCols=list(value))
|
|
78
|
+
else:
|
|
79
|
+
raise TypeError("InputCols datatype not supported. It must be either str or list")
|
|
80
|
+
|
|
81
|
+
def inputColsValidation(self, value):
|
|
82
|
+
actual_columns = len(value)
|
|
83
|
+
if type(value[0]) == list:
|
|
84
|
+
actual_columns = len(value[0])
|
|
85
|
+
|
|
86
|
+
expected_columns = len(self.inputAnnotatorTypes)
|
|
87
|
+
|
|
88
|
+
if actual_columns != expected_columns:
|
|
89
|
+
raise TypeError(
|
|
90
|
+
f"setInputCols in {self.uid} expecting {expected_columns} columns. "
|
|
91
|
+
f"Provided column amount: {actual_columns}. "
|
|
92
|
+
f"Which should be columns from the following annotators: {self.inputAnnotatorTypes}")
|
|
93
|
+
|
|
94
|
+
def getInputCols(self):
|
|
95
|
+
"""Gets current column names of input annotations."""
|
|
96
|
+
return self.getOrDefault(self.inputCols)
|
|
97
|
+
|
|
98
|
+
def setGraphFolder(self, value):
|
|
99
|
+
"""Sets folder path that contain external graph files.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
value : srt
|
|
104
|
+
Folder path that contain external graph files.
|
|
105
|
+
"""
|
|
106
|
+
return self._set(graphFolder=value)
|
|
107
|
+
|
|
108
|
+
def getGraphFolder(self):
|
|
109
|
+
"""Gets the graph folder."""
|
|
110
|
+
return self.getOrDefault(self.graphFolder)
|
|
111
|
+
|
|
112
|
+
def setGraphFile(self, value):
|
|
113
|
+
"""Sets the graph file name.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
value : srt
|
|
118
|
+
Greaph file name. If set to "auto", then the graph builder will use the model specific default graph
|
|
119
|
+
file name.
|
|
120
|
+
"""
|
|
121
|
+
return self._set(graphFile=value)
|
|
122
|
+
|
|
123
|
+
def getGraphFile(self):
|
|
124
|
+
"""Gets the graph file name."""
|
|
125
|
+
return self.getOrDefault(self.graphFile)
|
|
126
|
+
|
|
127
|
+
def _fit(self, dataset):
|
|
128
|
+
from ..training.tfgraphs import tf_graph, tf_graph_1x
|
|
129
|
+
|
|
130
|
+
build_params = {}
|
|
131
|
+
|
|
132
|
+
from sparknlp.internal import _NerDLGraphBuilder
|
|
133
|
+
|
|
134
|
+
params_java = _NerDLGraphBuilder(
|
|
135
|
+
dataset,
|
|
136
|
+
self.getInputCols(),
|
|
137
|
+
self.getLabelColumn())._java_obj
|
|
138
|
+
params = list(map(int, params_java.toString().replace("(", "").replace(")", "").split(",")))
|
|
139
|
+
build_params["ntags"] = params[0]
|
|
140
|
+
build_params["embeddings_dim"] = params[1]
|
|
141
|
+
build_params["nchars"] = params[2]
|
|
142
|
+
if self.getHiddenUnitsNumber() is not None:
|
|
143
|
+
build_params["lstm_size"] = self.getHiddenUnitsNumber()
|
|
144
|
+
|
|
145
|
+
graph_file = "auto"
|
|
146
|
+
if self.getGraphFile() is not None:
|
|
147
|
+
graph_file = self.getGraphFile()
|
|
148
|
+
|
|
149
|
+
graph_folder = ""
|
|
150
|
+
if self.getGraphFolder() is not None:
|
|
151
|
+
graph_folder = self.getGraphFolder()
|
|
152
|
+
|
|
153
|
+
print("Ner DL Graph Builder configuration:")
|
|
154
|
+
print("Graph folder: {}".format(graph_folder))
|
|
155
|
+
print("Graph file name: {}".format(graph_file))
|
|
156
|
+
print("Build params: ", end="")
|
|
157
|
+
print(build_params)
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
tf_graph.build("ner_dl", build_params=build_params, model_location=self.getGraphFolder(),
|
|
161
|
+
model_filename=graph_file)
|
|
162
|
+
except Exception:
|
|
163
|
+
print("Can't build the tensorflow graph with TF 2 graph factory, attempting TF 1.15 factory")
|
|
164
|
+
try:
|
|
165
|
+
tf_graph_1x.build("ner_dl", build_params=build_params, model_location=self.getGraphFolder())
|
|
166
|
+
except Exception:
|
|
167
|
+
raise Exception("The tensorflow graphs can't be build.")
|
|
168
|
+
|
|
169
|
+
return TFNerDLGraphBuilderModel()
|
|
170
|
+
|
|
171
|
+
def __init__(self):
|
|
172
|
+
super(TFNerDLGraphBuilder, self).__init__()
|
|
173
|
+
self._setDefault(
|
|
174
|
+
labelColumn=None,
|
|
175
|
+
inputCols=None,
|
|
176
|
+
graphFolder=None,
|
|
177
|
+
graphFile=None,
|
|
178
|
+
hiddenUnitsNumber=None
|
|
179
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Module of annotators for text tokenization."""
|
|
16
|
+
from sparknlp.annotator.token.chunk_tokenizer import *
|
|
17
|
+
from sparknlp.annotator.token.recursive_tokenizer import *
|
|
18
|
+
from sparknlp.annotator.token.regex_tokenizer import *
|
|
19
|
+
from sparknlp.annotator.token.tokenizer import *
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the ChunkTokenizer."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
from sparknlp.annotator.token.tokenizer import Tokenizer, TokenizerModel
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ChunkTokenizer(Tokenizer):
|
|
21
|
+
"""Tokenizes and flattens extracted NER chunks.
|
|
22
|
+
|
|
23
|
+
The ChunkTokenizer will split the extracted NER ``CHUNK`` type Annotations
|
|
24
|
+
and will create ``TOKEN`` type Annotations.
|
|
25
|
+
The result is then flattened, resulting in a single array.
|
|
26
|
+
|
|
27
|
+
====================== ======================
|
|
28
|
+
Input Annotation types Output Annotation type
|
|
29
|
+
====================== ======================
|
|
30
|
+
``CHUNK`` ``TOKEN``
|
|
31
|
+
====================== ======================
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
None
|
|
36
|
+
|
|
37
|
+
Examples
|
|
38
|
+
--------
|
|
39
|
+
>>> import sparknlp
|
|
40
|
+
>>> from sparknlp.base import *
|
|
41
|
+
>>> from sparknlp.annotator import *
|
|
42
|
+
>>> sparknlp.common import *
|
|
43
|
+
>>> from pyspark.ml import Pipeline
|
|
44
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
45
|
+
... .setInputCol("text") \\
|
|
46
|
+
... .setOutputCol("document")
|
|
47
|
+
>>> sentenceDetector = SentenceDetector() \\
|
|
48
|
+
... .setInputCols(["document"]) \\
|
|
49
|
+
... .setOutputCol("sentence")
|
|
50
|
+
>>> tokenizer = Tokenizer() \\
|
|
51
|
+
... .setInputCols(["sentence"]) \\
|
|
52
|
+
... .setOutputCol("token")
|
|
53
|
+
>>> entityExtractor = TextMatcher() \\
|
|
54
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
55
|
+
... .setEntities("src/test/resources/entity-extractor/test-chunks.txt", ReadAs.TEXT) \\
|
|
56
|
+
... .setOutputCol("entity")
|
|
57
|
+
>>> chunkTokenizer = ChunkTokenizer() \\
|
|
58
|
+
... .setInputCols(["entity"]) \\
|
|
59
|
+
... .setOutputCol("chunk_token")
|
|
60
|
+
>>> pipeline = Pipeline().setStages([
|
|
61
|
+
... documentAssembler,
|
|
62
|
+
... sentenceDetector,
|
|
63
|
+
... tokenizer,
|
|
64
|
+
... entityExtractor,
|
|
65
|
+
... chunkTokenizer
|
|
66
|
+
... ])
|
|
67
|
+
>>> data = spark.createDataFrame([
|
|
68
|
+
... ["Hello world, my name is Michael, I am an artist and I work at Benezar"],
|
|
69
|
+
... ["Robert, an engineer from Farendell, graduated last year. The other one, Lucas, graduated last week."]
|
|
70
|
+
>>> ]).toDF("text")
|
|
71
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
72
|
+
>>> result.selectExpr("entity.result as entity" , "chunk_token.result as chunk_token").show(truncate=False)
|
|
73
|
+
+-----------------------------------------------+---------------------------------------------------+
|
|
74
|
+
|entity |chunk_token |
|
|
75
|
+
+-----------------------------------------------+---------------------------------------------------+
|
|
76
|
+
|[world, Michael, work at Benezar] |[world, Michael, work, at, Benezar] |
|
|
77
|
+
|[engineer from Farendell, last year, last week]|[engineer, from, Farendell, last, year, last, week]|
|
|
78
|
+
+-----------------------------------------------+---------------------------------------------------+
|
|
79
|
+
"""
|
|
80
|
+
name = 'ChunkTokenizer'
|
|
81
|
+
|
|
82
|
+
inputAnnotatorTypes = [AnnotatorType.CHUNK]
|
|
83
|
+
|
|
84
|
+
@keyword_only
|
|
85
|
+
def __init__(self):
|
|
86
|
+
super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizer")
|
|
87
|
+
|
|
88
|
+
def _create_model(self, java_model):
|
|
89
|
+
return ChunkTokenizerModel(java_model=java_model)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class ChunkTokenizerModel(TokenizerModel):
|
|
93
|
+
"""Instantiated model of the ChunkTokenizer.
|
|
94
|
+
|
|
95
|
+
This is the instantiated model of the :class:`.ChunkTokenizer`.
|
|
96
|
+
For training your own model, please see the documentation of that class.
|
|
97
|
+
|
|
98
|
+
====================== ======================
|
|
99
|
+
Input Annotation types Output Annotation type
|
|
100
|
+
====================== ======================
|
|
101
|
+
``CHUNK`` ``TOKEN``
|
|
102
|
+
====================== ======================
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
None
|
|
107
|
+
"""
|
|
108
|
+
name = 'ChunkTokenizerModel'
|
|
109
|
+
|
|
110
|
+
inputAnnotatorTypes = [AnnotatorType.CHUNK]
|
|
111
|
+
|
|
112
|
+
@keyword_only
|
|
113
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizerModel", java_model=None):
|
|
114
|
+
super(TokenizerModel, self).__init__(
|
|
115
|
+
classname=classname,
|
|
116
|
+
java_model=java_model
|
|
117
|
+
)
|
|
118
|
+
|