spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the Tokenizer."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Tokenizer(AnnotatorApproach):
|
|
21
|
+
"""Tokenizes raw text in document type columns into TokenizedSentence .
|
|
22
|
+
|
|
23
|
+
This class represents a non fitted tokenizer. Fitting it will cause the
|
|
24
|
+
internal RuleFactory to construct the rules for tokenizing from the input
|
|
25
|
+
configuration.
|
|
26
|
+
|
|
27
|
+
Identifies tokens with tokenization open standards. A few rules will help
|
|
28
|
+
customizing it if defaults do not fit user needs.
|
|
29
|
+
|
|
30
|
+
For extended examples of usage see the `Examples
|
|
31
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__.
|
|
32
|
+
|
|
33
|
+
====================== ======================
|
|
34
|
+
Input Annotation types Output Annotation type
|
|
35
|
+
====================== ======================
|
|
36
|
+
``DOCUMENT`` ``TOKEN``
|
|
37
|
+
====================== ======================
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
targetPattern
|
|
42
|
+
Pattern to grab from text as token candidates, by default ``\\S+``
|
|
43
|
+
prefixPattern
|
|
44
|
+
Regex with groups and begins with ``\\A`` to match target prefix, by
|
|
45
|
+
default ``\\A([^\\s\\w\\$\\.]*)``
|
|
46
|
+
suffixPattern
|
|
47
|
+
Regex with groups and ends with ``\\z`` to match target suffix, by
|
|
48
|
+
default ``([^\\s\\w]?)([^\\s\\w]*)\\z``
|
|
49
|
+
infixPatterns
|
|
50
|
+
Regex patterns that match tokens within a single target. groups identify
|
|
51
|
+
different sub-tokens. multiple defaults
|
|
52
|
+
exceptions
|
|
53
|
+
Words that won't be affected by tokenization rules
|
|
54
|
+
exceptionsPath
|
|
55
|
+
Path to file containing list of exceptions
|
|
56
|
+
caseSensitiveExceptions
|
|
57
|
+
Whether to care for case sensitiveness in exceptions, by default True
|
|
58
|
+
contextChars
|
|
59
|
+
Character list used to separate from token boundaries, by default ['.',
|
|
60
|
+
',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"]
|
|
61
|
+
splitPattern
|
|
62
|
+
Pattern to separate from the inside of tokens. Takes priority over
|
|
63
|
+
splitChars.
|
|
64
|
+
splitChars
|
|
65
|
+
Character list used to separate from the inside of tokens
|
|
66
|
+
minLength
|
|
67
|
+
Set the minimum allowed length for each token, by default 0
|
|
68
|
+
maxLength
|
|
69
|
+
Set the maximum allowed length for each token, by default 99999
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> import sparknlp
|
|
74
|
+
>>> from sparknlp.base import *
|
|
75
|
+
>>> from sparknlp.annotator import *
|
|
76
|
+
>>> from pyspark.ml import Pipeline
|
|
77
|
+
>>> data = spark.createDataFrame([["I'd like to say we didn't expect that. Jane's boyfriend."]]).toDF("text")
|
|
78
|
+
>>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
|
|
79
|
+
>>> tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token").fit(data)
|
|
80
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)
|
|
81
|
+
>>> result = pipeline.transform(data)
|
|
82
|
+
>>> result.selectExpr("token.result").show(truncate=False)
|
|
83
|
+
+-----------------------------------------------------------------------+
|
|
84
|
+
|output |
|
|
85
|
+
+-----------------------------------------------------------------------+
|
|
86
|
+
|[I'd, like, to, say, we, didn't, expect, that, ., Jane's, boyfriend, .]|
|
|
87
|
+
+-----------------------------------------------------------------------+
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
name = 'Tokenizer'
|
|
91
|
+
|
|
92
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
93
|
+
|
|
94
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
95
|
+
|
|
96
|
+
targetPattern = Param(Params._dummy(),
|
|
97
|
+
"targetPattern",
|
|
98
|
+
"pattern to grab from text as token candidates. Defaults \S+",
|
|
99
|
+
typeConverter=TypeConverters.toString)
|
|
100
|
+
|
|
101
|
+
prefixPattern = Param(Params._dummy(),
|
|
102
|
+
"prefixPattern",
|
|
103
|
+
"regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)",
|
|
104
|
+
typeConverter=TypeConverters.toString)
|
|
105
|
+
|
|
106
|
+
suffixPattern = Param(Params._dummy(),
|
|
107
|
+
"suffixPattern",
|
|
108
|
+
"regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z",
|
|
109
|
+
typeConverter=TypeConverters.toString)
|
|
110
|
+
|
|
111
|
+
infixPatterns = Param(Params._dummy(),
|
|
112
|
+
"infixPatterns",
|
|
113
|
+
"regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults",
|
|
114
|
+
typeConverter=TypeConverters.toListString)
|
|
115
|
+
|
|
116
|
+
exceptions = Param(Params._dummy(),
|
|
117
|
+
"exceptions",
|
|
118
|
+
"Words that won't be affected by tokenization rules",
|
|
119
|
+
typeConverter=TypeConverters.toListString)
|
|
120
|
+
|
|
121
|
+
exceptionsPath = Param(Params._dummy(),
|
|
122
|
+
"exceptionsPath",
|
|
123
|
+
"path to file containing list of exceptions",
|
|
124
|
+
typeConverter=TypeConverters.identity)
|
|
125
|
+
|
|
126
|
+
caseSensitiveExceptions = Param(Params._dummy(),
|
|
127
|
+
"caseSensitiveExceptions",
|
|
128
|
+
"Whether to care for case sensitiveness in exceptions",
|
|
129
|
+
typeConverter=TypeConverters.toBoolean)
|
|
130
|
+
|
|
131
|
+
contextChars = Param(Params._dummy(),
|
|
132
|
+
"contextChars",
|
|
133
|
+
"character list used to separate from token boundaries",
|
|
134
|
+
typeConverter=TypeConverters.toListString)
|
|
135
|
+
|
|
136
|
+
splitPattern = Param(Params._dummy(),
|
|
137
|
+
"splitPattern",
|
|
138
|
+
"character list used to separate from the inside of tokens",
|
|
139
|
+
typeConverter=TypeConverters.toString)
|
|
140
|
+
|
|
141
|
+
splitChars = Param(Params._dummy(),
|
|
142
|
+
"splitChars",
|
|
143
|
+
"character list used to separate from the inside of tokens",
|
|
144
|
+
typeConverter=TypeConverters.toListString)
|
|
145
|
+
|
|
146
|
+
minLength = Param(Params._dummy(),
|
|
147
|
+
"minLength",
|
|
148
|
+
"Set the minimum allowed length for each token",
|
|
149
|
+
typeConverter=TypeConverters.toInt)
|
|
150
|
+
|
|
151
|
+
maxLength = Param(Params._dummy(),
|
|
152
|
+
"maxLength",
|
|
153
|
+
"Set the maximum allowed length for each token",
|
|
154
|
+
typeConverter=TypeConverters.toInt)
|
|
155
|
+
|
|
156
|
+
@keyword_only
|
|
157
|
+
def __init__(self):
|
|
158
|
+
super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Tokenizer")
|
|
159
|
+
self._setDefault(
|
|
160
|
+
targetPattern="\\S+",
|
|
161
|
+
contextChars=[".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'"],
|
|
162
|
+
caseSensitiveExceptions=True,
|
|
163
|
+
minLength=0,
|
|
164
|
+
maxLength=99999
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def getInfixPatterns(self):
|
|
168
|
+
"""Gets regex patterns that match tokens within a single target. Groups
|
|
169
|
+
identify different sub-tokens.
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
List[str]
|
|
174
|
+
The infix patterns
|
|
175
|
+
"""
|
|
176
|
+
return self.getOrDefault("infixPatterns")
|
|
177
|
+
|
|
178
|
+
def getSuffixPattern(self):
|
|
179
|
+
"""Gets regex with groups and ends with ``\\z`` to match target suffix.
|
|
180
|
+
|
|
181
|
+
Returns
|
|
182
|
+
-------
|
|
183
|
+
str
|
|
184
|
+
The suffix pattern
|
|
185
|
+
"""
|
|
186
|
+
return self.getOrDefault("suffixPattern")
|
|
187
|
+
|
|
188
|
+
def getPrefixPattern(self):
|
|
189
|
+
"""Gets regex with groups and begins with ``\\A`` to match target
|
|
190
|
+
prefix.
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
str
|
|
195
|
+
The prefix pattern
|
|
196
|
+
"""
|
|
197
|
+
return self.getOrDefault("prefixPattern")
|
|
198
|
+
|
|
199
|
+
def getContextChars(self):
|
|
200
|
+
"""Gets character list used to separate from token boundaries.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
List[str]
|
|
205
|
+
Character list used to separate from token boundaries
|
|
206
|
+
"""
|
|
207
|
+
return self.getOrDefault("contextChars")
|
|
208
|
+
|
|
209
|
+
def getSplitChars(self):
|
|
210
|
+
"""Gets character list used to separate from the inside of tokens.
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
List[str]
|
|
215
|
+
Character list used to separate from the inside of tokens
|
|
216
|
+
"""
|
|
217
|
+
return self.getOrDefault("splitChars")
|
|
218
|
+
|
|
219
|
+
def setTargetPattern(self, value):
|
|
220
|
+
"""Sets pattern to grab from text as token candidates, by default
|
|
221
|
+
``\\S+``.
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
value : str
|
|
226
|
+
Pattern to grab from text as token candidates
|
|
227
|
+
"""
|
|
228
|
+
return self._set(targetPattern=value)
|
|
229
|
+
|
|
230
|
+
def setPrefixPattern(self, value):
|
|
231
|
+
"""Sets regex with groups and begins with ``\\A`` to match target prefix, by
|
|
232
|
+
default ``\\A([^\\s\\w\\$\\.]*)``.
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
value : str
|
|
237
|
+
Regex with groups and begins with ``\\A`` to match target prefix
|
|
238
|
+
"""
|
|
239
|
+
return self._set(prefixPattern=value)
|
|
240
|
+
|
|
241
|
+
def setSuffixPattern(self, value):
|
|
242
|
+
"""Sets regex with groups and ends with ``\\z`` to match target suffix,
|
|
243
|
+
by default ``([^\\s\\w]?)([^\\s\\w]*)\\z``.
|
|
244
|
+
|
|
245
|
+
Parameters
|
|
246
|
+
----------
|
|
247
|
+
value : str
|
|
248
|
+
Regex with groups and ends with ``\\z`` to match target suffix
|
|
249
|
+
"""
|
|
250
|
+
return self._set(suffixPattern=value)
|
|
251
|
+
|
|
252
|
+
def setInfixPatterns(self, value):
|
|
253
|
+
"""Sets regex patterns that match tokens within a single target. Groups
|
|
254
|
+
identify different sub-tokens.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
value : List[str]
|
|
259
|
+
Regex patterns that match tokens within a single target
|
|
260
|
+
"""
|
|
261
|
+
return self._set(infixPatterns=value)
|
|
262
|
+
|
|
263
|
+
def addInfixPattern(self, value):
|
|
264
|
+
"""Adds an additional regex pattern that match tokens within a single
|
|
265
|
+
target. Groups identify different sub-tokens.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
value : str
|
|
270
|
+
Regex pattern that match tokens within a single target
|
|
271
|
+
"""
|
|
272
|
+
try:
|
|
273
|
+
infix_patterns = self.getInfixPatterns()
|
|
274
|
+
except KeyError:
|
|
275
|
+
infix_patterns = []
|
|
276
|
+
infix_patterns.insert(0, value)
|
|
277
|
+
return self._set(infixPatterns=infix_patterns)
|
|
278
|
+
|
|
279
|
+
def setExceptions(self, value):
|
|
280
|
+
"""Sets words that won't be affected by tokenization rules.
|
|
281
|
+
|
|
282
|
+
Parameters
|
|
283
|
+
----------
|
|
284
|
+
value : List[str]
|
|
285
|
+
Words that won't be affected by tokenization rules
|
|
286
|
+
"""
|
|
287
|
+
return self._set(exceptions=value)
|
|
288
|
+
|
|
289
|
+
def getExceptions(self):
|
|
290
|
+
"""Gets words that won't be affected by tokenization rules.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
List[str]
|
|
295
|
+
Words that won't be affected by tokenization rules
|
|
296
|
+
"""
|
|
297
|
+
return self.getOrDefault("exceptions")
|
|
298
|
+
|
|
299
|
+
def setExceptionsPath(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
300
|
+
"""Path to txt file with list of token exceptions
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
path : str
|
|
305
|
+
Path to the source file
|
|
306
|
+
read_as : str, optional
|
|
307
|
+
How to read the file, by default ReadAs.TEXT
|
|
308
|
+
options : dict, optional
|
|
309
|
+
Options to read the resource, by default {"format": "text"}
|
|
310
|
+
"""
|
|
311
|
+
opts = options.copy()
|
|
312
|
+
return self._set(exceptionsPath=ExternalResource(path, read_as, opts))
|
|
313
|
+
|
|
314
|
+
def addException(self, value):
|
|
315
|
+
"""Adds an additional word that won't be affected by tokenization rules.
|
|
316
|
+
|
|
317
|
+
Parameters
|
|
318
|
+
----------
|
|
319
|
+
value : str
|
|
320
|
+
Additional word that won't be affected by tokenization rules
|
|
321
|
+
"""
|
|
322
|
+
try:
|
|
323
|
+
exception_tokens = self.getExceptions()
|
|
324
|
+
except KeyError:
|
|
325
|
+
exception_tokens = []
|
|
326
|
+
exception_tokens.append(value)
|
|
327
|
+
return self._set(exceptions=exception_tokens)
|
|
328
|
+
|
|
329
|
+
def setCaseSensitiveExceptions(self, value):
|
|
330
|
+
"""Sets whether to care for case sensitiveness in exceptions, by default
|
|
331
|
+
True.
|
|
332
|
+
|
|
333
|
+
Parameters
|
|
334
|
+
----------
|
|
335
|
+
value : bool
|
|
336
|
+
Whether to care for case sensitiveness in exceptions
|
|
337
|
+
"""
|
|
338
|
+
return self._set(caseSensitiveExceptions=value)
|
|
339
|
+
|
|
340
|
+
def getCaseSensitiveExceptions(self):
|
|
341
|
+
"""Gets whether to care for case sensitiveness in exceptions.
|
|
342
|
+
|
|
343
|
+
Returns
|
|
344
|
+
-------
|
|
345
|
+
bool
|
|
346
|
+
Whether to care for case sensitiveness in exceptions
|
|
347
|
+
"""
|
|
348
|
+
return self.getOrDefault("caseSensitiveExceptions")
|
|
349
|
+
|
|
350
|
+
def setContextChars(self, value):
|
|
351
|
+
"""Sets character list used to separate from token boundaries, by
|
|
352
|
+
default ['.', ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"].
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
value : List[str]
|
|
357
|
+
Character list used to separate from token boundaries
|
|
358
|
+
"""
|
|
359
|
+
return self._set(contextChars=value)
|
|
360
|
+
|
|
361
|
+
def addContextChars(self, value):
|
|
362
|
+
"""Adds an additional character to the list used to separate from token
|
|
363
|
+
boundaries.
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
value : str
|
|
368
|
+
Additional context character
|
|
369
|
+
"""
|
|
370
|
+
try:
|
|
371
|
+
context_chars = self.getContextChars()
|
|
372
|
+
except KeyError:
|
|
373
|
+
context_chars = []
|
|
374
|
+
context_chars.append(value)
|
|
375
|
+
return self._set(contextChars=context_chars)
|
|
376
|
+
|
|
377
|
+
def setSplitPattern(self, value):
|
|
378
|
+
"""Sets pattern to separate from the inside of tokens. Takes priority
|
|
379
|
+
over splitChars.
|
|
380
|
+
|
|
381
|
+
Parameters
|
|
382
|
+
----------
|
|
383
|
+
value : str
|
|
384
|
+
Pattern used to separate from the inside of tokens
|
|
385
|
+
"""
|
|
386
|
+
return self._set(splitPattern=value)
|
|
387
|
+
|
|
388
|
+
def setSplitChars(self, value):
|
|
389
|
+
"""Sets character list used to separate from the inside of tokens.
|
|
390
|
+
|
|
391
|
+
Parameters
|
|
392
|
+
----------
|
|
393
|
+
value : List[str]
|
|
394
|
+
Character list used to separate from the inside of tokens
|
|
395
|
+
"""
|
|
396
|
+
return self._set(splitChars=value)
|
|
397
|
+
|
|
398
|
+
def addSplitChars(self, value):
|
|
399
|
+
"""Adds an additional character to separate from the inside of tokens.
|
|
400
|
+
|
|
401
|
+
Parameters
|
|
402
|
+
----------
|
|
403
|
+
value : str
|
|
404
|
+
Additional character to separate from the inside of tokens
|
|
405
|
+
"""
|
|
406
|
+
try:
|
|
407
|
+
split_chars = self.getSplitChars()
|
|
408
|
+
except KeyError:
|
|
409
|
+
split_chars = []
|
|
410
|
+
split_chars.append(value)
|
|
411
|
+
return self._set(splitChars=split_chars)
|
|
412
|
+
|
|
413
|
+
def setMinLength(self, value):
|
|
414
|
+
"""Sets the minimum allowed length for each token, by default 0.
|
|
415
|
+
|
|
416
|
+
Parameters
|
|
417
|
+
----------
|
|
418
|
+
value : int
|
|
419
|
+
Minimum allowed length for each token
|
|
420
|
+
"""
|
|
421
|
+
return self._set(minLength=value)
|
|
422
|
+
|
|
423
|
+
def setMaxLength(self, value):
|
|
424
|
+
"""Sets the maximum allowed length for each token, by default 99999.
|
|
425
|
+
|
|
426
|
+
Parameters
|
|
427
|
+
----------
|
|
428
|
+
value : int
|
|
429
|
+
Maximum allowed length for each token
|
|
430
|
+
"""
|
|
431
|
+
return self._set(maxLength=value)
|
|
432
|
+
|
|
433
|
+
def _create_model(self, java_model):
|
|
434
|
+
return TokenizerModel(java_model=java_model)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
class TokenizerModel(AnnotatorModel):
|
|
438
|
+
"""Tokenizes raw text into word pieces, tokens. Identifies tokens with
|
|
439
|
+
tokenization open standards. A few rules will help customizing it if
|
|
440
|
+
defaults do not fit user needs.
|
|
441
|
+
|
|
442
|
+
This class represents an already fitted :class:`.Tokenizer`.
|
|
443
|
+
|
|
444
|
+
See the main class Tokenizer for more examples of usage.
|
|
445
|
+
|
|
446
|
+
====================== ======================
|
|
447
|
+
Input Annotation types Output Annotation type
|
|
448
|
+
====================== ======================
|
|
449
|
+
``DOCUMENT`` ``TOKEN``
|
|
450
|
+
====================== ======================
|
|
451
|
+
|
|
452
|
+
Parameters
|
|
453
|
+
----------
|
|
454
|
+
splitPattern
|
|
455
|
+
Character list used to separate from the inside of tokens
|
|
456
|
+
splitChars
|
|
457
|
+
Character list used to separate from the inside of tokens
|
|
458
|
+
"""
|
|
459
|
+
name = "TokenizerModel"
|
|
460
|
+
|
|
461
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
462
|
+
|
|
463
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
464
|
+
|
|
465
|
+
exceptions = Param(Params._dummy(),
|
|
466
|
+
"exceptions",
|
|
467
|
+
"Words that won't be affected by tokenization rules",
|
|
468
|
+
typeConverter=TypeConverters.toListString)
|
|
469
|
+
|
|
470
|
+
caseSensitiveExceptions = Param(Params._dummy(),
|
|
471
|
+
"caseSensitiveExceptions",
|
|
472
|
+
"Whether to care for case sensitiveness in exceptions",
|
|
473
|
+
typeConverter=TypeConverters.toBoolean)
|
|
474
|
+
|
|
475
|
+
targetPattern = Param(Params._dummy(),
|
|
476
|
+
"targetPattern",
|
|
477
|
+
"pattern to grab from text as token candidates. Defaults \S+",
|
|
478
|
+
typeConverter=TypeConverters.toString)
|
|
479
|
+
|
|
480
|
+
rules = Param(Params._dummy(),
|
|
481
|
+
"rules",
|
|
482
|
+
"Rules structure factory containing pre processed regex rules",
|
|
483
|
+
typeConverter=TypeConverters.identity)
|
|
484
|
+
|
|
485
|
+
splitPattern = Param(Params._dummy(),
|
|
486
|
+
"splitPattern",
|
|
487
|
+
"character list used to separate from the inside of tokens",
|
|
488
|
+
typeConverter=TypeConverters.toString)
|
|
489
|
+
|
|
490
|
+
splitChars = Param(Params._dummy(),
|
|
491
|
+
"splitChars",
|
|
492
|
+
"character list used to separate from the inside of tokens",
|
|
493
|
+
typeConverter=TypeConverters.toListString)
|
|
494
|
+
|
|
495
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TokenizerModel", java_model=None):
|
|
496
|
+
super(TokenizerModel, self).__init__(
|
|
497
|
+
classname=classname,
|
|
498
|
+
java_model=java_model
|
|
499
|
+
)
|
|
500
|
+
self._setDefault(
|
|
501
|
+
targetPattern="\\S+",
|
|
502
|
+
caseSensitiveExceptions=True
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
def setSplitPattern(self, value):
|
|
506
|
+
"""Sets pattern to separate from the inside of tokens. Takes priority
|
|
507
|
+
over splitChars.
|
|
508
|
+
|
|
509
|
+
Parameters
|
|
510
|
+
----------
|
|
511
|
+
value : str
|
|
512
|
+
Pattern used to separate from the inside of tokens
|
|
513
|
+
"""
|
|
514
|
+
return self._set(splitPattern=value)
|
|
515
|
+
|
|
516
|
+
def setSplitChars(self, value):
|
|
517
|
+
"""Sets character list used to separate from the inside of tokens.
|
|
518
|
+
|
|
519
|
+
Parameters
|
|
520
|
+
----------
|
|
521
|
+
value : List[str]
|
|
522
|
+
Character list used to separate from the inside of tokens
|
|
523
|
+
"""
|
|
524
|
+
return self._set(splitChars=value)
|
|
525
|
+
|
|
526
|
+
def addSplitChars(self, value):
|
|
527
|
+
"""Adds an additional character to separate from the inside of tokens.
|
|
528
|
+
|
|
529
|
+
Parameters
|
|
530
|
+
----------
|
|
531
|
+
value : str
|
|
532
|
+
Additional character to separate from the inside of tokens
|
|
533
|
+
"""
|
|
534
|
+
try:
|
|
535
|
+
split_chars = self.getSplitChars()
|
|
536
|
+
except KeyError:
|
|
537
|
+
split_chars = []
|
|
538
|
+
split_chars.append(value)
|
|
539
|
+
return self._set(splitChars=split_chars)
|
|
540
|
+
|
|
541
|
+
@staticmethod
|
|
542
|
+
def pretrained(name="token_rules", lang="en", remote_loc=None):
|
|
543
|
+
"""Downloads and loads a pretrained model.
|
|
544
|
+
|
|
545
|
+
Parameters
|
|
546
|
+
----------
|
|
547
|
+
name : str, optional
|
|
548
|
+
Name of the pretrained model, by default "token_rules"
|
|
549
|
+
lang : str, optional
|
|
550
|
+
Language of the pretrained model, by default "en"
|
|
551
|
+
remote_loc : str, optional
|
|
552
|
+
Optional remote address of the resource, by default None. Will use
|
|
553
|
+
Spark NLPs repositories otherwise.
|
|
554
|
+
|
|
555
|
+
Returns
|
|
556
|
+
-------
|
|
557
|
+
TokenizerModel
|
|
558
|
+
The restored model
|
|
559
|
+
"""
|
|
560
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
561
|
+
return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for Token2Chunk."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Token2Chunk(AnnotatorModel):
|
|
21
|
+
"""Converts ``TOKEN`` type Annotations to ``CHUNK`` type.
|
|
22
|
+
|
|
23
|
+
This can be useful if a entities have been already extracted as ``TOKEN``
|
|
24
|
+
and following annotators require ``CHUNK`` types.
|
|
25
|
+
|
|
26
|
+
====================== ======================
|
|
27
|
+
Input Annotation types Output Annotation type
|
|
28
|
+
====================== ======================
|
|
29
|
+
``TOKEN`` ``CHUNK``
|
|
30
|
+
====================== ======================
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
None
|
|
35
|
+
|
|
36
|
+
Examples
|
|
37
|
+
--------
|
|
38
|
+
>>> import sparknlp
|
|
39
|
+
>>> from sparknlp.base import *
|
|
40
|
+
>>> from sparknlp.annotator import *
|
|
41
|
+
>>> from pyspark.ml import Pipeline
|
|
42
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
43
|
+
... .setInputCol("text") \\
|
|
44
|
+
... .setOutputCol("document")
|
|
45
|
+
>>> tokenizer = Tokenizer() \\
|
|
46
|
+
... .setInputCols(["document"]) \\
|
|
47
|
+
... .setOutputCol("token")
|
|
48
|
+
>>> token2chunk = Token2Chunk() \\
|
|
49
|
+
... .setInputCols(["token"]) \\
|
|
50
|
+
... .setOutputCol("chunk")
|
|
51
|
+
>>> pipeline = Pipeline().setStages([
|
|
52
|
+
... documentAssembler,
|
|
53
|
+
... tokenizer,
|
|
54
|
+
... token2chunk
|
|
55
|
+
... ])
|
|
56
|
+
>>> data = spark.createDataFrame([["One Two Three Four"]]).toDF("text")
|
|
57
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
58
|
+
>>> result.selectExpr("explode(chunk) as result").show(truncate=False)
|
|
59
|
+
+------------------------------------------+
|
|
60
|
+
|result |
|
|
61
|
+
+------------------------------------------+
|
|
62
|
+
|[chunk, 0, 2, One, [sentence -> 0], []] |
|
|
63
|
+
|[chunk, 4, 6, Two, [sentence -> 0], []] |
|
|
64
|
+
|[chunk, 8, 12, Three, [sentence -> 0], []]|
|
|
65
|
+
|[chunk, 14, 17, Four, [sentence -> 0], []]|
|
|
66
|
+
+------------------------------------------+
|
|
67
|
+
"""
|
|
68
|
+
name = "Token2Chunk"
|
|
69
|
+
|
|
70
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
71
|
+
|
|
72
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
73
|
+
|
|
74
|
+
def __init__(self):
|
|
75
|
+
super(Token2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Token2Chunk")
|
|
76
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Module of annotators for word segmentation."""
|
|
16
|
+
from sparknlp.annotator.ws.word_segmenter import *
|