spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the RecursiveTokenizer."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RecursiveTokenizer(AnnotatorApproach):
|
|
20
|
+
"""Tokenizes raw text recursively based on a handful of definable rules.
|
|
21
|
+
|
|
22
|
+
Unlike the Tokenizer, the RecursiveTokenizer operates based on these array
|
|
23
|
+
string parameters only:
|
|
24
|
+
|
|
25
|
+
- ``prefixes``: Strings that will be split when found at the beginning of
|
|
26
|
+
token.
|
|
27
|
+
- ``suffixes``: Strings that will be split when found at the end of token.
|
|
28
|
+
- ``infixes``: Strings that will be split when found at the middle of token.
|
|
29
|
+
- ``whitelist``: Whitelist of strings not to split
|
|
30
|
+
|
|
31
|
+
For extended examples of usage, see the `Examples
|
|
32
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
|
|
33
|
+
|
|
34
|
+
====================== ======================
|
|
35
|
+
Input Annotation types Output Annotation type
|
|
36
|
+
====================== ======================
|
|
37
|
+
``DOCUMENT`` ``TOKEN``
|
|
38
|
+
====================== ======================
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
prefixes
|
|
43
|
+
Strings to be considered independent tokens when found at the beginning
|
|
44
|
+
of a word, by default ["'", '"', '(', '[', '\\n']
|
|
45
|
+
suffixes
|
|
46
|
+
Strings to be considered independent tokens when found at the end of a
|
|
47
|
+
word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')', ']',
|
|
48
|
+
'\\n', '!', "'s"]
|
|
49
|
+
infixes
|
|
50
|
+
Strings to be considered independent tokens when found in the middle of
|
|
51
|
+
a word, by default ['\\n', '(', ')']
|
|
52
|
+
whitelist
|
|
53
|
+
Strings to be considered as single tokens , by default ["it\'s",
|
|
54
|
+
"that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s",
|
|
55
|
+
"It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s",
|
|
56
|
+
"Who\'s"]
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> import sparknlp
|
|
61
|
+
>>> from sparknlp.base import *
|
|
62
|
+
>>> from sparknlp.annotator import *
|
|
63
|
+
>>> from pyspark.ml import Pipeline
|
|
64
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
65
|
+
... .setInputCol("text") \\
|
|
66
|
+
... .setOutputCol("document")
|
|
67
|
+
>>> tokenizer = RecursiveTokenizer() \\
|
|
68
|
+
... .setInputCols(["document"]) \\
|
|
69
|
+
... .setOutputCol("token")
|
|
70
|
+
>>> pipeline = Pipeline().setStages([
|
|
71
|
+
... documentAssembler,
|
|
72
|
+
... tokenizer
|
|
73
|
+
... ])
|
|
74
|
+
>>> data = spark.createDataFrame([["One, after the Other, (and) again. PO, QAM,"]]).toDF("text")
|
|
75
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
76
|
+
>>> result.select("token.result").show(truncate=False)
|
|
77
|
+
+------------------------------------------------------------------+
|
|
78
|
+
|result |
|
|
79
|
+
+------------------------------------------------------------------+
|
|
80
|
+
|[One, ,, after, the, Other, ,, (, and, ), again, ., PO, ,, QAM, ,]|
|
|
81
|
+
+------------------------------------------------------------------+
|
|
82
|
+
"""
|
|
83
|
+
name = 'RecursiveTokenizer'
|
|
84
|
+
|
|
85
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
86
|
+
|
|
87
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
88
|
+
|
|
89
|
+
prefixes = Param(Params._dummy(),
|
|
90
|
+
"prefixes",
|
|
91
|
+
"strings to be considered independent tokens when found at the beginning of a word",
|
|
92
|
+
typeConverter=TypeConverters.toListString)
|
|
93
|
+
|
|
94
|
+
suffixes = Param(Params._dummy(),
|
|
95
|
+
"suffixes",
|
|
96
|
+
"strings to be considered independent tokens when found at the end of a word",
|
|
97
|
+
typeConverter=TypeConverters.toListString)
|
|
98
|
+
|
|
99
|
+
infixes = Param(Params._dummy(),
|
|
100
|
+
"infixes",
|
|
101
|
+
"strings to be considered independent tokens when found in the middle of a word",
|
|
102
|
+
typeConverter=TypeConverters.toListString)
|
|
103
|
+
|
|
104
|
+
whitelist = Param(Params._dummy(),
|
|
105
|
+
"whitelist",
|
|
106
|
+
"strings to be considered as single tokens",
|
|
107
|
+
typeConverter=TypeConverters.toListString)
|
|
108
|
+
|
|
109
|
+
def setPrefixes(self, p):
|
|
110
|
+
"""Sets strings to be considered independent tokens when found at the
|
|
111
|
+
beginning of a word, by default ["'", '"', '(', '[', '\\n'].
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
p : List[str]
|
|
116
|
+
Strings to be considered independent tokens when found at the
|
|
117
|
+
beginning of a word
|
|
118
|
+
"""
|
|
119
|
+
return self._set(prefixes=p)
|
|
120
|
+
|
|
121
|
+
def setSuffixes(self, s):
|
|
122
|
+
"""Sets strings to be considered independent tokens when found at the
|
|
123
|
+
end of a word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')',
|
|
124
|
+
']', '\\n', '!', "'s"].
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
s : List[str]
|
|
129
|
+
Strings to be considered independent tokens when found at the end of
|
|
130
|
+
a word
|
|
131
|
+
"""
|
|
132
|
+
return self._set(suffixes=s)
|
|
133
|
+
|
|
134
|
+
def setInfixes(self, i):
|
|
135
|
+
"""Sets strings to be considered independent tokens when found in the
|
|
136
|
+
middle of a word, by default ['\\n', '(', ')'].
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
i : List[str]
|
|
141
|
+
Strings to be considered independent tokens when found in the middle
|
|
142
|
+
of a word
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
[type]
|
|
147
|
+
[description]
|
|
148
|
+
"""
|
|
149
|
+
return self._set(infixes=i)
|
|
150
|
+
|
|
151
|
+
def setWhitelist(self, w):
|
|
152
|
+
"""Sets strings to be considered as single tokens, by default ["it\'s",
|
|
153
|
+
"that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s",
|
|
154
|
+
"It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s",
|
|
155
|
+
"Who\'s"].
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
w : List[str]
|
|
160
|
+
Strings to be considered as single tokens
|
|
161
|
+
"""
|
|
162
|
+
return self._set(whitelist=w)
|
|
163
|
+
|
|
164
|
+
@keyword_only
|
|
165
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer"):
|
|
166
|
+
super(RecursiveTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer")
|
|
167
|
+
self._setDefault(
|
|
168
|
+
prefixes=["'", "\"", "(", "[", "\n"],
|
|
169
|
+
infixes=["\n", "(", ")"],
|
|
170
|
+
suffixes=[".", ":", "%", ",", ";", "?", "'", "\"", ")", "]", "\n", "!", "'s"],
|
|
171
|
+
whitelist=["it's", "that's", "there's", "he's", "she's", "what's", "let's", "who's", \
|
|
172
|
+
"It's", "That's", "There's", "He's", "She's", "What's", "Let's", "Who's"]
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def _create_model(self, java_model):
|
|
176
|
+
return RecursiveTokenizerModel(java_model=java_model)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class RecursiveTokenizerModel(AnnotatorModel):
|
|
180
|
+
"""Instantiated model of the RecursiveTokenizer.
|
|
181
|
+
|
|
182
|
+
This is the instantiated model of the :class:`.RecursiveTokenizer`.
|
|
183
|
+
For training your own model, please see the documentation of that class.
|
|
184
|
+
|
|
185
|
+
====================== ======================
|
|
186
|
+
Input Annotation types Output Annotation type
|
|
187
|
+
====================== ======================
|
|
188
|
+
``DOCUMENT`` ``TOKEN``
|
|
189
|
+
====================== ======================
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
None
|
|
194
|
+
"""
|
|
195
|
+
name = 'RecursiveTokenizerModel'
|
|
196
|
+
|
|
197
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
198
|
+
|
|
199
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
200
|
+
|
|
201
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel", java_model=None):
|
|
202
|
+
super(RecursiveTokenizerModel, self).__init__(
|
|
203
|
+
classname=classname,
|
|
204
|
+
java_model=java_model
|
|
205
|
+
)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the RegexTokenizer."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RegexTokenizer(AnnotatorModel):
|
|
21
|
+
"""A tokenizer that splits text by a regex pattern.
|
|
22
|
+
|
|
23
|
+
The pattern needs to be set with :meth:`.setPattern` and this sets the
|
|
24
|
+
delimiting pattern or how the tokens should be split. By default this
|
|
25
|
+
pattern is ``\\s+`` which means that tokens should be split by 1 or more
|
|
26
|
+
whitespace characters.
|
|
27
|
+
|
|
28
|
+
====================== ======================
|
|
29
|
+
Input Annotation types Output Annotation type
|
|
30
|
+
====================== ======================
|
|
31
|
+
``DOCUMENT`` ``TOKEN``
|
|
32
|
+
====================== ======================
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
minLength
|
|
37
|
+
Set the minimum allowed length for each token, by default 1
|
|
38
|
+
maxLength
|
|
39
|
+
Set the maximum allowed length for each token
|
|
40
|
+
toLowercase
|
|
41
|
+
Indicates whether to convert all characters to lowercase before
|
|
42
|
+
tokenizing, by default False
|
|
43
|
+
pattern
|
|
44
|
+
Regex pattern used for tokenizing, by default ``\\s+``
|
|
45
|
+
positionalMask
|
|
46
|
+
Using a positional mask to guarantee the incremental progression of the
|
|
47
|
+
tokenization, by default False
|
|
48
|
+
trimWhitespace
|
|
49
|
+
Using a trimWhitespace flag to remove whitespaces from identified tokens,
|
|
50
|
+
by default False
|
|
51
|
+
preservePosition
|
|
52
|
+
Using a preservePosition flag to preserve initial indexes before eventual whitespaces removal in tokens,
|
|
53
|
+
by default True
|
|
54
|
+
|
|
55
|
+
Examples
|
|
56
|
+
--------
|
|
57
|
+
>>> import sparknlp
|
|
58
|
+
>>> from sparknlp.base import *
|
|
59
|
+
>>> from sparknlp.annotator import *
|
|
60
|
+
>>> from pyspark.ml import Pipeline
|
|
61
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
62
|
+
... .setInputCol("text") \\
|
|
63
|
+
... .setOutputCol("document")
|
|
64
|
+
>>> regexTokenizer = RegexTokenizer() \\
|
|
65
|
+
... .setInputCols(["document"]) \\
|
|
66
|
+
... .setOutputCol("regexToken") \\
|
|
67
|
+
... .setToLowercase(True) \\
|
|
68
|
+
>>> pipeline = Pipeline().setStages([
|
|
69
|
+
... documentAssembler,
|
|
70
|
+
... regexTokenizer
|
|
71
|
+
... ])
|
|
72
|
+
>>> data = spark.createDataFrame([["This is my first sentence.\\nThis is my second."]]).toDF("text")
|
|
73
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
74
|
+
>>> result.selectExpr("regexToken.result").show(truncate=False)
|
|
75
|
+
+-------------------------------------------------------+
|
|
76
|
+
|result |
|
|
77
|
+
+-------------------------------------------------------+
|
|
78
|
+
|[this, is, my, first, sentence., this, is, my, second.]|
|
|
79
|
+
+-------------------------------------------------------+
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
name = "RegexTokenizer"
|
|
83
|
+
|
|
84
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
85
|
+
|
|
86
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
87
|
+
|
|
88
|
+
@keyword_only
|
|
89
|
+
def __init__(self):
|
|
90
|
+
super(RegexTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexTokenizer")
|
|
91
|
+
self._setDefault(
|
|
92
|
+
inputCols=["document"],
|
|
93
|
+
outputCol="regexToken",
|
|
94
|
+
toLowercase=False,
|
|
95
|
+
minLength=1,
|
|
96
|
+
pattern="\\s+",
|
|
97
|
+
positionalMask=False,
|
|
98
|
+
trimWhitespace=False,
|
|
99
|
+
preservePosition=True
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
minLength = Param(Params._dummy(),
|
|
103
|
+
"minLength",
|
|
104
|
+
"Set the minimum allowed length for each token",
|
|
105
|
+
typeConverter=TypeConverters.toInt)
|
|
106
|
+
|
|
107
|
+
maxLength = Param(Params._dummy(),
|
|
108
|
+
"maxLength",
|
|
109
|
+
"Set the maximum allowed length for each token",
|
|
110
|
+
typeConverter=TypeConverters.toInt)
|
|
111
|
+
|
|
112
|
+
toLowercase = Param(Params._dummy(),
|
|
113
|
+
"toLowercase",
|
|
114
|
+
"Indicates whether to convert all characters to lowercase before tokenizing.",
|
|
115
|
+
typeConverter=TypeConverters.toBoolean)
|
|
116
|
+
|
|
117
|
+
pattern = Param(Params._dummy(),
|
|
118
|
+
"pattern",
|
|
119
|
+
"regex pattern used for tokenizing. Defaults \S+",
|
|
120
|
+
typeConverter=TypeConverters.toString)
|
|
121
|
+
|
|
122
|
+
positionalMask = Param(Params._dummy(),
|
|
123
|
+
"positionalMask",
|
|
124
|
+
"Using a positional mask to guarantee the incremental progression of the tokenization.",
|
|
125
|
+
typeConverter=TypeConverters.toBoolean)
|
|
126
|
+
|
|
127
|
+
trimWhitespace = Param(Params._dummy(),
|
|
128
|
+
"trimWhitespace",
|
|
129
|
+
"Indicates whether to use a trimWhitespaces flag to remove whitespaces from identified tokens.",
|
|
130
|
+
typeConverter=TypeConverters.toBoolean)
|
|
131
|
+
|
|
132
|
+
preservePosition = Param(Params._dummy(),
|
|
133
|
+
"preservePosition",
|
|
134
|
+
"Indicates whether to use a preserve initial indexes before eventual whitespaces removal in tokens.",
|
|
135
|
+
typeConverter=TypeConverters.toBoolean)
|
|
136
|
+
|
|
137
|
+
def setMinLength(self, value):
|
|
138
|
+
"""Sets the minimum allowed length for each token, by default 1.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
value : int
|
|
143
|
+
Minimum allowed length for each token
|
|
144
|
+
"""
|
|
145
|
+
return self._set(minLength=value)
|
|
146
|
+
|
|
147
|
+
def setMaxLength(self, value):
|
|
148
|
+
"""Sets the maximum allowed length for each token.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
value : int
|
|
153
|
+
Maximum allowed length for each token
|
|
154
|
+
"""
|
|
155
|
+
return self._set(maxLength=value)
|
|
156
|
+
|
|
157
|
+
def setToLowercase(self, value):
|
|
158
|
+
"""Sets whether to convert all characters to lowercase before
|
|
159
|
+
tokenizing, by default False.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
value : bool
|
|
164
|
+
Whether to convert all characters to lowercase before tokenizing
|
|
165
|
+
"""
|
|
166
|
+
return self._set(toLowercase=value)
|
|
167
|
+
|
|
168
|
+
def setPattern(self, value):
|
|
169
|
+
"""Sets the regex pattern used for tokenizing, by default ``\\s+``.
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
value : str
|
|
174
|
+
Regex pattern used for tokenizing
|
|
175
|
+
"""
|
|
176
|
+
return self._set(pattern=value)
|
|
177
|
+
|
|
178
|
+
def setPositionalMask(self, value):
|
|
179
|
+
"""Sets whether to use a positional mask to guarantee the incremental
|
|
180
|
+
progression of the tokenization, by default False.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
value : bool
|
|
185
|
+
Whether to use a positional mask
|
|
186
|
+
"""
|
|
187
|
+
return self._set(positionalMask=value)
|
|
188
|
+
|
|
189
|
+
def setTrimWhitespace(self, value):
|
|
190
|
+
"""Indicates whether to use a trimWhitespaces flag to remove whitespaces from identified tokens.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
value : bool
|
|
195
|
+
Indicates whether to use a trimWhitespaces flag, by default False.
|
|
196
|
+
"""
|
|
197
|
+
return self._set(trimWhitespace=value)
|
|
198
|
+
|
|
199
|
+
def setPreservePosition(self, value):
|
|
200
|
+
"""Indicates whether to use a preserve initial indexes before eventual whitespaces removal in tokens.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
value : bool
|
|
205
|
+
Indicates whether to use a preserve initial indexes, by default True.
|
|
206
|
+
"""
|
|
207
|
+
return self._set(preservePosition=value)
|
|
208
|
+
|