spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the Lemmatizer."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Lemmatizer(AnnotatorApproach):
|
|
19
|
+
"""Class to find lemmas out of words with the objective of returning a base
|
|
20
|
+
dictionary word.
|
|
21
|
+
|
|
22
|
+
Retrieves the significant part of a word. A dictionary of predefined lemmas
|
|
23
|
+
must be provided with :meth:`.setDictionary`.
|
|
24
|
+
|
|
25
|
+
For instantiated/pretrained models, see :class:`.LemmatizerModel`.
|
|
26
|
+
|
|
27
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
|
|
28
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb>`__.
|
|
29
|
+
|
|
30
|
+
====================== ======================
|
|
31
|
+
Input Annotation types Output Annotation type
|
|
32
|
+
====================== ======================
|
|
33
|
+
``TOKEN`` ``TOKEN``
|
|
34
|
+
====================== ======================
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
dictionary
|
|
39
|
+
lemmatizer external dictionary.
|
|
40
|
+
|
|
41
|
+
Examples
|
|
42
|
+
--------
|
|
43
|
+
>>> import sparknlp
|
|
44
|
+
>>> from sparknlp.base import *
|
|
45
|
+
>>> from sparknlp.annotator import *
|
|
46
|
+
>>> from pyspark.ml import Pipeline
|
|
47
|
+
|
|
48
|
+
In this example, the lemma dictionary ``lemmas_small.txt`` has the form of::
|
|
49
|
+
|
|
50
|
+
...
|
|
51
|
+
pick -> pick picks picking picked
|
|
52
|
+
peck -> peck pecking pecked pecks
|
|
53
|
+
pickle -> pickle pickles pickled pickling
|
|
54
|
+
pepper -> pepper peppers peppered peppering
|
|
55
|
+
...
|
|
56
|
+
|
|
57
|
+
where each key is delimited by ``->`` and values are delimited by ``\\t``
|
|
58
|
+
|
|
59
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
60
|
+
... .setInputCol("text") \\
|
|
61
|
+
... .setOutputCol("document")
|
|
62
|
+
>>> sentenceDetector = SentenceDetector() \\
|
|
63
|
+
... .setInputCols(["document"]) \\
|
|
64
|
+
... .setOutputCol("sentence")
|
|
65
|
+
>>> tokenizer = Tokenizer() \\
|
|
66
|
+
... .setInputCols(["sentence"]) \\
|
|
67
|
+
... .setOutputCol("token")
|
|
68
|
+
>>> lemmatizer = Lemmatizer() \\
|
|
69
|
+
... .setInputCols(["token"]) \\
|
|
70
|
+
... .setOutputCol("lemma") \\
|
|
71
|
+
... .setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\\t")
|
|
72
|
+
>>> pipeline = Pipeline() \\
|
|
73
|
+
... .setStages([
|
|
74
|
+
... documentAssembler,
|
|
75
|
+
... sentenceDetector,
|
|
76
|
+
... tokenizer,
|
|
77
|
+
... lemmatizer
|
|
78
|
+
... ])
|
|
79
|
+
>>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]) \\
|
|
80
|
+
... .toDF("text")
|
|
81
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
82
|
+
>>> result.selectExpr("lemma.result").show(truncate=False)
|
|
83
|
+
+------------------------------------------------------------------+
|
|
84
|
+
|result |
|
|
85
|
+
+------------------------------------------------------------------+
|
|
86
|
+
|[Peter, Pipers, employees, are, pick, peck, of, pickle, pepper, .]|
|
|
87
|
+
+------------------------------------------------------------------+
|
|
88
|
+
"""
|
|
89
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
90
|
+
|
|
91
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
92
|
+
|
|
93
|
+
dictionary = Param(Params._dummy(),
|
|
94
|
+
"dictionary",
|
|
95
|
+
"lemmatizer external dictionary." +
|
|
96
|
+
" needs 'keyDelimiter' and 'valueDelimiter' in options for parsing target text",
|
|
97
|
+
typeConverter=TypeConverters.identity)
|
|
98
|
+
|
|
99
|
+
formCol = Param(Params._dummy(),
|
|
100
|
+
"formCol",
|
|
101
|
+
"Column that correspends to CoNLLU(formCol=) output",
|
|
102
|
+
typeConverter=TypeConverters.toString)
|
|
103
|
+
|
|
104
|
+
lemmaCol = Param(Params._dummy(),
|
|
105
|
+
"lemmaCol",
|
|
106
|
+
"Column that correspends to CoNLLU(lemmaCol=) output",
|
|
107
|
+
typeConverter=TypeConverters.toString)
|
|
108
|
+
|
|
109
|
+
@keyword_only
|
|
110
|
+
def __init__(self):
|
|
111
|
+
super(Lemmatizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Lemmatizer")
|
|
112
|
+
self._setDefault(
|
|
113
|
+
formCol="form",
|
|
114
|
+
lemmaCol="lemma"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def _create_model(self, java_model):
|
|
118
|
+
return LemmatizerModel(java_model=java_model)
|
|
119
|
+
|
|
120
|
+
def setFormCol(self, value):
|
|
121
|
+
"""Column that correspends to CoNLLU(formCol=) output
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
value : str
|
|
126
|
+
Name of column for Array of Form tokens
|
|
127
|
+
"""
|
|
128
|
+
return self._set(formCol=value)
|
|
129
|
+
|
|
130
|
+
def setLemmaCol(self, value):
|
|
131
|
+
"""Column that correspends to CoNLLU(fromLemma=) output
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
value : str
|
|
136
|
+
Name of column for Array of Lemma tokens
|
|
137
|
+
"""
|
|
138
|
+
return self._set(lemmaCol=value)
|
|
139
|
+
|
|
140
|
+
def setDictionary(self, path, key_delimiter, value_delimiter, read_as=ReadAs.TEXT,
|
|
141
|
+
options={"format": "text"}):
|
|
142
|
+
"""Sets the external dictionary for the lemmatizer.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
path : str
|
|
147
|
+
Path to the source files
|
|
148
|
+
key_delimiter : str
|
|
149
|
+
Delimiter for the key
|
|
150
|
+
value_delimiter : str
|
|
151
|
+
Delimiter for the values
|
|
152
|
+
read_as : str, optional
|
|
153
|
+
How to read the file, by default ReadAs.TEXT
|
|
154
|
+
options : dict, optional
|
|
155
|
+
Options to read the resource, by default {"format": "text"}
|
|
156
|
+
|
|
157
|
+
Examples
|
|
158
|
+
--------
|
|
159
|
+
Here the file has each key is delimited by ``"->"`` and values are
|
|
160
|
+
delimited by ``\\t``::
|
|
161
|
+
|
|
162
|
+
...
|
|
163
|
+
pick -> pick picks picking picked
|
|
164
|
+
peck -> peck pecking pecked pecks
|
|
165
|
+
pickle -> pickle pickles pickled pickling
|
|
166
|
+
pepper -> pepper peppers peppered peppering
|
|
167
|
+
...
|
|
168
|
+
|
|
169
|
+
This file can then be parsed with
|
|
170
|
+
|
|
171
|
+
>>> lemmatizer = Lemmatizer() \\
|
|
172
|
+
... .setInputCols(["token"]) \\
|
|
173
|
+
... .setOutputCol("lemma") \\
|
|
174
|
+
... .setDictionary("lemmas_small.txt", "->", "\\t")
|
|
175
|
+
"""
|
|
176
|
+
opts = options.copy()
|
|
177
|
+
if "keyDelimiter" not in opts:
|
|
178
|
+
opts["keyDelimiter"] = key_delimiter
|
|
179
|
+
if "valueDelimiter" not in opts:
|
|
180
|
+
opts["valueDelimiter"] = value_delimiter
|
|
181
|
+
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class LemmatizerModel(AnnotatorModel):
|
|
185
|
+
"""Instantiated Model of the Lemmatizer.
|
|
186
|
+
|
|
187
|
+
This is the instantiated model of the :class:`.Lemmatizer`.
|
|
188
|
+
For training your own model, please see the documentation of that class.
|
|
189
|
+
|
|
190
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
191
|
+
object:
|
|
192
|
+
|
|
193
|
+
>>> lemmatizer = LemmatizerModel.pretrained() \\
|
|
194
|
+
... .setInputCols(["token"]) \\
|
|
195
|
+
... .setOutputCol("lemma")
|
|
196
|
+
|
|
197
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
|
|
198
|
+
|
|
199
|
+
====================== ======================
|
|
200
|
+
Input Annotation types Output Annotation type
|
|
201
|
+
====================== ======================
|
|
202
|
+
``TOKEN`` ``TOKEN``
|
|
203
|
+
====================== ======================
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
None
|
|
208
|
+
|
|
209
|
+
Examples
|
|
210
|
+
--------
|
|
211
|
+
The lemmatizer from the example of the :class:`.Lemmatizer` can be replaced
|
|
212
|
+
with:
|
|
213
|
+
|
|
214
|
+
>>> lemmatizer = LemmatizerModel.pretrained() \\
|
|
215
|
+
... .setInputCols(["token"]) \\
|
|
216
|
+
... .setOutputCol("lemma")
|
|
217
|
+
"""
|
|
218
|
+
name = "LemmatizerModel"
|
|
219
|
+
|
|
220
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
221
|
+
|
|
222
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
223
|
+
|
|
224
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.LemmatizerModel", java_model=None):
|
|
225
|
+
super(LemmatizerModel, self).__init__(
|
|
226
|
+
classname=classname,
|
|
227
|
+
java_model=java_model
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
@staticmethod
|
|
231
|
+
def pretrained(name="lemma_antbnc", lang="en", remote_loc=None):
|
|
232
|
+
"""Downloads and loads a pretrained model.
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
name : str, optional
|
|
237
|
+
Name of the pretrained model, by default "lemma_antbnc"
|
|
238
|
+
lang : str, optional
|
|
239
|
+
Language of the pretrained model, by default "en"
|
|
240
|
+
remote_loc : str, optional
|
|
241
|
+
Optional remote address of the resource, by default None. Will use
|
|
242
|
+
Spark NLPs repositories otherwise.
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
LemmatizerModel
|
|
247
|
+
The restored model
|
|
248
|
+
"""
|
|
249
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
250
|
+
return ResourceDownloader.downloadModel(LemmatizerModel, name, lang, remote_loc)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Module of annotators for text matching."""
|
|
16
|
+
from sparknlp.annotator.matcher.big_text_matcher import *
|
|
17
|
+
from sparknlp.annotator.matcher.date_matcher import *
|
|
18
|
+
from sparknlp.annotator.matcher.multi_date_matcher import *
|
|
19
|
+
from sparknlp.annotator.matcher.regex_matcher import *
|
|
20
|
+
from sparknlp.annotator.matcher.text_matcher import *
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the BigTextMatcher."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
from sparknlp.annotator.matcher.text_matcher import TextMatcherModel
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BigTextMatcher(AnnotatorApproach, HasStorage):
|
|
21
|
+
"""Annotator to match exact phrases (by token) provided in a file against a
|
|
22
|
+
Document.
|
|
23
|
+
|
|
24
|
+
A text file of predefined phrases must be provided with ``setStoragePath``.
|
|
25
|
+
|
|
26
|
+
In contrast to the normal ``TextMatcher``, the ``BigTextMatcher`` is
|
|
27
|
+
designed for large corpora.
|
|
28
|
+
|
|
29
|
+
====================== ======================
|
|
30
|
+
Input Annotation types Output Annotation type
|
|
31
|
+
====================== ======================
|
|
32
|
+
``DOCUMENT, TOKEN`` ``CHUNK``
|
|
33
|
+
====================== ======================
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
entities
|
|
38
|
+
ExternalResource for entities
|
|
39
|
+
caseSensitive
|
|
40
|
+
whether to ignore case in index lookups, by default True
|
|
41
|
+
mergeOverlapping
|
|
42
|
+
whether to merge overlapping matched chunks, by default False
|
|
43
|
+
tokenizer
|
|
44
|
+
TokenizerModel to use to tokenize input file for building a Trie
|
|
45
|
+
|
|
46
|
+
Examples
|
|
47
|
+
--------
|
|
48
|
+
In this example, the entities file is of the form::
|
|
49
|
+
|
|
50
|
+
...
|
|
51
|
+
dolore magna aliqua
|
|
52
|
+
lorem ipsum dolor. sit
|
|
53
|
+
laborum
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
where each line represents an entity phrase to be extracted.
|
|
57
|
+
|
|
58
|
+
>>> import sparknlp
|
|
59
|
+
>>> from sparknlp.base import *
|
|
60
|
+
>>> from sparknlp.annotator import *
|
|
61
|
+
>>> from pyspark.ml import Pipeline
|
|
62
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
63
|
+
... .setInputCol("text") \\
|
|
64
|
+
... .setOutputCol("document")
|
|
65
|
+
>>> tokenizer = Tokenizer() \\
|
|
66
|
+
... .setInputCols("document") \\
|
|
67
|
+
... .setOutputCol("token")
|
|
68
|
+
>>> data = spark.createDataFrame([["Hello dolore magna aliqua. Lorem ipsum dolor. sit in laborum"]]).toDF("text")
|
|
69
|
+
>>> entityExtractor = BigTextMatcher() \\
|
|
70
|
+
... .setInputCols("document", "token") \\
|
|
71
|
+
... .setStoragePath("src/test/resources/entity-extractor/test-phrases.txt", ReadAs.TEXT) \\
|
|
72
|
+
... .setOutputCol("entity") \\
|
|
73
|
+
... .setCaseSensitive(False)
|
|
74
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, tokenizer, entityExtractor])
|
|
75
|
+
>>> results = pipeline.fit(data).transform(data)
|
|
76
|
+
>>> results.selectExpr("explode(entity)").show(truncate=False)
|
|
77
|
+
+--------------------------------------------------------------------+
|
|
78
|
+
|col |
|
|
79
|
+
+--------------------------------------------------------------------+
|
|
80
|
+
|[chunk, 6, 24, dolore magna aliqua, [sentence -> 0, chunk -> 0], []]|
|
|
81
|
+
|[chunk, 53, 59, laborum, [sentence -> 0, chunk -> 1], []] |
|
|
82
|
+
+--------------------------------------------------------------------+
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
|
|
86
|
+
|
|
87
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
88
|
+
|
|
89
|
+
entities = Param(Params._dummy(),
|
|
90
|
+
"entities",
|
|
91
|
+
"ExternalResource for entities",
|
|
92
|
+
typeConverter=TypeConverters.identity)
|
|
93
|
+
|
|
94
|
+
caseSensitive = Param(Params._dummy(),
|
|
95
|
+
"caseSensitive",
|
|
96
|
+
"whether to ignore case in index lookups",
|
|
97
|
+
typeConverter=TypeConverters.toBoolean)
|
|
98
|
+
|
|
99
|
+
mergeOverlapping = Param(Params._dummy(),
|
|
100
|
+
"mergeOverlapping",
|
|
101
|
+
"whether to merge overlapping matched chunks. Defaults false",
|
|
102
|
+
typeConverter=TypeConverters.toBoolean)
|
|
103
|
+
|
|
104
|
+
tokenizer = Param(Params._dummy(),
|
|
105
|
+
"tokenizer",
|
|
106
|
+
"TokenizerModel to use to tokenize input file for building a Trie",
|
|
107
|
+
typeConverter=TypeConverters.identity)
|
|
108
|
+
|
|
109
|
+
@keyword_only
|
|
110
|
+
def __init__(self):
|
|
111
|
+
super(BigTextMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.btm.BigTextMatcher")
|
|
112
|
+
self._setDefault(caseSensitive=True)
|
|
113
|
+
self._setDefault(mergeOverlapping=False)
|
|
114
|
+
|
|
115
|
+
def _create_model(self, java_model):
|
|
116
|
+
return TextMatcherModel(java_model=java_model)
|
|
117
|
+
|
|
118
|
+
def setEntities(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
119
|
+
"""Sets ExternalResource for entities.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
path : str
|
|
124
|
+
Path to the resource
|
|
125
|
+
read_as : str, optional
|
|
126
|
+
How to read the resource, by default ReadAs.TEXT
|
|
127
|
+
options : dict, optional
|
|
128
|
+
Options for reading the resource, by default {"format": "text"}
|
|
129
|
+
"""
|
|
130
|
+
return self._set(entities=ExternalResource(path, read_as, options.copy()))
|
|
131
|
+
|
|
132
|
+
def setCaseSensitive(self, b):
|
|
133
|
+
"""Sets whether to ignore case in index lookups, by default True.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
b : bool
|
|
138
|
+
Whether to ignore case in index lookups
|
|
139
|
+
"""
|
|
140
|
+
return self._set(caseSensitive=b)
|
|
141
|
+
|
|
142
|
+
def setMergeOverlapping(self, b):
|
|
143
|
+
"""Sets whether to merge overlapping matched chunks, by default False.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
b : bool
|
|
148
|
+
Whether to merge overlapping matched chunks
|
|
149
|
+
|
|
150
|
+
"""
|
|
151
|
+
return self._set(mergeOverlapping=b)
|
|
152
|
+
|
|
153
|
+
def setTokenizer(self, tokenizer_model):
|
|
154
|
+
"""Sets TokenizerModel to use to tokenize input file for building a
|
|
155
|
+
Trie.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
tokenizer_model : :class:`TokenizerModel <sparknlp.annotator.TokenizerModel>`
|
|
160
|
+
TokenizerModel to use to tokenize input file
|
|
161
|
+
|
|
162
|
+
"""
|
|
163
|
+
tokenizer_model._transfer_params_to_java()
|
|
164
|
+
return self._set(tokenizer_model._java_obj)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class BigTextMatcherModel(AnnotatorModel, HasStorageModel):
|
|
168
|
+
"""Instantiated model of the BigTextMatcher.
|
|
169
|
+
|
|
170
|
+
This is the instantiated model of the :class:`.BigTextMatcher`.
|
|
171
|
+
For training your own model, please see the documentation of that class.
|
|
172
|
+
|
|
173
|
+
====================== ======================
|
|
174
|
+
Input Annotation types Output Annotation type
|
|
175
|
+
====================== ======================
|
|
176
|
+
``DOCUMENT, TOKEN`` ``CHUNK``
|
|
177
|
+
====================== ======================
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
caseSensitive
|
|
182
|
+
Whether to ignore case in index lookups
|
|
183
|
+
mergeOverlapping
|
|
184
|
+
Whether to merge overlapping matched chunks, by default False
|
|
185
|
+
searchTrie
|
|
186
|
+
SearchTrie
|
|
187
|
+
"""
|
|
188
|
+
name = "BigTextMatcherModel"
|
|
189
|
+
databases = ['TMVOCAB', 'TMEDGES', 'TMNODES']
|
|
190
|
+
|
|
191
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
|
|
192
|
+
|
|
193
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
194
|
+
|
|
195
|
+
caseSensitive = Param(Params._dummy(),
|
|
196
|
+
"caseSensitive",
|
|
197
|
+
"whether to ignore case in index lookups",
|
|
198
|
+
typeConverter=TypeConverters.toBoolean)
|
|
199
|
+
|
|
200
|
+
mergeOverlapping = Param(Params._dummy(),
|
|
201
|
+
"mergeOverlapping",
|
|
202
|
+
"whether to merge overlapping matched chunks. Defaults false",
|
|
203
|
+
typeConverter=TypeConverters.toBoolean)
|
|
204
|
+
|
|
205
|
+
searchTrie = Param(Params._dummy(),
|
|
206
|
+
"searchTrie",
|
|
207
|
+
"searchTrie",
|
|
208
|
+
typeConverter=TypeConverters.identity)
|
|
209
|
+
|
|
210
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.btm.TextMatcherModel", java_model=None):
|
|
211
|
+
super(BigTextMatcherModel, self).__init__(
|
|
212
|
+
classname=classname,
|
|
213
|
+
java_model=java_model
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def setMergeOverlapping(self, b):
|
|
217
|
+
"""Sets whether to merge overlapping matched chunks, by default False.
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
v : bool
|
|
222
|
+
Whether to merge overlapping matched chunks, by default False
|
|
223
|
+
"""
|
|
224
|
+
return self._set(mergeOverlapping=b)
|
|
225
|
+
|
|
226
|
+
def setCaseSensitive(self, v):
|
|
227
|
+
"""Sets whether to ignore case in index lookups.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
b : bool
|
|
232
|
+
Whether to ignore case in index lookups
|
|
233
|
+
"""
|
|
234
|
+
return self._set(caseSensitive=v)
|
|
235
|
+
|
|
236
|
+
@staticmethod
|
|
237
|
+
def pretrained(name, lang="en", remote_loc=None):
|
|
238
|
+
"""Downloads and loads a pretrained model.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
name : str, optional
|
|
243
|
+
Name of the pretrained model
|
|
244
|
+
lang : str, optional
|
|
245
|
+
Language of the pretrained model, by default "en"
|
|
246
|
+
remote_loc : str, optional
|
|
247
|
+
Optional remote address of the resource, by default None. Will use
|
|
248
|
+
Spark NLPs repositories otherwise.
|
|
249
|
+
|
|
250
|
+
Returns
|
|
251
|
+
-------
|
|
252
|
+
TextMatcherModel
|
|
253
|
+
The restored model
|
|
254
|
+
"""
|
|
255
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
256
|
+
return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
|
|
257
|
+
|
|
258
|
+
@staticmethod
|
|
259
|
+
def loadStorage(path, spark, storage_ref):
|
|
260
|
+
"""Loads the model from storage.
|
|
261
|
+
|
|
262
|
+
Parameters
|
|
263
|
+
----------
|
|
264
|
+
path : str
|
|
265
|
+
Path to the model
|
|
266
|
+
spark : :class:`pyspark.sql.SparkSession`
|
|
267
|
+
The current SparkSession
|
|
268
|
+
storage_ref : str
|
|
269
|
+
Identifiers for the model parameters
|
|
270
|
+
"""
|
|
271
|
+
HasStorageModel.loadStorages(path, spark, storage_ref, BigTextMatcherModel.databases)
|
|
272
|
+
|