PyPI - spark-nlp - Versions diffs - 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (329) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
com/johnsnowlabs/nlp/__init__.py +4 -2
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +281 -27
sparknlp/annotation.py +137 -6
sparknlp/annotation_audio.py +61 -0
sparknlp/annotation_image.py +82 -0
sparknlp/annotator/__init__.py +93 -0
sparknlp/annotator/audio/__init__.py +16 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/annotator/chunk2_doc.py +85 -0
sparknlp/annotator/chunker.py +137 -0
sparknlp/annotator/classifier_dl/__init__.py +61 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/__init__.py +1 -0
sparknlp/annotator/coref/spanbert_coref.py +221 -0
sparknlp/annotator/cv/__init__.py +29 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/__init__.py +17 -0
sparknlp/annotator/dependency/dependency_parser.py +294 -0
sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +235 -0
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +45 -0
sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
sparknlp/annotator/embeddings/doc2vec.py +352 -0
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
sparknlp/annotator/embeddings/word2vec.py +353 -0
sparknlp/annotator/embeddings/word_embeddings.py +385 -0
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
sparknlp/annotator/er/__init__.py +16 -0
sparknlp/annotator/er/entity_ruler.py +267 -0
sparknlp/annotator/graph_extraction.py +368 -0
sparknlp/annotator/keyword_extraction/__init__.py +16 -0
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
sparknlp/annotator/ld_dl/__init__.py +16 -0
sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
sparknlp/annotator/lemmatizer.py +250 -0
sparknlp/annotator/matcher/__init__.py +20 -0
sparknlp/annotator/matcher/big_text_matcher.py +272 -0
sparknlp/annotator/matcher/date_matcher.py +303 -0
sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
sparknlp/annotator/matcher/regex_matcher.py +221 -0
sparknlp/annotator/matcher/text_matcher.py +290 -0
sparknlp/annotator/n_gram_generator.py +141 -0
sparknlp/annotator/ner/__init__.py +21 -0
sparknlp/annotator/ner/ner_approach.py +94 -0
sparknlp/annotator/ner/ner_converter.py +148 -0
sparknlp/annotator/ner/ner_crf.py +397 -0
sparknlp/annotator/ner/ner_dl.py +591 -0
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +166 -0
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +230 -0
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/param/__init__.py +17 -0
sparknlp/annotator/param/classifier_encoder.py +98 -0
sparknlp/annotator/param/evaluation_dl_params.py +130 -0
sparknlp/annotator/pos/__init__.py +16 -0
sparknlp/annotator/pos/perceptron.py +263 -0
sparknlp/annotator/sentence/__init__.py +17 -0
sparknlp/annotator/sentence/sentence_detector.py +290 -0
sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
sparknlp/annotator/sentiment/__init__.py +17 -0
sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
sparknlp/annotator/seq2seq/__init__.py +35 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/__init__.py +18 -0
sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
sparknlp/annotator/stemmer.py +79 -0
sparknlp/annotator/stop_words_cleaner.py +190 -0
sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
sparknlp/annotator/token/__init__.py +19 -0
sparknlp/annotator/token/chunk_tokenizer.py +118 -0
sparknlp/annotator/token/recursive_tokenizer.py +205 -0
sparknlp/annotator/token/regex_tokenizer.py +208 -0
sparknlp/annotator/token/tokenizer.py +561 -0
sparknlp/annotator/token2_chunk.py +76 -0
sparknlp/annotator/ws/__init__.py +16 -0
sparknlp/annotator/ws/word_segmenter.py +429 -0
sparknlp/base/__init__.py +30 -0
sparknlp/base/audio_assembler.py +95 -0
sparknlp/base/doc2_chunk.py +169 -0
sparknlp/base/document_assembler.py +164 -0
sparknlp/base/embeddings_finisher.py +201 -0
sparknlp/base/finisher.py +217 -0
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/graph_finisher.py +125 -0
sparknlp/base/has_recursive_fit.py +24 -0
sparknlp/base/has_recursive_transform.py +22 -0
sparknlp/base/image_assembler.py +172 -0
sparknlp/base/light_pipeline.py +429 -0
sparknlp/base/multi_document_assembler.py +164 -0
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/recursive_pipeline.py +107 -0
sparknlp/base/table_assembler.py +145 -0
sparknlp/base/token_assembler.py +124 -0
sparknlp/common/__init__.py +26 -0
sparknlp/common/annotator_approach.py +41 -0
sparknlp/common/annotator_model.py +47 -0
sparknlp/common/annotator_properties.py +114 -0
sparknlp/common/annotator_type.py +38 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/coverage_result.py +22 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +1298 -0
sparknlp/common/read_as.py +33 -0
sparknlp/common/recursive_annotator_approach.py +35 -0
sparknlp/common/storage.py +149 -0
sparknlp/common/utils.py +39 -0
sparknlp/functions.py +315 -5
sparknlp/internal/__init__.py +1199 -0
sparknlp/internal/annotator_java_ml.py +32 -0
sparknlp/internal/annotator_transformer.py +37 -0
sparknlp/internal/extended_java_wrapper.py +63 -0
sparknlp/internal/params_getters_setters.py +71 -0
sparknlp/internal/recursive.py +70 -0
sparknlp/logging/__init__.py +15 -0
sparknlp/logging/comet.py +467 -0
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/__init__.py +17 -0
sparknlp/pretrained/pretrained_pipeline.py +158 -0
sparknlp/pretrained/resource_downloader.py +216 -0
sparknlp/pretrained/utils.py +35 -0
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +20 -0
sparknlp/training/_tf_graph_builders/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/conll.py +150 -0
sparknlp/training/conllu.py +103 -0
sparknlp/training/pos.py +103 -0
sparknlp/training/pub_tator.py +76 -0
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/training/tfgraphs.py +5 -0
sparknlp/upload_to_hub.py +149 -0
sparknlp/util.py +51 -5
com/__init__.pyc +0 -0
com/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/__init__.pyc +0 -0
com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/nlp/__init__.pyc +0 -0
com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
sparknlp/__init__.pyc +0 -0
sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
sparknlp/__pycache__/base.cpython-36.pyc +0 -0
sparknlp/__pycache__/common.cpython-36.pyc +0 -0
sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
sparknlp/__pycache__/training.cpython-36.pyc +0 -0
sparknlp/__pycache__/util.cpython-36.pyc +0 -0
sparknlp/annotation.pyc +0 -0
sparknlp/annotator.py +0 -3006
sparknlp/annotator.pyc +0 -0
sparknlp/base.py +0 -347
sparknlp/base.pyc +0 -0
sparknlp/common.py +0 -193
sparknlp/common.pyc +0 -0
sparknlp/embeddings.py +0 -40
sparknlp/embeddings.pyc +0 -0
sparknlp/internal.py +0 -288
sparknlp/internal.pyc +0 -0
sparknlp/pretrained.py +0 -123
sparknlp/pretrained.pyc +0 -0
sparknlp/storage.py +0 -32
sparknlp/storage.pyc +0 -0
sparknlp/training.py +0 -62
sparknlp/training.pyc +0 -0
sparknlp/util.pyc +0 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0

sparknlp/annotator/lemmatizer.py ADDED Viewed

@@ -0,0 +1,250 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the Lemmatizer."""
+from sparknlp.common import *
+class Lemmatizer(AnnotatorApproach):
+    """Class to find lemmas out of words with the objective of returning a base
+    dictionary word.
+    Retrieves the significant part of a word. A dictionary of predefined lemmas
+    must be provided with :meth:`.setDictionary`.
+    For instantiated/pretrained models, see :class:`.LemmatizerModel`.
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``TOKEN``              ``TOKEN``
+    ====================== ======================
+    Parameters
+    ----------
+    dictionary
+        lemmatizer external dictionary.
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    In this example, the lemma dictionary ``lemmas_small.txt`` has the form of::
+        ...
+        pick	->	pick	picks	picking	picked
+        peck	->	peck	pecking	pecked	pecks
+        pickle	->	pickle	pickles	pickled	pickling
+        pepper	->	pepper	peppers	peppered	peppering
+        ...
+    where each key is delimited by ``->`` and values are delimited by ``\\t``
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> sentenceDetector = SentenceDetector() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("sentence")
+    >>> tokenizer = Tokenizer() \\
+    ...     .setInputCols(["sentence"]) \\
+    ...     .setOutputCol("token")
+    >>> lemmatizer = Lemmatizer() \\
+    ...     .setInputCols(["token"]) \\
+    ...     .setOutputCol("lemma") \\
+    ...     .setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\\t")
+    >>> pipeline = Pipeline() \\
+    ...     .setStages([
+    ...       documentAssembler,
+    ...       sentenceDetector,
+    ...       tokenizer,
+    ...       lemmatizer
+    ...     ])
+    >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]) \\
+    ...     .toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.selectExpr("lemma.result").show(truncate=False)
+    +------------------------------------------------------------------+
+    |result                                                            |
+    +------------------------------------------------------------------+
+    |[Peter, Pipers, employees, are, pick, peck, of, pickle, pepper, .]|
+    +------------------------------------------------------------------+
+    """
+    inputAnnotatorTypes = [AnnotatorType.TOKEN]
+    outputAnnotatorType = AnnotatorType.TOKEN
+    dictionary = Param(Params._dummy(),
+                       "dictionary",
+                       "lemmatizer external dictionary." +
+                       " needs 'keyDelimiter' and 'valueDelimiter' in options for parsing target text",
+                       typeConverter=TypeConverters.identity)
+    formCol = Param(Params._dummy(),
+                    "formCol",
+                    "Column that correspends to CoNLLU(formCol=) output",
+                    typeConverter=TypeConverters.toString)
+    lemmaCol = Param(Params._dummy(),
+                     "lemmaCol",
+                     "Column that correspends to CoNLLU(lemmaCol=) output",
+                     typeConverter=TypeConverters.toString)
+    @keyword_only
+    def __init__(self):
+        super(Lemmatizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Lemmatizer")
+        self._setDefault(
+            formCol="form",
+            lemmaCol="lemma"
+        )
+    def _create_model(self, java_model):
+        return LemmatizerModel(java_model=java_model)
+    def setFormCol(self, value):
+        """Column that correspends to CoNLLU(formCol=) output
+        Parameters
+        ----------
+        value : str
+            Name of column for Array of Form tokens
+        """
+        return self._set(formCol=value)
+    def setLemmaCol(self, value):
+        """Column that correspends to CoNLLU(fromLemma=) output
+        Parameters
+        ----------
+        value : str
+            Name of column for Array of Lemma tokens
+        """
+        return self._set(lemmaCol=value)
+    def setDictionary(self, path, key_delimiter, value_delimiter, read_as=ReadAs.TEXT,
+                      options={"format": "text"}):
+        """Sets the external dictionary for the lemmatizer.
+        Parameters
+        ----------
+        path : str
+            Path to the source files
+        key_delimiter : str
+            Delimiter for the key
+        value_delimiter : str
+            Delimiter for the values
+        read_as : str, optional
+            How to read the file, by default ReadAs.TEXT
+        options : dict, optional
+            Options to read the resource, by default {"format": "text"}
+        Examples
+        --------
+        Here the file has each key is delimited by ``"->"`` and values are
+        delimited by ``\\t``::
+            ...
+            pick	->	pick	picks	picking	picked
+            peck	->	peck	pecking	pecked	pecks
+            pickle	->	pickle	pickles	pickled	pickling
+            pepper	->	pepper	peppers	peppered	peppering
+            ...
+        This file can then be parsed with
+        >>> lemmatizer = Lemmatizer() \\
+        ...     .setInputCols(["token"]) \\
+        ...     .setOutputCol("lemma") \\
+        ...     .setDictionary("lemmas_small.txt", "->", "\\t")
+        """
+        opts = options.copy()
+        if "keyDelimiter" not in opts:
+            opts["keyDelimiter"] = key_delimiter
+        if "valueDelimiter" not in opts:
+            opts["valueDelimiter"] = value_delimiter
+        return self._set(dictionary=ExternalResource(path, read_as, opts))
+class LemmatizerModel(AnnotatorModel):
+    """Instantiated Model of the Lemmatizer.
+    This is the instantiated model of the :class:`.Lemmatizer`.
+    For training your own model, please see the documentation of that class.
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+    >>> lemmatizer = LemmatizerModel.pretrained() \\
+    ...     .setInputCols(["token"]) \\
+    ...     .setOutputCol("lemma")
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``TOKEN``              ``TOKEN``
+    ====================== ======================
+    Parameters
+    ----------
+    None
+    Examples
+    --------
+    The lemmatizer from the example of the :class:`.Lemmatizer` can be replaced
+    with:
+    >>> lemmatizer = LemmatizerModel.pretrained() \\
+    ...     .setInputCols(["token"]) \\
+    ...     .setOutputCol("lemma")
+    """
+    name = "LemmatizerModel"
+    inputAnnotatorTypes = [AnnotatorType.TOKEN]
+    outputAnnotatorType = AnnotatorType.TOKEN
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.LemmatizerModel", java_model=None):
+        super(LemmatizerModel, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+    @staticmethod
+    def pretrained(name="lemma_antbnc", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "lemma_antbnc"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        LemmatizerModel
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(LemmatizerModel, name, lang, remote_loc)

sparknlp/annotator/matcher/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Module of annotators for text matching."""
+from sparknlp.annotator.matcher.big_text_matcher import *
+from sparknlp.annotator.matcher.date_matcher import *
+from sparknlp.annotator.matcher.multi_date_matcher import *
+from sparknlp.annotator.matcher.regex_matcher import *
+from sparknlp.annotator.matcher.text_matcher import *

sparknlp/annotator/matcher/big_text_matcher.py ADDED Viewed

@@ -0,0 +1,272 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the BigTextMatcher."""
+from sparknlp.common import *
+from sparknlp.annotator.matcher.text_matcher import TextMatcherModel
+class BigTextMatcher(AnnotatorApproach, HasStorage):
+    """Annotator to match exact phrases (by token) provided in a file against a
+    Document.
+    A text file of predefined phrases must be provided with ``setStoragePath``.
+    In contrast to the normal ``TextMatcher``, the ``BigTextMatcher`` is
+    designed for large corpora.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT, TOKEN``    ``CHUNK``
+    ====================== ======================
+    Parameters
+    ----------
+    entities
+        ExternalResource for entities
+    caseSensitive
+        whether to ignore case in index lookups, by default True
+    mergeOverlapping
+        whether to merge overlapping matched chunks, by default False
+    tokenizer
+        TokenizerModel to use to tokenize input file for building a Trie
+    Examples
+    --------
+    In this example, the entities file is of the form::
+        ...
+        dolore magna aliqua
+        lorem ipsum dolor. sit
+        laborum
+        ...
+    where each line represents an entity phrase to be extracted.
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> tokenizer = Tokenizer() \\
+    ...     .setInputCols("document") \\
+    ...     .setOutputCol("token")
+    >>> data = spark.createDataFrame([["Hello dolore magna aliqua. Lorem ipsum dolor. sit in laborum"]]).toDF("text")
+    >>> entityExtractor = BigTextMatcher() \\
+    ...     .setInputCols("document", "token") \\
+    ...     .setStoragePath("src/test/resources/entity-extractor/test-phrases.txt", ReadAs.TEXT) \\
+    ...     .setOutputCol("entity") \\
+    ...     .setCaseSensitive(False)
+    >>> pipeline = Pipeline().setStages([documentAssembler, tokenizer, entityExtractor])
+    >>> results = pipeline.fit(data).transform(data)
+    >>> results.selectExpr("explode(entity)").show(truncate=False)
+    +--------------------------------------------------------------------+
+    |col                                                                 |
+    +--------------------------------------------------------------------+
+    |[chunk, 6, 24, dolore magna aliqua, [sentence -> 0, chunk -> 0], []]|
+    |[chunk, 53, 59, laborum, [sentence -> 0, chunk -> 1], []]           |
+    +--------------------------------------------------------------------+
+    """
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+    outputAnnotatorType = AnnotatorType.CHUNK
+    entities = Param(Params._dummy(),
+                     "entities",
+                     "ExternalResource for entities",
+                     typeConverter=TypeConverters.identity)
+    caseSensitive = Param(Params._dummy(),
+                          "caseSensitive",
+                          "whether to ignore case in index lookups",
+                          typeConverter=TypeConverters.toBoolean)
+    mergeOverlapping = Param(Params._dummy(),
+                             "mergeOverlapping",
+                             "whether to merge overlapping matched chunks. Defaults false",
+                             typeConverter=TypeConverters.toBoolean)
+    tokenizer = Param(Params._dummy(),
+                      "tokenizer",
+                      "TokenizerModel to use to tokenize input file for building a Trie",
+                      typeConverter=TypeConverters.identity)
+    @keyword_only
+    def __init__(self):
+        super(BigTextMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.btm.BigTextMatcher")
+        self._setDefault(caseSensitive=True)
+        self._setDefault(mergeOverlapping=False)
+    def _create_model(self, java_model):
+        return TextMatcherModel(java_model=java_model)
+    def setEntities(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
+        """Sets ExternalResource for entities.
+        Parameters
+        ----------
+        path : str
+            Path to the resource
+        read_as : str, optional
+            How to read the resource, by default ReadAs.TEXT
+        options : dict, optional
+            Options for reading the resource, by default {"format": "text"}
+        """
+        return self._set(entities=ExternalResource(path, read_as, options.copy()))
+    def setCaseSensitive(self, b):
+        """Sets whether to ignore case in index lookups, by default True.
+        Parameters
+        ----------
+        b : bool
+            Whether to ignore case in index lookups
+        """
+        return self._set(caseSensitive=b)
+    def setMergeOverlapping(self, b):
+        """Sets whether to merge overlapping matched chunks, by default False.
+        Parameters
+        ----------
+        b : bool
+            Whether to merge overlapping matched chunks
+        """
+        return self._set(mergeOverlapping=b)
+    def setTokenizer(self, tokenizer_model):
+        """Sets TokenizerModel to use to tokenize input file for building a
+        Trie.
+        Parameters
+        ----------
+        tokenizer_model : :class:`TokenizerModel <sparknlp.annotator.TokenizerModel>`
+            TokenizerModel to use to tokenize input file
+        """
+        tokenizer_model._transfer_params_to_java()
+        return self._set(tokenizer_model._java_obj)
+class BigTextMatcherModel(AnnotatorModel, HasStorageModel):
+    """Instantiated model of the BigTextMatcher.
+    This is the instantiated model of the :class:`.BigTextMatcher`.
+    For training your own model, please see the documentation of that class.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT, TOKEN``    ``CHUNK``
+    ====================== ======================
+    Parameters
+    ----------
+    caseSensitive
+        Whether to ignore case in index lookups
+    mergeOverlapping
+        Whether to merge overlapping matched chunks, by default False
+    searchTrie
+        SearchTrie
+    """
+    name = "BigTextMatcherModel"
+    databases = ['TMVOCAB', 'TMEDGES', 'TMNODES']
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+    outputAnnotatorType = AnnotatorType.CHUNK
+    caseSensitive = Param(Params._dummy(),
+                          "caseSensitive",
+                          "whether to ignore case in index lookups",
+                          typeConverter=TypeConverters.toBoolean)
+    mergeOverlapping = Param(Params._dummy(),
+                             "mergeOverlapping",
+                             "whether to merge overlapping matched chunks. Defaults false",
+                             typeConverter=TypeConverters.toBoolean)
+    searchTrie = Param(Params._dummy(),
+                       "searchTrie",
+                       "searchTrie",
+                       typeConverter=TypeConverters.identity)
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.btm.TextMatcherModel", java_model=None):
+        super(BigTextMatcherModel, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+    def setMergeOverlapping(self, b):
+        """Sets whether to merge overlapping matched chunks, by default False.
+        Parameters
+        ----------
+        v : bool
+            Whether to merge overlapping matched chunks, by default False
+        """
+        return self._set(mergeOverlapping=b)
+    def setCaseSensitive(self, v):
+        """Sets whether to ignore case in index lookups.
+        Parameters
+        ----------
+        b : bool
+            Whether to ignore case in index lookups
+        """
+        return self._set(caseSensitive=v)
+    @staticmethod
+    def pretrained(name, lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        TextMatcherModel
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
+    @staticmethod
+    def loadStorage(path, spark, storage_ref):
+        """Loads the model from storage.
+        Parameters
+        ----------
+        path : str
+            Path to the model
+        spark : :class:`pyspark.sql.SparkSession`
+            The current SparkSession
+        storage_ref : str
+            Identifiers for the model parameters
+        """
+        HasStorageModel.loadStorages(path, spark, storage_ref, BigTextMatcherModel.databases)

spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl