spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the DateMatcher."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DateMatcherUtils(Params):
|
|
20
|
+
"""Base class for DateMatcher Annotators
|
|
21
|
+
"""
|
|
22
|
+
inputFormats = Param(Params._dummy(),
|
|
23
|
+
"inputFormats",
|
|
24
|
+
"input formats list of patterns to match",
|
|
25
|
+
typeConverter=TypeConverters.toListString)
|
|
26
|
+
|
|
27
|
+
outputFormat = Param(Params._dummy(),
|
|
28
|
+
"outputFormat",
|
|
29
|
+
"desired output format for dates extracted",
|
|
30
|
+
typeConverter=TypeConverters.toString)
|
|
31
|
+
|
|
32
|
+
readMonthFirst = Param(Params._dummy(),
|
|
33
|
+
"readMonthFirst",
|
|
34
|
+
"Whether to parse july 07/05/2015 or as 05/07/2015",
|
|
35
|
+
typeConverter=TypeConverters.toBoolean
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
defaultDayWhenMissing = Param(Params._dummy(),
|
|
39
|
+
"defaultDayWhenMissing",
|
|
40
|
+
"which day to set when it is missing from parsed input",
|
|
41
|
+
typeConverter=TypeConverters.toInt
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
anchorDateYear = Param(Params._dummy(),
|
|
45
|
+
"anchorDateYear",
|
|
46
|
+
"Add an anchor year for the relative dates such as a day after tomorrow. If not set it "
|
|
47
|
+
"will use the current year. Example: 2021",
|
|
48
|
+
typeConverter=TypeConverters.toInt
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
anchorDateMonth = Param(Params._dummy(),
|
|
52
|
+
"anchorDateMonth",
|
|
53
|
+
"Add an anchor month for the relative dates such as a day after tomorrow. If not set it "
|
|
54
|
+
"will use the current month. Example: 1 which means January",
|
|
55
|
+
typeConverter=TypeConverters.toInt
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
anchorDateDay = Param(Params._dummy(),
|
|
59
|
+
"anchorDateDay",
|
|
60
|
+
"Add an anchor day of the day for the relative dates such as a day after tomorrow. If not "
|
|
61
|
+
"set it will use the current day. Example: 11",
|
|
62
|
+
typeConverter=TypeConverters.toInt
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
sourceLanguage = Param(Params._dummy(),
|
|
66
|
+
"sourceLanguage",
|
|
67
|
+
"source language for explicit translation",
|
|
68
|
+
typeConverter=TypeConverters.toString)
|
|
69
|
+
|
|
70
|
+
relaxedFactoryStrategy = Param(Params._dummy(),
|
|
71
|
+
"relaxedFactoryStrategy",
|
|
72
|
+
"Matched Strategy to searches relaxed dates",
|
|
73
|
+
typeConverter=TypeConverters.toString)
|
|
74
|
+
|
|
75
|
+
aggressiveMatching = Param(Params._dummy(),
|
|
76
|
+
"aggressiveMatching",
|
|
77
|
+
"Whether to aggressively attempt to find date matches, even in ambiguous or less common formats",
|
|
78
|
+
typeConverter=TypeConverters.toBoolean)
|
|
79
|
+
|
|
80
|
+
def setInputFormats(self, value):
|
|
81
|
+
"""Sets input formats patterns to match in the documents.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
value : List[str]
|
|
86
|
+
Input formats regex patterns to match dates in documents
|
|
87
|
+
"""
|
|
88
|
+
return self._set(inputFormats=value)
|
|
89
|
+
|
|
90
|
+
def setOutputFormat(self, value):
|
|
91
|
+
"""Sets desired output format for extracted dates, by default yyyy/MM/dd.
|
|
92
|
+
|
|
93
|
+
Not all of the date information needs to be included. For example
|
|
94
|
+
``"YYYY"`` is also a valid input.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
value : str
|
|
99
|
+
Desired output format for dates extracted.
|
|
100
|
+
"""
|
|
101
|
+
return self._set(outputFormat=value)
|
|
102
|
+
|
|
103
|
+
def setReadMonthFirst(self, value):
|
|
104
|
+
"""Sets whether to parse the date in mm/dd/yyyy format instead of
|
|
105
|
+
dd/mm/yyyy, by default True.
|
|
106
|
+
|
|
107
|
+
For example July 5th 2015, would be parsed as 07/05/2015 instead of
|
|
108
|
+
05/07/2015.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
value : bool
|
|
113
|
+
Whether to parse the date in mm/dd/yyyy format instead of
|
|
114
|
+
dd/mm/yyyy.
|
|
115
|
+
"""
|
|
116
|
+
return self._set(readMonthFirst=value)
|
|
117
|
+
|
|
118
|
+
def setDefaultDayWhenMissing(self, value):
|
|
119
|
+
"""Sets which day to set when it is missing from parsed input,
|
|
120
|
+
by default 1.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
value : int
|
|
125
|
+
[description]
|
|
126
|
+
"""
|
|
127
|
+
return self._set(defaultDayWhenMissing=value)
|
|
128
|
+
|
|
129
|
+
def setAnchorDateYear(self, value):
|
|
130
|
+
"""Sets an anchor year for the relative dates such as a day after
|
|
131
|
+
tomorrow. If not set it will use the current year.
|
|
132
|
+
|
|
133
|
+
Example: 2021
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
value : int
|
|
138
|
+
The anchor year for relative dates
|
|
139
|
+
"""
|
|
140
|
+
return self._set(anchorDateYear=value)
|
|
141
|
+
|
|
142
|
+
def setAnchorDateMonth(self, value):
|
|
143
|
+
"""Sets an anchor month for the relative dates such as a day after
|
|
144
|
+
tomorrow. If not set it will use the current month.
|
|
145
|
+
|
|
146
|
+
Example: 1 which means January
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
value : int
|
|
151
|
+
The anchor month for relative dates
|
|
152
|
+
"""
|
|
153
|
+
normalizedMonth = value - 1
|
|
154
|
+
return self._set(anchorDateMonth=normalizedMonth)
|
|
155
|
+
|
|
156
|
+
def setSourceLanguage(self, value):
|
|
157
|
+
return self._set(sourceLanguage=value)
|
|
158
|
+
|
|
159
|
+
def setAnchorDateDay(self, value):
|
|
160
|
+
"""Sets an anchor day of the day for the relative dates such as a day
|
|
161
|
+
after tomorrow. If not set it will use the current day.
|
|
162
|
+
|
|
163
|
+
Example: 11
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
value : int
|
|
168
|
+
The anchor day for relative dates
|
|
169
|
+
"""
|
|
170
|
+
return self._set(anchorDateDay=value)
|
|
171
|
+
|
|
172
|
+
def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
|
|
173
|
+
""" Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy.
|
|
174
|
+
|
|
175
|
+
Not all of the date information needs to be included. For example
|
|
176
|
+
``"YYYY"`` is also a valid input.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
matchStrategy : MatchStrategy
|
|
181
|
+
Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy
|
|
182
|
+
"""
|
|
183
|
+
return self._set(relaxedFactoryStrategy=matchStrategy)
|
|
184
|
+
|
|
185
|
+
def setAggressiveMatching(self, value):
|
|
186
|
+
""" Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
aggressiveMatching : Boolean
|
|
191
|
+
Whether to aggressively attempt to find date matches, even in ambiguous or less common formats
|
|
192
|
+
"""
|
|
193
|
+
return self._set(aggressiveMatching=value)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class DateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
197
|
+
"""Matches standard date formats into a provided format
|
|
198
|
+
Reads from different forms of date and time expressions and converts them
|
|
199
|
+
to a provided date format.
|
|
200
|
+
|
|
201
|
+
Extracts only **one** date per document. Use with sentence detector to find
|
|
202
|
+
matches in each sentence.
|
|
203
|
+
To extract multiple dates from a document, please use the
|
|
204
|
+
:class:`.MultiDateMatcher`.
|
|
205
|
+
|
|
206
|
+
Reads the following kind of dates::
|
|
207
|
+
|
|
208
|
+
"1978-01-28", "1984/04/02,1/02/1980", "2/28/79",
|
|
209
|
+
"The 31st of April in the year 2008", "Fri, 21 Nov 1997", "Jan 21,
|
|
210
|
+
‘97", "Sun", "Nov 21", "jan 1st", "next thursday", "last wednesday",
|
|
211
|
+
"today", "tomorrow", "yesterday", "next week", "next month",
|
|
212
|
+
"next year", "day after", "the day before", "0600h", "06:00 hours",
|
|
213
|
+
"6pm", "5:30 a.m.", "at 5", "12:59", "23:59", "1988/11/23 6pm",
|
|
214
|
+
"next week at 7.30", "5 am tomorrow"
|
|
215
|
+
|
|
216
|
+
For example ``"The 31st of April in the year 2008"`` will be converted into
|
|
217
|
+
``2008/04/31``.
|
|
218
|
+
|
|
219
|
+
Pretrained pipelines are available for this module, see
|
|
220
|
+
`Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
|
|
221
|
+
|
|
222
|
+
For extended examples of usage, see the
|
|
223
|
+
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
|
|
224
|
+
|
|
225
|
+
====================== ======================
|
|
226
|
+
Input Annotation types Output Annotation type
|
|
227
|
+
====================== ======================
|
|
228
|
+
``DOCUMENT`` ``DATE``
|
|
229
|
+
====================== ======================
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
dateFormat
|
|
234
|
+
Desired format for dates extracted, by default yyyy/MM/dd.
|
|
235
|
+
readMonthFirst
|
|
236
|
+
Whether to parse the date in mm/dd/yyyy format instead of dd/mm/yyyy,
|
|
237
|
+
by default True.
|
|
238
|
+
defaultDayWhenMissing
|
|
239
|
+
Which day to set when it is missing from parsed input, by default 1.
|
|
240
|
+
anchorDateYear
|
|
241
|
+
Add an anchor year for the relative dates such as a day after tomorrow.
|
|
242
|
+
If not set it will use the current year. Example: 2021
|
|
243
|
+
anchorDateMonth
|
|
244
|
+
Add an anchor month for the relative dates such as a day after tomorrow.
|
|
245
|
+
If not set it will use the current month. Example: 1 which means January
|
|
246
|
+
anchorDateDay
|
|
247
|
+
Add an anchor day of the day for the relative dates such as a day after
|
|
248
|
+
tomorrow. If not set it will use the current day. Example: 11
|
|
249
|
+
|
|
250
|
+
Examples
|
|
251
|
+
--------
|
|
252
|
+
>>> import sparknlp
|
|
253
|
+
>>> from sparknlp.base import *
|
|
254
|
+
>>> from sparknlp.annotator import *
|
|
255
|
+
>>> from pyspark.ml import Pipeline
|
|
256
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
257
|
+
... .setInputCol("text") \\
|
|
258
|
+
... .setOutputCol("document")
|
|
259
|
+
>>> date = DateMatcher() \\
|
|
260
|
+
... .setInputCols("document") \\
|
|
261
|
+
... .setOutputCol("date") \\
|
|
262
|
+
... .setAnchorDateYear(2020) \\
|
|
263
|
+
... .setAnchorDateMonth(1) \\
|
|
264
|
+
... .setAnchorDateDay(11) \\
|
|
265
|
+
... .setOutputFormat("yyyy/MM/dd")
|
|
266
|
+
>>> pipeline = Pipeline().setStages([
|
|
267
|
+
... documentAssembler,
|
|
268
|
+
... date
|
|
269
|
+
... ])
|
|
270
|
+
>>> data = spark.createDataFrame([["Fri, 21 Nov 1997"], ["next week at 7.30"], ["see you a day after"]]).toDF("text")
|
|
271
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
272
|
+
>>> result.selectExpr("date").show(truncate=False)
|
|
273
|
+
+-------------------------------------------------+
|
|
274
|
+
|date |
|
|
275
|
+
+-------------------------------------------------+
|
|
276
|
+
|[[date, 5, 15, 1997/11/21, [sentence -> 0], []]] |
|
|
277
|
+
|[[date, 0, 8, 2020/01/18, [sentence -> 0], []]] |
|
|
278
|
+
|[[date, 10, 18, 2020/01/12, [sentence -> 0], []]]|
|
|
279
|
+
+-------------------------------------------------+
|
|
280
|
+
|
|
281
|
+
See Also
|
|
282
|
+
--------
|
|
283
|
+
MultiDateMatcher : for matching multiple dates in a document
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
name = "DateMatcher"
|
|
287
|
+
|
|
288
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
289
|
+
|
|
290
|
+
outputAnnotatorType = AnnotatorType.DATE
|
|
291
|
+
|
|
292
|
+
@keyword_only
|
|
293
|
+
def __init__(self):
|
|
294
|
+
super(DateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DateMatcher")
|
|
295
|
+
self._setDefault(
|
|
296
|
+
inputFormats=[""],
|
|
297
|
+
outputFormat="yyyy/MM/dd",
|
|
298
|
+
readMonthFirst=True,
|
|
299
|
+
defaultDayWhenMissing=1,
|
|
300
|
+
anchorDateYear=-1,
|
|
301
|
+
anchorDateMonth=-1,
|
|
302
|
+
anchorDateDay=-1
|
|
303
|
+
)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for MultiDateMatcher."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
from sparknlp.annotator.matcher.date_matcher import DateMatcherUtils
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
21
|
+
"""Matches standard date formats into a provided format.
|
|
22
|
+
|
|
23
|
+
Reads the following kind of dates::
|
|
24
|
+
|
|
25
|
+
"1978-01-28", "1984/04/02,1/02/1980", "2/28/79",
|
|
26
|
+
"The 31st of April in the year 2008", "Fri, 21 Nov 1997", "Jan 21,
|
|
27
|
+
‘97", "Sun", "Nov 21", "jan 1st", "next thursday", "last wednesday",
|
|
28
|
+
"today", "tomorrow", "yesterday", "next week", "next month",
|
|
29
|
+
"next year", "day after", "the day before", "0600h", "06:00 hours",
|
|
30
|
+
"6pm", "5:30 a.m.", "at 5", "12:59", "23:59", "1988/11/23 6pm",
|
|
31
|
+
"next week at 7.30", "5 am tomorrow"
|
|
32
|
+
|
|
33
|
+
For example ``"The 31st of April in the year 2008"`` will be converted into
|
|
34
|
+
``2008/04/31``.
|
|
35
|
+
|
|
36
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
|
|
37
|
+
|
|
38
|
+
====================== ======================
|
|
39
|
+
Input Annotation types Output Annotation type
|
|
40
|
+
====================== ======================
|
|
41
|
+
``DOCUMENT`` ``DATE``
|
|
42
|
+
====================== ======================
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
dateFormat
|
|
47
|
+
Desired format for dates extracted, by default yyyy/MM/dd.
|
|
48
|
+
readMonthFirst
|
|
49
|
+
Whether to parse the date in mm/dd/yyyy format instead of dd/mm/yyyy,
|
|
50
|
+
by default True.
|
|
51
|
+
defaultDayWhenMissing
|
|
52
|
+
Which day to set when it is missing from parsed input, by default 1.
|
|
53
|
+
anchorDateYear
|
|
54
|
+
Add an anchor year for the relative dates such as a day after tomorrow.
|
|
55
|
+
If not set it will use the current year. Example: 2021
|
|
56
|
+
anchorDateMonth
|
|
57
|
+
Add an anchor month for the relative dates such as a day after tomorrow.
|
|
58
|
+
If not set it will use the current month. Example: 1 which means January
|
|
59
|
+
anchorDateDay
|
|
60
|
+
Add an anchor day of the day for the relative dates such as a day after
|
|
61
|
+
tomorrow. If not set it will use the current day. Example: 11
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
>>> import sparknlp
|
|
66
|
+
>>> from sparknlp.base import *
|
|
67
|
+
>>> from sparknlp.annotator import *
|
|
68
|
+
>>> from pyspark.ml import Pipeline
|
|
69
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
70
|
+
... .setInputCol("text") \\
|
|
71
|
+
... .setOutputCol("document")
|
|
72
|
+
>>> date = MultiDateMatcher() \\
|
|
73
|
+
... .setInputCols("document") \\
|
|
74
|
+
... .setOutputCol("date") \\
|
|
75
|
+
... .setAnchorDateYear(2020) \\
|
|
76
|
+
... .setAnchorDateMonth(1) \\
|
|
77
|
+
... .setAnchorDateDay(11) \\
|
|
78
|
+
... .setOutputFormat("yyyy/MM/dd")
|
|
79
|
+
>>> pipeline = Pipeline().setStages([
|
|
80
|
+
... documentAssembler,
|
|
81
|
+
... date
|
|
82
|
+
... ])
|
|
83
|
+
>>> data = spark.createDataFrame([["I saw him yesterday and he told me that he will visit us next week"]]) \\
|
|
84
|
+
... .toDF("text")
|
|
85
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
86
|
+
>>> result.selectExpr("explode(date) as dates").show(truncate=False)
|
|
87
|
+
+-----------------------------------------------+
|
|
88
|
+
|dates |
|
|
89
|
+
+-----------------------------------------------+
|
|
90
|
+
|[date, 57, 65, 2020/01/18, [sentence -> 0], []]|
|
|
91
|
+
|[date, 10, 18, 2020/01/10, [sentence -> 0], []]|
|
|
92
|
+
+-----------------------------------------------+
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
name = "MultiDateMatcher"
|
|
96
|
+
|
|
97
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
98
|
+
|
|
99
|
+
outputAnnotatorType = AnnotatorType.DATE
|
|
100
|
+
|
|
101
|
+
@keyword_only
|
|
102
|
+
def __init__(self):
|
|
103
|
+
super(MultiDateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.MultiDateMatcher")
|
|
104
|
+
self._setDefault(
|
|
105
|
+
inputFormats=[""],
|
|
106
|
+
outputFormat="yyyy/MM/dd",
|
|
107
|
+
readMonthFirst=True,
|
|
108
|
+
defaultDayWhenMissing=1
|
|
109
|
+
)
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the RegexMatcher."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RegexMatcher(AnnotatorApproach):
|
|
20
|
+
"""Uses rules to match a set of regular expressions and associate them with a
|
|
21
|
+
provided identifier.
|
|
22
|
+
|
|
23
|
+
A rule consists of a regex pattern and an identifier, delimited by a character of
|
|
24
|
+
choice. An example could be `"\\d{4}\\/\\d\\d\\/\\d\\d,date"` which will match
|
|
25
|
+
strings like `"1970/01/01"` to the identifier `"date"`.
|
|
26
|
+
|
|
27
|
+
Rules must be provided by either :meth:`.setRules` (followed by
|
|
28
|
+
:meth:`.setDelimiter`) or an external file.
|
|
29
|
+
|
|
30
|
+
To use an external file, a dictionary of predefined regular expressions must be
|
|
31
|
+
provided with :meth:`.setExternalRules`. The dictionary can be set in the form of a
|
|
32
|
+
delimited text file.
|
|
33
|
+
|
|
34
|
+
Pretrained pipelines are available for this module, see `Pipelines
|
|
35
|
+
<https://sparknlp.org/docs/en/pipelines>`__.
|
|
36
|
+
|
|
37
|
+
For extended examples of usage, see the `Examples
|
|
38
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb>`__.
|
|
39
|
+
|
|
40
|
+
====================== ======================
|
|
41
|
+
Input Annotation types Output Annotation type
|
|
42
|
+
====================== ======================
|
|
43
|
+
``DOCUMENT`` ``CHUNK``
|
|
44
|
+
====================== ======================
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
strategy
|
|
49
|
+
Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE, by default
|
|
50
|
+
"MATCH_ALL"
|
|
51
|
+
rules
|
|
52
|
+
Regex rules to match the identifier with
|
|
53
|
+
delimiter
|
|
54
|
+
Delimiter for rules provided with setRules
|
|
55
|
+
externalRules
|
|
56
|
+
external resource to rules, needs 'delimiter' in options
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> import sparknlp
|
|
61
|
+
>>> from sparknlp.base import *
|
|
62
|
+
>>> from sparknlp.annotator import *
|
|
63
|
+
>>> from pyspark.ml import Pipeline
|
|
64
|
+
|
|
65
|
+
In this example, the ``rules.txt`` has the form of::
|
|
66
|
+
|
|
67
|
+
the\\s\\w+, followed by 'the'
|
|
68
|
+
ceremonies, ceremony
|
|
69
|
+
|
|
70
|
+
where each regex is separated by the identifier ``","``
|
|
71
|
+
|
|
72
|
+
>>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
|
|
73
|
+
>>> sentence = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
|
|
74
|
+
>>> regexMatcher = RegexMatcher() \\
|
|
75
|
+
... .setExternalRules("src/test/resources/regex-matcher/rules.txt", ",") \\
|
|
76
|
+
... .setInputCols(["sentence"]) \\
|
|
77
|
+
... .setOutputCol("regex") \\
|
|
78
|
+
... .setStrategy("MATCH_ALL")
|
|
79
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, sentence, regexMatcher])
|
|
80
|
+
>>> data = spark.createDataFrame([[
|
|
81
|
+
... "My first sentence with the first rule. This is my second sentence with ceremonies rule."
|
|
82
|
+
... ]]).toDF("text")
|
|
83
|
+
>>> results = pipeline.fit(data).transform(data)
|
|
84
|
+
>>> results.selectExpr("explode(regex) as result").show(truncate=False)
|
|
85
|
+
+--------------------------------------------------------------------------------------------+
|
|
86
|
+
|result |
|
|
87
|
+
+--------------------------------------------------------------------------------------------+
|
|
88
|
+
|[chunk, 23, 31, the first, [identifier -> followed by 'the', sentence -> 0, chunk -> 0], []]|
|
|
89
|
+
|[chunk, 71, 80, ceremonies, [identifier -> ceremony, sentence -> 1, chunk -> 0], []] |
|
|
90
|
+
+--------------------------------------------------------------------------------------------+
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
94
|
+
|
|
95
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
96
|
+
|
|
97
|
+
strategy = Param(Params._dummy(),
|
|
98
|
+
"strategy",
|
|
99
|
+
"MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE",
|
|
100
|
+
typeConverter=TypeConverters.toString)
|
|
101
|
+
externalRules = Param(Params._dummy(),
|
|
102
|
+
"externalRules",
|
|
103
|
+
"external resource to rules, needs 'delimiter' in options",
|
|
104
|
+
typeConverter=TypeConverters.identity)
|
|
105
|
+
rules = Param(Params._dummy(),
|
|
106
|
+
"rules",
|
|
107
|
+
"Regex rules to match the identifier with",
|
|
108
|
+
typeConverter=TypeConverters.toListString)
|
|
109
|
+
delimiter = Param(Params._dummy(),
|
|
110
|
+
"delimiter",
|
|
111
|
+
"Delimiter for rules",
|
|
112
|
+
typeConverter=TypeConverters.toString)
|
|
113
|
+
|
|
114
|
+
@keyword_only
|
|
115
|
+
def __init__(self):
|
|
116
|
+
super(RegexMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexMatcher")
|
|
117
|
+
self._setDefault(
|
|
118
|
+
strategy="MATCH_ALL"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def setStrategy(self, value):
|
|
122
|
+
"""Sets matching strategy, by default "MATCH_ALL".
|
|
123
|
+
|
|
124
|
+
Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
value : str
|
|
129
|
+
Matching Strategy
|
|
130
|
+
"""
|
|
131
|
+
return self._set(strategy=value)
|
|
132
|
+
|
|
133
|
+
def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
134
|
+
"""Sets external resource to rules, needs 'delimiter' in options.
|
|
135
|
+
|
|
136
|
+
Only one of either parameter `rules` or `externalRules` must be set.
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
Parameters
|
|
140
|
+
----------
|
|
141
|
+
path : str
|
|
142
|
+
Path to the source files
|
|
143
|
+
delimiter : str
|
|
144
|
+
Delimiter for the dictionary file. Can also be set it `options`.
|
|
145
|
+
read_as : str, optional
|
|
146
|
+
How to read the file, by default ReadAs.TEXT
|
|
147
|
+
options : dict, optional
|
|
148
|
+
Options to read the resource, by default {"format": "text"}
|
|
149
|
+
"""
|
|
150
|
+
opts = options.copy()
|
|
151
|
+
if "delimiter" not in opts:
|
|
152
|
+
opts["delimiter"] = delimiter
|
|
153
|
+
return self._set(externalRules=ExternalResource(path, read_as, opts))
|
|
154
|
+
|
|
155
|
+
def setRules(self, value):
|
|
156
|
+
"""Sets the regex rules to match the identifier with.
|
|
157
|
+
|
|
158
|
+
The rules must consist of a regex pattern and an identifier for that pattern. The regex
|
|
159
|
+
pattern and the identifier must be delimited by a character that will also have to set with
|
|
160
|
+
`setDelimiter`.
|
|
161
|
+
|
|
162
|
+
Only one of either parameter `rules` or `externalRules` must be set.
|
|
163
|
+
|
|
164
|
+
Examples
|
|
165
|
+
--------
|
|
166
|
+
>>> regexMatcher = RegexMatcher() \\
|
|
167
|
+
... .setRules(["\\d{4}\\/\\d\\d\\/\\d\\d,date", "\\d{2}\\/\\d\\d\\/\\d\\d,short_date"]) \\
|
|
168
|
+
... .setDelimiter(",") \\
|
|
169
|
+
... .setInputCols(["sentence"]) \\
|
|
170
|
+
... .setOutputCol("regex") \\
|
|
171
|
+
... .setStrategy("MATCH_ALL")
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
value : List[str]
|
|
176
|
+
List of rules
|
|
177
|
+
"""
|
|
178
|
+
return self._set(rules=value)
|
|
179
|
+
|
|
180
|
+
def setDelimiter(self, value):
|
|
181
|
+
"""Sets the delimiter for rules.
|
|
182
|
+
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
185
|
+
value : str
|
|
186
|
+
Delimiter for the rules
|
|
187
|
+
"""
|
|
188
|
+
return self._set(delimiter=value)
|
|
189
|
+
|
|
190
|
+
def _create_model(self, java_model):
|
|
191
|
+
return RegexMatcherModel(java_model=java_model)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class RegexMatcherModel(AnnotatorModel):
|
|
195
|
+
"""Instantiated model of the RegexMatcher.
|
|
196
|
+
|
|
197
|
+
This is the instantiated model of the :class:`.RegexMatcher`.
|
|
198
|
+
For training your own model, please see the documentation of that class.
|
|
199
|
+
|
|
200
|
+
====================== ======================
|
|
201
|
+
Input Annotation types Output Annotation type
|
|
202
|
+
====================== ======================
|
|
203
|
+
``DOCUMENT`` ``CHUNK``
|
|
204
|
+
====================== ======================
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
None
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
212
|
+
|
|
213
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
214
|
+
|
|
215
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RegexMatcherModel", java_model=None):
|
|
216
|
+
super(RegexMatcherModel, self).__init__(
|
|
217
|
+
classname=classname,
|
|
218
|
+
java_model=java_model
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
name = "RegexMatcherModel"
|