spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Copyright 2017-2023 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for Date2Chunk."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Date2Chunk(AnnotatorModel):
|
|
20
|
+
"""Converts ``DATE`` type Annotations to ``CHUNK`` type.
|
|
21
|
+
|
|
22
|
+
This can be useful if the following annotators after DateMatcher and MultiDateMatcher require ```CHUNK``` types.
|
|
23
|
+
|
|
24
|
+
====================== ======================
|
|
25
|
+
Input Annotation types Output Annotation type
|
|
26
|
+
====================== ======================
|
|
27
|
+
``DATE`` ``CHUNK``
|
|
28
|
+
====================== ======================
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
entityName
|
|
33
|
+
Entity name for the metadata, by default ``"DATE"``.
|
|
34
|
+
|
|
35
|
+
Examples
|
|
36
|
+
--------
|
|
37
|
+
>>> from pyspark.ml import Pipeline
|
|
38
|
+
|
|
39
|
+
>>> import sparknlp
|
|
40
|
+
>>> from sparknlp.base import *
|
|
41
|
+
>>> from sparknlp.annotator import *
|
|
42
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
43
|
+
... .setInputCol("text") \\
|
|
44
|
+
... .setOutputCol("document")
|
|
45
|
+
>>> date = DateMatcher() \\
|
|
46
|
+
... .setInputCols(["document"]) \\
|
|
47
|
+
... .setOutputCol("date")
|
|
48
|
+
>>> date2Chunk = Date2Chunk() \\
|
|
49
|
+
... .setInputCols(["date"]) \\
|
|
50
|
+
... .setOutputCol("date_chunk")
|
|
51
|
+
>>> pipeline = Pipeline().setStages([
|
|
52
|
+
... documentAssembler,
|
|
53
|
+
... date,
|
|
54
|
+
... date2Chunk
|
|
55
|
+
... ])
|
|
56
|
+
>>> data = spark.createDataFrame([["Omicron is a new variant of COVID-19, which the World Health Organization designated a variant of concern on Nov. 26, 2021/26/11."]]).toDF("text")
|
|
57
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
58
|
+
>>> result.select("date_chunk").show(1, truncate=False)
|
|
59
|
+
----------------------------------------------------+
|
|
60
|
+
|date_chunk |
|
|
61
|
+
----------------------------------------------------+
|
|
62
|
+
|[{chunk, 118, 121, 2021/01/01, {sentence -> 0}, []}]|
|
|
63
|
+
----------------------------------------------------+
|
|
64
|
+
"""
|
|
65
|
+
name = "Date2Chunk"
|
|
66
|
+
|
|
67
|
+
inputAnnotatorTypes = [AnnotatorType.DATE]
|
|
68
|
+
|
|
69
|
+
outputAnnotatorType = AnnotatorType.CHUNK
|
|
70
|
+
|
|
71
|
+
@keyword_only
|
|
72
|
+
def __init__(self):
|
|
73
|
+
super(Date2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Date2Chunk")
|
|
74
|
+
self._setDefault(entityName="DATE")
|
|
75
|
+
|
|
76
|
+
entityName = Param(Params._dummy(), "entityName", "Entity name for the metadata",
|
|
77
|
+
TypeConverters.toString)
|
|
78
|
+
|
|
79
|
+
def setEntityName(self, name):
|
|
80
|
+
"""Sets Learning Rate, by default 0.001.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
v : float
|
|
85
|
+
Learning Rate
|
|
86
|
+
"""
|
|
87
|
+
self._set(entityName=name)
|
|
88
|
+
return self
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Module of annotators for dependency parsing."""
|
|
16
|
+
from sparknlp.annotator.dependency.dependency_parser import *
|
|
17
|
+
from sparknlp.annotator.dependency.typed_dependency_parser import *
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the DependencyParser."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DependencyParserApproach(AnnotatorApproach):
|
|
20
|
+
"""Trains an unlabeled parser that finds a grammatical relations between two
|
|
21
|
+
words in a sentence.
|
|
22
|
+
|
|
23
|
+
For instantiated/pretrained models, see :class:`.DependencyParserModel`.
|
|
24
|
+
|
|
25
|
+
Dependency parser provides information about word relationship. For example,
|
|
26
|
+
dependency parsing can tell you what the subjects and objects of a verb are,
|
|
27
|
+
as well as which words are modifying (describing) the subject. This can help
|
|
28
|
+
you find precise answers to specific questions.
|
|
29
|
+
|
|
30
|
+
The required training data can be set in two different ways (only one can be
|
|
31
|
+
chosen for a particular model):
|
|
32
|
+
|
|
33
|
+
- Dependency treebank in the
|
|
34
|
+
`Penn Treebank format <http://www.nltk.org/nltk_data/>`__ set with
|
|
35
|
+
``setDependencyTreeBank``
|
|
36
|
+
- Dataset in the
|
|
37
|
+
`CoNLL-U format <https://universaldependencies.org/format.html>`__ set
|
|
38
|
+
with ``setConllU``
|
|
39
|
+
|
|
40
|
+
Apart from that, no additional training data is needed.
|
|
41
|
+
|
|
42
|
+
======================== ======================
|
|
43
|
+
Input Annotation types Output Annotation type
|
|
44
|
+
======================== ======================
|
|
45
|
+
``DOCUMENT, POS, TOKEN`` ``DEPENDENCY``
|
|
46
|
+
======================== ======================
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
dependencyTreeBank
|
|
51
|
+
Dependency treebank source files
|
|
52
|
+
conllU
|
|
53
|
+
Universal Dependencies source files
|
|
54
|
+
numberOfIterations
|
|
55
|
+
Number of iterations in training, converges to better accuracy,
|
|
56
|
+
by default 10
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> import sparknlp
|
|
61
|
+
>>> from sparknlp.base import *
|
|
62
|
+
>>> from sparknlp.annotator import *
|
|
63
|
+
>>> from pyspark.ml import Pipeline
|
|
64
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
65
|
+
... .setInputCol("text") \\
|
|
66
|
+
... .setOutputCol("document")
|
|
67
|
+
>>> sentence = SentenceDetector() \\
|
|
68
|
+
... .setInputCols(["document"]) \\
|
|
69
|
+
... .setOutputCol("sentence")
|
|
70
|
+
>>> tokenizer = Tokenizer() \\
|
|
71
|
+
... .setInputCols(["sentence"]) \\
|
|
72
|
+
... .setOutputCol("token")
|
|
73
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
74
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
75
|
+
... .setOutputCol("pos")
|
|
76
|
+
>>> dependencyParserApproach = DependencyParserApproach() \\
|
|
77
|
+
... .setInputCols(["sentence", "pos", "token"]) \\
|
|
78
|
+
... .setOutputCol("dependency") \\
|
|
79
|
+
... .setDependencyTreeBank("src/test/resources/parser/unlabeled/dependency_treebank")
|
|
80
|
+
>>> pipeline = Pipeline().setStages([
|
|
81
|
+
... documentAssembler,
|
|
82
|
+
... sentence,
|
|
83
|
+
... tokenizer,
|
|
84
|
+
... posTagger,
|
|
85
|
+
... dependencyParserApproach
|
|
86
|
+
... ])
|
|
87
|
+
>>> emptyDataSet = spark.createDataFrame([[""]]).toDF("text")
|
|
88
|
+
>>> pipelineModel = pipeline.fit(emptyDataSet)
|
|
89
|
+
|
|
90
|
+
Additional training data is not needed, the dependency parser relies on the
|
|
91
|
+
dependency tree bank / CoNLL-U only.
|
|
92
|
+
|
|
93
|
+
See Also
|
|
94
|
+
--------
|
|
95
|
+
TypedDependencyParserApproach : to extract labels for the dependencies
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.TOKEN]
|
|
99
|
+
|
|
100
|
+
outputAnnotatorType = AnnotatorType.DEPENDENCY
|
|
101
|
+
|
|
102
|
+
dependencyTreeBank = Param(Params._dummy(),
|
|
103
|
+
"dependencyTreeBank",
|
|
104
|
+
"Dependency treebank source files",
|
|
105
|
+
typeConverter=TypeConverters.identity)
|
|
106
|
+
|
|
107
|
+
conllU = Param(Params._dummy(),
|
|
108
|
+
"conllU",
|
|
109
|
+
"Universal Dependencies source files",
|
|
110
|
+
typeConverter=TypeConverters.identity)
|
|
111
|
+
|
|
112
|
+
numberOfIterations = Param(Params._dummy(),
|
|
113
|
+
"numberOfIterations",
|
|
114
|
+
"Number of iterations in training, converges to better accuracy",
|
|
115
|
+
typeConverter=TypeConverters.toInt)
|
|
116
|
+
|
|
117
|
+
@keyword_only
|
|
118
|
+
def __init__(self):
|
|
119
|
+
super(DependencyParserApproach,
|
|
120
|
+
self).__init__(classname="com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach")
|
|
121
|
+
self._setDefault(numberOfIterations=10)
|
|
122
|
+
|
|
123
|
+
def setNumberOfIterations(self, value):
|
|
124
|
+
"""Sets number of iterations in training, converges to better accuracy,
|
|
125
|
+
by default 10.
|
|
126
|
+
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
value : int
|
|
130
|
+
Number of iterations
|
|
131
|
+
"""
|
|
132
|
+
return self._set(numberOfIterations=value)
|
|
133
|
+
|
|
134
|
+
def setDependencyTreeBank(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
135
|
+
"""Sets dependency treebank source files.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
path : str
|
|
140
|
+
Path to the source files
|
|
141
|
+
read_as : str, optional
|
|
142
|
+
How to read the file, by default ReadAs.TEXT
|
|
143
|
+
options : dict, optional
|
|
144
|
+
Options to read the resource, by default {"key": "value"}
|
|
145
|
+
"""
|
|
146
|
+
opts = options.copy()
|
|
147
|
+
return self._set(dependencyTreeBank=ExternalResource(path, read_as, opts))
|
|
148
|
+
|
|
149
|
+
def setConllU(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
150
|
+
"""Sets Universal Dependencies source files.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
path : str
|
|
155
|
+
Path to the source files
|
|
156
|
+
read_as : str, optional
|
|
157
|
+
How to read the file, by default ReadAs.TEXT
|
|
158
|
+
options : dict, optional
|
|
159
|
+
Options to read the resource, by default {"key": "value"}
|
|
160
|
+
"""
|
|
161
|
+
opts = options.copy()
|
|
162
|
+
return self._set(conllU=ExternalResource(path, read_as, opts))
|
|
163
|
+
|
|
164
|
+
def _create_model(self, java_model):
|
|
165
|
+
return DependencyParserModel(java_model=java_model)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class DependencyParserModel(AnnotatorModel):
|
|
169
|
+
"""Unlabeled parser that finds a grammatical relation between two words in a
|
|
170
|
+
sentence.
|
|
171
|
+
|
|
172
|
+
Dependency parser provides information about word relationship. For example,
|
|
173
|
+
dependency parsing can tell you what the subjects and objects of a verb are,
|
|
174
|
+
as well as which words are modifying (describing) the subject. This can help
|
|
175
|
+
you find precise answers to specific questions.
|
|
176
|
+
|
|
177
|
+
This is the instantiated model of the :class:`.DependencyParserApproach`.
|
|
178
|
+
For training your own model, please see the documentation of that class.
|
|
179
|
+
|
|
180
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
181
|
+
object:
|
|
182
|
+
|
|
183
|
+
>>> dependencyParserApproach = DependencyParserModel.pretrained() \\
|
|
184
|
+
... .setInputCols(["sentence", "pos", "token"]) \\
|
|
185
|
+
... .setOutputCol("dependency")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
The default model is ``"dependency_conllu"``, if no name is provided.
|
|
189
|
+
For available pretrained models please see the
|
|
190
|
+
`Models Hub <https://sparknlp.org/models>`__.
|
|
191
|
+
|
|
192
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb>`__.
|
|
193
|
+
|
|
194
|
+
================================ ======================
|
|
195
|
+
Input Annotation types Output Annotation type
|
|
196
|
+
================================ ======================
|
|
197
|
+
``[String]DOCUMENT, POS, TOKEN`` ``DEPENDENCY``
|
|
198
|
+
================================ ======================
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
perceptron
|
|
203
|
+
Dependency parsing perceptron features
|
|
204
|
+
|
|
205
|
+
Examples
|
|
206
|
+
--------
|
|
207
|
+
>>> import sparknlp
|
|
208
|
+
>>> from sparknlp.base import *
|
|
209
|
+
>>> from sparknlp.annotator import *
|
|
210
|
+
>>> from pyspark.ml import Pipeline
|
|
211
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
212
|
+
... .setInputCol("text") \\
|
|
213
|
+
... .setOutputCol("document")
|
|
214
|
+
>>> sentence = SentenceDetector() \\
|
|
215
|
+
... .setInputCols(["document"]) \\
|
|
216
|
+
... .setOutputCol("sentence")
|
|
217
|
+
>>> tokenizer = Tokenizer() \\
|
|
218
|
+
... .setInputCols(["sentence"]) \\
|
|
219
|
+
... .setOutputCol("token")
|
|
220
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
221
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
222
|
+
... .setOutputCol("pos")
|
|
223
|
+
>>> dependencyParser = DependencyParserModel.pretrained() \\
|
|
224
|
+
... .setInputCols(["sentence", "pos", "token"]) \\
|
|
225
|
+
... .setOutputCol("dependency")
|
|
226
|
+
>>> pipeline = Pipeline().setStages([
|
|
227
|
+
... documentAssembler,
|
|
228
|
+
... sentence,
|
|
229
|
+
... tokenizer,
|
|
230
|
+
... posTagger,
|
|
231
|
+
... dependencyParser
|
|
232
|
+
... ])
|
|
233
|
+
>>> data = spark.createDataFrame([[
|
|
234
|
+
... "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent " +
|
|
235
|
+
... "firm Federal Mogul."
|
|
236
|
+
... ]]).toDF("text")
|
|
237
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
238
|
+
>>> result.selectExpr("explode(arrays_zip(token.result, dependency.result)) as cols") \\
|
|
239
|
+
... .selectExpr("cols['0'] as token", "cols['1'] as dependency").show(8, truncate = False)
|
|
240
|
+
+------------+------------+
|
|
241
|
+
|token |dependency |
|
|
242
|
+
+------------+------------+
|
|
243
|
+
|Unions |ROOT |
|
|
244
|
+
|representing|workers |
|
|
245
|
+
|workers |Unions |
|
|
246
|
+
|at |Turner |
|
|
247
|
+
|Turner |workers |
|
|
248
|
+
|Newall |say |
|
|
249
|
+
|say |Unions |
|
|
250
|
+
|they |disappointed|
|
|
251
|
+
+------------+------------+
|
|
252
|
+
|
|
253
|
+
See Also
|
|
254
|
+
--------
|
|
255
|
+
TypedDependencyParserMdoel : to extract labels for the dependencies
|
|
256
|
+
"""
|
|
257
|
+
name = "DependencyParserModel"
|
|
258
|
+
|
|
259
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.TOKEN]
|
|
260
|
+
|
|
261
|
+
outputAnnotatorType = AnnotatorType.DEPENDENCY
|
|
262
|
+
|
|
263
|
+
perceptron = Param(Params._dummy(),
|
|
264
|
+
"perceptron",
|
|
265
|
+
"Dependency parsing perceptron features",
|
|
266
|
+
typeConverter=TypeConverters.identity)
|
|
267
|
+
|
|
268
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserModel", java_model=None):
|
|
269
|
+
super(DependencyParserModel, self).__init__(
|
|
270
|
+
classname=classname,
|
|
271
|
+
java_model=java_model
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
@staticmethod
|
|
275
|
+
def pretrained(name="dependency_conllu", lang="en", remote_loc=None):
|
|
276
|
+
"""Downloads and loads a pretrained model.
|
|
277
|
+
|
|
278
|
+
Parameters
|
|
279
|
+
----------
|
|
280
|
+
name : str, optional
|
|
281
|
+
Name of the pretrained model, by default "dependency_conllu"
|
|
282
|
+
lang : str, optional
|
|
283
|
+
Language of the pretrained model, by default "en"
|
|
284
|
+
remote_loc : str, optional
|
|
285
|
+
Optional remote address of the resource, by default None. Will use
|
|
286
|
+
Spark NLPs repositories otherwise.
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
DependencyParserModel
|
|
291
|
+
The restored model
|
|
292
|
+
"""
|
|
293
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
294
|
+
return ResourceDownloader.downloadModel(DependencyParserModel, name, lang, remote_loc)
|