spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Package that contains classes for integration with Comet."""
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import comet_ml
|
|
19
|
+
except AttributeError:
|
|
20
|
+
# Python 3.6
|
|
21
|
+
comet_ml = None
|
|
22
|
+
except ModuleNotFoundError:
|
|
23
|
+
# Python 3.7+
|
|
24
|
+
comet_ml = None
|
|
25
|
+
|
|
26
|
+
import threading
|
|
27
|
+
import time
|
|
28
|
+
import os
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CometLogger:
|
|
32
|
+
"""Logger class for Comet integration
|
|
33
|
+
|
|
34
|
+
`Comet <https://www.comet.ml/>`__ is a meta machine learning platform
|
|
35
|
+
designed to help AI practitioners and teams build reliable machine learning
|
|
36
|
+
models for real-world applications by streamlining the machine learning
|
|
37
|
+
model lifecycle. By leveraging Comet, users can track, compare, explain and
|
|
38
|
+
reproduce their machine learning experiments.
|
|
39
|
+
|
|
40
|
+
To log a Spark NLP annotator, it will need an "outputLogPath" parameter, as the
|
|
41
|
+
CometLogger reads the log file generated during the training process.
|
|
42
|
+
|
|
43
|
+
For more examples see the `Examples
|
|
44
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/logging/Comet_SparkNLP_Integration.ipynb>`__.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
workspace : str, optional
|
|
49
|
+
Name of the workspace in Comet, by default None
|
|
50
|
+
project_name : str, optional
|
|
51
|
+
Name of the project in Comet, by default None
|
|
52
|
+
comet_mode : str, optional
|
|
53
|
+
Mode of logging, by default None. If set to "offline" then offline mode
|
|
54
|
+
will be used, otherwise online.
|
|
55
|
+
experiment_id : str, optional
|
|
56
|
+
Id of the experiment, if it is reused, by default None
|
|
57
|
+
tags : List[str], optional
|
|
58
|
+
List of tags for the experiment, by default None
|
|
59
|
+
|
|
60
|
+
Attributes
|
|
61
|
+
----------
|
|
62
|
+
experiment : comet_ml.Experiment
|
|
63
|
+
Object representing the Comet experiment
|
|
64
|
+
|
|
65
|
+
Raises
|
|
66
|
+
------
|
|
67
|
+
ImportError
|
|
68
|
+
If the package comet-ml is not installed
|
|
69
|
+
|
|
70
|
+
Examples
|
|
71
|
+
--------
|
|
72
|
+
Metrics while training an annotator can be logged with for example:
|
|
73
|
+
|
|
74
|
+
>>> import sparknlp
|
|
75
|
+
>>> from sparknlp.base import *
|
|
76
|
+
>>> from sparknlp.annotator import *
|
|
77
|
+
>>> from sparknlp.logging.comet import CometLogger
|
|
78
|
+
>>> spark = sparknlp.start()
|
|
79
|
+
|
|
80
|
+
To run an online experiment, the logger is defined like so.
|
|
81
|
+
|
|
82
|
+
>>> OUTPUT_LOG_PATH = "./run"
|
|
83
|
+
>>> logger = CometLogger()
|
|
84
|
+
|
|
85
|
+
Then the experiment can start like so
|
|
86
|
+
|
|
87
|
+
>>> document = DocumentAssembler() \\
|
|
88
|
+
... .setInputCol("text")\\
|
|
89
|
+
... .setOutputCol("document")
|
|
90
|
+
>>> embds = UniversalSentenceEncoder.pretrained() \\
|
|
91
|
+
... .setInputCols("document") \\
|
|
92
|
+
... .setOutputCol("sentence_embeddings")
|
|
93
|
+
>>> multiClassifier = MultiClassifierDLApproach() \\
|
|
94
|
+
... .setInputCols("sentence_embeddings") \\
|
|
95
|
+
... .setOutputCol("category") \\
|
|
96
|
+
... .setLabelColumn("labels") \\
|
|
97
|
+
... .setBatchSize(128) \\
|
|
98
|
+
... .setLr(1e-3) \\
|
|
99
|
+
... .setThreshold(0.5) \\
|
|
100
|
+
... .setShufflePerEpoch(False) \\
|
|
101
|
+
... .setEnableOutputLogs(True) \\
|
|
102
|
+
... .setOutputLogsPath(OUTPUT_LOG_PATH) \\
|
|
103
|
+
... .setMaxEpochs(1)
|
|
104
|
+
>>> logger.monitor(logdir=OUTPUT_LOG_PATH, model=multiClassifier)
|
|
105
|
+
>>> trainDataset = spark.createDataFrame(
|
|
106
|
+
... [("Nice.", ["positive"]), ("That's bad.", ["negative"])],
|
|
107
|
+
... schema=["text", "labels"],
|
|
108
|
+
... )
|
|
109
|
+
>>> pipeline = Pipeline(stages=[document, embds, multiClassifier])
|
|
110
|
+
>>> pipeline.fit(trainDataset)
|
|
111
|
+
>>> logger.end()
|
|
112
|
+
|
|
113
|
+
If you are using a jupyter notebook, it is possible to display the live web
|
|
114
|
+
interface with
|
|
115
|
+
|
|
116
|
+
>>> logger.experiment.display(tab='charts')
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
workspace=None,
|
|
122
|
+
project_name=None,
|
|
123
|
+
comet_mode=None,
|
|
124
|
+
experiment_id=None,
|
|
125
|
+
tags=None,
|
|
126
|
+
**experiment_kwargs,
|
|
127
|
+
):
|
|
128
|
+
if comet_ml is None:
|
|
129
|
+
raise ImportError(
|
|
130
|
+
"`comet_ml` is not installed. Please install it with `pip install comet-ml`."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
self.comet_mode = comet_mode
|
|
134
|
+
self.workspace = workspace
|
|
135
|
+
self.project_name = project_name
|
|
136
|
+
self.experiment_id = experiment_id
|
|
137
|
+
self.experiment_kwargs = experiment_kwargs
|
|
138
|
+
|
|
139
|
+
self.experiment = self._get_experiment(
|
|
140
|
+
self.comet_mode,
|
|
141
|
+
self.workspace,
|
|
142
|
+
self.project_name,
|
|
143
|
+
self.experiment_id,
|
|
144
|
+
**self.experiment_kwargs,
|
|
145
|
+
)
|
|
146
|
+
self.experiment.log_other("Created from", "SparkNLP")
|
|
147
|
+
if tags is not None:
|
|
148
|
+
self.experiment.add_tags(tags)
|
|
149
|
+
|
|
150
|
+
self._watch_file = False
|
|
151
|
+
self._monitor_thread_timeout = 5
|
|
152
|
+
self.thread = None
|
|
153
|
+
|
|
154
|
+
def _get_experiment(
|
|
155
|
+
self,
|
|
156
|
+
mode,
|
|
157
|
+
workspace=None,
|
|
158
|
+
project_name=None,
|
|
159
|
+
experiment_id=None,
|
|
160
|
+
**experiment_kwargs,
|
|
161
|
+
):
|
|
162
|
+
if mode == "offline":
|
|
163
|
+
if experiment_id is not None:
|
|
164
|
+
return comet_ml.ExistingOfflineExperiment(
|
|
165
|
+
previous_experiment=experiment_id,
|
|
166
|
+
workspace=workspace,
|
|
167
|
+
project_name=project_name,
|
|
168
|
+
**experiment_kwargs,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return comet_ml.OfflineExperiment(
|
|
172
|
+
workspace=workspace,
|
|
173
|
+
project_name=project_name,
|
|
174
|
+
**experiment_kwargs,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
else:
|
|
178
|
+
if experiment_id is not None:
|
|
179
|
+
return comet_ml.ExistingExperiment(
|
|
180
|
+
previous_experiment=experiment_id,
|
|
181
|
+
workspace=workspace,
|
|
182
|
+
project_name=project_name,
|
|
183
|
+
**experiment_kwargs,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return comet_ml.Experiment(
|
|
187
|
+
workspace=workspace,
|
|
188
|
+
project_name=project_name,
|
|
189
|
+
**experiment_kwargs,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def log_pipeline_parameters(self, pipeline, stages=None):
|
|
193
|
+
"""Iterates over the different stages in a pyspark PipelineModel object
|
|
194
|
+
and logs the parameters to Comet.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
pipeline : pyspark.ml.PipelineModel
|
|
199
|
+
PipelineModel object
|
|
200
|
+
stages : List[str], optional
|
|
201
|
+
Names of the stages of the pipeline to include, by default None (logs all)
|
|
202
|
+
|
|
203
|
+
Examples
|
|
204
|
+
--------
|
|
205
|
+
The pipeline model contains the annotators of Spark NLP, that were
|
|
206
|
+
fitted to a dataframe.
|
|
207
|
+
|
|
208
|
+
>>> logger.log_pipeline_parameters(pipeline_model)
|
|
209
|
+
"""
|
|
210
|
+
self.experiment.log_other("pipeline_uid", pipeline.uid)
|
|
211
|
+
if stages is None:
|
|
212
|
+
stages = [s.name for s in pipeline.stages]
|
|
213
|
+
|
|
214
|
+
for stage in pipeline.stages:
|
|
215
|
+
if stage.name not in stages:
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
params = stage.extractParamMap()
|
|
219
|
+
for param, param_value in params.items():
|
|
220
|
+
self.experiment.log_parameter(f"{stage.name}-{param.name}", param_value)
|
|
221
|
+
|
|
222
|
+
def log_visualization(self, html, name="viz.html"):
|
|
223
|
+
"""Uploads a NER visualization from Spark NLP Display to comet.
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
html : str
|
|
228
|
+
HTML of the spark NLP Display visualization
|
|
229
|
+
name : str, optional
|
|
230
|
+
Name for the visualization in comet, by default "viz.html"
|
|
231
|
+
|
|
232
|
+
Examples
|
|
233
|
+
--------
|
|
234
|
+
This example has NER chunks (NER extracted by e.g. :class:`.NerDLModel`
|
|
235
|
+
and converted by a :class:`.NerConverter`) extracted in the colum
|
|
236
|
+
"ner_chunk".
|
|
237
|
+
|
|
238
|
+
>>> from sparknlp_display import NerVisualizer
|
|
239
|
+
>>> logger = CometLogger()
|
|
240
|
+
>>> for idx, result in enumerate(results.collect()):
|
|
241
|
+
... viz = NerVisualizer().display(
|
|
242
|
+
... result=result,
|
|
243
|
+
... label_col='ner_chunk',
|
|
244
|
+
... document_col='document',
|
|
245
|
+
... return_html=True
|
|
246
|
+
... )
|
|
247
|
+
... logger.log_visualization(viz, name=f'viz-{idx}.html')
|
|
248
|
+
"""
|
|
249
|
+
self.log_asset_data(html, name)
|
|
250
|
+
|
|
251
|
+
def log_metrics(self, metrics, step=None, epoch=None, prefix=None):
|
|
252
|
+
"""Submits logs of an evaluation metrics.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
metrics : dict
|
|
257
|
+
Dictionary with key value pairs corresponding to the measured metric
|
|
258
|
+
and its value
|
|
259
|
+
step : int, optional
|
|
260
|
+
Used to associate a specific step, by default None
|
|
261
|
+
epoch : int, optional
|
|
262
|
+
Used to associate a specific epoch, by default None
|
|
263
|
+
prefix : str, optional
|
|
264
|
+
Name prefix for this metric, by default None. This can be used to
|
|
265
|
+
identify for example different features by name.
|
|
266
|
+
|
|
267
|
+
Examples
|
|
268
|
+
--------
|
|
269
|
+
In this example, sklearn is used to retrieve the metrics.
|
|
270
|
+
|
|
271
|
+
>>> from sklearn.preprocessing import MultiLabelBinarizer
|
|
272
|
+
>>> from sklearn.metrics import classification_report
|
|
273
|
+
>>> prediction = model.transform(testDataset)
|
|
274
|
+
>>> preds_df = prediction.select('labels', 'category.result').toPandas()
|
|
275
|
+
|
|
276
|
+
>>> mlb = MultiLabelBinarizer()
|
|
277
|
+
>>> y_true = mlb.fit_transform(preds_df['labels'])
|
|
278
|
+
>>> y_pred = mlb.fit_transform(preds_df['result'])
|
|
279
|
+
>>> report = classification_report(y_true, y_pred, output_dict=True)
|
|
280
|
+
|
|
281
|
+
Iterate over the report and log the metrics:
|
|
282
|
+
|
|
283
|
+
>>> for key, value in report.items():
|
|
284
|
+
... logger.log_metrics(value, prefix=key)
|
|
285
|
+
>>> logger.end()
|
|
286
|
+
|
|
287
|
+
If you are using Spark NLP in a notebook, then you can display the
|
|
288
|
+
metrics directly with
|
|
289
|
+
|
|
290
|
+
>>> logger.experiment.display(tab='metrics')
|
|
291
|
+
"""
|
|
292
|
+
self.experiment.log_metrics(metrics, step=step, epoch=epoch, prefix=prefix)
|
|
293
|
+
|
|
294
|
+
def log_parameters(self, parameters, step=None):
|
|
295
|
+
"""Logs a dictionary (or dictionary-like object) of multiple parameters.
|
|
296
|
+
|
|
297
|
+
Parameters
|
|
298
|
+
----------
|
|
299
|
+
parameters : dict
|
|
300
|
+
Parameters in a key : value form
|
|
301
|
+
step : int, optional
|
|
302
|
+
Used to associate a specific step, by default None, by default None
|
|
303
|
+
"""
|
|
304
|
+
self.experiment.log_parameters(parameters, step=step)
|
|
305
|
+
|
|
306
|
+
def log_completed_run(self, log_file_path):
|
|
307
|
+
"""Submit logs of training metrics after a run has completed.
|
|
308
|
+
|
|
309
|
+
Parameters
|
|
310
|
+
----------
|
|
311
|
+
log_file_path : str
|
|
312
|
+
Path to log file containing training metrics
|
|
313
|
+
"""
|
|
314
|
+
with open(log_file_path, "r") as f:
|
|
315
|
+
stats = f.read().splitlines()
|
|
316
|
+
|
|
317
|
+
self._parse_log_entry(stats)
|
|
318
|
+
self.experiment.log_other("log_file_path", log_file_path)
|
|
319
|
+
|
|
320
|
+
def log_asset(self, asset_path, metadata=None, step=None):
|
|
321
|
+
"""Uploads an asset to comet.
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
asset_path : str
|
|
326
|
+
Path to the asset
|
|
327
|
+
metadata : str, optional
|
|
328
|
+
Some additional data to attach to the the audio asset. Must be a
|
|
329
|
+
JSON-encodable dict, by default None
|
|
330
|
+
step : int, optional
|
|
331
|
+
Used to associate a specific step, by default None, by default None
|
|
332
|
+
"""
|
|
333
|
+
self.experiment.log_asset(asset_path, metadata=metadata, step=step)
|
|
334
|
+
|
|
335
|
+
def log_asset_data(self, asset, name, overwrite=False, metadata=None, step=None):
|
|
336
|
+
"""Uploads the data given to comet (str, binary, or JSON).
|
|
337
|
+
|
|
338
|
+
Parameters
|
|
339
|
+
----------
|
|
340
|
+
asset : str or bytes or dict
|
|
341
|
+
Data to be saved as asset
|
|
342
|
+
name : str
|
|
343
|
+
A custom file name to be displayed
|
|
344
|
+
overwrite : bool, optional
|
|
345
|
+
If True will overwrite all existing assets with the same name, by
|
|
346
|
+
default False
|
|
347
|
+
metadata : dict, optional
|
|
348
|
+
Some additional data to attach to the the asset data.
|
|
349
|
+
Must be a JSON-encodable dict, by default None
|
|
350
|
+
step : int, optional
|
|
351
|
+
Used to associate a specific step, by default None, by default None
|
|
352
|
+
"""
|
|
353
|
+
self.experiment.log_asset_data(
|
|
354
|
+
asset, name, overwrite=overwrite, metadata=metadata, step=step
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def monitor(self, logdir, model, interval=10):
|
|
358
|
+
"""Monitors the training of the model and submits logs to comet, given
|
|
359
|
+
by an interval.
|
|
360
|
+
|
|
361
|
+
To log a Spark NLP annotator, it will need an "outputLogPath" parameter, as the
|
|
362
|
+
CometLogger reads the log file generated during the training process.
|
|
363
|
+
|
|
364
|
+
If you are not able to monitor the live training, you can still log the training
|
|
365
|
+
at the end with :meth:`.log_completed_run`.
|
|
366
|
+
|
|
367
|
+
Parameters
|
|
368
|
+
----------
|
|
369
|
+
logdir : str
|
|
370
|
+
Path to the output of the logs
|
|
371
|
+
model : AnnotatorApproach
|
|
372
|
+
Annotator to monitor
|
|
373
|
+
interval : int, optional
|
|
374
|
+
Interval for refreshing, by default 10
|
|
375
|
+
"""
|
|
376
|
+
self._watch_file = True
|
|
377
|
+
self.experiment.log_other("model_uid", model.uid)
|
|
378
|
+
self.thread = threading.Thread(
|
|
379
|
+
target=self._monitor_log_file,
|
|
380
|
+
args=(
|
|
381
|
+
os.path.join(logdir, f"{model.uid}.log"),
|
|
382
|
+
interval,
|
|
383
|
+
),
|
|
384
|
+
)
|
|
385
|
+
self.thread.start()
|
|
386
|
+
|
|
387
|
+
def _file_watcher(self, filename, interval):
|
|
388
|
+
"""Generator that yields lines from the model log file.
|
|
389
|
+
|
|
390
|
+
Parameters
|
|
391
|
+
----------
|
|
392
|
+
filename : str
|
|
393
|
+
Path to model log file
|
|
394
|
+
interval : int
|
|
395
|
+
Time (seconds) to wait in between checking for file updates
|
|
396
|
+
|
|
397
|
+
Yields
|
|
398
|
+
------
|
|
399
|
+
str
|
|
400
|
+
A single line from the file
|
|
401
|
+
"""
|
|
402
|
+
fp = open(filename)
|
|
403
|
+
|
|
404
|
+
line = ""
|
|
405
|
+
while self._watch_file:
|
|
406
|
+
partial_line = fp.readline()
|
|
407
|
+
if len(partial_line) != 0:
|
|
408
|
+
line += partial_line
|
|
409
|
+
if line.endswith("\n"):
|
|
410
|
+
yield line
|
|
411
|
+
line = ""
|
|
412
|
+
else:
|
|
413
|
+
time.sleep(interval)
|
|
414
|
+
|
|
415
|
+
fp.close()
|
|
416
|
+
|
|
417
|
+
def _monitor_log_file(self, filename, interval):
|
|
418
|
+
# Wait for file to be created:
|
|
419
|
+
while not os.path.exists(filename) and self._watch_file:
|
|
420
|
+
time.sleep(interval)
|
|
421
|
+
|
|
422
|
+
watcher = self._file_watcher(filename, interval)
|
|
423
|
+
for line in watcher:
|
|
424
|
+
lines = line.split("\n")
|
|
425
|
+
self._parse_log_entry(lines)
|
|
426
|
+
|
|
427
|
+
def _convert_log_entry_to_dict(self, log_entries):
|
|
428
|
+
output_dict = {}
|
|
429
|
+
for entry in log_entries:
|
|
430
|
+
key, value = entry.strip(" ").split(":")
|
|
431
|
+
output_dict[key] = float(value)
|
|
432
|
+
|
|
433
|
+
return output_dict
|
|
434
|
+
|
|
435
|
+
def _parse_run_metrics(self, parts):
|
|
436
|
+
epoch_str, ratio = parts[0].split(" ", 1)
|
|
437
|
+
epoch, total = ratio.split("/", 1)
|
|
438
|
+
|
|
439
|
+
metrics = parts[2:]
|
|
440
|
+
formatted_metrics = self._convert_log_entry_to_dict(metrics)
|
|
441
|
+
|
|
442
|
+
return formatted_metrics, epoch
|
|
443
|
+
|
|
444
|
+
def _parse_run_parameters(self, parts):
|
|
445
|
+
parameters = parts[2:]
|
|
446
|
+
formatted_parameters = self._convert_log_entry_to_dict(parameters)
|
|
447
|
+
return formatted_parameters
|
|
448
|
+
|
|
449
|
+
def _parse_log_entry(self, lines):
|
|
450
|
+
for line in lines:
|
|
451
|
+
parts = line.split("-")
|
|
452
|
+
if line.startswith("Training started"):
|
|
453
|
+
parameters = self._parse_run_parameters(parts)
|
|
454
|
+
self.log_parameters(parameters)
|
|
455
|
+
|
|
456
|
+
elif line.startswith("Epoch"):
|
|
457
|
+
metrics, epoch = self._parse_run_metrics(parts)
|
|
458
|
+
self.log_metrics(metrics, step=int(epoch), epoch=int(epoch))
|
|
459
|
+
|
|
460
|
+
def end(self):
|
|
461
|
+
"""Ends the experiment and the logger. Submits all outstanding logs to
|
|
462
|
+
comet.
|
|
463
|
+
"""
|
|
464
|
+
self._watch_file = False
|
|
465
|
+
self.experiment.end()
|
|
466
|
+
if self.thread:
|
|
467
|
+
self.thread.join(timeout=self._monitor_thread_timeout)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Module to read various types of documents into chunks"""
|
|
15
|
+
from sparknlp.partition.partition import *
|
|
16
|
+
from sparknlp.partition.partition_transformer import *
|