spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for CoNLL."""
|
|
15
|
+
|
|
16
|
+
import pyspark
|
|
17
|
+
|
|
18
|
+
from sparknlp.common import ReadAs
|
|
19
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CoNLL(ExtendedJavaWrapper):
|
|
23
|
+
"""Instantiates the class to read a CoNLL dataset.
|
|
24
|
+
|
|
25
|
+
The dataset should be in the format of `CoNLL 2003
|
|
26
|
+
<https://www.clips.uantwerpen.be/conll2003/ner/>`_ and needs to be specified
|
|
27
|
+
with :meth:`.readDataset`, which will create a dataframe with the data.
|
|
28
|
+
|
|
29
|
+
Can be used to train a :class:`NerDLApproach
|
|
30
|
+
<sparknlp.annotator.NerDLApproach>`.
|
|
31
|
+
|
|
32
|
+
**Input File Format**::
|
|
33
|
+
|
|
34
|
+
-DOCSTART- -X- -X- O
|
|
35
|
+
|
|
36
|
+
EU NNP B-NP B-ORG
|
|
37
|
+
rejects VBZ B-VP O
|
|
38
|
+
German JJ B-NP B-MISC
|
|
39
|
+
call NN I-NP O
|
|
40
|
+
to TO B-VP O
|
|
41
|
+
boycott VB I-VP O
|
|
42
|
+
British JJ B-NP B-MISC
|
|
43
|
+
lamb NN I-NP O
|
|
44
|
+
. . O O
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
documentCol : str, optional
|
|
49
|
+
Name of the :class:`.DocumentAssembler` column, by default 'document'
|
|
50
|
+
sentenceCol : str, optional
|
|
51
|
+
Name of the :class:`.SentenceDetector` column, by default 'sentence'
|
|
52
|
+
tokenCol : str, optional
|
|
53
|
+
Name of the :class:`.Tokenizer` column, by default 'token'
|
|
54
|
+
posCol : str, optional
|
|
55
|
+
Name of the :class:`.PerceptronModel` column, by default 'pos'
|
|
56
|
+
conllLabelIndex : int, optional
|
|
57
|
+
Index of the label column in the dataset, by default 3
|
|
58
|
+
conllPosIndex : int, optional
|
|
59
|
+
Index of the POS tags in the dataset, by default 1
|
|
60
|
+
textCol : str, optional
|
|
61
|
+
Index of the text column in the dataset, by default 'text'
|
|
62
|
+
labelCol : str, optional
|
|
63
|
+
Name of the label column, by default 'label'
|
|
64
|
+
explodeSentences : bool, optional
|
|
65
|
+
Whether to explode sentences to separate rows, by default True
|
|
66
|
+
delimiter: str, optional
|
|
67
|
+
Delimiter used to separate columns inside CoNLL file
|
|
68
|
+
includeDocId: bool, optional
|
|
69
|
+
Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> from sparknlp.training import CoNLL
|
|
74
|
+
>>> trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
|
|
75
|
+
>>> trainingData.selectExpr(
|
|
76
|
+
... "text",
|
|
77
|
+
... "token.result as tokens",
|
|
78
|
+
... "pos.result as pos",
|
|
79
|
+
... "label.result as label"
|
|
80
|
+
... ).show(3, False)
|
|
81
|
+
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|
|
82
|
+
|text |tokens |pos |label |
|
|
83
|
+
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|
|
84
|
+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|
|
85
|
+
|Peter Blackburn |[Peter, Blackburn] |[NNP, NNP] |[B-PER, I-PER] |
|
|
86
|
+
|BRUSSELS 1996-08-22 |[BRUSSELS, 1996-08-22] |[NNP, CD] |[B-LOC, O] |
|
|
87
|
+
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(self,
|
|
91
|
+
documentCol='document',
|
|
92
|
+
sentenceCol='sentence',
|
|
93
|
+
tokenCol='token',
|
|
94
|
+
posCol='pos',
|
|
95
|
+
conllLabelIndex=3,
|
|
96
|
+
conllPosIndex=1,
|
|
97
|
+
conllDocIdCol="doc_id",
|
|
98
|
+
textCol='text',
|
|
99
|
+
labelCol='label',
|
|
100
|
+
explodeSentences=True,
|
|
101
|
+
delimiter=' ',
|
|
102
|
+
includeDocId=False
|
|
103
|
+
):
|
|
104
|
+
super(CoNLL, self).__init__("com.johnsnowlabs.nlp.training.CoNLL",
|
|
105
|
+
documentCol,
|
|
106
|
+
sentenceCol,
|
|
107
|
+
tokenCol,
|
|
108
|
+
posCol,
|
|
109
|
+
conllLabelIndex,
|
|
110
|
+
conllPosIndex,
|
|
111
|
+
conllDocIdCol,
|
|
112
|
+
textCol,
|
|
113
|
+
labelCol,
|
|
114
|
+
explodeSentences,
|
|
115
|
+
delimiter,
|
|
116
|
+
includeDocId)
|
|
117
|
+
|
|
118
|
+
def readDataset(self, spark, path, read_as=ReadAs.TEXT, partitions=8, storage_level=pyspark.StorageLevel.DISK_ONLY):
|
|
119
|
+
# ToDo Replace with std pyspark
|
|
120
|
+
"""Reads the dataset from an external resource.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
spark : :class:`pyspark.sql.SparkSession`
|
|
125
|
+
Initiated Spark Session with Spark NLP
|
|
126
|
+
path : str
|
|
127
|
+
Path to the resource, it can take two forms; a path to a conll file, or a path to a folder containing multiple CoNLL files.
|
|
128
|
+
When the path points to a folder, the path must end in '*'.
|
|
129
|
+
Examples:
|
|
130
|
+
"/path/to/single/file.conll'
|
|
131
|
+
"/path/to/folder/containing/multiple/files/*'
|
|
132
|
+
|
|
133
|
+
read_as : str, optional
|
|
134
|
+
How to read the resource, by default ReadAs.TEXT
|
|
135
|
+
partitions : sets the minimum number of partitions for the case of lifting multiple files in parallel into a single dataframe. Defaults to 8.
|
|
136
|
+
storage_level : sets the persistence level according to PySpark definitions. Defaults to StorageLevel.DISK_ONLY. Applies only when lifting multiple files.
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
:class:`pyspark.sql.DataFrame`
|
|
142
|
+
Spark Dataframe with the data
|
|
143
|
+
"""
|
|
144
|
+
jSession = spark._jsparkSession
|
|
145
|
+
|
|
146
|
+
jdf = self._java_obj.readDataset(jSession, path, read_as, partitions,
|
|
147
|
+
spark.sparkContext._getJavaStorageLevel(storage_level))
|
|
148
|
+
dataframe = self.getDataFrame(spark, jdf)
|
|
149
|
+
return dataframe
|
|
150
|
+
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for CoNLLU."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import ReadAs
|
|
17
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CoNLLU(ExtendedJavaWrapper):
|
|
21
|
+
"""Instantiates the class to read a CoNLL-U dataset.
|
|
22
|
+
|
|
23
|
+
The dataset should be in the format of `CoNLL-U
|
|
24
|
+
<https://universaldependencies.org/format.html>`_ and needs to be specified
|
|
25
|
+
with :meth:`.readDataset`, which will create a dataframe with the data.
|
|
26
|
+
|
|
27
|
+
Can be used to train a :class:`DependencyParserApproach
|
|
28
|
+
<sparknlp.annotator.DependencyParserApproach>`
|
|
29
|
+
|
|
30
|
+
**Input File Format**::
|
|
31
|
+
|
|
32
|
+
# sent_id = 1
|
|
33
|
+
# text = They buy and sell books.
|
|
34
|
+
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
|
|
35
|
+
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
|
|
36
|
+
3 and and CONJ CC _ 4 cc 4:cc _
|
|
37
|
+
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
|
|
38
|
+
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
|
|
39
|
+
6 . . PUNCT . _ 2 punct 2:punct _
|
|
40
|
+
|
|
41
|
+
Examples
|
|
42
|
+
--------
|
|
43
|
+
>>> from sparknlp.training import CoNLLU
|
|
44
|
+
>>> conlluFile = "src/test/resources/conllu/en.test.conllu"
|
|
45
|
+
>>> conllDataSet = CoNLLU(False).readDataset(spark, conlluFile)
|
|
46
|
+
>>> conllDataSet.selectExpr(
|
|
47
|
+
... "text",
|
|
48
|
+
... "form.result as form",
|
|
49
|
+
... "upos.result as upos",
|
|
50
|
+
... "xpos.result as xpos",
|
|
51
|
+
... "lemma.result as lemma"
|
|
52
|
+
... ).show(1, False)
|
|
53
|
+
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|
|
54
|
+
|text |form |upos |xpos |lemma |
|
|
55
|
+
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|
|
56
|
+
|What if Google Morphed Into GoogleOS? |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
|
|
57
|
+
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self,
|
|
61
|
+
textCol='text',
|
|
62
|
+
documentCol='document',
|
|
63
|
+
sentenceCol='sentence',
|
|
64
|
+
formCol='form',
|
|
65
|
+
uposCol='upos',
|
|
66
|
+
xposCol='xpos',
|
|
67
|
+
lemmaCol='lemma',
|
|
68
|
+
explodeSentences=True
|
|
69
|
+
):
|
|
70
|
+
super(CoNLLU, self).__init__("com.johnsnowlabs.nlp.training.CoNLLU",
|
|
71
|
+
textCol,
|
|
72
|
+
documentCol,
|
|
73
|
+
sentenceCol,
|
|
74
|
+
formCol,
|
|
75
|
+
uposCol,
|
|
76
|
+
xposCol,
|
|
77
|
+
lemmaCol,
|
|
78
|
+
explodeSentences)
|
|
79
|
+
|
|
80
|
+
def readDataset(self, spark, path, read_as=ReadAs.TEXT):
|
|
81
|
+
"""Reads the dataset from an external resource.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
spark : :class:`pyspark.sql.SparkSession`
|
|
86
|
+
Initiated Spark Session with Spark NLP
|
|
87
|
+
path : str
|
|
88
|
+
Path to the resource
|
|
89
|
+
read_as : str, optional
|
|
90
|
+
How to read the resource, by default ReadAs.TEXT
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
:class:`pyspark.sql.DataFrame`
|
|
95
|
+
Spark Dataframe with the data
|
|
96
|
+
"""
|
|
97
|
+
# ToDo Replace with std pyspark
|
|
98
|
+
jSession = spark._jsparkSession
|
|
99
|
+
|
|
100
|
+
jdf = self._java_obj.readDataset(jSession, path, read_as)
|
|
101
|
+
dataframe = self.getDataFrame(spark, jdf)
|
|
102
|
+
return dataframe
|
|
103
|
+
|
sparknlp/training/pos.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains helper classes for part-of-speech tagging."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class POS(ExtendedJavaWrapper):
|
|
20
|
+
"""Helper class for creating DataFrames for training a part-of-speech
|
|
21
|
+
tagger.
|
|
22
|
+
|
|
23
|
+
The dataset needs to consist of sentences on each line, where each word is
|
|
24
|
+
delimited with its respective tag.
|
|
25
|
+
|
|
26
|
+
**Input File Format**::
|
|
27
|
+
|
|
28
|
+
A|DT few|JJ months|NNS ago|RB you|PRP received|VBD a|DT letter|NN
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
The sentence can then be parsed with :meth:`.readDataset` into a column with
|
|
32
|
+
annotations of type ``POS``.
|
|
33
|
+
|
|
34
|
+
Can be used to train a :class:`PerceptronApproach
|
|
35
|
+
<sparknlp.annotator.PerceptronApproach>`.
|
|
36
|
+
|
|
37
|
+
Examples
|
|
38
|
+
--------
|
|
39
|
+
In this example, the file ``test-training.txt`` has the content of the
|
|
40
|
+
sentence above.
|
|
41
|
+
|
|
42
|
+
>>> from sparknlp.training import POS
|
|
43
|
+
>>> pos = POS()
|
|
44
|
+
>>> path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
|
|
45
|
+
>>> posDf = pos.readDataset(spark, path, "|", "tags")
|
|
46
|
+
>>> posDf.selectExpr("explode(tags) as tags").show(truncate=False)
|
|
47
|
+
+---------------------------------------------+
|
|
48
|
+
|tags |
|
|
49
|
+
+---------------------------------------------+
|
|
50
|
+
|[pos, 0, 5, NNP, [word -> Pierre], []] |
|
|
51
|
+
|[pos, 7, 12, NNP, [word -> Vinken], []] |
|
|
52
|
+
|[pos, 14, 14, ,, [word -> ,], []] |
|
|
53
|
+
|[pos, 16, 17, CD, [word -> 61], []] |
|
|
54
|
+
|[pos, 19, 23, NNS, [word -> years], []] |
|
|
55
|
+
|[pos, 25, 27, JJ, [word -> old], []] |
|
|
56
|
+
|[pos, 29, 29, ,, [word -> ,], []] |
|
|
57
|
+
|[pos, 31, 34, MD, [word -> will], []] |
|
|
58
|
+
|[pos, 36, 39, VB, [word -> join], []] |
|
|
59
|
+
|[pos, 41, 43, DT, [word -> the], []] |
|
|
60
|
+
|[pos, 45, 49, NN, [word -> board], []] |
|
|
61
|
+
|[pos, 51, 52, IN, [word -> as], []] |
|
|
62
|
+
|[pos, 47, 47, DT, [word -> a], []] |
|
|
63
|
+
|[pos, 56, 67, JJ, [word -> nonexecutive], []]|
|
|
64
|
+
|[pos, 69, 76, NN, [word -> director], []] |
|
|
65
|
+
|[pos, 78, 81, NNP, [word -> Nov.], []] |
|
|
66
|
+
|[pos, 83, 84, CD, [word -> 29], []] |
|
|
67
|
+
|[pos, 81, 81, ., [word -> .], []] |
|
|
68
|
+
+---------------------------------------------+
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self):
|
|
72
|
+
super(POS, self).__init__("com.johnsnowlabs.nlp.training.POS")
|
|
73
|
+
|
|
74
|
+
def readDataset(self, spark, path, delimiter="|", outputPosCol="tags", outputDocumentCol="document",
|
|
75
|
+
outputTextCol="text"):
|
|
76
|
+
# ToDo Replace with std pyspark
|
|
77
|
+
"""Reads the dataset from an external resource.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
spark : :class:`pyspark.sql.SparkSession`
|
|
82
|
+
Initiated Spark Session with Spark NLP
|
|
83
|
+
path : str
|
|
84
|
+
Path to the resource
|
|
85
|
+
delimiter : str, optional
|
|
86
|
+
Delimiter of word and POS, by default "|"
|
|
87
|
+
outputPosCol : str, optional
|
|
88
|
+
Name of the output POS column, by default "tags"
|
|
89
|
+
outputDocumentCol : str, optional
|
|
90
|
+
Name of the output document column, by default "document"
|
|
91
|
+
outputTextCol : str, optional
|
|
92
|
+
Name of the output text column, by default "text"
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
:class:`pyspark.sql.DataFrame`
|
|
97
|
+
Spark Dataframe with the data
|
|
98
|
+
"""
|
|
99
|
+
jSession = spark._jsparkSession
|
|
100
|
+
|
|
101
|
+
jdf = self._java_obj.readDataset(jSession, path, delimiter, outputPosCol, outputDocumentCol, outputTextCol)
|
|
102
|
+
dataframe = self.getDataFrame(spark, jdf)
|
|
103
|
+
return dataframe
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains helper classes for PubTator datasets."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PubTator(ExtendedJavaWrapper):
|
|
20
|
+
"""The PubTator format includes medical papers’ titles, abstracts, and
|
|
21
|
+
tagged chunks.
|
|
22
|
+
|
|
23
|
+
For more information see `PubTator Docs
|
|
24
|
+
<http://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=format_3783>`_
|
|
25
|
+
and `MedMentions Docs <http://github.com/chanzuckerberg/MedMentions>`_.
|
|
26
|
+
|
|
27
|
+
:meth:`.readDataset` is used to create a Spark DataFrame from a PubTator
|
|
28
|
+
text file.
|
|
29
|
+
|
|
30
|
+
**Input File Format**::
|
|
31
|
+
|
|
32
|
+
25763772 0 5 DCTN4 T116,T123 C4308010
|
|
33
|
+
25763772 23 63 chronic Pseudomonas aeruginosa infection T047 C0854135
|
|
34
|
+
25763772 67 82 cystic fibrosis T047 C0010674
|
|
35
|
+
25763772 83 120 Pseudomonas aeruginosa (Pa) infection T047 C0854135
|
|
36
|
+
25763772 124 139 cystic fibrosis T047 C0010674
|
|
37
|
+
|
|
38
|
+
Examples
|
|
39
|
+
--------
|
|
40
|
+
>>> from sparknlp.training import PubTator
|
|
41
|
+
>>> pubTatorFile = "./src/test/resources/corpus_pubtator_sample.txt"
|
|
42
|
+
>>> pubTatorDataSet = PubTator().readDataset(spark, pubTatorFile)
|
|
43
|
+
>>> pubTatorDataSet.show(1)
|
|
44
|
+
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|
|
45
|
+
| doc_id| finished_token| finished_pos| finished_ner|finished_token_metadata|finished_pos_metadata|finished_label_metadata|
|
|
46
|
+
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|
|
47
|
+
|25763772|[DCTN4, as, a, mo...|[NNP, IN, DT, NN,...|[B-T116, O, O, O,...| [[sentence, 0], [...| [[word, DCTN4], [...| [[word, DCTN4], [...|
|
|
48
|
+
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self):
|
|
52
|
+
super(PubTator, self).__init__("com.johnsnowlabs.nlp.training.PubTator")
|
|
53
|
+
|
|
54
|
+
def readDataset(self, spark, path, isPaddedToken=True):
|
|
55
|
+
# ToDo Replace with std pyspark
|
|
56
|
+
"""Reads the dataset from an external resource.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
spark : :class:`pyspark.sql.SparkSession`
|
|
61
|
+
Initiated Spark Session with Spark NLP
|
|
62
|
+
path : str
|
|
63
|
+
Path to the resource
|
|
64
|
+
isPaddedToken : str, optional
|
|
65
|
+
Whether tokens are padded, by default True
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
:class:`pyspark.sql.DataFrame`
|
|
70
|
+
Spark Dataframe with the data
|
|
71
|
+
"""
|
|
72
|
+
jSession = spark._jsparkSession
|
|
73
|
+
|
|
74
|
+
jdf = self._java_obj.readDataset(jSession, path, isPaddedToken)
|
|
75
|
+
dataframe = self.getDataFrame(spark, jdf)
|
|
76
|
+
return dataframe
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright 2017-2023 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from pyspark.sql import SparkSession
|
|
15
|
+
|
|
16
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SpacyToAnnotation(ExtendedJavaWrapper):
|
|
20
|
+
|
|
21
|
+
"""Helper class to load a list of tokens/sentences as JSON to Annotation.
|
|
22
|
+
|
|
23
|
+
The JSON will be in this format:
|
|
24
|
+
[
|
|
25
|
+
{
|
|
26
|
+
"tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."],
|
|
27
|
+
"token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false],
|
|
28
|
+
"sentence_ends": [2, 7, 12]
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
>>> from sparknlp.training import SpacyToAnnotation
|
|
35
|
+
>>> result = SpacyToAnnotation().readDataset(spark, "src/test/resources/spacy-to-annotation/multi_doc_tokens.json")
|
|
36
|
+
>>> result.show(False)
|
|
37
|
+
+-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
38
|
+
|document |sentence |token |
|
|
39
|
+
+-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
40
|
+
|[{document, 0, 55, John went to the store last night. He bought some bread., {}, []}]|[{document, 0, 33, John went to the store last night., {sentence -> 0}, []}, {document, 35, 55, He bought some bread., {sentence -> 1}, []}] |[{token, 0, 3, John, {sentence -> 0}, []}, {token, 5, 8, went, {sentence -> 0}, []}, {token, 10, 11, to, {sentence -> 0}, []}, {token, 13, 15, the, {sentence -> 0}, []}, {token, 17, 21, store, {sentence -> 0}, []}, {token, 23, 26, last, {sentence -> 0}, []}, {token, 28, 32, night, {sentence -> 0}, []}, {token, 33, 33, ., {sentence -> 0}, []}, {token, 35, 36, He, {sentence -> 1}, []}, {token, 38, 43, bought, {sentence -> 1}, []}, {token, 45, 48, some, {sentence -> 1}, []}, {token, 50, 54, bread, {sentence -> 1}, []}, {token, 55, 55, ., {sentence -> 1}, []}]|
|
|
41
|
+
|[{document, 0, 47, Hello world! How are you today? I'm fine thanks., {}, []}] |[{document, 0, 11, Hello world!, {sentence -> 0}, []}, {document, 13, 30, How are you today?, {sentence -> 1}, []}, {document, 32, 47, I'm fine thanks., {sentence -> 2}, []}]|[{token, 0, 4, Hello, {sentence -> 0}, []}, {token, 6, 10, world, {sentence -> 0}, []}, {token, 11, 11, !, {sentence -> 0}, []}, {token, 13, 15, How, {sentence -> 1}, []}, {token, 17, 19, are, {sentence -> 1}, []}, {token, 21, 23, you, {sentence -> 1}, []}, {token, 25, 29, today, {sentence -> 1}, []}, {token, 30, 30, ?, {sentence -> 1}, []}, {token, 32, 32, I, {sentence -> 2}, []}, {token, 33, 34, 'm, {sentence -> 2}, []}, {token, 36, 39, fine, {sentence -> 2}, []}, {token, 41, 46, thanks, {sentence -> 2}, []}, {token, 47, 47, ., {sentence -> 2}, []}] |
|
|
42
|
+
+-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
super(SpacyToAnnotation, self).__init__("com.johnsnowlabs.nlp.training.SpacyToAnnotation")
|
|
48
|
+
|
|
49
|
+
def readJsonFile(self, spark, jsonFilePath, params=None):
|
|
50
|
+
if params is None:
|
|
51
|
+
params = {}
|
|
52
|
+
|
|
53
|
+
jSession = spark._jsparkSession
|
|
54
|
+
|
|
55
|
+
jdf = self._java_obj.readJsonFileJava(jSession, jsonFilePath, params)
|
|
56
|
+
annotation_dataset = self.getDataFrame(spark, jdf)
|
|
57
|
+
return annotation_dataset
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import json
|
|
3
|
+
from typing import List
|
|
4
|
+
import sparknlp
|
|
5
|
+
import os
|
|
6
|
+
import zipfile
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PushToHub:
|
|
10
|
+
list_of_tasks = [ # List of available tasks in Modelhub
|
|
11
|
+
"Named Entity Recognition",
|
|
12
|
+
'Text Classification',
|
|
13
|
+
'Text Generation',
|
|
14
|
+
'Sentiment Analysis',
|
|
15
|
+
'Translation',
|
|
16
|
+
'Question Answering',
|
|
17
|
+
'Summarization',
|
|
18
|
+
'Sentence Detection',
|
|
19
|
+
'Embeddings',
|
|
20
|
+
'Language Detection',
|
|
21
|
+
'Stop Words Removal',
|
|
22
|
+
'Word Segmentation',
|
|
23
|
+
'Part of Speech Tagging',
|
|
24
|
+
'Lemmatization',
|
|
25
|
+
'Chunk Mapping',
|
|
26
|
+
'Spell Check',
|
|
27
|
+
'Dependency Parser',
|
|
28
|
+
'Pipeline Public']
|
|
29
|
+
|
|
30
|
+
def zip_directory(folder_path: str, zip_path: str):
|
|
31
|
+
"""Zips folder for pushing to hub.
|
|
32
|
+
|
|
33
|
+
folder_path:Path to the folder to zip.
|
|
34
|
+
zip_path:Path of the zip file to create."""
|
|
35
|
+
|
|
36
|
+
with zipfile.ZipFile(zip_path, mode='w') as zipf:
|
|
37
|
+
len_dir_path = len(folder_path)
|
|
38
|
+
for root, _, files in os.walk(folder_path):
|
|
39
|
+
for file in files:
|
|
40
|
+
file_path = os.path.join(root, file)
|
|
41
|
+
zipf.write(file_path, file_path[len_dir_path:])
|
|
42
|
+
|
|
43
|
+
def unzip_directory(zip_path: str):
|
|
44
|
+
"""Unzips Model to check for required files for upload.
|
|
45
|
+
|
|
46
|
+
Keyword Arguments:
|
|
47
|
+
zip_path:Zip Path to unzip.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def check_for_required_info(model_data: dict):
|
|
51
|
+
"""Checks if the required fields exist in given dictionary and fills any remaining fields.
|
|
52
|
+
|
|
53
|
+
Keyword Arguments:
|
|
54
|
+
model_data: The model data to check.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
list_of_required_fields = ['name', 'task', 'language', 'pythonCode', 'model_zip_path']
|
|
58
|
+
|
|
59
|
+
if model_data['task'] not in PushToHub.list_of_tasks:
|
|
60
|
+
list_of_tasks_string_version = "\n".join(PushToHub.list_of_tasks)
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"""Invalid task, please pick one of the following tasks\n{list_of_tasks_string_version}""")
|
|
63
|
+
|
|
64
|
+
if model_data['model_zip_path'].endswith(".zip"):
|
|
65
|
+
with zipfile.ZipFile(model_data['model_zip_path']) as modelfile:
|
|
66
|
+
if 'metadata/part-00000' not in modelfile.namelist():
|
|
67
|
+
raise ValueError("The Model is not a Spark Saved Model.")
|
|
68
|
+
else:
|
|
69
|
+
if not os.path.exists(f"{model_data['model_zip_path']}/metadata/part-00000"):
|
|
70
|
+
raise ValueError("The Model is not a Spark Saved Model.")
|
|
71
|
+
|
|
72
|
+
def push_to_hub(name: str,
|
|
73
|
+
language: str,
|
|
74
|
+
model_zip_path: str,
|
|
75
|
+
task: str,
|
|
76
|
+
pythonCode: str,
|
|
77
|
+
GIT_TOKEN: str,
|
|
78
|
+
title: str = None,
|
|
79
|
+
tags: List[str] = None,
|
|
80
|
+
dependencies: str = None,
|
|
81
|
+
description: str = None,
|
|
82
|
+
predictedEntities: str = None,
|
|
83
|
+
sparknlpVersion: str = None,
|
|
84
|
+
howToUse: str = None,
|
|
85
|
+
liveDemo: str = None,
|
|
86
|
+
runInColab: str = None,
|
|
87
|
+
scalaCode: str = None,
|
|
88
|
+
nluCode: str = None,
|
|
89
|
+
results: str = None,
|
|
90
|
+
dataSource: str = None,
|
|
91
|
+
includedModels: str = None,
|
|
92
|
+
benchmarking: str = None,
|
|
93
|
+
) -> str:
|
|
94
|
+
"""Pushes model to Hub.
|
|
95
|
+
|
|
96
|
+
Keyword Arguments:
|
|
97
|
+
model_data:Dictionary containing info about the model such as Name and Language.
|
|
98
|
+
GIT_TOKEN: Token required for pushing to hub.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
model_data = {item: value for (item, value) in locals().items() if value is not None}
|
|
102
|
+
PushToHub.check_for_required_info(model_data)
|
|
103
|
+
model_data = PushToHub.create_docs(model_data)
|
|
104
|
+
|
|
105
|
+
r1 = requests.post('https://modelshub.johnsnowlabs.com/api/v1/models', data=json.dumps(model_data), headers={
|
|
106
|
+
'Content-type': 'application/json',
|
|
107
|
+
'Authorization': f'Bearer {GIT_TOKEN}'
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
if r1.status_code == 201:
|
|
111
|
+
r2 = requests.post(
|
|
112
|
+
'https://modelshub.johnsnowlabs.com/api/v1/models/%s/file' % r1.json()['id'],
|
|
113
|
+
data=open(model_data['model_zip_path'], 'rb'), headers={
|
|
114
|
+
'Authorization': f'Bearer {GIT_TOKEN}'
|
|
115
|
+
})
|
|
116
|
+
if r2.status_code == 200:
|
|
117
|
+
print(r2.json()['message'])
|
|
118
|
+
return r2.json()['message']
|
|
119
|
+
else:
|
|
120
|
+
print(f"Something Went Wrong During the Upload. Got Status Code: {r1.status_code}")
|
|
121
|
+
return f"Something Went Wrong During the Upload. Got Status Code: {r1.status_code}"
|
|
122
|
+
|
|
123
|
+
def create_docs(dicionary_for_upload: dict) -> dict:
|
|
124
|
+
"""Adds fields in the dictionary for pushing to hub.
|
|
125
|
+
|
|
126
|
+
Keyword Arguments:
|
|
127
|
+
dictionary_for_upload: The dictionary to add keys to.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
dicionary_for_upload['sparkVersion'] = "3.0"
|
|
131
|
+
dicionary_for_upload['license'] = 'Open Source'
|
|
132
|
+
dicionary_for_upload['supported'] = False
|
|
133
|
+
|
|
134
|
+
if 'sparknlpVersion' not in dicionary_for_upload.keys():
|
|
135
|
+
dicionary_for_upload['sparknlpVersion'] = "Spark NLP " + sparknlp.version()
|
|
136
|
+
|
|
137
|
+
if 'description' not in dicionary_for_upload.keys():
|
|
138
|
+
dicionary_for_upload[
|
|
139
|
+
'description'] = f"This model is used for {dicionary_for_upload['task']} and this model works with {dicionary_for_upload['language']} language"
|
|
140
|
+
|
|
141
|
+
if 'title' not in dicionary_for_upload.keys():
|
|
142
|
+
dicionary_for_upload[
|
|
143
|
+
'title'] = f"{dicionary_for_upload['task']} for {dicionary_for_upload['language']} language"
|
|
144
|
+
|
|
145
|
+
if os.path.isdir(dicionary_for_upload['model_zip_path']):
|
|
146
|
+
PushToHub.zip_directory(dicionary_for_upload['model_zip_path'],
|
|
147
|
+
f"{dicionary_for_upload['model_zip_path']}.zip")
|
|
148
|
+
dicionary_for_upload['model_zip_path'] = dicionary_for_upload['model_zip_path'] + '.zip'
|
|
149
|
+
return dicionary_for_upload
|