spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from sparknlp.internal import _ResourceHelper_moveFile
|
|
5
|
+
from sparknlp.training._tf_graph_builders.ner_dl.create_graph import create_graph
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class WrongTFVersion(Exception):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TensorflowAddonsNeeded(Exception):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TFGraphBuilder:
|
|
17
|
+
"""
|
|
18
|
+
Generic class to create the tensorflow graphs for 'ner_dl', 'generic_classifier', 'assertion_dl', 'relation_extraction' annotators in spark-nlp healthcare. In version 1.1
|
|
19
|
+
Examples
|
|
20
|
+
--------
|
|
21
|
+
>>> from sparknlp.training.tfgraphs import tf_graph
|
|
22
|
+
>>>
|
|
23
|
+
>>> tf_graph.get_models()
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def supports_auto_file_name(self):
|
|
28
|
+
return False
|
|
29
|
+
|
|
30
|
+
def get_model_filename(self):
|
|
31
|
+
raise Exception("Not implemented.")
|
|
32
|
+
|
|
33
|
+
def check_build_params(self):
|
|
34
|
+
|
|
35
|
+
build_params = self.get_build_params()
|
|
36
|
+
required_params = self.get_model_build_params()
|
|
37
|
+
|
|
38
|
+
for req_param in required_params:
|
|
39
|
+
if req_param not in build_params:
|
|
40
|
+
if required_params[req_param] is None:
|
|
41
|
+
raise Exception(f"You need to specify a value for {req_param} in the build parameters.")
|
|
42
|
+
|
|
43
|
+
def get_build_params(self):
|
|
44
|
+
return self.__build_params
|
|
45
|
+
|
|
46
|
+
def get_build_params_with_defaults(self):
|
|
47
|
+
build_params = self.get_build_params()
|
|
48
|
+
req_build_params = self.get_model_build_params()
|
|
49
|
+
|
|
50
|
+
for req_param in req_build_params:
|
|
51
|
+
if (req_param not in build_params) and (req_build_params[req_param] is not None):
|
|
52
|
+
build_params[req_param] = req_build_params[req_param]
|
|
53
|
+
|
|
54
|
+
return build_params
|
|
55
|
+
|
|
56
|
+
def get_build_param(self, build_param):
|
|
57
|
+
build_params = self.get_build_params()
|
|
58
|
+
|
|
59
|
+
if build_param in build_params:
|
|
60
|
+
return build_params[build_param]
|
|
61
|
+
|
|
62
|
+
required_params = self.get_model_build_params()
|
|
63
|
+
|
|
64
|
+
if (build_param in required_params) and (required_params[build_param] is not None):
|
|
65
|
+
return required_params[build_param]
|
|
66
|
+
|
|
67
|
+
raise Exception(f"No value for {build_param} found.")
|
|
68
|
+
|
|
69
|
+
def get_model_build_params(self):
|
|
70
|
+
return {}
|
|
71
|
+
|
|
72
|
+
def get_model_build_param_explanations(self):
|
|
73
|
+
return {}
|
|
74
|
+
|
|
75
|
+
def __init__(self, build_params):
|
|
76
|
+
self.__build_params = build_params
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class NerTFGraphBuilder(TFGraphBuilder):
|
|
80
|
+
"""
|
|
81
|
+
Class to build the the TF graphs for MedicalNerApproach.
|
|
82
|
+
|
|
83
|
+
Examples
|
|
84
|
+
--------
|
|
85
|
+
|
|
86
|
+
>>> from sparknlp.training.tfgraphs import tf_graph
|
|
87
|
+
>>> from sparknlp.base import *
|
|
88
|
+
>>> from sparknlp.annotator import *
|
|
89
|
+
>>>feat_size = 200
|
|
90
|
+
>>>n_classes = 6
|
|
91
|
+
>>> tf_graph.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12},model_location="./medical_ner_graphs",model_filename="auto")
|
|
92
|
+
>>> nerTagger = MedicalNerApproach()\
|
|
93
|
+
>>> .setInputCols(["sentence", "token", "embeddings"])\
|
|
94
|
+
>>> .setLabelColumn("label")\
|
|
95
|
+
>>> .setOutputCol("ner")\
|
|
96
|
+
>>> .setMaxEpochs(2)\
|
|
97
|
+
>>> .setBatchSize(64)\
|
|
98
|
+
>>> .setRandomSeed(0)\
|
|
99
|
+
>>> .setVerbose(1)\
|
|
100
|
+
>>> .setValidationSplit(0.2)\
|
|
101
|
+
>>> .setEvaluationLogExtended(True) \
|
|
102
|
+
>>> .setEnableOutputLogs(True)\
|
|
103
|
+
>>> .setIncludeConfidence(True)\
|
|
104
|
+
>>> .setOutputLogsPath('ner_logs')\
|
|
105
|
+
>>> .setGraphFolder('medical_ner_graphs')\
|
|
106
|
+
>>> .setEnableMemoryOptimizer(True)
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def supports_auto_file_name(self):
|
|
110
|
+
return True
|
|
111
|
+
|
|
112
|
+
def get_model_filename(self):
|
|
113
|
+
return "blstm_{}_{}_{}_{}.pb".format(
|
|
114
|
+
self.get_build_param("ntags"),
|
|
115
|
+
self.get_build_param("embeddings_dim"),
|
|
116
|
+
self.get_build_param("lstm_size"),
|
|
117
|
+
self.get_build_param("nchars"),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def get_model_build_params(self):
|
|
121
|
+
return {
|
|
122
|
+
"ntags": None,
|
|
123
|
+
"embeddings_dim": 200,
|
|
124
|
+
"nchars": 100,
|
|
125
|
+
"lstm_size": 128,
|
|
126
|
+
"gpu_device": 0
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
def get_model_build_param_explanations(self):
|
|
130
|
+
return {
|
|
131
|
+
"ntags": "Number of tags.",
|
|
132
|
+
"embeddings_dim": "Embeddings dimension.",
|
|
133
|
+
"nchars": "Number of chars.",
|
|
134
|
+
"gpu_device": "Device for training.",
|
|
135
|
+
"lstm_size": "Number of LSTM units."
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
def build(self, model_location, model_filename):
|
|
139
|
+
if re.match(r'(\w+)://.*', model_location):
|
|
140
|
+
tmp_location = "/tmp/nerModel"
|
|
141
|
+
create_graph(
|
|
142
|
+
model_location=tmp_location,
|
|
143
|
+
model_filename=model_filename,
|
|
144
|
+
ntags=self.get_build_param("ntags"),
|
|
145
|
+
embeddings_dim=self.get_build_param("embeddings_dim"),
|
|
146
|
+
nchars=self.get_build_param("nchars"),
|
|
147
|
+
lstm_size=self.get_build_param("lstm_size"),
|
|
148
|
+
gpu_device=self.get_build_param("gpu_device"),
|
|
149
|
+
is_medical=False,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
file_location = os.path.join(tmp_location, model_filename)
|
|
153
|
+
_ResourceHelper_moveFile(file_location, model_location).apply()
|
|
154
|
+
|
|
155
|
+
else:
|
|
156
|
+
create_graph(
|
|
157
|
+
model_location=model_location,
|
|
158
|
+
model_filename=model_filename,
|
|
159
|
+
ntags=self.get_build_param("ntags"),
|
|
160
|
+
embeddings_dim=self.get_build_param("embeddings_dim"),
|
|
161
|
+
nchars=self.get_build_param("nchars"),
|
|
162
|
+
lstm_size=self.get_build_param("lstm_size"),
|
|
163
|
+
gpu_device=self.get_build_param("gpu_device"),
|
|
164
|
+
is_medical=False,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class TFGraphBuilderFactory:
|
|
169
|
+
"""
|
|
170
|
+
Factory class to create the different tensorflow graphs for ner_dl, generic_classifier, assertion_dl, relation_extraction annotators in spark-nlp healthcare
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
__model_builders = {
|
|
174
|
+
"ner_dl": NerTFGraphBuilder
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def get_models():
|
|
179
|
+
"""
|
|
180
|
+
Method that return the available tf models in spark-nlp healthcare
|
|
181
|
+
Examples
|
|
182
|
+
--------
|
|
183
|
+
>>> from sparknlp.training.tfgraphs import tf_graph
|
|
184
|
+
>>> tf_graph.get_models()
|
|
185
|
+
"""
|
|
186
|
+
return list(TFGraphBuilderFactory.__model_builders.keys())
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def build(model_name, build_params, model_location, model_filename="auto"):
|
|
190
|
+
"""
|
|
191
|
+
Method that create the tf graph.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
model_name: str
|
|
196
|
+
The name of the tf model that you want to build.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction
|
|
197
|
+
build_params: dict
|
|
198
|
+
Configuration params to build the tf graph for the specific model.
|
|
199
|
+
model_location: str
|
|
200
|
+
Path where the model will be saved
|
|
201
|
+
model_filename: str
|
|
202
|
+
Name of the .rb file. If you put auto the filename will be generated.
|
|
203
|
+
|
|
204
|
+
Examples
|
|
205
|
+
--------
|
|
206
|
+
>>> from sparknlp.training.tfgraphs import tf_graph
|
|
207
|
+
>>> tf_graph.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83,"ntags": 12},model_location="./ner_graphs",model_filename="auto")
|
|
208
|
+
|
|
209
|
+
"""
|
|
210
|
+
try:
|
|
211
|
+
import tensorflow as tf
|
|
212
|
+
|
|
213
|
+
if tf.__version__[0] == '2':
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
import tensorflow_addons
|
|
217
|
+
|
|
218
|
+
except ModuleNotFoundError:
|
|
219
|
+
raise TensorflowAddonsNeeded()
|
|
220
|
+
|
|
221
|
+
if not (tf.__version__.startswith("1.15") or tf.__version__[0] == '2'):
|
|
222
|
+
raise WrongTFVersion()
|
|
223
|
+
|
|
224
|
+
except WrongTFVersion:
|
|
225
|
+
print(tf.version)
|
|
226
|
+
raise Exception("Tensorflow 2.xx or 1.15 is required to build model graphs.")
|
|
227
|
+
|
|
228
|
+
except ModuleNotFoundError:
|
|
229
|
+
raise Exception("You need to install Tensorflow 2.xx or 1.15 to be able to build model graphs")
|
|
230
|
+
|
|
231
|
+
except TensorflowAddonsNeeded:
|
|
232
|
+
raise Exception("You need to install tensorflow_addons to be able to generate graphs in Tensorflow 2.x")
|
|
233
|
+
|
|
234
|
+
if model_name not in TFGraphBuilderFactory.__model_builders:
|
|
235
|
+
raise Exception(f"Can't build a graph for {model_name}: model not supported.")
|
|
236
|
+
|
|
237
|
+
model = TFGraphBuilderFactory.__model_builders[model_name](build_params)
|
|
238
|
+
model.check_build_params()
|
|
239
|
+
|
|
240
|
+
if model_filename == "auto":
|
|
241
|
+
if not model.supports_auto_file_name():
|
|
242
|
+
msg = f"""
|
|
243
|
+
{model_name} doesn't support automatic filename generation, please specify the filename of the
|
|
244
|
+
output graph
|
|
245
|
+
""".strip()
|
|
246
|
+
raise Exception(msg)
|
|
247
|
+
else:
|
|
248
|
+
model_filename = model.get_model_filename()
|
|
249
|
+
|
|
250
|
+
model.build(model_location, model_filename)
|
|
251
|
+
else:
|
|
252
|
+
if re.match(r'(\w+)://.*', model_location):
|
|
253
|
+
tmp_location = "/tmp/relationModel"
|
|
254
|
+
model.build(tmp_location, model_filename)
|
|
255
|
+
|
|
256
|
+
file_location = os.path.join(tmp_location, model_filename)
|
|
257
|
+
_ResourceHelper_moveFile(file_location, model_location).apply()
|
|
258
|
+
|
|
259
|
+
else:
|
|
260
|
+
model.build(model_location, model_filename)
|
|
261
|
+
|
|
262
|
+
print("{} graph exported to {}/{}".format(model_name, model_location, model_filename))
|
|
263
|
+
|
|
264
|
+
@staticmethod
|
|
265
|
+
def print_model_params(model_name):
|
|
266
|
+
"""
|
|
267
|
+
Method that return the params allowed for the tf model.This method return the params with the description for every param.
|
|
268
|
+
|
|
269
|
+
Parameters
|
|
270
|
+
----------
|
|
271
|
+
model_name: str
|
|
272
|
+
The name of the tf model name.Model availables ner_dl,generic_classifier,assertion_dl and relation_extraction
|
|
273
|
+
|
|
274
|
+
Examples
|
|
275
|
+
--------
|
|
276
|
+
>>> from sparknlp.training import tf_graph
|
|
277
|
+
>>> tf_graph.print_model_params("ner_dl")
|
|
278
|
+
"""
|
|
279
|
+
if model_name not in TFGraphBuilderFactory.get_models():
|
|
280
|
+
raise Exception(f"Model {model_name} not supported.")
|
|
281
|
+
|
|
282
|
+
model = TFGraphBuilderFactory.__model_builders[model_name]({})
|
|
283
|
+
model_params = model.get_model_build_params()
|
|
284
|
+
model_params_descr = model.get_model_build_param_explanations()
|
|
285
|
+
|
|
286
|
+
print(f"{model_name} parameters.")
|
|
287
|
+
print("{:<20} {:<10} {:<20} {}".format("Parameter", "Required", "Default value", "Description"))
|
|
288
|
+
for param in model_params:
|
|
289
|
+
if type(model_params[param]) in [list, tuple]:
|
|
290
|
+
default_value = "[" + ", ".join(map(str, model_params[param])) + "]"
|
|
291
|
+
else:
|
|
292
|
+
default_value = model_params[param]
|
|
293
|
+
|
|
294
|
+
print("{:<20} {:<10} {:<20} {}".format(
|
|
295
|
+
param,
|
|
296
|
+
"yes" if default_value is None else "no",
|
|
297
|
+
default_value if default_value is not None else "-",
|
|
298
|
+
model_params_descr[param] if param in model_params_descr else ""
|
|
299
|
+
))
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import tensorflow.compat.v1 as tf
|
|
2
|
+
|
|
3
|
+
from .ner_model import NerModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def create_graph(
|
|
7
|
+
model_location,
|
|
8
|
+
ntags,
|
|
9
|
+
embeddings_dim,
|
|
10
|
+
nchars,
|
|
11
|
+
lstm_size=128,
|
|
12
|
+
model_filename=None,
|
|
13
|
+
gpu_device=0,
|
|
14
|
+
is_medical=False
|
|
15
|
+
):
|
|
16
|
+
tf.disable_v2_behavior()
|
|
17
|
+
tf.enable_v2_tensorshape()
|
|
18
|
+
tf.reset_default_graph()
|
|
19
|
+
|
|
20
|
+
if model_filename is None:
|
|
21
|
+
model_filename = 'blstm' + '_{}_{}_{}_{}'.format(ntags, embeddings_dim, lstm_size, nchars) + '.pb'
|
|
22
|
+
|
|
23
|
+
with tf.Session() as session:
|
|
24
|
+
ner = NerModel(session=None, use_gpu_device=gpu_device)
|
|
25
|
+
ner.add_cnn_char_repr(nchars, 25, 30)
|
|
26
|
+
ner.add_bilstm_char_repr(nchars, 25, 30)
|
|
27
|
+
ner.add_pretrained_word_embeddings(embeddings_dim)
|
|
28
|
+
ner.add_context_repr(ntags, lstm_size, 3)
|
|
29
|
+
ner.add_inference_layer(True, "predictions" if is_medical else "cond_2/Merge")
|
|
30
|
+
ner.add_training_op(5, "train" if is_medical else None)
|
|
31
|
+
ner.init_variables()
|
|
32
|
+
tf.train.Saver()
|
|
33
|
+
|
|
34
|
+
if model_location.startswith("dbfs:"):
|
|
35
|
+
graph_location = model_location.replace("dbfs:/", "/dbfs/")
|
|
36
|
+
else:
|
|
37
|
+
graph_location = model_location
|
|
38
|
+
|
|
39
|
+
tf.io.write_graph(ner.session.graph, graph_location, model_filename, False)
|
|
40
|
+
ner.close()
|
|
41
|
+
session.close()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import string
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DatasetEncoder:
|
|
8
|
+
# Each sentence must be array of tuple (word, tag)
|
|
9
|
+
def __init__(self, embeddings_resolver, tag2id=None, piece_tag='[X]'):
|
|
10
|
+
if tag2id is None:
|
|
11
|
+
tag2id = {'O': 0}
|
|
12
|
+
self.char2id = {c: i + 1 for i, c in enumerate(string.printable)}
|
|
13
|
+
self.tag2id = tag2id
|
|
14
|
+
self.embeddings_resolver = embeddings_resolver
|
|
15
|
+
self.piece_tag = piece_tag
|
|
16
|
+
|
|
17
|
+
def shuffle(self):
|
|
18
|
+
random.shuffle(self.sentences)
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def normalize(word):
|
|
22
|
+
return word.strip().lower()
|
|
23
|
+
|
|
24
|
+
def get_char_indexes(self, word):
|
|
25
|
+
result = []
|
|
26
|
+
for c in word:
|
|
27
|
+
char_id = self.char2id.get(c, len(self.char2id) - 1)
|
|
28
|
+
result.append(char_id)
|
|
29
|
+
|
|
30
|
+
return result
|
|
31
|
+
|
|
32
|
+
def encode(self, sentences, output=False):
|
|
33
|
+
for sentence in sentences:
|
|
34
|
+
dataset_words = [word for (word, tag) in sentence]
|
|
35
|
+
word_embeddings = self.embeddings_resolver.resolve_sentence(dataset_words)
|
|
36
|
+
|
|
37
|
+
# Zip Embeddings and Tags
|
|
38
|
+
words = []
|
|
39
|
+
tags = []
|
|
40
|
+
char_ids = []
|
|
41
|
+
tag_ids = []
|
|
42
|
+
is_word_start = []
|
|
43
|
+
embeddings = []
|
|
44
|
+
|
|
45
|
+
i = 0
|
|
46
|
+
|
|
47
|
+
for item in word_embeddings:
|
|
48
|
+
words.append(item.piece)
|
|
49
|
+
|
|
50
|
+
if item.is_word_start:
|
|
51
|
+
assert i < len(sentence), 'i = {} is more or equal than length of {}, during zip with {}'.format(i,
|
|
52
|
+
sentence,
|
|
53
|
+
word_embeddings)
|
|
54
|
+
tag = sentence[i][1]
|
|
55
|
+
i += 1
|
|
56
|
+
else:
|
|
57
|
+
tag = self.piece_tag
|
|
58
|
+
|
|
59
|
+
tag_id = self.tag2id.get(tag, len(self.tag2id))
|
|
60
|
+
self.tag2id[tag] = tag_id
|
|
61
|
+
|
|
62
|
+
tags.append(tag)
|
|
63
|
+
tag_ids.append(tag_id)
|
|
64
|
+
|
|
65
|
+
embeddings.append(item.vector)
|
|
66
|
+
is_word_start.append(item.is_word_start)
|
|
67
|
+
|
|
68
|
+
char_ids.append(self.get_char_indexes(item.piece))
|
|
69
|
+
|
|
70
|
+
if len(sentence) > 0:
|
|
71
|
+
yield {
|
|
72
|
+
"words": words,
|
|
73
|
+
"tags": tags,
|
|
74
|
+
"char_ids": char_ids,
|
|
75
|
+
"tag_ids": tag_ids,
|
|
76
|
+
"is_word_start": is_word_start,
|
|
77
|
+
"word_embeddings": np.array(embeddings, dtype=np.float16)
|
|
78
|
+
}
|