spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
sparknlp/internal.py
DELETED
|
@@ -1,288 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
|
|
3
|
-
from pyspark import SparkContext, keyword_only
|
|
4
|
-
from pyspark.ml import PipelineModel
|
|
5
|
-
from pyspark.ml.wrapper import JavaWrapper, JavaTransformer, JavaEstimator, JavaModel
|
|
6
|
-
from pyspark.ml.util import JavaMLWritable, JavaMLReadable, JavaMLReader
|
|
7
|
-
from pyspark.sql.dataframe import DataFrame
|
|
8
|
-
from pyspark.ml.param.shared import Params
|
|
9
|
-
import re
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
# Helper class used to generate the getters for all params
|
|
13
|
-
class ParamsGettersSetters(Params):
|
|
14
|
-
getter_attrs = []
|
|
15
|
-
|
|
16
|
-
def __init__(self):
|
|
17
|
-
super(ParamsGettersSetters, self).__init__()
|
|
18
|
-
for param in self.params:
|
|
19
|
-
param_name = param.name
|
|
20
|
-
fg_attr = "get" + re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), param_name)
|
|
21
|
-
fs_attr = "set" + re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), param_name)
|
|
22
|
-
# Generates getter and setter only if not exists
|
|
23
|
-
try:
|
|
24
|
-
getattr(self, fg_attr)
|
|
25
|
-
except AttributeError:
|
|
26
|
-
setattr(self, fg_attr, self.getParamValue(param_name))
|
|
27
|
-
try:
|
|
28
|
-
getattr(self, fs_attr)
|
|
29
|
-
except AttributeError:
|
|
30
|
-
setattr(self, fs_attr, self.setParamValue(param_name))
|
|
31
|
-
|
|
32
|
-
def getParamValue(self, paramName):
|
|
33
|
-
def r():
|
|
34
|
-
try:
|
|
35
|
-
return self.getOrDefault(paramName)
|
|
36
|
-
except KeyError:
|
|
37
|
-
return None
|
|
38
|
-
return r
|
|
39
|
-
|
|
40
|
-
def setParamValue(self, paramName):
|
|
41
|
-
def r(v):
|
|
42
|
-
self.set(self.getParam(paramName), v)
|
|
43
|
-
return self
|
|
44
|
-
return r
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class AnnotatorJavaMLReadable(JavaMLReadable):
|
|
48
|
-
@classmethod
|
|
49
|
-
def read(cls):
|
|
50
|
-
"""Returns an MLReader instance for this class."""
|
|
51
|
-
return AnnotatorJavaMLReader(cls())
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class AnnotatorJavaMLReader(JavaMLReader):
|
|
55
|
-
@classmethod
|
|
56
|
-
def _java_loader_class(cls, clazz):
|
|
57
|
-
if hasattr(clazz, '_java_class_name') and clazz._java_class_name is not None:
|
|
58
|
-
return clazz._java_class_name
|
|
59
|
-
else:
|
|
60
|
-
return JavaMLReader._java_loader_class(clazz)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
class AnnotatorTransformer(JavaTransformer, AnnotatorJavaMLReadable, JavaMLWritable, ParamsGettersSetters):
|
|
64
|
-
@keyword_only
|
|
65
|
-
def __init__(self, classname):
|
|
66
|
-
super(AnnotatorTransformer, self).__init__()
|
|
67
|
-
kwargs = self._input_kwargs
|
|
68
|
-
if 'classname' in kwargs:
|
|
69
|
-
kwargs.pop('classname')
|
|
70
|
-
self.setParams(**kwargs)
|
|
71
|
-
self.__class__._java_class_name = classname
|
|
72
|
-
self._java_obj = self._new_java_obj(classname, self.uid)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class RecursiveEstimator(JavaEstimator, ABC):
|
|
76
|
-
|
|
77
|
-
def _fit_java(self, dataset, pipeline=None):
|
|
78
|
-
self._transfer_params_to_java()
|
|
79
|
-
if pipeline:
|
|
80
|
-
return self._java_obj.recursiveFit(dataset._jdf, pipeline._to_java())
|
|
81
|
-
else:
|
|
82
|
-
return self._java_obj.fit(dataset._jdf)
|
|
83
|
-
|
|
84
|
-
def _fit(self, dataset, pipeline=None):
|
|
85
|
-
java_model = self._fit_java(dataset, pipeline)
|
|
86
|
-
model = self._create_model(java_model)
|
|
87
|
-
return self._copyValues(model)
|
|
88
|
-
|
|
89
|
-
def fit(self, dataset, params=None, pipeline=None):
|
|
90
|
-
if params is None:
|
|
91
|
-
params = dict()
|
|
92
|
-
if isinstance(params, (list, tuple)):
|
|
93
|
-
models = [None] * len(params)
|
|
94
|
-
for index, model in self.fitMultiple(dataset, params):
|
|
95
|
-
models[index] = model
|
|
96
|
-
return models
|
|
97
|
-
elif isinstance(params, dict):
|
|
98
|
-
if params:
|
|
99
|
-
return self.copy(params)._fit(dataset, pipeline=pipeline)
|
|
100
|
-
else:
|
|
101
|
-
return self._fit(dataset, pipeline=pipeline)
|
|
102
|
-
else:
|
|
103
|
-
raise ValueError("Params must be either a param map or a list/tuple of param maps, "
|
|
104
|
-
"but got %s." % type(params))
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
class RecursiveTransformer(JavaModel):
|
|
108
|
-
|
|
109
|
-
def _transform_recursive(self, dataset, recursive_pipeline):
|
|
110
|
-
self._transfer_params_to_java()
|
|
111
|
-
return DataFrame(self._java_obj.recursiveTransform(dataset._jdf, recursive_pipeline._to_java()), dataset.sql_ctx)
|
|
112
|
-
|
|
113
|
-
def transform_recursive(self, dataset, recursive_pipeline, params=None):
|
|
114
|
-
if params is None:
|
|
115
|
-
params = dict()
|
|
116
|
-
if isinstance(params, dict):
|
|
117
|
-
if params:
|
|
118
|
-
return self.copy(params)._transform_recursive(dataset, recursive_pipeline)
|
|
119
|
-
else:
|
|
120
|
-
return self._transform_recursive(dataset, recursive_pipeline)
|
|
121
|
-
else:
|
|
122
|
-
raise ValueError("Params must be a param map but got %s." % type(params))
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
class ExtendedJavaWrapper(JavaWrapper):
|
|
126
|
-
def __init__(self, java_obj, *args):
|
|
127
|
-
super(ExtendedJavaWrapper, self).__init__(java_obj)
|
|
128
|
-
self.sc = SparkContext._active_spark_context
|
|
129
|
-
self._java_obj = self.new_java_obj(java_obj, *args)
|
|
130
|
-
self.java_obj = self._java_obj
|
|
131
|
-
|
|
132
|
-
def __del__(self):
|
|
133
|
-
pass
|
|
134
|
-
|
|
135
|
-
def apply(self):
|
|
136
|
-
return self._java_obj
|
|
137
|
-
|
|
138
|
-
def new_java_obj(self, java_class, *args):
|
|
139
|
-
return self._new_java_obj(java_class, *args)
|
|
140
|
-
|
|
141
|
-
def new_java_array(self, pylist, java_class):
|
|
142
|
-
"""
|
|
143
|
-
ToDo: Inspired from spark 2.0. Review if spark changes
|
|
144
|
-
"""
|
|
145
|
-
java_array = self.sc._gateway.new_array(java_class, len(pylist))
|
|
146
|
-
for i in range(len(pylist)):
|
|
147
|
-
java_array[i] = pylist[i]
|
|
148
|
-
return java_array
|
|
149
|
-
|
|
150
|
-
def new_java_array_string(self, pylist):
|
|
151
|
-
java_array = self._new_java_array(pylist, self.sc._gateway.jvm.java.lang.String)
|
|
152
|
-
return java_array
|
|
153
|
-
|
|
154
|
-
def new_java_array_integer(self, pylist):
|
|
155
|
-
java_array = self._new_java_array(pylist, self.sc._gateway.jvm.java.lang.Integer)
|
|
156
|
-
return java_array
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
class _RegexRule(ExtendedJavaWrapper):
|
|
160
|
-
def __init__(self, rule, identifier):
|
|
161
|
-
super(_RegexRule, self).__init__("com.johnsnowlabs.nlp.util.regex.RegexRule", rule, identifier)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
class _ExternalResource(ExtendedJavaWrapper):
|
|
165
|
-
def __init__(self, path, read_as, options):
|
|
166
|
-
super(_ExternalResource, self).__init__("com.johnsnowlabs.nlp.util.io.ExternalResource.fromJava", path, read_as, options)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
class _ConfigLoaderGetter(ExtendedJavaWrapper):
|
|
170
|
-
def __init__(self):
|
|
171
|
-
super(_ConfigLoaderGetter, self).__init__("com.johnsnowlabs.util.ConfigLoader.getConfigPath")
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
class _DownloadModel(ExtendedJavaWrapper):
|
|
175
|
-
def __init__(self, reader, name, language, remote_loc, validator):
|
|
176
|
-
super(_DownloadModel, self).__init__("com.johnsnowlabs.nlp.pretrained."+validator+".downloadModel", reader, name, language, remote_loc)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class _DownloadPipeline(ExtendedJavaWrapper):
|
|
180
|
-
def __init__(self, name, language, remote_loc):
|
|
181
|
-
super(_DownloadPipeline, self).__init__("com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline", name, language, remote_loc)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
class _ClearCache(ExtendedJavaWrapper):
|
|
185
|
-
def __init__(self, name, language, remote_loc):
|
|
186
|
-
super(_ClearCache, self).__init__("com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.clearCache", name, language, remote_loc)
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
class _GetResourceSize(ExtendedJavaWrapper):
|
|
190
|
-
def __init__(self, name, language, remote_loc):
|
|
191
|
-
super(_GetResourceSize, self).__init__(
|
|
192
|
-
"com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
class _ShowUnCategorizedResources(ExtendedJavaWrapper):
|
|
196
|
-
def __init__(self):
|
|
197
|
-
super(_ShowUnCategorizedResources, self).__init__(
|
|
198
|
-
"com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.showUnCategorizedResources")
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
class _ShowPublicPipelines(ExtendedJavaWrapper):
|
|
202
|
-
def __init__(self):
|
|
203
|
-
super(_ShowPublicPipelines, self).__init__(
|
|
204
|
-
"com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.showPublicPipelines")
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
class _ShowPublicModels(ExtendedJavaWrapper):
|
|
208
|
-
def __init__(self):
|
|
209
|
-
super(_ShowPublicModels, self).__init__(
|
|
210
|
-
"com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.showPublicModels")
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
# predefined pipelines
|
|
214
|
-
class _DownloadPredefinedPipeline(ExtendedJavaWrapper):
|
|
215
|
-
def __init__(self, java_path):
|
|
216
|
-
super(_DownloadPredefinedPipeline, self).__init__(java_path)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
class _LightPipeline(ExtendedJavaWrapper):
|
|
220
|
-
def __init__(self, pipelineModel, parse_embeddings):
|
|
221
|
-
super(_LightPipeline, self).__init__("com.johnsnowlabs.nlp.LightPipeline", pipelineModel._to_java(), parse_embeddings)
|
|
222
|
-
|
|
223
|
-
# ==================
|
|
224
|
-
# Utils
|
|
225
|
-
# ==================
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
class _StorageHelper(ExtendedJavaWrapper):
|
|
229
|
-
def __init__(self, path, spark, database, storage_ref, within_storage):
|
|
230
|
-
super(_StorageHelper, self).__init__("com.johnsnowlabs.storage.StorageHelper.load", path, spark._jsparkSession, database, storage_ref, within_storage)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
class _CoNLLGeneratorExport(ExtendedJavaWrapper):
|
|
234
|
-
def __init__(self, spark, target, pipeline, output_path):
|
|
235
|
-
if type(pipeline) == PipelineModel:
|
|
236
|
-
pipeline = pipeline._to_java()
|
|
237
|
-
if type(target) == DataFrame:
|
|
238
|
-
super(_CoNLLGeneratorExport, self).__init__("com.johnsnowlabs.util.CoNLLGenerator.exportConllFiles", target._jdf, pipeline, output_path)
|
|
239
|
-
else:
|
|
240
|
-
super(_CoNLLGeneratorExport, self).__init__("com.johnsnowlabs.util.CoNLLGenerator.exportConllFiles", spark._jsparkSession, target, pipeline, output_path)
|
|
241
|
-
|
|
242
|
-
def __init__(self, dataframe, output_path):
|
|
243
|
-
super(_CoNLLGeneratorExport, self).__init__("com.johnsnowlabs.util.CoNLLGenerator.exportConllFiles", dataframe, output_path)
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
class _EmbeddingsOverallCoverage(ExtendedJavaWrapper):
|
|
247
|
-
def __init__(self, dataset, embeddings_col):
|
|
248
|
-
super(_EmbeddingsOverallCoverage, self).__init__("com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel.overallCoverage", dataset._jdf, embeddings_col)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
class _EmbeddingsCoverageColumn(ExtendedJavaWrapper):
|
|
252
|
-
def __init__(self, dataset, embeddings_col, output_col):
|
|
253
|
-
super(_EmbeddingsCoverageColumn, self).__init__("com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel.withCoverageColumn", dataset._jdf, embeddings_col, output_col)
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
class _CoverageResult(ExtendedJavaWrapper):
|
|
257
|
-
def __init__(self, covered, total, percentage):
|
|
258
|
-
super(_CoverageResult, self).__init__("com.johnsnowlabs.nlp.embeddings.CoverageResult", covered, total, percentage)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
class _BertLoader(ExtendedJavaWrapper):
|
|
262
|
-
def __init__(self, path, jspark):
|
|
263
|
-
super(_BertLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BertEmbeddings.loadSavedModel", path, jspark)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
class _BertSentenceLoader(ExtendedJavaWrapper):
|
|
267
|
-
def __init__(self, path, jspark):
|
|
268
|
-
super(_BertSentenceLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings.loadSavedModel", path, jspark)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
class _USELoader(ExtendedJavaWrapper):
|
|
272
|
-
def __init__(self, path, jspark):
|
|
273
|
-
super(_USELoader, self).__init__("com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder.loadSavedModel", path, jspark)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
class _ElmoLoader(ExtendedJavaWrapper):
|
|
277
|
-
def __init__(self, path, jspark):
|
|
278
|
-
super(_ElmoLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings.loadSavedModel", path, jspark)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
class _AlbertLoader(ExtendedJavaWrapper):
|
|
282
|
-
def __init__(self, path, jspark):
|
|
283
|
-
super(_AlbertLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.AlbertEmbeddings.loadSavedModel", path, jspark)
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
class _XlnetLoader(ExtendedJavaWrapper):
|
|
287
|
-
def __init__(self, path, jspark):
|
|
288
|
-
super(_XlnetLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings.loadSavedModel", path, jspark)
|
sparknlp/internal.pyc
DELETED
|
Binary file
|
sparknlp/pretrained.py
DELETED
|
@@ -1,123 +0,0 @@
|
|
|
1
|
-
import sparknlp.internal as _internal
|
|
2
|
-
import threading
|
|
3
|
-
import time
|
|
4
|
-
from pyspark.sql import DataFrame
|
|
5
|
-
from sparknlp.annotator import *
|
|
6
|
-
from sparknlp.base import LightPipeline
|
|
7
|
-
from pyspark.ml import PipelineModel
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def printProgress(stop):
|
|
11
|
-
states = [' | ', ' / ', ' — ', ' \\ ']
|
|
12
|
-
nextc = 0
|
|
13
|
-
while True:
|
|
14
|
-
sys.stdout.write('\r[{}]'.format(states[nextc]))
|
|
15
|
-
sys.stdout.flush()
|
|
16
|
-
time.sleep(2.5)
|
|
17
|
-
nextc = nextc + 1 if nextc < 3 else 0
|
|
18
|
-
if stop():
|
|
19
|
-
sys.stdout.write('\r[{}]'.format('OK!'))
|
|
20
|
-
sys.stdout.flush()
|
|
21
|
-
break
|
|
22
|
-
|
|
23
|
-
sys.stdout.write('\n')
|
|
24
|
-
return
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class ResourceDownloader(object):
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def downloadModel(reader, name, language, remote_loc=None, j_dwn='PythonResourceDownloader'):
|
|
31
|
-
print(name + " download started this may take some time.")
|
|
32
|
-
file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
|
|
33
|
-
if file_size == "-1":
|
|
34
|
-
print("Can not find the model to download please check the name!")
|
|
35
|
-
else:
|
|
36
|
-
print("Approximate size to download " + file_size)
|
|
37
|
-
stop_threads = False
|
|
38
|
-
t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
|
|
39
|
-
t1.start()
|
|
40
|
-
try:
|
|
41
|
-
j_obj = _internal._DownloadModel(reader.name, name, language, remote_loc, j_dwn).apply()
|
|
42
|
-
finally:
|
|
43
|
-
stop_threads = True
|
|
44
|
-
t1.join()
|
|
45
|
-
|
|
46
|
-
return reader(classname=None, java_model=j_obj)
|
|
47
|
-
|
|
48
|
-
@staticmethod
|
|
49
|
-
def downloadPipeline(name, language, remote_loc=None):
|
|
50
|
-
print(name + " download started this may take some time.")
|
|
51
|
-
file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
|
|
52
|
-
if file_size == "-1":
|
|
53
|
-
print("Can not find the model to download please check the name!")
|
|
54
|
-
else:
|
|
55
|
-
print("Approx size to download " + file_size)
|
|
56
|
-
stop_threads = False
|
|
57
|
-
t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
|
|
58
|
-
t1.start()
|
|
59
|
-
try:
|
|
60
|
-
j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
|
|
61
|
-
jmodel = PipelineModel._from_java(j_obj)
|
|
62
|
-
finally:
|
|
63
|
-
stop_threads = True
|
|
64
|
-
t1.join()
|
|
65
|
-
|
|
66
|
-
return jmodel
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def clearCache(name, language, remote_loc=None):
|
|
70
|
-
_internal._ClearCache(name, language, remote_loc).apply()
|
|
71
|
-
|
|
72
|
-
@staticmethod
|
|
73
|
-
def showPublicModels():
|
|
74
|
-
print("test")
|
|
75
|
-
_internal._ShowPublicModels().apply()
|
|
76
|
-
|
|
77
|
-
@staticmethod
|
|
78
|
-
def showPublicPipelines():
|
|
79
|
-
_internal._ShowPublicPipelines().apply()
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
@staticmethod
|
|
83
|
-
def showUnCategorizedResources():
|
|
84
|
-
_internal._ShowUnCategorizedResources().apply()
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
class PretrainedPipeline:
|
|
88
|
-
|
|
89
|
-
def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
|
|
90
|
-
if not disk_location:
|
|
91
|
-
self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
|
|
92
|
-
else:
|
|
93
|
-
self.model = PipelineModel.load(disk_location)
|
|
94
|
-
self.light_model = LightPipeline(self.model, parse_embeddings)
|
|
95
|
-
|
|
96
|
-
@staticmethod
|
|
97
|
-
def from_disk(path, parse_embeddings=False):
|
|
98
|
-
return PretrainedPipeline(None, None, None, parse_embeddings, path)
|
|
99
|
-
|
|
100
|
-
def annotate(self, target, column=None):
|
|
101
|
-
if type(target) is DataFrame:
|
|
102
|
-
if not column:
|
|
103
|
-
raise Exception("annotate() column arg needed when targeting a DataFrame")
|
|
104
|
-
return self.model.transform(target.withColumnRenamed(column, "text"))
|
|
105
|
-
elif type(target) is list or type(target) is str:
|
|
106
|
-
pipeline = self.light_model
|
|
107
|
-
return pipeline.annotate(target)
|
|
108
|
-
else:
|
|
109
|
-
raise Exception("target must be either a spark DataFrame, a list of strings or a string")
|
|
110
|
-
|
|
111
|
-
def fullAnnotate(self, target, column=None):
|
|
112
|
-
if type(target) is DataFrame:
|
|
113
|
-
if not column:
|
|
114
|
-
raise Exception("annotate() column arg needed when targeting a DataFrame")
|
|
115
|
-
return self.model.transform(target.withColumnRenamed(column, "text"))
|
|
116
|
-
elif type(target) is list or type(target) is str:
|
|
117
|
-
pipeline = self.light_model
|
|
118
|
-
return pipeline.fullAnnotate(target)
|
|
119
|
-
else:
|
|
120
|
-
raise Exception("target must be either a spark DataFrame, a list of strings or a string")
|
|
121
|
-
|
|
122
|
-
def transform(self, data):
|
|
123
|
-
return self.model.transform(data)
|
sparknlp/pretrained.pyc
DELETED
|
Binary file
|
sparknlp/storage.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
import sparknlp.internal as _internal
|
|
2
|
-
|
|
3
|
-
from pyspark.ml.param import Params
|
|
4
|
-
from pyspark import keyword_only
|
|
5
|
-
import sys
|
|
6
|
-
import threading
|
|
7
|
-
import time
|
|
8
|
-
import sparknlp.pretrained as _pretrained
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# DONT REMOVE THIS IMPORT
|
|
12
|
-
from sparknlp.annotator import WordEmbeddingsModel
|
|
13
|
-
####
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class RocksDBConnection:
|
|
17
|
-
def __init__(self, connection):
|
|
18
|
-
self.jconnection = connection
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class StorageHelper:
|
|
22
|
-
@classmethod
|
|
23
|
-
def load(cls, path, spark_session, database):
|
|
24
|
-
print("Loading started this may take some time")
|
|
25
|
-
stop_threads = False
|
|
26
|
-
t1 = threading.Thread(target=_pretrained.printProgress, args=(lambda: stop_threads,))
|
|
27
|
-
t1.start()
|
|
28
|
-
jembeddings = _internal._StorageHelper(path, spark_session, database).apply()
|
|
29
|
-
stop_threads = True
|
|
30
|
-
t1.join()
|
|
31
|
-
print("Loading done")
|
|
32
|
-
return RocksDBConnection(jembeddings)
|
sparknlp/storage.pyc
DELETED
|
Binary file
|
sparknlp/training.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from sparknlp.internal import ExtendedJavaWrapper
|
|
2
|
-
from sparknlp.common import ExternalResource, ReadAs
|
|
3
|
-
from pyspark.sql import SparkSession, DataFrame
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class CoNLL(ExtendedJavaWrapper):
|
|
7
|
-
def __init__(self,
|
|
8
|
-
documentCol = 'document',
|
|
9
|
-
sentenceCol = 'sentence',
|
|
10
|
-
tokenCol = 'token',
|
|
11
|
-
posCol = 'pos',
|
|
12
|
-
conllLabelIndex = 3,
|
|
13
|
-
conllPosIndex = 1,
|
|
14
|
-
textCol = 'text',
|
|
15
|
-
labelCol = 'label',
|
|
16
|
-
explodeSentences = True,
|
|
17
|
-
):
|
|
18
|
-
super(CoNLL, self).__init__("com.johnsnowlabs.nlp.training.CoNLL",
|
|
19
|
-
documentCol,
|
|
20
|
-
sentenceCol,
|
|
21
|
-
tokenCol,
|
|
22
|
-
posCol,
|
|
23
|
-
conllLabelIndex,
|
|
24
|
-
conllPosIndex,
|
|
25
|
-
textCol,
|
|
26
|
-
labelCol,
|
|
27
|
-
explodeSentences)
|
|
28
|
-
|
|
29
|
-
def readDataset(self, spark, path, read_as=ReadAs.TEXT):
|
|
30
|
-
|
|
31
|
-
# ToDo Replace with std pyspark
|
|
32
|
-
jSession = spark._jsparkSession
|
|
33
|
-
|
|
34
|
-
jdf = self._java_obj.readDataset(jSession, path, read_as)
|
|
35
|
-
return DataFrame(jdf, spark._wrapped)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class POS(ExtendedJavaWrapper):
|
|
39
|
-
def __init__(self):
|
|
40
|
-
super(POS, self).__init__("com.johnsnowlabs.nlp.training.POS")
|
|
41
|
-
|
|
42
|
-
def readDataset(self, spark, path, delimiter="|", outputPosCol="tags", outputDocumentCol="document", outputTextCol="text"):
|
|
43
|
-
|
|
44
|
-
# ToDo Replace with std pyspark
|
|
45
|
-
jSession = spark._jsparkSession
|
|
46
|
-
|
|
47
|
-
jdf = self._java_obj.readDataset(jSession, path, delimiter, outputPosCol, outputDocumentCol, outputTextCol)
|
|
48
|
-
return DataFrame(jdf, spark._wrapped)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class PubTator(ExtendedJavaWrapper):
|
|
52
|
-
def __init__(self):
|
|
53
|
-
super(PubTator, self).__init__("com.johnsnowlabs.nlp.training.PubTator")
|
|
54
|
-
|
|
55
|
-
def readDataset(self, spark, path):
|
|
56
|
-
|
|
57
|
-
# ToDo Replace with std pyspark
|
|
58
|
-
jSession = spark._jsparkSession
|
|
59
|
-
|
|
60
|
-
jdf = self._java_obj.readDataset(jSession, path)
|
|
61
|
-
return DataFrame(jdf, spark._wrapped)
|
|
62
|
-
|
sparknlp/training.pyc
DELETED
|
Binary file
|
sparknlp/util.pyc
DELETED
|
Binary file
|
|
File without changes
|