spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
sparknlp/annotator.pyc
DELETED
|
Binary file
|
sparknlp/base.py
DELETED
|
@@ -1,347 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
|
|
3
|
-
from pyspark import keyword_only
|
|
4
|
-
from pyspark.ml.wrapper import JavaEstimator
|
|
5
|
-
from pyspark.ml.param.shared import Param, Params, TypeConverters
|
|
6
|
-
from pyspark.ml.pipeline import Pipeline, PipelineModel, Estimator, Transformer
|
|
7
|
-
from sparknlp.common import AnnotatorProperties
|
|
8
|
-
from sparknlp.internal import AnnotatorTransformer, RecursiveEstimator, RecursiveTransformer
|
|
9
|
-
|
|
10
|
-
from sparknlp.annotation import Annotation
|
|
11
|
-
import sparknlp.internal as _internal
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class LightPipeline:
|
|
15
|
-
def __init__(self, pipelineModel, parse_embeddings=False):
|
|
16
|
-
self.pipeline_model = pipelineModel
|
|
17
|
-
self._lightPipeline = _internal._LightPipeline(pipelineModel, parse_embeddings).apply()
|
|
18
|
-
|
|
19
|
-
@staticmethod
|
|
20
|
-
def _annotation_from_java(java_annotations):
|
|
21
|
-
annotations = []
|
|
22
|
-
for annotation in java_annotations:
|
|
23
|
-
annotations.append(Annotation(annotation.annotatorType(),
|
|
24
|
-
annotation.begin(),
|
|
25
|
-
annotation.end(),
|
|
26
|
-
annotation.result(),
|
|
27
|
-
annotation.metadata(),
|
|
28
|
-
annotation.embeddings
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
return annotations
|
|
32
|
-
|
|
33
|
-
def fullAnnotate(self, target):
|
|
34
|
-
result = []
|
|
35
|
-
if type(target) is str:
|
|
36
|
-
target = [target]
|
|
37
|
-
for row in self._lightPipeline.fullAnnotateJava(target):
|
|
38
|
-
kas = {}
|
|
39
|
-
for atype, annotations in row.items():
|
|
40
|
-
kas[atype] = self._annotation_from_java(annotations)
|
|
41
|
-
result.append(kas)
|
|
42
|
-
return result
|
|
43
|
-
|
|
44
|
-
def annotate(self, target):
|
|
45
|
-
|
|
46
|
-
def reformat(annotations):
|
|
47
|
-
return {k: list(v) for k, v in annotations.items()}
|
|
48
|
-
|
|
49
|
-
annotations = self._lightPipeline.annotateJava(target)
|
|
50
|
-
|
|
51
|
-
if type(target) is str:
|
|
52
|
-
result = reformat(annotations)
|
|
53
|
-
elif type(target) is list:
|
|
54
|
-
result = list(map(lambda a: reformat(a), list(annotations)))
|
|
55
|
-
else:
|
|
56
|
-
raise TypeError("target for annotation may be 'str' or 'list'")
|
|
57
|
-
|
|
58
|
-
return result
|
|
59
|
-
|
|
60
|
-
def transform(self, dataframe):
|
|
61
|
-
return self.pipeline_model.transform(dataframe)
|
|
62
|
-
|
|
63
|
-
def setIgnoreUnsupported(self, value):
|
|
64
|
-
self._lightPipeline.setIgnoreUnsupported(value)
|
|
65
|
-
return self
|
|
66
|
-
|
|
67
|
-
def getIgnoreUnsupported(self):
|
|
68
|
-
return self._lightPipeline.getIgnoreUnsupported()
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
class RecursivePipeline(Pipeline, JavaEstimator):
|
|
72
|
-
@keyword_only
|
|
73
|
-
def __init__(self, *args, **kwargs):
|
|
74
|
-
super(RecursivePipeline, self).__init__(*args, **kwargs)
|
|
75
|
-
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.RecursivePipeline", self.uid)
|
|
76
|
-
kwargs = self._input_kwargs
|
|
77
|
-
self.setParams(**kwargs)
|
|
78
|
-
|
|
79
|
-
def _fit(self, dataset):
|
|
80
|
-
stages = self.getStages()
|
|
81
|
-
for stage in stages:
|
|
82
|
-
if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)):
|
|
83
|
-
raise TypeError(
|
|
84
|
-
"Cannot recognize a pipeline stage of type %s." % type(stage))
|
|
85
|
-
indexOfLastEstimator = -1
|
|
86
|
-
for i, stage in enumerate(stages):
|
|
87
|
-
if isinstance(stage, Estimator):
|
|
88
|
-
indexOfLastEstimator = i
|
|
89
|
-
transformers = []
|
|
90
|
-
for i, stage in enumerate(stages):
|
|
91
|
-
if i <= indexOfLastEstimator:
|
|
92
|
-
if isinstance(stage, Transformer):
|
|
93
|
-
transformers.append(stage)
|
|
94
|
-
dataset = stage.transform(dataset)
|
|
95
|
-
elif isinstance(stage, RecursiveEstimator):
|
|
96
|
-
model = stage.fit(dataset, pipeline=PipelineModel(transformers))
|
|
97
|
-
transformers.append(model)
|
|
98
|
-
if i < indexOfLastEstimator:
|
|
99
|
-
dataset = model.transform(dataset)
|
|
100
|
-
else:
|
|
101
|
-
model = stage.fit(dataset)
|
|
102
|
-
transformers.append(model)
|
|
103
|
-
if i < indexOfLastEstimator:
|
|
104
|
-
dataset = model.transform(dataset)
|
|
105
|
-
else:
|
|
106
|
-
transformers.append(stage)
|
|
107
|
-
return PipelineModel(transformers)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
class RecursivePipelineModel(PipelineModel):
|
|
111
|
-
|
|
112
|
-
def __init__(self, pipeline_model):
|
|
113
|
-
super(PipelineModel, self).__init__()
|
|
114
|
-
self.stages = pipeline_model.stages
|
|
115
|
-
|
|
116
|
-
def _transform(self, dataset):
|
|
117
|
-
for t in self.stages:
|
|
118
|
-
if isinstance(t, HasRecursiveTransform):
|
|
119
|
-
# drops current stage from the recursive pipeline within
|
|
120
|
-
dataset = t.transform_recursive(dataset, PipelineModel(self.stages[:-1]))
|
|
121
|
-
elif isinstance(t, AnnotatorProperties) and t.getLazyAnnotator():
|
|
122
|
-
pass
|
|
123
|
-
else:
|
|
124
|
-
dataset = t.transform(dataset)
|
|
125
|
-
return dataset
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class HasRecursiveFit(RecursiveEstimator, ABC):
|
|
129
|
-
pass
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
class HasRecursiveTransform(RecursiveTransformer):
|
|
133
|
-
pass
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
class DocumentAssembler(AnnotatorTransformer):
|
|
137
|
-
|
|
138
|
-
inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
|
|
139
|
-
outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
|
|
140
|
-
idCol = Param(Params._dummy(), "idCol", "column for setting an id to such string in row", typeConverter=TypeConverters.toString)
|
|
141
|
-
metadataCol = Param(Params._dummy(), "metadataCol", "String to String map column to use as metadata", typeConverter=TypeConverters.toString)
|
|
142
|
-
calculationsCol = Param(Params._dummy(), "calculationsCol", "String to Float vector map column to use as embeddigns and other representations", typeConverter=TypeConverters.toString)
|
|
143
|
-
cleanupMode = Param(Params._dummy(), "cleanupMode", "possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full", typeConverter=TypeConverters.toString)
|
|
144
|
-
name = 'DocumentAssembler'
|
|
145
|
-
|
|
146
|
-
@keyword_only
|
|
147
|
-
def __init__(self):
|
|
148
|
-
super(DocumentAssembler, self).__init__(classname="com.johnsnowlabs.nlp.DocumentAssembler")
|
|
149
|
-
self._setDefault(outputCol="document", cleanupMode='disabled')
|
|
150
|
-
|
|
151
|
-
@keyword_only
|
|
152
|
-
def setParams(self):
|
|
153
|
-
kwargs = self._input_kwargs
|
|
154
|
-
return self._set(**kwargs)
|
|
155
|
-
|
|
156
|
-
def setInputCol(self, value):
|
|
157
|
-
return self._set(inputCol=value)
|
|
158
|
-
|
|
159
|
-
def setOutputCol(self, value):
|
|
160
|
-
return self._set(outputCol=value)
|
|
161
|
-
|
|
162
|
-
def setIdCol(self, value):
|
|
163
|
-
return self._set(idCol=value)
|
|
164
|
-
|
|
165
|
-
def setMetadataCol(self, value):
|
|
166
|
-
return self._set(metadataCol=value)
|
|
167
|
-
|
|
168
|
-
def setCalculationsCol(self, value):
|
|
169
|
-
return self._set(metadataCol=value)
|
|
170
|
-
|
|
171
|
-
def setCleanupMode(self, value):
|
|
172
|
-
if value.strip().lower() not in ['disabled', 'inplace', 'inplace_full', 'shrink', 'shrink_full', 'each', 'each_full', 'delete_full']:
|
|
173
|
-
raise Exception("Cleanup mode possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full")
|
|
174
|
-
return self._set(cleanupMode=value)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
class TokenAssembler(AnnotatorTransformer, AnnotatorProperties):
|
|
178
|
-
|
|
179
|
-
name = "TokenAssembler"
|
|
180
|
-
preservePosition = Param(Params._dummy(), "preservePosition", "whether to preserve the actual position of the tokens or reduce them to one space", typeConverter=TypeConverters.toBoolean)
|
|
181
|
-
|
|
182
|
-
@keyword_only
|
|
183
|
-
def __init__(self):
|
|
184
|
-
super(TokenAssembler, self).__init__(classname="com.johnsnowlabs.nlp.TokenAssembler")
|
|
185
|
-
|
|
186
|
-
@keyword_only
|
|
187
|
-
def setParams(self):
|
|
188
|
-
kwargs = self._input_kwargs
|
|
189
|
-
return self._set(**kwargs)
|
|
190
|
-
|
|
191
|
-
def setPreservePosition(self, value):
|
|
192
|
-
return self._set(preservePosition=value)
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties):
|
|
196
|
-
|
|
197
|
-
chunkCol = Param(Params._dummy(), "chunkCol", "column that contains string. Must be part of DOCUMENT", typeConverter=TypeConverters.toString)
|
|
198
|
-
startCol = Param(Params._dummy(), "startCol", "column that has a reference of where chunk begins", typeConverter=TypeConverters.toString)
|
|
199
|
-
startColByTokenIndex = Param(Params._dummy(), "startColByTokenIndex", "whether start col is by whitespace tokens", typeConverter=TypeConverters.toBoolean)
|
|
200
|
-
isArray = Param(Params._dummy(), "isArray", "whether the chunkCol is an array of strings", typeConverter=TypeConverters.toBoolean)
|
|
201
|
-
failOnMissing = Param(Params._dummy(), "failOnMissing", "whether to fail the job if a chunk is not found within document. return empty otherwise", typeConverter=TypeConverters.toBoolean)
|
|
202
|
-
lowerCase = Param(Params._dummy(), "lowerCase", "whether to lower case for matching case", typeConverter=TypeConverters.toBoolean)
|
|
203
|
-
name = "Doc2Chunk"
|
|
204
|
-
|
|
205
|
-
@keyword_only
|
|
206
|
-
def __init__(self):
|
|
207
|
-
super(Doc2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.Doc2Chunk")
|
|
208
|
-
self._setDefault(
|
|
209
|
-
isArray=False
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
@keyword_only
|
|
213
|
-
def setParams(self):
|
|
214
|
-
kwargs = self._input_kwargs
|
|
215
|
-
return self._set(**kwargs)
|
|
216
|
-
|
|
217
|
-
def setChunkCol(self, value):
|
|
218
|
-
return self._set(chunkCol=value)
|
|
219
|
-
|
|
220
|
-
def setIsArray(self, value):
|
|
221
|
-
return self._set(isArray=value)
|
|
222
|
-
|
|
223
|
-
def setStartCol(self, value):
|
|
224
|
-
return self._set(startCol=value)
|
|
225
|
-
|
|
226
|
-
def setStartColByTokenIndex(self, value):
|
|
227
|
-
return self._set(startColByTokenIndex=value)
|
|
228
|
-
|
|
229
|
-
def setFailOnMissing(self, value):
|
|
230
|
-
return self._set(failOnMissing=value)
|
|
231
|
-
|
|
232
|
-
def setLowerCase(self, value):
|
|
233
|
-
return self._set(lowerCase=value)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties):
|
|
237
|
-
|
|
238
|
-
name = "Chunk2Doc"
|
|
239
|
-
|
|
240
|
-
@keyword_only
|
|
241
|
-
def __init__(self):
|
|
242
|
-
super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.Chunk2Doc")
|
|
243
|
-
|
|
244
|
-
@keyword_only
|
|
245
|
-
def setParams(self):
|
|
246
|
-
kwargs = self._input_kwargs
|
|
247
|
-
return self._set(**kwargs)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
class Finisher(AnnotatorTransformer):
|
|
251
|
-
|
|
252
|
-
inputCols = Param(Params._dummy(), "inputCols", "input annotations", typeConverter=TypeConverters.toListString)
|
|
253
|
-
outputCols = Param(Params._dummy(), "outputCols", "output finished annotation cols", typeConverter=TypeConverters.toListString)
|
|
254
|
-
valueSplitSymbol = Param(Params._dummy(), "valueSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
|
|
255
|
-
annotationSplitSymbol = Param(Params._dummy(), "annotationSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
|
|
256
|
-
cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove annotation columns", typeConverter=TypeConverters.toBoolean)
|
|
257
|
-
includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
|
|
258
|
-
outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
|
|
259
|
-
parseEmbeddingsVectors = Param(Params._dummy(), "parseEmbeddingsVectors", "whether to include embeddings vectors in the process", typeConverter=TypeConverters.toBoolean)
|
|
260
|
-
|
|
261
|
-
name = "Finisher"
|
|
262
|
-
|
|
263
|
-
@keyword_only
|
|
264
|
-
def __init__(self):
|
|
265
|
-
super(Finisher, self).__init__(classname="com.johnsnowlabs.nlp.Finisher")
|
|
266
|
-
self._setDefault(
|
|
267
|
-
cleanAnnotations=True,
|
|
268
|
-
includeMetadata=False,
|
|
269
|
-
outputAsArray=True,
|
|
270
|
-
parseEmbeddingsVectors=False
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
@keyword_only
|
|
274
|
-
def setParams(self):
|
|
275
|
-
kwargs = self._input_kwargs
|
|
276
|
-
return self._set(**kwargs)
|
|
277
|
-
|
|
278
|
-
def setInputCols(self, *value):
|
|
279
|
-
if len(value) == 1 and type(value[0]) == list:
|
|
280
|
-
return self._set(inputCols=value[0])
|
|
281
|
-
else:
|
|
282
|
-
return self._set(inputCols=list(value))
|
|
283
|
-
|
|
284
|
-
def setOutputCols(self, *value):
|
|
285
|
-
if len(value) == 1 and type(value[0]) == list:
|
|
286
|
-
return self._set(outputCols=value[0])
|
|
287
|
-
else:
|
|
288
|
-
return self._set(outputCols=list(value))
|
|
289
|
-
|
|
290
|
-
def setValueSplitSymbol(self, value):
|
|
291
|
-
return self._set(valueSplitSymbol=value)
|
|
292
|
-
|
|
293
|
-
def setAnnotationSplitSymbol(self, value):
|
|
294
|
-
return self._set(annotationSplitSymbol=value)
|
|
295
|
-
|
|
296
|
-
def setCleanAnnotations(self, value):
|
|
297
|
-
return self._set(cleanAnnotations=value)
|
|
298
|
-
|
|
299
|
-
def setIncludeMetadata(self, value):
|
|
300
|
-
return self._set(includeMetadata=value)
|
|
301
|
-
|
|
302
|
-
def setOutputAsArray(self, value):
|
|
303
|
-
return self._set(outputAsArray=value)
|
|
304
|
-
|
|
305
|
-
def setParseEmbeddingsVectors(self, value):
|
|
306
|
-
return self._set(parseEmbeddingsVectors=value)
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
class EmbeddingsFinisher(AnnotatorTransformer):
|
|
310
|
-
|
|
311
|
-
inputCols = Param(Params._dummy(), "inputCols", "name of input annotation cols containing embeddings", typeConverter=TypeConverters.toListString)
|
|
312
|
-
outputCols = Param(Params._dummy(), "outputCols", "output EmbeddingsFinisher ouput cols", typeConverter=TypeConverters.toListString)
|
|
313
|
-
cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove all the existing annotation columns", typeConverter=TypeConverters.toBoolean)
|
|
314
|
-
outputAsVector = Param(Params._dummy(), "outputAsVector", "if enabled it will output the embeddings as Vectors instead of arrays", typeConverter=TypeConverters.toBoolean)
|
|
315
|
-
|
|
316
|
-
name = "EmbeddingsFinisher"
|
|
317
|
-
|
|
318
|
-
@keyword_only
|
|
319
|
-
def __init__(self):
|
|
320
|
-
super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher")
|
|
321
|
-
self._setDefault(
|
|
322
|
-
cleanAnnotations=False,
|
|
323
|
-
outputAsVector=False
|
|
324
|
-
)
|
|
325
|
-
|
|
326
|
-
@keyword_only
|
|
327
|
-
def setParams(self):
|
|
328
|
-
kwargs = self._input_kwargs
|
|
329
|
-
return self._set(**kwargs)
|
|
330
|
-
|
|
331
|
-
def setInputCols(self, *value):
|
|
332
|
-
if len(value) == 1 and type(value[0]) == list:
|
|
333
|
-
return self._set(inputCols=value[0])
|
|
334
|
-
else:
|
|
335
|
-
return self._set(inputCols=list(value))
|
|
336
|
-
|
|
337
|
-
def setOutputCols(self, *value):
|
|
338
|
-
if len(value) == 1 and type(value[0]) == list:
|
|
339
|
-
return self._set(outputCols=value[0])
|
|
340
|
-
else:
|
|
341
|
-
return self._set(outputCols=list(value))
|
|
342
|
-
|
|
343
|
-
def setCleanAnnotations(self, value):
|
|
344
|
-
return self._set(cleanAnnotations=value)
|
|
345
|
-
|
|
346
|
-
def setOutputAsVector(self, value):
|
|
347
|
-
return self._set(outputAsVector=value)
|
sparknlp/base.pyc
DELETED
|
Binary file
|
sparknlp/common.py
DELETED
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
from pyspark.ml.util import JavaMLWritable
|
|
2
|
-
from pyspark.ml.wrapper import JavaModel, JavaEstimator
|
|
3
|
-
from pyspark.ml.param.shared import Param, TypeConverters
|
|
4
|
-
from pyspark.ml.param import Params
|
|
5
|
-
from pyspark import keyword_only
|
|
6
|
-
import sparknlp.internal as _internal
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class AnnotatorProperties(Params):
|
|
10
|
-
|
|
11
|
-
inputCols = Param(Params._dummy(),
|
|
12
|
-
"inputCols",
|
|
13
|
-
"previous annotations columns, if renamed",
|
|
14
|
-
typeConverter=TypeConverters.toListString)
|
|
15
|
-
outputCol = Param(Params._dummy(),
|
|
16
|
-
"outputCol",
|
|
17
|
-
"output annotation column. can be left default.",
|
|
18
|
-
typeConverter=TypeConverters.toString)
|
|
19
|
-
lazyAnnotator = Param(Params._dummy(),
|
|
20
|
-
"lazyAnnotator",
|
|
21
|
-
"Whether this AnnotatorModel acts as lazy in RecursivePipelines",
|
|
22
|
-
typeConverter=TypeConverters.toBoolean
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
def setInputCols(self, *value):
|
|
26
|
-
if len(value) == 1 and type(value[0]) == list:
|
|
27
|
-
return self._set(inputCols=value[0])
|
|
28
|
-
else:
|
|
29
|
-
return self._set(inputCols=list(value))
|
|
30
|
-
|
|
31
|
-
def getInputCols(self):
|
|
32
|
-
self.getOrDefault(self.inputCols)
|
|
33
|
-
|
|
34
|
-
def setOutputCol(self, value):
|
|
35
|
-
return self._set(outputCol=value)
|
|
36
|
-
|
|
37
|
-
def getOutputCol(self):
|
|
38
|
-
self.getOrDefault(self.outputCol)
|
|
39
|
-
|
|
40
|
-
def setLazyAnnotator(self, value):
|
|
41
|
-
return self._set(lazyAnnotator=value)
|
|
42
|
-
|
|
43
|
-
def getLazyAnnotator(self):
|
|
44
|
-
self.getOrDefault(self.lazyAnnotator)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class AnnotatorModel(JavaModel, _internal.AnnotatorJavaMLReadable, JavaMLWritable, AnnotatorProperties, _internal.ParamsGettersSetters):
|
|
48
|
-
|
|
49
|
-
@keyword_only
|
|
50
|
-
def setParams(self):
|
|
51
|
-
kwargs = self._input_kwargs
|
|
52
|
-
return self._set(**kwargs)
|
|
53
|
-
|
|
54
|
-
@keyword_only
|
|
55
|
-
def __init__(self, classname, java_model=None):
|
|
56
|
-
super(AnnotatorModel, self).__init__(java_model=java_model)
|
|
57
|
-
if classname and not java_model:
|
|
58
|
-
self.__class__._java_class_name = classname
|
|
59
|
-
self._java_obj = self._new_java_obj(classname, self.uid)
|
|
60
|
-
if java_model is not None:
|
|
61
|
-
self._transfer_params_from_java()
|
|
62
|
-
self._setDefault(lazyAnnotator=False)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
class HasEmbeddingsProperties(Params):
|
|
66
|
-
dimension = Param(Params._dummy(),
|
|
67
|
-
"dimension",
|
|
68
|
-
"Number of embedding dimensions",
|
|
69
|
-
typeConverter=TypeConverters.toInt)
|
|
70
|
-
|
|
71
|
-
def setDimension(self, value):
|
|
72
|
-
return self._set(dimension=value)
|
|
73
|
-
|
|
74
|
-
def getDimension(self):
|
|
75
|
-
return self.getOrDefault(self.dimension)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
class HasStorageRef:
|
|
79
|
-
|
|
80
|
-
storageRef = Param(Params._dummy(), "storageRef",
|
|
81
|
-
"unique reference name for identification",
|
|
82
|
-
TypeConverters.toString)
|
|
83
|
-
|
|
84
|
-
def setStorageRef(self, value):
|
|
85
|
-
return self._set(storageRef=value)
|
|
86
|
-
|
|
87
|
-
def getStorageRef(self):
|
|
88
|
-
return self.getOrDefault("storageRef")
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
class HasCaseSensitiveProperties:
|
|
92
|
-
caseSensitive = Param(Params._dummy(),
|
|
93
|
-
"caseSensitive",
|
|
94
|
-
"whether to ignore case in tokens for embeddings matching",
|
|
95
|
-
typeConverter=TypeConverters.toBoolean)
|
|
96
|
-
|
|
97
|
-
def setCaseSensitive(self, value):
|
|
98
|
-
return self._set(caseSensitive=value)
|
|
99
|
-
|
|
100
|
-
def getCaseSensitive(self):
|
|
101
|
-
return self.getOrDefault(self.caseSensitive)
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
class HasExcludableStorage:
|
|
105
|
-
|
|
106
|
-
includeStorage = Param(Params._dummy(),
|
|
107
|
-
"includeStorage",
|
|
108
|
-
"whether to include indexed storage in trained model",
|
|
109
|
-
typeConverter=TypeConverters.toBoolean)
|
|
110
|
-
|
|
111
|
-
def setIncludeStorage(self, value):
|
|
112
|
-
return self._set(includeStorage=value)
|
|
113
|
-
|
|
114
|
-
def getIncludeStorage(self):
|
|
115
|
-
return self.getOrDefault("includeStorage")
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
class HasStorage(HasStorageRef, HasCaseSensitiveProperties, HasExcludableStorage):
|
|
119
|
-
|
|
120
|
-
storagePath = Param(Params._dummy(),
|
|
121
|
-
"storagePath",
|
|
122
|
-
"path to file",
|
|
123
|
-
typeConverter=TypeConverters.identity)
|
|
124
|
-
|
|
125
|
-
def setStoragePath(self, path, read_as):
|
|
126
|
-
return self._set(storagePath=ExternalResource(path, read_as, {}))
|
|
127
|
-
|
|
128
|
-
def getStoragePath(self):
|
|
129
|
-
return self.getOrDefault("storagePath")
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
class HasStorageModel(HasStorageRef, HasCaseSensitiveProperties, HasExcludableStorage):
|
|
133
|
-
|
|
134
|
-
def saveStorage(self, path, spark):
|
|
135
|
-
self._transfer_params_to_java()
|
|
136
|
-
self._java_obj.saveStorage(path, spark._jsparkSession, False)
|
|
137
|
-
|
|
138
|
-
@staticmethod
|
|
139
|
-
def loadStorage(path, spark, storage_ref):
|
|
140
|
-
raise NotImplementedError("AnnotatorModel with HasStorageModel did not implement 'loadStorage'")
|
|
141
|
-
|
|
142
|
-
@staticmethod
|
|
143
|
-
def loadStorages(path, spark, storage_ref, databases):
|
|
144
|
-
for database in databases:
|
|
145
|
-
_internal._StorageHelper(path, spark, database, storage_ref, within_storage=False)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
class AnnotatorApproach(JavaEstimator, JavaMLWritable, _internal.AnnotatorJavaMLReadable, AnnotatorProperties,
|
|
149
|
-
_internal.ParamsGettersSetters):
|
|
150
|
-
|
|
151
|
-
@keyword_only
|
|
152
|
-
def __init__(self, classname):
|
|
153
|
-
_internal.ParamsGettersSetters.__init__(self)
|
|
154
|
-
self.__class__._java_class_name = classname
|
|
155
|
-
self._java_obj = self._new_java_obj(classname, self.uid)
|
|
156
|
-
self._setDefault(lazyAnnotator=False)
|
|
157
|
-
|
|
158
|
-
def _create_model(self, java_model):
|
|
159
|
-
raise NotImplementedError('Please implement _create_model in %s' % self)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
class RecursiveAnnotatorApproach(_internal.RecursiveEstimator, JavaMLWritable, _internal.AnnotatorJavaMLReadable, AnnotatorProperties,
|
|
163
|
-
_internal.ParamsGettersSetters):
|
|
164
|
-
@keyword_only
|
|
165
|
-
def __init__(self, classname):
|
|
166
|
-
_internal.ParamsGettersSetters.__init__(self)
|
|
167
|
-
self.__class__._java_class_name = classname
|
|
168
|
-
self._java_obj = self._new_java_obj(classname, self.uid)
|
|
169
|
-
self._setDefault(lazyAnnotator=False)
|
|
170
|
-
|
|
171
|
-
def _create_model(self, java_model):
|
|
172
|
-
raise NotImplementedError('Please implement _create_model in %s' % self)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def RegexRule(rule, identifier):
|
|
176
|
-
return _internal._RegexRule(rule, identifier).apply()
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class ReadAs(object):
|
|
180
|
-
TEXT = "TEXT"
|
|
181
|
-
SPARK = "SPARK"
|
|
182
|
-
BINARY = "BINARY"
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def ExternalResource(path, read_as=ReadAs.TEXT, options={}):
|
|
186
|
-
return _internal._ExternalResource(path, read_as, options).apply()
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
class CoverageResult:
|
|
190
|
-
def __init__(self, cov_obj):
|
|
191
|
-
self.covered = cov_obj.covered()
|
|
192
|
-
self.total = cov_obj.total()
|
|
193
|
-
self.percentage = cov_obj.percentage()
|
sparknlp/common.pyc
DELETED
|
Binary file
|
sparknlp/embeddings.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import sparknlp.internal as _internal
|
|
2
|
-
|
|
3
|
-
from pyspark.ml.param import Params
|
|
4
|
-
from pyspark import keyword_only
|
|
5
|
-
import sys
|
|
6
|
-
import threading
|
|
7
|
-
import time
|
|
8
|
-
import sparknlp.pretrained as _pretrained
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# DONT REMOVE THIS IMPORT
|
|
12
|
-
from sparknlp.annotator import WordEmbeddingsModel
|
|
13
|
-
####
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class Embeddings:
|
|
17
|
-
def __init__(self, embeddings):
|
|
18
|
-
self.jembeddings = embeddings
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class EmbeddingsHelper:
|
|
22
|
-
@classmethod
|
|
23
|
-
def load(cls, path, spark_session, embeddings_format, embeddings_ref, embeddings_dim, embeddings_casesens=False):
|
|
24
|
-
print("Loading started this may take some time")
|
|
25
|
-
stop_threads = False
|
|
26
|
-
t1 = threading.Thread(target=_pretrained.printProgress, args=(lambda: stop_threads,))
|
|
27
|
-
t1.start()
|
|
28
|
-
jembeddings = _internal._EmbeddingsHelperLoad(path, spark_session, embeddings_format, embeddings_ref, embeddings_dim, embeddings_casesens).apply()
|
|
29
|
-
stop_threads = True
|
|
30
|
-
t1.join()
|
|
31
|
-
print("Loading done")
|
|
32
|
-
return Embeddings(jembeddings)
|
|
33
|
-
|
|
34
|
-
@classmethod
|
|
35
|
-
def save(cls, path, embeddings, spark_session):
|
|
36
|
-
return _internal._EmbeddingsHelperSave(path, embeddings, spark_session).apply()
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
def getFromAnnotator(cls, annotator):
|
|
40
|
-
return _internal._EmbeddingsHelperFromAnnotator(annotator).apply()
|
sparknlp/embeddings.pyc
DELETED
|
Binary file
|