spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the DocumentNormalizer"""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DocumentNormalizer(AnnotatorModel):
|
|
19
|
+
"""Annotator which normalizes raw text from tagged text, e.g. scraped web
|
|
20
|
+
pages or xml documents, from document type columns into Sentence.
|
|
21
|
+
|
|
22
|
+
Removes all dirty characters from text following one or more input regex
|
|
23
|
+
patterns. Can apply not wanted character removal with a specific policy.
|
|
24
|
+
Can apply lower case normalization.
|
|
25
|
+
|
|
26
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb
|
|
27
|
+
>`__.
|
|
28
|
+
|
|
29
|
+
====================== ======================
|
|
30
|
+
Input Annotation types Output Annotation type
|
|
31
|
+
====================== ======================
|
|
32
|
+
``DOCUMENT`` ``DOCUMENT``
|
|
33
|
+
====================== ======================
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
action
|
|
38
|
+
action to perform before applying regex patterns on text, by default
|
|
39
|
+
"clean"
|
|
40
|
+
patterns
|
|
41
|
+
normalization regex patterns which match will be removed from document,
|
|
42
|
+
by default ['<[^>]*>']
|
|
43
|
+
replacement
|
|
44
|
+
replacement string to apply when regexes match, by default " "
|
|
45
|
+
lowercase
|
|
46
|
+
whether to convert strings to lowercase, by default False
|
|
47
|
+
policy
|
|
48
|
+
policy to remove pattern from text, by default "pretty_all"
|
|
49
|
+
encoding
|
|
50
|
+
file encoding to apply on normalized documents, by default "UTF-8"
|
|
51
|
+
|
|
52
|
+
Examples
|
|
53
|
+
--------
|
|
54
|
+
>>> import sparknlp
|
|
55
|
+
>>> from sparknlp.base import *
|
|
56
|
+
>>> from sparknlp.annotator import *
|
|
57
|
+
>>> from pyspark.ml import Pipeline
|
|
58
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
59
|
+
... .setInputCol("text") \\
|
|
60
|
+
... .setOutputCol("document")
|
|
61
|
+
>>> cleanUpPatterns = ["<[^>]>"]
|
|
62
|
+
>>> documentNormalizer = DocumentNormalizer() \\
|
|
63
|
+
... .setInputCols("document") \\
|
|
64
|
+
... .setOutputCol("normalizedDocument") \\
|
|
65
|
+
... .setAction("clean") \\
|
|
66
|
+
... .setPatterns(cleanUpPatterns) \\
|
|
67
|
+
... .setReplacement(" ") \\
|
|
68
|
+
... .setPolicy("pretty_all") \\
|
|
69
|
+
... .setLowercase(True)
|
|
70
|
+
>>> pipeline = Pipeline().setStages([
|
|
71
|
+
... documentAssembler,
|
|
72
|
+
... documentNormalizer
|
|
73
|
+
... ])
|
|
74
|
+
>>> text = \"\"\"
|
|
75
|
+
... <div id="theworldsgreatest" class='my-right my-hide-small my-wide toptext' style="font-family:'Segoe UI',Arial,sans-serif">
|
|
76
|
+
... THE WORLD'S LARGEST WEB DEVELOPER SITE
|
|
77
|
+
... <h1 style="font-size:300%;">THE WORLD'S LARGEST WEB DEVELOPER SITE</h1>
|
|
78
|
+
... <p style="font-size:160%;">Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum..</p>
|
|
79
|
+
... </div>
|
|
80
|
+
... </div>\"\"\"
|
|
81
|
+
>>> data = spark.createDataFrame([[text]]).toDF("text")
|
|
82
|
+
>>> pipelineModel = pipeline.fit(data)
|
|
83
|
+
>>> result = pipelineModel.transform(data)
|
|
84
|
+
>>> result.selectExpr("normalizedDocument.result").show(truncate=False)
|
|
85
|
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
86
|
+
|result |
|
|
87
|
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
88
|
+
|[ the world's largest web developer site the world's largest web developer site lorem ipsum is simply dummy text of the printing and typesetting industry. lorem ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. it has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. it was popularised in the 1960s with the release of letraset sheets containing lorem ipsum passages, and more recently with desktop publishing software like aldus pagemaker including versions of lorem ipsum..]|
|
|
89
|
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
90
|
+
"""
|
|
91
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
92
|
+
|
|
93
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
94
|
+
|
|
95
|
+
action = Param(Params._dummy(),
|
|
96
|
+
"action",
|
|
97
|
+
"action to perform applying regex patterns on text",
|
|
98
|
+
typeConverter=TypeConverters.toString)
|
|
99
|
+
|
|
100
|
+
patterns = Param(Params._dummy(),
|
|
101
|
+
"patterns",
|
|
102
|
+
"normalization regex patterns which match will be removed from document. Defaults is <[^>]*>",
|
|
103
|
+
typeConverter=TypeConverters.toListString)
|
|
104
|
+
|
|
105
|
+
replacement = Param(Params._dummy(),
|
|
106
|
+
"replacement",
|
|
107
|
+
"replacement string to apply when regexes match",
|
|
108
|
+
typeConverter=TypeConverters.toString)
|
|
109
|
+
|
|
110
|
+
lowercase = Param(Params._dummy(),
|
|
111
|
+
"lowercase",
|
|
112
|
+
"whether to convert strings to lowercase",
|
|
113
|
+
typeConverter=TypeConverters.toBoolean)
|
|
114
|
+
|
|
115
|
+
policy = Param(Params._dummy(),
|
|
116
|
+
"policy",
|
|
117
|
+
"policy to remove pattern from text",
|
|
118
|
+
typeConverter=TypeConverters.toString)
|
|
119
|
+
|
|
120
|
+
encoding = Param(Params._dummy(),
|
|
121
|
+
"encoding",
|
|
122
|
+
"file encoding to apply on normalized documents",
|
|
123
|
+
typeConverter=TypeConverters.toString)
|
|
124
|
+
|
|
125
|
+
presetPattern = Param(
|
|
126
|
+
Params._dummy(),
|
|
127
|
+
"presetPattern",
|
|
128
|
+
"Selects a single text cleaning function from the functional presets (e.g., 'CLEAN_BULLETS', 'CLEAN_DASHES', etc.).",
|
|
129
|
+
typeConverter=TypeConverters.toString
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
autoMode = Param(
|
|
133
|
+
Params._dummy(),
|
|
134
|
+
"autoMode",
|
|
135
|
+
"Enables a predefined cleaning mode combining multiple text cleaner functions (e.g., 'light_clean', 'document_clean', 'html_clean', 'full_auto').",
|
|
136
|
+
typeConverter=TypeConverters.toString
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@keyword_only
|
|
141
|
+
def __init__(self):
|
|
142
|
+
super(DocumentNormalizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DocumentNormalizer")
|
|
143
|
+
self._setDefault(
|
|
144
|
+
action="clean",
|
|
145
|
+
patterns=["<[^>]*>"],
|
|
146
|
+
replacement=" ",
|
|
147
|
+
lowercase=False,
|
|
148
|
+
policy="pretty_all",
|
|
149
|
+
encoding="UTF-8"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def setAction(self, value):
|
|
153
|
+
"""Sets action to perform before applying regex patterns on text, by
|
|
154
|
+
default "clean".
|
|
155
|
+
|
|
156
|
+
Parameters
|
|
157
|
+
----------
|
|
158
|
+
value : str
|
|
159
|
+
Action to perform before applying regex patterns
|
|
160
|
+
"""
|
|
161
|
+
return self._set(action=value)
|
|
162
|
+
|
|
163
|
+
def setPatterns(self, value):
|
|
164
|
+
"""Sets normalization regex patterns which match will be removed from
|
|
165
|
+
document, by default ['<[^>]*>'].
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
value : List[str]
|
|
170
|
+
Normalization regex patterns which match will be removed from
|
|
171
|
+
document
|
|
172
|
+
"""
|
|
173
|
+
return self._set(patterns=value)
|
|
174
|
+
|
|
175
|
+
def setReplacement(self, value):
|
|
176
|
+
"""Sets replacement string to apply when regexes match, by default " ".
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
value : str
|
|
181
|
+
Replacement string to apply when regexes match
|
|
182
|
+
"""
|
|
183
|
+
return self._set(replacement=value)
|
|
184
|
+
|
|
185
|
+
def setLowercase(self, value):
|
|
186
|
+
"""Sets whether to convert strings to lowercase, by default False.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
value : bool
|
|
191
|
+
Whether to convert strings to lowercase, by default False
|
|
192
|
+
"""
|
|
193
|
+
return self._set(lowercase=value)
|
|
194
|
+
|
|
195
|
+
def setPolicy(self, value):
|
|
196
|
+
"""Sets policy to remove pattern from text, by default "pretty_all".
|
|
197
|
+
|
|
198
|
+
Parameters
|
|
199
|
+
----------
|
|
200
|
+
value : str
|
|
201
|
+
Policy to remove pattern from text, by default "pretty_all"
|
|
202
|
+
"""
|
|
203
|
+
return self._set(policy=value)
|
|
204
|
+
|
|
205
|
+
def setEncoding(self, value):
|
|
206
|
+
"""Sets file encoding to apply on normalized documents, by default
|
|
207
|
+
"UTF-8".
|
|
208
|
+
|
|
209
|
+
Parameters
|
|
210
|
+
----------
|
|
211
|
+
value : str
|
|
212
|
+
File encoding to apply on normalized documents, by default "UTF-8"
|
|
213
|
+
"""
|
|
214
|
+
return self._set(encoding=value)
|
|
215
|
+
|
|
216
|
+
def setPresetPattern(self, value):
|
|
217
|
+
"""Sets a single text cleaning preset pattern.
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
value : str
|
|
222
|
+
Preset cleaning pattern name, e.g., 'CLEAN_BULLETS', 'CLEAN_DASHES'.
|
|
223
|
+
"""
|
|
224
|
+
return self._set(presetPattern=value)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def setAutoMode(self, value):
|
|
228
|
+
"""Sets an automatic text cleaning mode using predefined groups of cleaning functions.
|
|
229
|
+
|
|
230
|
+
Parameters
|
|
231
|
+
----------
|
|
232
|
+
value : str
|
|
233
|
+
Auto cleaning mode, e.g., 'light_clean', 'document_clean', 'social_clean', 'html_clean', 'full_auto'.
|
|
234
|
+
"""
|
|
235
|
+
return self._set(autoMode=value)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the DocumentNormalizer"""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DocumentTokenSplitter(AnnotatorModel):
|
|
19
|
+
"""Annotator that splits large documents into smaller documents based on the number of tokens in
|
|
20
|
+
the text.
|
|
21
|
+
|
|
22
|
+
Currently, DocumentTokenSplitter splits the text by whitespaces to create the tokens. The
|
|
23
|
+
number of these tokens will then be used as a measure of the text length. In the future, other
|
|
24
|
+
tokenization techniques will be supported.
|
|
25
|
+
|
|
26
|
+
For example, given 3 tokens and overlap 1:
|
|
27
|
+
|
|
28
|
+
.. code-block:: python
|
|
29
|
+
|
|
30
|
+
He was, I take it, the most perfect reasoning and observing machine that the world has seen.
|
|
31
|
+
|
|
32
|
+
["He was, I", "I take it,", "it, the most", "most perfect reasoning", "reasoning and observing", "observing machine that", "that the world", "world has seen."]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
Additionally, you can set
|
|
36
|
+
|
|
37
|
+
- whether to trim whitespaces with setTrimWhitespace
|
|
38
|
+
- whether to explode the splits to individual rows with setExplodeSplits
|
|
39
|
+
|
|
40
|
+
For extended examples of usage, see the
|
|
41
|
+
`DocumentTokenSplitterTest <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala>`__.
|
|
42
|
+
|
|
43
|
+
====================== ======================
|
|
44
|
+
Input Annotation types Output Annotation type
|
|
45
|
+
====================== ======================
|
|
46
|
+
``DOCUMENT`` ``DOCUMENT``
|
|
47
|
+
====================== ======================
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
|
|
52
|
+
numTokens
|
|
53
|
+
Limit of the number of tokens in a text
|
|
54
|
+
tokenOverlap
|
|
55
|
+
Length of the token overlap between text chunks, by default `0`.
|
|
56
|
+
explodeSplits
|
|
57
|
+
Whether to explode split chunks to separate rows, by default `False`.
|
|
58
|
+
trimWhitespace
|
|
59
|
+
Whether to trim whitespaces of extracted chunks, by default `True`.
|
|
60
|
+
|
|
61
|
+
Examples
|
|
62
|
+
--------
|
|
63
|
+
>>> import sparknlp
|
|
64
|
+
>>> from sparknlp.base import *
|
|
65
|
+
>>> from sparknlp.annotator import *
|
|
66
|
+
>>> from pyspark.ml import Pipeline
|
|
67
|
+
>>> textDF = spark.read.text(
|
|
68
|
+
... "sherlockholmes.txt",
|
|
69
|
+
... wholetext=True
|
|
70
|
+
... ).toDF("text")
|
|
71
|
+
>>> documentAssembler = DocumentAssembler().setInputCol("text")
|
|
72
|
+
>>> textSplitter = DocumentTokenSplitter() \\
|
|
73
|
+
... .setInputCols(["document"]) \\
|
|
74
|
+
... .setOutputCol("splits") \\
|
|
75
|
+
... .setNumTokens(512) \\
|
|
76
|
+
... .setTokenOverlap(10) \\
|
|
77
|
+
... .setExplodeSplits(True)
|
|
78
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, textSplitter])
|
|
79
|
+
>>> result = pipeline.fit(textDF).transform(textDF)
|
|
80
|
+
>>> result.selectExpr(
|
|
81
|
+
... "splits.result as result",
|
|
82
|
+
... "splits[0].begin as begin",
|
|
83
|
+
... "splits[0].end as end",
|
|
84
|
+
... "splits[0].end - splits[0].begin as length",
|
|
85
|
+
... "splits[0].metadata.numTokens as tokens") \\
|
|
86
|
+
... .show(8, truncate = 80)
|
|
87
|
+
+--------------------------------------------------------------------------------+-----+-----+------+------+
|
|
88
|
+
| result|begin| end|length|tokens|
|
|
89
|
+
+--------------------------------------------------------------------------------+-----+-----+------+------+
|
|
90
|
+
|[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 3018| 3018| 512|
|
|
91
|
+
|[study of crime, and occupied his\\nimmense faculties and extraordinary powers...| 2950| 5707| 2757| 512|
|
|
92
|
+
|[but as I have changed my clothes I can't imagine how you\\ndeduce it. As to M...| 5659| 8483| 2824| 512|
|
|
93
|
+
|[quarters received. Be in your chamber then at that hour, and do\\nnot take it...| 8427|11241| 2814| 512|
|
|
94
|
+
|[a pity\\nto miss it."\\n\\n"But your client--"\\n\\n"Never mind him. I may want y...|11188|13970| 2782| 512|
|
|
95
|
+
|[person who employs me wishes his agent to be unknown to\\nyou, and I may conf...|13918|16898| 2980| 512|
|
|
96
|
+
|[letters back."\\n\\n"Precisely so. But how--"\\n\\n"Was there a secret marriage?...|16836|19744| 2908| 512|
|
|
97
|
+
|[seven hundred in\\nnotes," he said.\\n\\nHolmes scribbled a receipt upon a shee...|19683|22551| 2868| 512|
|
|
98
|
+
+--------------------------------------------------------------------------------+-----+-----+------+------+
|
|
99
|
+
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
103
|
+
|
|
104
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
105
|
+
|
|
106
|
+
numTokens = Param(Params._dummy(),
|
|
107
|
+
"numTokens",
|
|
108
|
+
"Limit of the number of tokens in a text",
|
|
109
|
+
typeConverter=TypeConverters.toInt)
|
|
110
|
+
tokenOverlap = Param(Params._dummy(),
|
|
111
|
+
"tokenOverlap",
|
|
112
|
+
"Length of the token overlap between text chunks",
|
|
113
|
+
typeConverter=TypeConverters.toInt)
|
|
114
|
+
explodeSplits = Param(Params._dummy(),
|
|
115
|
+
"explodeSplits",
|
|
116
|
+
"Whether to explode split chunks to separate rows",
|
|
117
|
+
typeConverter=TypeConverters.toBoolean)
|
|
118
|
+
trimWhitespace = Param(Params._dummy(),
|
|
119
|
+
"trimWhitespace",
|
|
120
|
+
"Whether to trim whitespaces of extracted chunks",
|
|
121
|
+
typeConverter=TypeConverters.toBoolean)
|
|
122
|
+
|
|
123
|
+
@keyword_only
|
|
124
|
+
def __init__(self):
|
|
125
|
+
super(DocumentTokenSplitter, self).__init__(
|
|
126
|
+
classname="com.johnsnowlabs.nlp.annotators.DocumentTokenSplitter")
|
|
127
|
+
self._setDefault(
|
|
128
|
+
tokenOverlap=0,
|
|
129
|
+
explodeSplits=False,
|
|
130
|
+
trimWhitespace=True
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def setNumTokens(self, value):
|
|
134
|
+
"""Sets the limit of the number of tokens in a text
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
value : int
|
|
139
|
+
Number of tokens in a text
|
|
140
|
+
"""
|
|
141
|
+
if value < 1:
|
|
142
|
+
raise ValueError("Number of tokens should be larger than 0.")
|
|
143
|
+
return self._set(numTokens=value)
|
|
144
|
+
|
|
145
|
+
def setTokenOverlap(self, value):
|
|
146
|
+
"""Length of the token overlap between text chunks, by default `0`.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
value : int
|
|
151
|
+
Length of the token overlap between text chunks
|
|
152
|
+
"""
|
|
153
|
+
if value > self.getOrDefault(self.numTokens):
|
|
154
|
+
raise ValueError("Token overlap can't be larger than number of tokens.")
|
|
155
|
+
return self._set(tokenOverlap=value)
|
|
156
|
+
|
|
157
|
+
def setExplodeSplits(self, value):
|
|
158
|
+
"""Sets whether to explode split chunks to separate rows, by default `False`.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
value : bool
|
|
163
|
+
Whether to explode split chunks to separate rows
|
|
164
|
+
"""
|
|
165
|
+
return self._set(explodeSplits=value)
|
|
166
|
+
|
|
167
|
+
def setTrimWhitespace(self, value):
|
|
168
|
+
"""Sets whether to trim whitespaces of extracted chunks, by default `True`.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
value : bool
|
|
173
|
+
Whether to trim whitespaces of extracted chunks
|
|
174
|
+
"""
|
|
175
|
+
return self._set(trimWhitespace=value)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import unittest
|
|
15
|
+
|
|
16
|
+
import pytest
|
|
17
|
+
|
|
18
|
+
from sparknlp.annotator import *
|
|
19
|
+
from sparknlp.base import *
|
|
20
|
+
from test.util import SparkSessionForTest
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.fast
|
|
24
|
+
class DocumentTokenSplitterTestSpec(unittest.TestCase):
|
|
25
|
+
def setUp(self):
|
|
26
|
+
self.data = SparkSessionForTest.spark.createDataFrame(
|
|
27
|
+
[
|
|
28
|
+
[
|
|
29
|
+
(
|
|
30
|
+
"All emotions, and that\none particularly, were abhorrent to his cold, precise"
|
|
31
|
+
" but\nadmirably balanced mind.\n\nHe was, I take it, the most perfect\nreasoning"
|
|
32
|
+
" and observing machine that the world has seen."
|
|
33
|
+
)
|
|
34
|
+
]
|
|
35
|
+
]
|
|
36
|
+
).toDF("text")
|
|
37
|
+
|
|
38
|
+
def test_run(self):
|
|
39
|
+
df = self.data
|
|
40
|
+
|
|
41
|
+
document_assembler = (
|
|
42
|
+
DocumentAssembler().setInputCol("text").setOutputCol("document")
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
document_token_splitter = (
|
|
46
|
+
DocumentTokenSplitter()
|
|
47
|
+
.setInputCols("document")
|
|
48
|
+
.setOutputCol("splits")
|
|
49
|
+
.setNumTokens(3)
|
|
50
|
+
.setTokenOverlap(1)
|
|
51
|
+
.setExplodeSplits(True)
|
|
52
|
+
.setTrimWhitespace(True)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
pipeline = Pipeline().setStages([document_assembler, document_token_splitter])
|
|
56
|
+
|
|
57
|
+
pipeline_df = pipeline.fit(df).transform(df)
|
|
58
|
+
|
|
59
|
+
results = pipeline_df.select("splits").collect()
|
|
60
|
+
|
|
61
|
+
splits = [
|
|
62
|
+
row["splits"][0].result.replace("\n\n", " ").replace("\n", " ")
|
|
63
|
+
for row in results
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
expected = [
|
|
67
|
+
"All emotions, and",
|
|
68
|
+
"and that one",
|
|
69
|
+
"one particularly, were",
|
|
70
|
+
"were abhorrent to",
|
|
71
|
+
"to his cold,",
|
|
72
|
+
"cold, precise but",
|
|
73
|
+
"but admirably balanced",
|
|
74
|
+
"balanced mind. He",
|
|
75
|
+
"He was, I",
|
|
76
|
+
"I take it,",
|
|
77
|
+
"it, the most",
|
|
78
|
+
"most perfect reasoning",
|
|
79
|
+
"reasoning and observing",
|
|
80
|
+
"observing machine that",
|
|
81
|
+
"that the world",
|
|
82
|
+
"world has seen.",
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
assert splits == expected
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Module of annotators for text embeddings."""
|
|
16
|
+
from sparknlp.annotator.embeddings.albert_embeddings import *
|
|
17
|
+
from sparknlp.annotator.embeddings.bert_embeddings import *
|
|
18
|
+
from sparknlp.annotator.embeddings.bert_sentence_embeddings import *
|
|
19
|
+
from sparknlp.annotator.embeddings.camembert_embeddings import *
|
|
20
|
+
from sparknlp.annotator.embeddings.chunk_embeddings import *
|
|
21
|
+
from sparknlp.annotator.embeddings.deberta_embeddings import *
|
|
22
|
+
from sparknlp.annotator.embeddings.distil_bert_embeddings import *
|
|
23
|
+
from sparknlp.annotator.embeddings.doc2vec import *
|
|
24
|
+
from sparknlp.annotator.embeddings.elmo_embeddings import *
|
|
25
|
+
from sparknlp.annotator.embeddings.e5_embeddings import *
|
|
26
|
+
from sparknlp.annotator.embeddings.instructor_embeddings import *
|
|
27
|
+
from sparknlp.annotator.embeddings.longformer_embeddings import *
|
|
28
|
+
from sparknlp.annotator.embeddings.minilm_embeddings import *
|
|
29
|
+
from sparknlp.annotator.embeddings.mpnet_embeddings import *
|
|
30
|
+
from sparknlp.annotator.embeddings.roberta_embeddings import *
|
|
31
|
+
from sparknlp.annotator.embeddings.roberta_sentence_embeddings import *
|
|
32
|
+
from sparknlp.annotator.embeddings.sentence_embeddings import *
|
|
33
|
+
from sparknlp.annotator.embeddings.universal_sentence_encoder import *
|
|
34
|
+
from sparknlp.annotator.embeddings.word2vec import *
|
|
35
|
+
from sparknlp.annotator.embeddings.word_embeddings import *
|
|
36
|
+
from sparknlp.annotator.embeddings.xlm_roberta_embeddings import *
|
|
37
|
+
from sparknlp.annotator.embeddings.xlm_roberta_sentence_embeddings import *
|
|
38
|
+
from sparknlp.annotator.embeddings.xlnet_embeddings import *
|
|
39
|
+
from sparknlp.annotator.embeddings.bge_embeddings import *
|
|
40
|
+
from sparknlp.annotator.embeddings.uae_embeddings import *
|
|
41
|
+
from sparknlp.annotator.embeddings.mxbai_embeddings import *
|
|
42
|
+
from sparknlp.annotator.embeddings.snowflake_embeddings import *
|
|
43
|
+
from sparknlp.annotator.embeddings.nomic_embeddings import *
|
|
44
|
+
from sparknlp.annotator.embeddings.auto_gguf_embeddings import *
|
|
45
|
+
from sparknlp.annotator.embeddings.e5v_embeddings import *
|