spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the TypedDependencyParser."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TypedDependencyParserApproach(AnnotatorApproach):
|
|
21
|
+
"""Labeled parser that finds a grammatical relation between two words in a
|
|
22
|
+
sentence. Its input is either a CoNLL2009 or ConllU dataset.
|
|
23
|
+
|
|
24
|
+
For instantiated/pretrained models, see
|
|
25
|
+
:class:`.TypedDependencyParserModel`.
|
|
26
|
+
|
|
27
|
+
Dependency parsers provide information about word relationship. For example,
|
|
28
|
+
dependency parsing can tell you what the subjects and objects of a verb are,
|
|
29
|
+
as well as which words are modifying (describing) the subject. This can help
|
|
30
|
+
you find precise answers to specific questions.
|
|
31
|
+
|
|
32
|
+
The parser requires the dependant tokens beforehand with e.g.
|
|
33
|
+
DependencyParser. The required training data can be set in two different
|
|
34
|
+
ways (only one can be chosen for a particular model):
|
|
35
|
+
|
|
36
|
+
- Dataset in the `CoNLL 2009 format
|
|
37
|
+
<https://ufal.mff.cuni.cz/conll2009-st/trial-data.html>`__ set with
|
|
38
|
+
:meth:`.setConll2009`
|
|
39
|
+
- Dataset in the `CoNLL-U format
|
|
40
|
+
<https://universaldependencies.org/format.html>`__ set with
|
|
41
|
+
:meth:`.setConllU`
|
|
42
|
+
|
|
43
|
+
Apart from that, no additional training data is needed.
|
|
44
|
+
|
|
45
|
+
========================== ======================
|
|
46
|
+
Input Annotation types Output Annotation type
|
|
47
|
+
========================== ======================
|
|
48
|
+
``TOKEN, POS, DEPENDENCY`` ``LABELED_DEPENDENCY``
|
|
49
|
+
========================== ======================
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
conll2009
|
|
54
|
+
Path to file with CoNLL 2009 format
|
|
55
|
+
conllU
|
|
56
|
+
Universal Dependencies source files
|
|
57
|
+
numberOfIterations
|
|
58
|
+
Number of iterations in training, converges to better accuracy
|
|
59
|
+
|
|
60
|
+
Examples
|
|
61
|
+
--------
|
|
62
|
+
>>> import sparknlp
|
|
63
|
+
>>> from sparknlp.base import *
|
|
64
|
+
>>> from sparknlp.annotator import *
|
|
65
|
+
>>> from pyspark.ml import Pipeline
|
|
66
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
67
|
+
... .setInputCol("text") \\
|
|
68
|
+
... .setOutputCol("document")
|
|
69
|
+
>>> sentence = SentenceDetector() \\
|
|
70
|
+
... .setInputCols(["document"]) \\
|
|
71
|
+
... .setOutputCol("sentence")
|
|
72
|
+
>>> tokenizer = Tokenizer() \\
|
|
73
|
+
... .setInputCols(["sentence"]) \\
|
|
74
|
+
... .setOutputCol("token")
|
|
75
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
76
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
77
|
+
... .setOutputCol("pos")
|
|
78
|
+
>>> dependencyParser = DependencyParserModel.pretrained() \\
|
|
79
|
+
... .setInputCols(["sentence", "pos", "token"]) \\
|
|
80
|
+
... .setOutputCol("dependency")
|
|
81
|
+
>>> typedDependencyParser = TypedDependencyParserApproach() \\
|
|
82
|
+
... .setInputCols(["dependency", "pos", "token"]) \\
|
|
83
|
+
... .setOutputCol("dependency_type") \\
|
|
84
|
+
... .setConllU("src/test/resources/parser/labeled/train_small.conllu.txt") \\
|
|
85
|
+
... .setNumberOfIterations(1)
|
|
86
|
+
>>> pipeline = Pipeline().setStages([
|
|
87
|
+
... documentAssembler,
|
|
88
|
+
... sentence,
|
|
89
|
+
... tokenizer,
|
|
90
|
+
... posTagger,
|
|
91
|
+
... dependencyParser,
|
|
92
|
+
... typedDependencyParser
|
|
93
|
+
... ])
|
|
94
|
+
|
|
95
|
+
Additional training data is not needed, the dependency parser relies on
|
|
96
|
+
CoNLL-U only.
|
|
97
|
+
|
|
98
|
+
>>> emptyDataSet = spark.createDataFrame([[""]]).toDF("text")
|
|
99
|
+
>>> pipelineModel = pipeline.fit(emptyDataSet)
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.DEPENDENCY]
|
|
103
|
+
|
|
104
|
+
outputAnnotatorType = AnnotatorType.LABELED_DEPENDENCY
|
|
105
|
+
|
|
106
|
+
conll2009 = Param(Params._dummy(),
|
|
107
|
+
"conll2009",
|
|
108
|
+
"Path to file with CoNLL 2009 format",
|
|
109
|
+
typeConverter=TypeConverters.identity)
|
|
110
|
+
|
|
111
|
+
conllU = Param(Params._dummy(),
|
|
112
|
+
"conllU",
|
|
113
|
+
"Universal Dependencies source files",
|
|
114
|
+
typeConverter=TypeConverters.identity)
|
|
115
|
+
|
|
116
|
+
numberOfIterations = Param(Params._dummy(),
|
|
117
|
+
"numberOfIterations",
|
|
118
|
+
"Number of iterations in training, converges to better accuracy",
|
|
119
|
+
typeConverter=TypeConverters.toInt)
|
|
120
|
+
|
|
121
|
+
@keyword_only
|
|
122
|
+
def __init__(self):
|
|
123
|
+
super(TypedDependencyParserApproach,
|
|
124
|
+
self).__init__(classname="com.johnsnowlabs.nlp.annotators.parser.typdep.TypedDependencyParserApproach")
|
|
125
|
+
|
|
126
|
+
def setConll2009(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
127
|
+
"""Sets path to file with CoNLL 2009 format.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
path : str
|
|
132
|
+
Path to the resource
|
|
133
|
+
read_as : str, optional
|
|
134
|
+
How to read the resource, by default ReadAs.TEXT
|
|
135
|
+
options : dict, optional
|
|
136
|
+
Options for reading the resource, by default {"key": "value"}
|
|
137
|
+
"""
|
|
138
|
+
opts = options.copy()
|
|
139
|
+
return self._set(conll2009=ExternalResource(path, read_as, opts))
|
|
140
|
+
|
|
141
|
+
def setConllU(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
142
|
+
"""Sets path to Universal Dependencies source files.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
path : str
|
|
147
|
+
Path to the resource
|
|
148
|
+
read_as : str, optional
|
|
149
|
+
How to read the resource, by default ReadAs.TEXT
|
|
150
|
+
options : dict, optional
|
|
151
|
+
Options for reading the resource, by default {"key": "value"}
|
|
152
|
+
"""
|
|
153
|
+
opts = options.copy()
|
|
154
|
+
return self._set(conllU=ExternalResource(path, read_as, opts))
|
|
155
|
+
|
|
156
|
+
def setNumberOfIterations(self, value):
|
|
157
|
+
"""Sets Number of iterations in training, converges to better accuracy.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
value : int
|
|
162
|
+
Number of iterations in training
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
[type]
|
|
167
|
+
[description]
|
|
168
|
+
"""
|
|
169
|
+
return self._set(numberOfIterations=value)
|
|
170
|
+
|
|
171
|
+
def _create_model(self, java_model):
|
|
172
|
+
return TypedDependencyParserModel(java_model=java_model)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class TypedDependencyParserModel(AnnotatorModel):
|
|
176
|
+
"""Labeled parser that finds a grammatical relation between two words in a
|
|
177
|
+
sentence. Its input is either a CoNLL2009 or ConllU dataset.
|
|
178
|
+
|
|
179
|
+
Dependency parsers provide information about word relationship. For example,
|
|
180
|
+
dependency parsing can tell you what the subjects and objects of a verb are,
|
|
181
|
+
as well as which words are modifying (describing) the subject. This can help
|
|
182
|
+
you find precise answers to specific questions.
|
|
183
|
+
|
|
184
|
+
The parser requires the dependant tokens beforehand with e.g.
|
|
185
|
+
DependencyParser.
|
|
186
|
+
|
|
187
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
188
|
+
object:
|
|
189
|
+
|
|
190
|
+
>>> typedDependencyParser = TypedDependencyParserModel.pretrained() \\
|
|
191
|
+
... .setInputCols(["dependency", "pos", "token"]) \\
|
|
192
|
+
... .setOutputCol("dependency_type")
|
|
193
|
+
|
|
194
|
+
The default model is ``"dependency_typed_conllu"``, if no name is provided.
|
|
195
|
+
For available pretrained models please see the `Models Hub
|
|
196
|
+
<https://sparknlp.org/models>`__.
|
|
197
|
+
|
|
198
|
+
For extended examples of usage, see the `Examples
|
|
199
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb>`__.
|
|
200
|
+
|
|
201
|
+
========================== ======================
|
|
202
|
+
Input Annotation types Output Annotation type
|
|
203
|
+
========================== ======================
|
|
204
|
+
``TOKEN, POS, DEPENDENCY`` ``LABELED_DEPENDENCY``
|
|
205
|
+
========================== ======================
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
None
|
|
210
|
+
|
|
211
|
+
Examples
|
|
212
|
+
--------
|
|
213
|
+
>>> import sparknlp
|
|
214
|
+
>>> from sparknlp.base import *
|
|
215
|
+
>>> from sparknlp.annotator import *
|
|
216
|
+
>>> from pyspark.ml import Pipeline
|
|
217
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
218
|
+
... .setInputCol("text") \\
|
|
219
|
+
... .setOutputCol("document")
|
|
220
|
+
>>> sentence = SentenceDetector() \\
|
|
221
|
+
... .setInputCols(["document"]) \\
|
|
222
|
+
... .setOutputCol("sentence")
|
|
223
|
+
>>> tokenizer = Tokenizer() \\
|
|
224
|
+
... .setInputCols(["sentence"]) \\
|
|
225
|
+
... .setOutputCol("token")
|
|
226
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
227
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
228
|
+
... .setOutputCol("pos")
|
|
229
|
+
>>> dependencyParser = DependencyParserModel.pretrained() \\
|
|
230
|
+
... .setInputCols(["sentence", "pos", "token"]) \\
|
|
231
|
+
... .setOutputCol("dependency")
|
|
232
|
+
>>> typedDependencyParser = TypedDependencyParserModel.pretrained() \\
|
|
233
|
+
... .setInputCols(["dependency", "pos", "token"]) \\
|
|
234
|
+
... .setOutputCol("dependency_type")
|
|
235
|
+
>>> pipeline = Pipeline().setStages([
|
|
236
|
+
... documentAssembler,
|
|
237
|
+
... sentence,
|
|
238
|
+
... tokenizer,
|
|
239
|
+
... posTagger,
|
|
240
|
+
... dependencyParser,
|
|
241
|
+
... typedDependencyParser
|
|
242
|
+
... ])
|
|
243
|
+
>>> data = spark.createDataFrame([[
|
|
244
|
+
... "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent " +
|
|
245
|
+
... "firm Federal Mogul."
|
|
246
|
+
... ]]).toDF("text")
|
|
247
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
248
|
+
>>> result.selectExpr("explode(arrays_zip(token.result, dependency.result, dependency_type.result)) as cols") \\
|
|
249
|
+
... .selectExpr("cols['0'] as token", "cols['1'] as dependency", "cols['2'] as dependency_type") \\
|
|
250
|
+
... .show(8, truncate = False)
|
|
251
|
+
+------------+------------+---------------+
|
|
252
|
+
|token |dependency |dependency_type|
|
|
253
|
+
+------------+------------+---------------+
|
|
254
|
+
|Unions |ROOT |root |
|
|
255
|
+
|representing|workers |amod |
|
|
256
|
+
|workers |Unions |flat |
|
|
257
|
+
|at |Turner |case |
|
|
258
|
+
|Turner |workers |flat |
|
|
259
|
+
|Newall |say |nsubj |
|
|
260
|
+
|say |Unions |parataxis |
|
|
261
|
+
|they |disappointed|nsubj |
|
|
262
|
+
+------------+------------+---------------+
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
name = "TypedDependencyParserModel"
|
|
266
|
+
|
|
267
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.DEPENDENCY]
|
|
268
|
+
|
|
269
|
+
outputAnnotatorType = AnnotatorType.LABELED_DEPENDENCY
|
|
270
|
+
|
|
271
|
+
trainOptions = Param(Params._dummy(),
|
|
272
|
+
"trainOptions",
|
|
273
|
+
"Training Options",
|
|
274
|
+
typeConverter=TypeConverters.identity)
|
|
275
|
+
|
|
276
|
+
trainParameters = Param(Params._dummy(),
|
|
277
|
+
"trainParameters",
|
|
278
|
+
"Training Parameters",
|
|
279
|
+
typeConverter=TypeConverters.identity)
|
|
280
|
+
|
|
281
|
+
trainDependencyPipe = Param(Params._dummy(),
|
|
282
|
+
"trainDependencyPipe",
|
|
283
|
+
"Training dependency pipe",
|
|
284
|
+
typeConverter=TypeConverters.identity)
|
|
285
|
+
|
|
286
|
+
conllFormat = Param(Params._dummy(),
|
|
287
|
+
"conllFormat",
|
|
288
|
+
"CoNLL Format",
|
|
289
|
+
typeConverter=TypeConverters.toString)
|
|
290
|
+
|
|
291
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.parser.typdep.TypedDependencyParserModel",
|
|
292
|
+
java_model=None):
|
|
293
|
+
super(TypedDependencyParserModel, self).__init__(
|
|
294
|
+
classname=classname,
|
|
295
|
+
java_model=java_model
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
@staticmethod
|
|
299
|
+
def pretrained(name="dependency_typed_conllu", lang="en", remote_loc=None):
|
|
300
|
+
"""Downloads and loads a pretrained model.
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
name : str, optional
|
|
305
|
+
Name of the pretrained model, by default "dependency_typed_conllu"
|
|
306
|
+
lang : str, optional
|
|
307
|
+
Language of the pretrained model, by default "en"
|
|
308
|
+
remote_loc : str, optional
|
|
309
|
+
Optional remote address of the resource, by default None. Will use
|
|
310
|
+
Spark NLPs repositories otherwise.
|
|
311
|
+
|
|
312
|
+
Returns
|
|
313
|
+
-------
|
|
314
|
+
TypedDependencyParserModel
|
|
315
|
+
The restored model
|
|
316
|
+
"""
|
|
317
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
318
|
+
return ResourceDownloader.downloadModel(TypedDependencyParserModel, name, lang, remote_loc)
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the DocumentNormalizer"""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DocumentCharacterTextSplitter(AnnotatorModel):
|
|
19
|
+
"""Annotator which splits large documents into chunks of roughly given size.
|
|
20
|
+
|
|
21
|
+
DocumentCharacterTextSplitter takes a list of separators. It takes the separators in order and
|
|
22
|
+
splits subtexts if they are over the chunk length, considering optional overlap of the chunks.
|
|
23
|
+
|
|
24
|
+
For example, given chunk size 20 and overlap 5:
|
|
25
|
+
|
|
26
|
+
.. code-block:: python
|
|
27
|
+
|
|
28
|
+
"He was, I take it, the most perfect reasoning and observing machine that the world has seen."
|
|
29
|
+
|
|
30
|
+
["He was, I take it,", "it, the most", "most perfect", "reasoning and", "and observing", "machine that the", "the world has seen."]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
Additionally, you can set
|
|
34
|
+
|
|
35
|
+
- custom patterns with setSplitPatterns
|
|
36
|
+
- whether patterns should be interpreted as regex with setPatternsAreRegex
|
|
37
|
+
- whether to keep the separators with setKeepSeparators
|
|
38
|
+
- whether to trim whitespaces with setTrimWhitespace
|
|
39
|
+
- whether to explode the splits to individual rows with setExplodeSplits
|
|
40
|
+
|
|
41
|
+
For extended examples of usage, see the
|
|
42
|
+
`DocumentCharacterTextSplitterTest <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala>`__.
|
|
43
|
+
|
|
44
|
+
====================== ======================
|
|
45
|
+
Input Annotation types Output Annotation type
|
|
46
|
+
====================== ======================
|
|
47
|
+
``DOCUMENT`` ``DOCUMENT``
|
|
48
|
+
====================== ======================
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
|
|
53
|
+
chunkSize
|
|
54
|
+
Size of each chunk of text.
|
|
55
|
+
chunkOverlap
|
|
56
|
+
Length of the overlap between text chunks , by default `0`.
|
|
57
|
+
splitPatterns
|
|
58
|
+
Patterns to separate the text by in decreasing priority , by default `["\\n\\n", "\\n", " ", ""]`.
|
|
59
|
+
patternsAreRegex
|
|
60
|
+
Whether to interpret the split patterns as regular expressions , by default `False`.
|
|
61
|
+
keepSeparators
|
|
62
|
+
Whether to keep the separators in the final result , by default `True`.
|
|
63
|
+
explodeSplits
|
|
64
|
+
Whether to explode split chunks to separate rows , by default `False`.
|
|
65
|
+
trimWhitespace
|
|
66
|
+
Whether to trim whitespaces of extracted chunks , by default `True`.
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
>>> import sparknlp
|
|
71
|
+
>>> from sparknlp.base import *
|
|
72
|
+
>>> from sparknlp.annotator import *
|
|
73
|
+
>>> from pyspark.ml import Pipeline
|
|
74
|
+
>>> textDF = spark.read.text(
|
|
75
|
+
... "sherlockholmes.txt",
|
|
76
|
+
... wholetext=True
|
|
77
|
+
... ).toDF("text")
|
|
78
|
+
>>> documentAssembler = DocumentAssembler().setInputCol("text")
|
|
79
|
+
>>> textSplitter = DocumentCharacterTextSplitter() \\
|
|
80
|
+
... .setInputCols(["document"]) \\
|
|
81
|
+
... .setOutputCol("splits") \\
|
|
82
|
+
... .setChunkSize(20000) \\
|
|
83
|
+
... .setChunkOverlap(200) \\
|
|
84
|
+
... .setExplodeSplits(True)
|
|
85
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, textSplitter])
|
|
86
|
+
>>> result = pipeline.fit(textDF).transform(textDF)
|
|
87
|
+
>>> result.selectExpr(
|
|
88
|
+
... "splits.result",
|
|
89
|
+
... "splits[0].begin",
|
|
90
|
+
... "splits[0].end",
|
|
91
|
+
... "splits[0].end - splits[0].begin as length") \\
|
|
92
|
+
... .show(8, truncate = 80)
|
|
93
|
+
+--------------------------------------------------------------------------------+---------------+-------------+------+
|
|
94
|
+
| result|splits[0].begin|splits[0].end|length|
|
|
95
|
+
+--------------------------------------------------------------------------------+---------------+-------------+------+
|
|
96
|
+
|[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 19994| 19994|
|
|
97
|
+
|["And Mademoiselle's address?" he asked.\\n\\n"Is Briony Lodge, Serpentine Aven...| 19798| 39395| 19597|
|
|
98
|
+
|["How did that help you?"\\n\\n"It was all-important. When a woman thinks that ...| 39371| 59242| 19871|
|
|
99
|
+
|["'But,' said I, 'there would be millions of red-headed men who\\nwould apply....| 59166| 77833| 18667|
|
|
100
|
+
|[My friend was an enthusiastic musician, being himself not only a\\nvery capab...| 77835| 97769| 19934|
|
|
101
|
+
|["And yet I am not convinced of it," I answered. "The cases which\\ncome to li...| 97771| 117248| 19477|
|
|
102
|
+
|["Well, she had a slate-coloured, broad-brimmed straw hat, with a\\nfeather of...| 117250| 137242| 19992|
|
|
103
|
+
|["That sounds a little paradoxical."\\n\\n"But it is profoundly True. Singulari...| 137244| 157171| 19927|
|
|
104
|
+
+--------------------------------------------------------------------------------+---------------+-------------+------+
|
|
105
|
+
|
|
106
|
+
"""
|
|
107
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
108
|
+
|
|
109
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
110
|
+
|
|
111
|
+
chunkSize = Param(Params._dummy(),
|
|
112
|
+
"chunkSize",
|
|
113
|
+
"Size of each chunk of text",
|
|
114
|
+
typeConverter=TypeConverters.toInt)
|
|
115
|
+
chunkOverlap = Param(Params._dummy(),
|
|
116
|
+
"chunkOverlap",
|
|
117
|
+
"Length of the overlap between text chunks",
|
|
118
|
+
typeConverter=TypeConverters.toInt)
|
|
119
|
+
splitPatterns = Param(Params._dummy(),
|
|
120
|
+
"splitPatterns",
|
|
121
|
+
"Patterns to separate the text by in decreasing priority",
|
|
122
|
+
typeConverter=TypeConverters.toListString)
|
|
123
|
+
patternsAreRegex = Param(Params._dummy(),
|
|
124
|
+
"patternsAreRegex",
|
|
125
|
+
"Whether to interpret the split patterns as regular expressions",
|
|
126
|
+
typeConverter=TypeConverters.toBoolean)
|
|
127
|
+
keepSeparators = Param(Params._dummy(),
|
|
128
|
+
"keepSeparators",
|
|
129
|
+
"Whether to keep the separators in the final result",
|
|
130
|
+
typeConverter=TypeConverters.toBoolean)
|
|
131
|
+
explodeSplits = Param(Params._dummy(),
|
|
132
|
+
"explodeSplits",
|
|
133
|
+
"Whether to explode split chunks to separate rows",
|
|
134
|
+
typeConverter=TypeConverters.toBoolean)
|
|
135
|
+
trimWhitespace = Param(Params._dummy(),
|
|
136
|
+
"trimWhitespace",
|
|
137
|
+
"Whether to trim whitespaces of extracted chunks",
|
|
138
|
+
typeConverter=TypeConverters.toBoolean)
|
|
139
|
+
|
|
140
|
+
@keyword_only
|
|
141
|
+
def __init__(self):
|
|
142
|
+
super(DocumentCharacterTextSplitter, self).__init__(
|
|
143
|
+
classname="com.johnsnowlabs.nlp.annotators.DocumentCharacterTextSplitter")
|
|
144
|
+
self._setDefault(
|
|
145
|
+
chunkOverlap=0,
|
|
146
|
+
explodeSplits=False,
|
|
147
|
+
keepSeparators=True,
|
|
148
|
+
patternsAreRegex=False,
|
|
149
|
+
splitPatterns=["\n\n", "\n", " ", ""],
|
|
150
|
+
trimWhitespace=True
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def setChunkSize(self, value):
|
|
154
|
+
"""Sets size of each chunk of text.
|
|
155
|
+
|
|
156
|
+
Parameters
|
|
157
|
+
----------
|
|
158
|
+
value : int
|
|
159
|
+
Size of each chunk of text
|
|
160
|
+
"""
|
|
161
|
+
if value < 1:
|
|
162
|
+
raise ValueError("Chunk size should be larger than 0.")
|
|
163
|
+
return self._set(chunkSize=value)
|
|
164
|
+
|
|
165
|
+
def setChunkOverlap(self, value):
|
|
166
|
+
"""Sets length of the overlap between text chunks , by default `0`.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
value : int
|
|
171
|
+
Length of the overlap between text chunks
|
|
172
|
+
"""
|
|
173
|
+
if value > self.getOrDefault(self.chunkSize):
|
|
174
|
+
raise ValueError("Chunk overlap can't be larger than chunk size.")
|
|
175
|
+
return self._set(chunkOverlap=value)
|
|
176
|
+
|
|
177
|
+
def setSplitPatterns(self, value):
|
|
178
|
+
"""Sets patterns to separate the text by in decreasing priority , by default `["\n\n", "\n", " ", ""]`.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
value : List[str]
|
|
183
|
+
Patterns to separate the text by in decreasing priority
|
|
184
|
+
"""
|
|
185
|
+
if len(value) == 0:
|
|
186
|
+
raise ValueError("Patterns are empty")
|
|
187
|
+
|
|
188
|
+
return self._set(splitPatterns=value)
|
|
189
|
+
|
|
190
|
+
def setPatternsAreRegex(self, value):
|
|
191
|
+
"""Sets whether to interpret the split patterns as regular expressions , by default `False`.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
value : bool
|
|
196
|
+
Whether to interpret the split patterns as regular expressions
|
|
197
|
+
"""
|
|
198
|
+
return self._set(patternsAreRegex=value)
|
|
199
|
+
|
|
200
|
+
def setKeepSeparators(self, value):
|
|
201
|
+
"""Sets whether to keep the separators in the final result , by default `True`.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
value : bool
|
|
206
|
+
Whether to keep the separators in the final result
|
|
207
|
+
"""
|
|
208
|
+
return self._set(keepSeparators=value)
|
|
209
|
+
|
|
210
|
+
def setExplodeSplits(self, value):
|
|
211
|
+
"""Sets whether to explode split chunks to separate rows , by default `False`.
|
|
212
|
+
|
|
213
|
+
Parameters
|
|
214
|
+
----------
|
|
215
|
+
value : bool
|
|
216
|
+
Whether to explode split chunks to separate rows
|
|
217
|
+
"""
|
|
218
|
+
return self._set(explodeSplits=value)
|
|
219
|
+
|
|
220
|
+
def setTrimWhitespace(self, value):
|
|
221
|
+
"""Sets whether to trim whitespaces of extracted chunks , by default `True`.
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
value : bool
|
|
226
|
+
Whether to trim whitespaces of extracted chunks
|
|
227
|
+
"""
|
|
228
|
+
return self._set(trimWhitespace=value)
|