PyPI - spark-nlp - Versions diffs - 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +81 -28
sparknlp/annotation.py +3 -2
sparknlp/annotator/__init__.py +6 -0
sparknlp/annotator/audio/__init__.py +2 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/{base → annotator}/chunk2_doc.py +4 -7
sparknlp/annotator/chunker.py +1 -2
sparknlp/annotator/classifier_dl/__init__.py +17 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/spanbert_coref.py +4 -18
sparknlp/annotator/cv/__init__.py +15 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/dependency_parser.py +2 -3
sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +37 -1
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +11 -0
sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
sparknlp/annotator/embeddings/doc2vec.py +7 -1
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
sparknlp/annotator/embeddings/word2vec.py +7 -1
sparknlp/annotator/embeddings/word_embeddings.py +4 -5
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
sparknlp/annotator/er/entity_ruler.py +37 -23
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
sparknlp/annotator/lemmatizer.py +3 -4
sparknlp/annotator/matcher/date_matcher.py +35 -3
sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
sparknlp/annotator/matcher/regex_matcher.py +3 -3
sparknlp/annotator/matcher/text_matcher.py +2 -3
sparknlp/annotator/n_gram_generator.py +1 -2
sparknlp/annotator/ner/__init__.py +3 -1
sparknlp/annotator/ner/ner_converter.py +18 -0
sparknlp/annotator/ner/ner_crf.py +4 -5
sparknlp/annotator/ner/ner_dl.py +10 -5
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +2 -2
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +2 -2
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/pos/perceptron.py +6 -7
sparknlp/annotator/sentence/sentence_detector.py +2 -2
sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
sparknlp/annotator/seq2seq/__init__.py +17 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
sparknlp/annotator/stemmer.py +2 -3
sparknlp/annotator/stop_words_cleaner.py +3 -4
sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
sparknlp/annotator/token/__init__.py +0 -1
sparknlp/annotator/token/recursive_tokenizer.py +2 -3
sparknlp/annotator/token/tokenizer.py +2 -3
sparknlp/annotator/ws/word_segmenter.py +35 -10
sparknlp/base/__init__.py +2 -3
sparknlp/base/doc2_chunk.py +0 -3
sparknlp/base/document_assembler.py +5 -5
sparknlp/base/embeddings_finisher.py +14 -2
sparknlp/base/finisher.py +15 -4
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/image_assembler.py +69 -0
sparknlp/base/light_pipeline.py +53 -21
sparknlp/base/multi_document_assembler.py +9 -13
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/token_assembler.py +1 -2
sparknlp/common/__init__.py +2 -0
sparknlp/common/annotator_type.py +1 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +914 -9
sparknlp/internal/__init__.py +841 -116
sparknlp/internal/annotator_java_ml.py +1 -1
sparknlp/internal/annotator_transformer.py +3 -0
sparknlp/logging/comet.py +2 -2
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/pretrained_pipeline.py +1 -1
sparknlp/pretrained/resource_downloader.py +126 -2
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +1 -0
sparknlp/training/conll.py +8 -2
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/util.py +26 -0
spark_nlp-4.2.6.dist-info/METADATA +0 -1256
spark_nlp-4.2.6.dist-info/RECORD +0 -196
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
/sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0

sparknlp/annotator/document_character_text_splitter.py ADDED Viewed

@@ -0,0 +1,228 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the DocumentNormalizer"""
+from sparknlp.common import *
+class DocumentCharacterTextSplitter(AnnotatorModel):
+    """Annotator which splits large documents into chunks of roughly given size.
+    DocumentCharacterTextSplitter takes a list of separators. It takes the separators in order and
+    splits subtexts if they are over the chunk length, considering optional overlap of the chunks.
+    For example, given chunk size 20 and overlap 5:
+    .. code-block:: python
+        "He was, I take it, the most perfect reasoning and observing machine that the world has seen."
+        ["He was, I take it,", "it, the most", "most perfect", "reasoning and", "and observing", "machine that the", "the world has seen."]
+    Additionally, you can set
+    - custom patterns with setSplitPatterns
+    - whether patterns should be interpreted as regex with setPatternsAreRegex
+    - whether to keep the separators with setKeepSeparators
+    - whether to trim whitespaces with setTrimWhitespace
+    - whether to explode the splits to individual rows with setExplodeSplits
+    For extended examples of usage, see the
+    `DocumentCharacterTextSplitterTest <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+    Parameters
+    ----------
+    chunkSize
+        Size of each chunk of text.
+    chunkOverlap
+        Length of the overlap between text chunks , by default `0`.
+    splitPatterns
+        Patterns to separate the text by in decreasing priority , by default `["\\n\\n", "\\n", " ", ""]`.
+    patternsAreRegex
+        Whether to interpret the split patterns as regular expressions , by default `False`.
+    keepSeparators
+        Whether to keep the separators in the final result , by default `True`.
+    explodeSplits
+        Whether to explode split chunks to separate rows , by default `False`.
+    trimWhitespace
+        Whether to trim whitespaces of extracted chunks , by default `True`.
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> textDF = spark.read.text(
+    ...    "sherlockholmes.txt",
+    ...    wholetext=True
+    ... ).toDF("text")
+    >>> documentAssembler = DocumentAssembler().setInputCol("text")
+    >>> textSplitter = DocumentCharacterTextSplitter() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("splits") \\
+    ...     .setChunkSize(20000) \\
+    ...     .setChunkOverlap(200) \\
+    ...     .setExplodeSplits(True)
+    >>> pipeline = Pipeline().setStages([documentAssembler, textSplitter])
+    >>> result = pipeline.fit(textDF).transform(textDF)
+    >>> result.selectExpr(
+    ...       "splits.result",
+    ...       "splits[0].begin",
+    ...       "splits[0].end",
+    ...       "splits[0].end - splits[0].begin as length") \\
+    ...     .show(8, truncate = 80)
+    +--------------------------------------------------------------------------------+---------------+-------------+------+
+    |                                                                          result|splits[0].begin|splits[0].end|length|
+    +--------------------------------------------------------------------------------+---------------+-------------+------+
+    |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...|              0|        19994| 19994|
+    |["And Mademoiselle's address?" he asked.\\n\\n"Is Briony Lodge, Serpentine Aven...|          19798|        39395| 19597|
+    |["How did that help you?"\\n\\n"It was all-important. When a woman thinks that ...|          39371|        59242| 19871|
+    |["'But,' said I, 'there would be millions of red-headed men who\\nwould apply....|          59166|        77833| 18667|
+    |[My friend was an enthusiastic musician, being himself not only a\\nvery capab...|          77835|        97769| 19934|
+    |["And yet I am not convinced of it," I answered. "The cases which\\ncome to li...|          97771|       117248| 19477|
+    |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\\nfeather of...|         117250|       137242| 19992|
+    |["That sounds a little paradoxical."\\n\\n"But it is profoundly True. Singulari...|         137244|       157171| 19927|
+    +--------------------------------------------------------------------------------+---------------+-------------+------+
+    """
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+    chunkSize = Param(Params._dummy(),
+                      "chunkSize",
+                      "Size of each chunk of text",
+                      typeConverter=TypeConverters.toInt)
+    chunkOverlap = Param(Params._dummy(),
+                         "chunkOverlap",
+                         "Length of the overlap between text chunks",
+                         typeConverter=TypeConverters.toInt)
+    splitPatterns = Param(Params._dummy(),
+                          "splitPatterns",
+                          "Patterns to separate the text by in decreasing priority",
+                          typeConverter=TypeConverters.toListString)
+    patternsAreRegex = Param(Params._dummy(),
+                             "patternsAreRegex",
+                             "Whether to interpret the split patterns as regular expressions",
+                             typeConverter=TypeConverters.toBoolean)
+    keepSeparators = Param(Params._dummy(),
+                           "keepSeparators",
+                           "Whether to keep the separators in the final result",
+                           typeConverter=TypeConverters.toBoolean)
+    explodeSplits = Param(Params._dummy(),
+                          "explodeSplits",
+                          "Whether to explode split chunks to separate rows",
+                          typeConverter=TypeConverters.toBoolean)
+    trimWhitespace = Param(Params._dummy(),
+                           "trimWhitespace",
+                           "Whether to trim whitespaces of extracted chunks",
+                           typeConverter=TypeConverters.toBoolean)
+    @keyword_only
+    def __init__(self):
+        super(DocumentCharacterTextSplitter, self).__init__(
+            classname="com.johnsnowlabs.nlp.annotators.DocumentCharacterTextSplitter")
+        self._setDefault(
+            chunkOverlap=0,
+            explodeSplits=False,
+            keepSeparators=True,
+            patternsAreRegex=False,
+            splitPatterns=["\n\n", "\n", " ", ""],
+            trimWhitespace=True
+        )
+    def setChunkSize(self, value):
+        """Sets size of each chunk of text.
+        Parameters
+        ----------
+        value : int
+            Size of each chunk of text
+        """
+        if value < 1:
+            raise ValueError("Chunk size should be larger than 0.")
+        return self._set(chunkSize=value)
+    def setChunkOverlap(self, value):
+        """Sets length of the overlap between text chunks , by default `0`.
+        Parameters
+        ----------
+        value : int
+            Length of the overlap between text chunks
+        """
+        if value > self.getOrDefault(self.chunkSize):
+            raise ValueError("Chunk overlap can't be larger than chunk size.")
+        return self._set(chunkOverlap=value)
+    def setSplitPatterns(self, value):
+        """Sets patterns to separate the text by in decreasing priority , by default `["\n\n", "\n", " ", ""]`.
+        Parameters
+        ----------
+        value : List[str]
+            Patterns to separate the text by in decreasing priority
+        """
+        if len(value) == 0:
+            raise ValueError("Patterns are empty")
+        return self._set(splitPatterns=value)
+    def setPatternsAreRegex(self, value):
+        """Sets whether to interpret the split patterns as regular expressions , by default `False`.
+        Parameters
+        ----------
+        value : bool
+            Whether to interpret the split patterns as regular expressions
+        """
+        return self._set(patternsAreRegex=value)
+    def setKeepSeparators(self, value):
+        """Sets whether to keep the separators in the final result , by default `True`.
+        Parameters
+        ----------
+        value : bool
+            Whether to keep the separators in the final result
+        """
+        return self._set(keepSeparators=value)
+    def setExplodeSplits(self, value):
+        """Sets whether to explode split chunks to separate rows , by default `False`.
+        Parameters
+        ----------
+        value : bool
+            Whether to explode split chunks to separate rows
+        """
+        return self._set(explodeSplits=value)
+    def setTrimWhitespace(self, value):
+        """Sets whether to trim whitespaces of extracted chunks , by default `True`.
+        Parameters
+        ----------
+        value : bool
+            Whether to trim whitespaces of extracted chunks
+        """
+        return self._set(trimWhitespace=value)

sparknlp/annotator/document_normalizer.py CHANGED Viewed

@@ -23,7 +23,8 @@ class DocumentNormalizer(AnnotatorModel):
     patterns. Can apply not wanted character removal with a specific policy.
     Can apply lower case normalization.
-    For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb
+>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -121,6 +122,21 @@ class DocumentNormalizer(AnnotatorModel):
                      "file encoding to apply on normalized documents",
                      typeConverter=TypeConverters.toString)
+    presetPattern = Param(
+        Params._dummy(),
+        "presetPattern",
+        "Selects a single text cleaning function from the functional presets (e.g., 'CLEAN_BULLETS', 'CLEAN_DASHES', etc.).",
+        typeConverter=TypeConverters.toString
+    )
+    autoMode = Param(
+        Params._dummy(),
+        "autoMode",
+        "Enables a predefined cleaning mode combining multiple text cleaner functions (e.g., 'light_clean', 'document_clean', 'html_clean', 'full_auto').",
+        typeConverter=TypeConverters.toString
+    )
     @keyword_only
     def __init__(self):
         super(DocumentNormalizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DocumentNormalizer")
@@ -197,3 +213,23 @@ class DocumentNormalizer(AnnotatorModel):
         """
         return self._set(encoding=value)
+    def setPresetPattern(self, value):
+        """Sets a single text cleaning preset pattern.
+        Parameters
+        ----------
+        value : str
+            Preset cleaning pattern name, e.g., 'CLEAN_BULLETS', 'CLEAN_DASHES'.
+        """
+        return self._set(presetPattern=value)
+    def setAutoMode(self, value):
+        """Sets an automatic text cleaning mode using predefined groups of cleaning functions.
+        Parameters
+        ----------
+        value : str
+            Auto cleaning mode, e.g., 'light_clean', 'document_clean', 'social_clean', 'html_clean', 'full_auto'.
+        """
+        return self._set(autoMode=value)

sparknlp/annotator/document_token_splitter.py ADDED Viewed

@@ -0,0 +1,175 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the DocumentNormalizer"""
+from sparknlp.common import *
+class DocumentTokenSplitter(AnnotatorModel):
+    """Annotator that splits large documents into smaller documents based on the number of tokens in
+    the text.
+    Currently, DocumentTokenSplitter splits the text by whitespaces to create the tokens. The
+    number of these tokens will then be used as a measure of the text length. In the future, other
+    tokenization techniques will be supported.
+    For example, given 3 tokens and overlap 1:
+    .. code-block:: python
+        He was, I take it, the most perfect reasoning and observing machine that the world has seen.
+        ["He was, I", "I take it,", "it, the most", "most perfect reasoning", "reasoning and observing", "observing machine that", "that the world", "world has seen."]
+    Additionally, you can set
+      - whether to trim whitespaces with setTrimWhitespace
+      - whether to explode the splits to individual rows with setExplodeSplits
+    For extended examples of usage, see the
+    `DocumentTokenSplitterTest <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+    Parameters
+    ----------
+    numTokens
+        Limit of the number of tokens in a text
+    tokenOverlap
+        Length of the token overlap between text chunks, by default `0`.
+    explodeSplits
+        Whether to explode split chunks to separate rows, by default `False`.
+    trimWhitespace
+        Whether to trim whitespaces of extracted chunks, by default `True`.
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> textDF = spark.read.text(
+    ...    "sherlockholmes.txt",
+    ...    wholetext=True
+    ... ).toDF("text")
+    >>> documentAssembler = DocumentAssembler().setInputCol("text")
+    >>> textSplitter = DocumentTokenSplitter() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("splits") \\
+    ...     .setNumTokens(512) \\
+    ...     .setTokenOverlap(10) \\
+    ...     .setExplodeSplits(True)
+    >>> pipeline = Pipeline().setStages([documentAssembler, textSplitter])
+    >>> result = pipeline.fit(textDF).transform(textDF)
+    >>> result.selectExpr(
+    ...       "splits.result as result",
+    ...       "splits[0].begin as begin",
+    ...       "splits[0].end as end",
+    ...       "splits[0].end - splits[0].begin as length",
+    ...       "splits[0].metadata.numTokens as tokens") \\
+    ...     .show(8, truncate = 80)
+    +--------------------------------------------------------------------------------+-----+-----+------+------+
+    |                                                                          result|begin|  end|length|tokens|
+    +--------------------------------------------------------------------------------+-----+-----+------+------+
+    |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...|    0| 3018|  3018|   512|
+    |[study of crime, and occupied his\\nimmense faculties and extraordinary powers...| 2950| 5707|  2757|   512|
+    |[but as I have changed my clothes I can't imagine how you\\ndeduce it. As to M...| 5659| 8483|  2824|   512|
+    |[quarters received. Be in your chamber then at that hour, and do\\nnot take it...| 8427|11241|  2814|   512|
+    |[a pity\\nto miss it."\\n\\n"But your client--"\\n\\n"Never mind him. I may want y...|11188|13970|  2782|   512|
+    |[person who employs me wishes his agent to be unknown to\\nyou, and I may conf...|13918|16898|  2980|   512|
+    |[letters back."\\n\\n"Precisely so. But how--"\\n\\n"Was there a secret marriage?...|16836|19744|  2908|   512|
+    |[seven hundred in\\nnotes," he said.\\n\\nHolmes scribbled a receipt upon a shee...|19683|22551|  2868|   512|
+    +--------------------------------------------------------------------------------+-----+-----+------+------+
+    """
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+    numTokens = Param(Params._dummy(),
+                      "numTokens",
+                      "Limit of the number of tokens in a text",
+                      typeConverter=TypeConverters.toInt)
+    tokenOverlap = Param(Params._dummy(),
+                         "tokenOverlap",
+                         "Length of the token overlap between text chunks",
+                         typeConverter=TypeConverters.toInt)
+    explodeSplits = Param(Params._dummy(),
+                          "explodeSplits",
+                          "Whether to explode split chunks to separate rows",
+                          typeConverter=TypeConverters.toBoolean)
+    trimWhitespace = Param(Params._dummy(),
+                           "trimWhitespace",
+                           "Whether to trim whitespaces of extracted chunks",
+                           typeConverter=TypeConverters.toBoolean)
+    @keyword_only
+    def __init__(self):
+        super(DocumentTokenSplitter, self).__init__(
+            classname="com.johnsnowlabs.nlp.annotators.DocumentTokenSplitter")
+        self._setDefault(
+            tokenOverlap=0,
+            explodeSplits=False,
+            trimWhitespace=True
+        )
+    def setNumTokens(self, value):
+        """Sets the limit of the number of tokens in a text
+        Parameters
+        ----------
+        value : int
+            Number of tokens in a text
+        """
+        if value < 1:
+            raise ValueError("Number of tokens should be larger than 0.")
+        return self._set(numTokens=value)
+    def setTokenOverlap(self, value):
+        """Length of the token overlap between text chunks, by default `0`.
+        Parameters
+        ----------
+        value : int
+            Length of the token overlap between text chunks
+        """
+        if value > self.getOrDefault(self.numTokens):
+            raise ValueError("Token overlap can't be larger than number of tokens.")
+        return self._set(tokenOverlap=value)
+    def setExplodeSplits(self, value):
+        """Sets whether to explode split chunks to separate rows, by default `False`.
+        Parameters
+        ----------
+        value : bool
+            Whether to explode split chunks to separate rows
+        """
+        return self._set(explodeSplits=value)
+    def setTrimWhitespace(self, value):
+        """Sets whether to trim whitespaces of extracted chunks, by default `True`.
+        Parameters
+        ----------
+        value : bool
+            Whether to trim whitespaces of extracted chunks
+        """
+        return self._set(trimWhitespace=value)

sparknlp/annotator/document_token_splitter_test.py ADDED Viewed

@@ -0,0 +1,85 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+import pytest
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkSessionForTest
+@pytest.mark.fast
+class DocumentTokenSplitterTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.data = SparkSessionForTest.spark.createDataFrame(
+            [
+                [
+                    (
+                        "All emotions, and that\none particularly, were abhorrent to his cold, precise"
+                        " but\nadmirably balanced mind.\n\nHe was, I take it, the most perfect\nreasoning"
+                        " and observing machine that the world has seen."
+                    )
+                ]
+            ]
+        ).toDF("text")
+    def test_run(self):
+        df = self.data
+        document_assembler = (
+            DocumentAssembler().setInputCol("text").setOutputCol("document")
+        )
+        document_token_splitter = (
+            DocumentTokenSplitter()
+            .setInputCols("document")
+            .setOutputCol("splits")
+            .setNumTokens(3)
+            .setTokenOverlap(1)
+            .setExplodeSplits(True)
+            .setTrimWhitespace(True)
+        )
+        pipeline = Pipeline().setStages([document_assembler, document_token_splitter])
+        pipeline_df = pipeline.fit(df).transform(df)
+        results = pipeline_df.select("splits").collect()
+        splits = [
+            row["splits"][0].result.replace("\n\n", " ").replace("\n", " ")
+            for row in results
+        ]
+        expected = [
+            "All emotions, and",
+            "and that one",
+            "one particularly, were",
+            "were abhorrent to",
+            "to his cold,",
+            "cold, precise but",
+            "but admirably balanced",
+            "balanced mind. He",
+            "He was, I",
+            "I take it,",
+            "it, the most",
+            "most perfect reasoning",
+            "reasoning and observing",
+            "observing machine that",
+            "that the world",
+            "world has seen.",
+        ]
+        assert splits == expected

sparknlp/annotator/embeddings/__init__.py CHANGED Viewed

@@ -22,7 +22,11 @@ from sparknlp.annotator.embeddings.deberta_embeddings import *
 from sparknlp.annotator.embeddings.distil_bert_embeddings import *
 from sparknlp.annotator.embeddings.doc2vec import *
 from sparknlp.annotator.embeddings.elmo_embeddings import *
+from sparknlp.annotator.embeddings.e5_embeddings import *
+from sparknlp.annotator.embeddings.instructor_embeddings import *
 from sparknlp.annotator.embeddings.longformer_embeddings import *
+from sparknlp.annotator.embeddings.minilm_embeddings import *
+from sparknlp.annotator.embeddings.mpnet_embeddings import *
 from sparknlp.annotator.embeddings.roberta_embeddings import *
 from sparknlp.annotator.embeddings.roberta_sentence_embeddings import *
 from sparknlp.annotator.embeddings.sentence_embeddings import *
@@ -32,3 +36,10 @@ from sparknlp.annotator.embeddings.word_embeddings import *
 from sparknlp.annotator.embeddings.xlm_roberta_embeddings import *
 from sparknlp.annotator.embeddings.xlm_roberta_sentence_embeddings import *
 from sparknlp.annotator.embeddings.xlnet_embeddings import *
+from sparknlp.annotator.embeddings.bge_embeddings import *
+from sparknlp.annotator.embeddings.uae_embeddings import *
+from sparknlp.annotator.embeddings.mxbai_embeddings import *
+from sparknlp.annotator.embeddings.snowflake_embeddings import *
+from sparknlp.annotator.embeddings.nomic_embeddings import *
+from sparknlp.annotator.embeddings.auto_gguf_embeddings import *
+from sparknlp.annotator.embeddings.e5v_embeddings import *

sparknlp/annotator/embeddings/albert_embeddings.py CHANGED Viewed

@@ -21,7 +21,8 @@ class AlbertEmbeddings(AnnotatorModel,
                        HasCaseSensitiveProperties,
                        HasStorageRef,
                        HasBatchedAnnotate,
-                       HasEngine):
+                       HasEngine,
+                       HasMaxSentenceLengthLimit):
     """ALBERT: A Lite Bert For Self-Supervised Learning Of Language
     Representations - Google Research, Toyota Technological Institute at Chicago
@@ -53,8 +54,8 @@ class AlbertEmbeddings(AnnotatorModel,
     The default model is ``"albert_base_uncased"``, if no name is provided.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_albert.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_albert.ipynb>`__.
     To see which models are compatible and how to import them see
     `Import Transformers into Spark NLP 🚀
     <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
@@ -163,11 +164,6 @@ class AlbertEmbeddings(AnnotatorModel,
                              "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
                              TypeConverters.toListInt)
-    maxSentenceLength = Param(Params._dummy(),
-                              "maxSentenceLength",
-                              "Max sentence length to process",
-                              typeConverter=TypeConverters.toInt)
     def setConfigProtoBytes(self, b):
         """Sets configProto from tensorflow, serialized into byte array.
@@ -178,16 +174,6 @@ class AlbertEmbeddings(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.AlbertEmbeddings", java_model=None):
         super(AlbertEmbeddings, self).__init__(

spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl