PyPI - spark-nlp - Versions diffs - 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +81 -28
sparknlp/annotation.py +3 -2
sparknlp/annotator/__init__.py +6 -0
sparknlp/annotator/audio/__init__.py +2 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/{base → annotator}/chunk2_doc.py +4 -7
sparknlp/annotator/chunker.py +1 -2
sparknlp/annotator/classifier_dl/__init__.py +17 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/spanbert_coref.py +4 -18
sparknlp/annotator/cv/__init__.py +15 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/dependency_parser.py +2 -3
sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +37 -1
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +11 -0
sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
sparknlp/annotator/embeddings/doc2vec.py +7 -1
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
sparknlp/annotator/embeddings/word2vec.py +7 -1
sparknlp/annotator/embeddings/word_embeddings.py +4 -5
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
sparknlp/annotator/er/entity_ruler.py +37 -23
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
sparknlp/annotator/lemmatizer.py +3 -4
sparknlp/annotator/matcher/date_matcher.py +35 -3
sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
sparknlp/annotator/matcher/regex_matcher.py +3 -3
sparknlp/annotator/matcher/text_matcher.py +2 -3
sparknlp/annotator/n_gram_generator.py +1 -2
sparknlp/annotator/ner/__init__.py +3 -1
sparknlp/annotator/ner/ner_converter.py +18 -0
sparknlp/annotator/ner/ner_crf.py +4 -5
sparknlp/annotator/ner/ner_dl.py +10 -5
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +2 -2
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +2 -2
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/pos/perceptron.py +6 -7
sparknlp/annotator/sentence/sentence_detector.py +2 -2
sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
sparknlp/annotator/seq2seq/__init__.py +17 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
sparknlp/annotator/stemmer.py +2 -3
sparknlp/annotator/stop_words_cleaner.py +3 -4
sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
sparknlp/annotator/token/__init__.py +0 -1
sparknlp/annotator/token/recursive_tokenizer.py +2 -3
sparknlp/annotator/token/tokenizer.py +2 -3
sparknlp/annotator/ws/word_segmenter.py +35 -10
sparknlp/base/__init__.py +2 -3
sparknlp/base/doc2_chunk.py +0 -3
sparknlp/base/document_assembler.py +5 -5
sparknlp/base/embeddings_finisher.py +14 -2
sparknlp/base/finisher.py +15 -4
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/image_assembler.py +69 -0
sparknlp/base/light_pipeline.py +53 -21
sparknlp/base/multi_document_assembler.py +9 -13
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/token_assembler.py +1 -2
sparknlp/common/__init__.py +2 -0
sparknlp/common/annotator_type.py +1 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +914 -9
sparknlp/internal/__init__.py +841 -116
sparknlp/internal/annotator_java_ml.py +1 -1
sparknlp/internal/annotator_transformer.py +3 -0
sparknlp/logging/comet.py +2 -2
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/pretrained_pipeline.py +1 -1
sparknlp/pretrained/resource_downloader.py +126 -2
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +1 -0
sparknlp/training/conll.py +8 -2
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/util.py +26 -0
spark_nlp-4.2.6.dist-info/METADATA +0 -1256
spark_nlp-4.2.6.dist-info/RECORD +0 -196
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
/sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0

sparknlp/annotator/er/entity_ruler.py CHANGED Viewed

@@ -27,9 +27,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
     to be set as the "format" field in the ``option`` parameter map and
     depending on the file type, additional parameters might need to be set.
-    To enable regex extraction, ``setEnablePatternRegex(True)`` needs to be
-    called.
     If the file is in a JSON format, then the rule definitions need to be given
     in a list with the fields "id", "label" and "patterns"::
@@ -71,8 +68,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
     ----------
     patternsResource
         Resource in JSON or CSV format to map entities to patterns
-    enablePatternRegex
-        Enables regex pattern match
     useStorage
         Whether to use RocksDB storage to serialize patterns
@@ -106,8 +101,7 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
     ...       "patterns.csv",
     ...       ReadAs.TEXT,
     ...       {"format": "csv", "delimiter": "\\\\|"}
-    ...     ) \\
-    ...     .setEnablePatternRegex(True)
+    ...     )
     >>> pipeline = Pipeline().setStages([
     ...     documentAssembler,
     ...     tokenizer,
@@ -135,11 +129,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
                              "Resource in JSON or CSV format to map entities to patterns",
                              typeConverter=TypeConverters.identity)
-    enablePatternRegex = Param(Params._dummy(),
-                               "enablePatternRegex",
-                               "Enables regex pattern match",
-                               typeConverter=TypeConverters.toBoolean)
     useStorage = Param(Params._dummy(),
                        "useStorage",
                        "Whether to use RocksDB storage to serialize patterns",
@@ -174,16 +163,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
         """
         return self._set(patternsResource=ExternalResource(path, read_as, options))
-    def setEnablePatternRegex(self, value):
-        """Sets whether to enable regex pattern matching.
-        Parameters
-        ----------
-        value : bool
-            Whether to enable regex pattern matching.
-        """
-        return self._set(enablePatternRegex=value)
     def setUseStorage(self, value):
         """Sets whether to use RocksDB storage to serialize patterns.
@@ -236,6 +215,20 @@ class EntityRulerModel(AnnotatorModel, HasStorageModel):
     outputAnnotatorType = AnnotatorType.CHUNK
+    autoMode = Param(
+        Params._dummy(),
+        "autoMode",
+        "Enable built-in regex presets that combine related entity patterns (e.g., 'communication_entities', 'network_entities', 'media_entities', etc.).",
+        typeConverter=TypeConverters.toString
+    )
+    extractEntities = Param(
+        Params._dummy(),
+        "extractEntities",
+        "List of entity types to extract. If not set, all entities in the active autoMode or from regexPatterns are used.",
+        typeConverter=TypeConverters.toListString
+    )
     def __init__(self, classname="com.johnsnowlabs.nlp.annotators.er.EntityRulerModel", java_model=None):
         super(EntityRulerModel, self).__init__(
             classname=classname,
@@ -249,5 +242,26 @@ class EntityRulerModel(AnnotatorModel, HasStorageModel):
     @staticmethod
     def loadStorage(path, spark, storage_ref):
-        HasStorageModel.loadStorages(path, spark, storage_ref, EntityRulerModel.databases)
+        HasStorageModel.loadStorages(path, spark, storage_ref, EntityRulerModel.database)
+    def setAutoMode(self, value):
+        """Sets the auto mode for predefined regex entity groups.
+        Parameters
+        ----------
+        value : str
+            Name of the auto mode to activate (e.g., 'communication_entities', 'network_entities', etc.)
+        """
+        return self._set(autoMode=value)
+    def setExtractEntities(self, value):
+        """Sets specific entities to extract, filtering only those defined in regexPatterns or autoMode.
+        Parameters
+        ----------
+        value : list[str]
+            List of entity names to extract, e.g., ['EMAIL_ADDRESS_PATTERN', 'IPV4_PATTERN'].
+        """
+        return self._set(extractEntities=value)

sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py CHANGED Viewed

@@ -44,8 +44,8 @@ class YakeKeywordExtraction(AnnotatorModel):
     lower the score better the keyword). Therefore to filter the keywords, an
     upper bound for the score can be set with :meth:`.setThreshold`.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/8.Keyword_Extraction_YAKE.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -268,4 +268,3 @@ class YakeKeywordExtraction(AnnotatorModel):
         from pyspark.ml.wrapper import _jvm
         stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover
         return list(stopWordsObj.loadDefaultStopWords(language))

sparknlp/annotator/ld_dl/language_detector_dl.py CHANGED Viewed

@@ -37,9 +37,9 @@ class LanguageDetectorDL(AnnotatorModel, HasStorageRef, HasEngine):
     The default model is ``"ld_wiki_tatoeba_cnn_21"``, default language is
     ``"xx"`` (meaning multi-lingual), if no values are provided.
-    For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Language+Detection>`__.
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Language+Detection>`__.
-    For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/language-detection/Language_Detection_and_Indentification.ipynb>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type

sparknlp/annotator/lemmatizer.py CHANGED Viewed

@@ -24,8 +24,8 @@ class Lemmatizer(AnnotatorApproach):
     For instantiated/pretrained models, see :class:`.LemmatizerModel`.
-    For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Lemmatization>`__.
-    For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -194,7 +194,7 @@ class LemmatizerModel(AnnotatorModel):
     ...     .setInputCols(["token"]) \\
     ...     .setOutputCol("lemma")
-    For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Lemmatization>`__.
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -248,4 +248,3 @@ class LemmatizerModel(AnnotatorModel):
         """
         from sparknlp.pretrained import ResourceDownloader
         return ResourceDownloader.downloadModel(LemmatizerModel, name, lang, remote_loc)

sparknlp/annotator/matcher/date_matcher.py CHANGED Viewed

@@ -67,6 +67,16 @@ class DateMatcherUtils(Params):
                            "source language for explicit translation",
                            typeConverter=TypeConverters.toString)
+    relaxedFactoryStrategy = Param(Params._dummy(),
+                                   "relaxedFactoryStrategy",
+                                   "Matched Strategy to searches relaxed dates",
+                                   typeConverter=TypeConverters.toString)
+    aggressiveMatching = Param(Params._dummy(),
+                               "aggressiveMatching",
+                               "Whether to aggressively attempt to find date matches, even in ambiguous or less common formats",
+                               typeConverter=TypeConverters.toBoolean)
     def setInputFormats(self, value):
         """Sets input formats patterns to match in the documents.
@@ -159,6 +169,29 @@ class DateMatcherUtils(Params):
         """
         return self._set(anchorDateDay=value)
+    def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
+        """ Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy.
+        Not all of the date information needs to be included. For example
+        ``"YYYY"`` is also a valid input.
+        Parameters
+        ----------
+        matchStrategy : MatchStrategy
+            Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy
+        """
+        return self._set(relaxedFactoryStrategy=matchStrategy)
+    def setAggressiveMatching(self, value):
+        """ Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats
+        Parameters
+        ----------
+        aggressiveMatching : Boolean
+            Whether to aggressively attempt to find date matches, even in ambiguous or less common formats
+        """
+        return self._set(aggressiveMatching=value)
 class DateMatcher(AnnotatorModel, DateMatcherUtils):
     """Matches standard date formats into a provided format
@@ -184,10 +217,10 @@ class DateMatcher(AnnotatorModel, DateMatcherUtils):
     ``2008/04/31``.
     Pretrained pipelines are available for this module, see
-    `Pipelines <https://nlp.johnsnowlabs.com/docs/en/pipelines>`__.
+    `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
     For extended examples of usage, see the
-    `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -268,4 +301,3 @@ class DateMatcher(AnnotatorModel, DateMatcherUtils):
             anchorDateMonth=-1,
             anchorDateDay=-1
         )

sparknlp/annotator/matcher/multi_date_matcher.py CHANGED Viewed

@@ -33,7 +33,7 @@ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
     For example ``"The 31st of April in the year 2008"`` will be converted into
     ``2008/04/31``.
-    For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -107,4 +107,3 @@ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
             readMonthFirst=True,
             defaultDayWhenMissing=1
         )

sparknlp/annotator/matcher/regex_matcher.py CHANGED Viewed

@@ -32,10 +32,10 @@ class RegexMatcher(AnnotatorApproach):
     delimited text file.
     Pretrained pipelines are available for this module, see `Pipelines
-    <https://nlp.johnsnowlabs.com/docs/en/pipelines>`__.
+    <https://sparknlp.org/docs/en/pipelines>`__.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type

sparknlp/annotator/matcher/text_matcher.py CHANGED Viewed

@@ -24,8 +24,8 @@ class TextMatcher(AnnotatorApproach):
     A text file of predefined phrases must be provided with
     :meth:`.setEntities`.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -288,4 +288,3 @@ class TextMatcherModel(AnnotatorModel):
         """
         from sparknlp.pretrained import ResourceDownloader
         return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)

sparknlp/annotator/n_gram_generator.py CHANGED Viewed

@@ -27,7 +27,7 @@ class NGramGenerator(AnnotatorModel):
     length is less than n (number of elements per n-gram), no n-grams are
     returned.
-    For more extended examples see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/chunking/NgramGenerator.ipynb>`__.
+    For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -139,4 +139,3 @@ class NGramGenerator(AnnotatorModel):
         if len(value) > 1:
             raise Exception("Delimiter should have length == 1")
         return self._set(delimiter=value)

sparknlp/annotator/ner/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-#  Copyright 2017-2022 John Snow Labs
+#  Copyright 2017-2023 John Snow Labs
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -16,4 +16,6 @@ from sparknlp.annotator.ner.ner_approach import *
 from sparknlp.annotator.ner.ner_converter import *
 from sparknlp.annotator.ner.ner_crf import *
 from sparknlp.annotator.ner.ner_dl import *
+from sparknlp.annotator.ner.ner_dl_graph_checker import *
 from sparknlp.annotator.ner.ner_overwriter import *
+from sparknlp.annotator.ner.zero_shot_ner_model import *

sparknlp/annotator/ner/ner_converter.py CHANGED Viewed

@@ -98,6 +98,13 @@ class NerConverter(AnnotatorModel):
         typeConverter=TypeConverters.toBoolean
     )
+    nerHasNoSchema = Param(
+        Params._dummy(),
+        "nerHasNoSchema",
+        "set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema",
+        typeConverter=TypeConverters.toBoolean
+    )
     def setWhiteList(self, entities):
         """Sets list of entities to process. The rest will be ignored.
@@ -124,6 +131,17 @@ class NerConverter(AnnotatorModel):
         """
         return self._set(preservePosition=value)
+    def setNerHasNoSchema(self, value):
+        """
+        set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
+        Parameters
+        ----------
+        value : bool
+            set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
+        """
+        return self._set(nerHasNoSchema=value)
     @keyword_only
     def __init__(self):
         super(NerConverter, self).__init__(

sparknlp/annotator/ner/ner_crf.py CHANGED Viewed

@@ -39,7 +39,7 @@ class NerCrfApproach(AnnotatorApproach, NerApproach):
     Optionally the user can provide an entity dictionary file with
     :meth:`.setExternalFeatures` for better accuracy.
-    For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/crf-ner/ner_dl_crf.ipynb>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
     ========================================= ======================
     Input Annotation types                    Output Annotation type
@@ -278,10 +278,10 @@ class NerCrfModel(AnnotatorModel):
     The default model is ``"ner_crf"``, if no name is provided. For available
     pretrained models please see the `Models Hub
-    <https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition>`__.
+    <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/model-downloader/Running_Pretrained_pipelines.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
     ========================================= ======================
     Input Annotation types                    Output Annotation type
@@ -395,4 +395,3 @@ class NerCrfModel(AnnotatorModel):
         """
         from sparknlp.pretrained import ResourceDownloader
         return ResourceDownloader.downloadModel(NerCrfModel, name, lang, remote_loc)

sparknlp/annotator/ner/ner_dl.py CHANGED Viewed

@@ -41,6 +41,11 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
     - a WordEmbeddingsModel (any embeddings can be chosen, e.g. BertEmbeddings
       for BERT based embeddings).
+    By default, collects all data points into memory for training. For larger datasets, use
+    ``setEnableMemoryOptimizer(true)``. This will optimize memory usage during training at the cost
+    of speed. Note that this annotator will use as much memory as the largest partition of the
+    input dataset, so we recommend repartitioning to batch sizes.
     Setting a test dataset to monitor model metrics can be done with
     ``.setTestDataset``. The method expects a path to a parquet file containing a
     dataframe that has the same required columns as the training dataframe. The
@@ -72,7 +77,7 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
     ...     .setOutputCol("ner") \\
     ...     .setTestDataset("test_data")
-    For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master/jupyter/training/english/dl-ner>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner>`__.
     ==================================== ======================
     Input Annotation types               Output Annotation type
@@ -420,16 +425,16 @@ class NerDLModel(AnnotatorModel, HasStorageRef, HasBatchedAnnotate, HasEngine):
     The default model is ``"ner_dl"``, if no name is provided.
     For available pretrained models please see the `Models Hub
-    <https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition>`__.
+    <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
     Additionally, pretrained pipelines are available for this module, see
-    `Pipelines <https://nlp.johnsnowlabs.com/docs/en/pipelines>`__.
+    `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
     Note that some pretrained models require specific types of embeddings,
     depending on which they were trained on. For example, the default model
     ``"ner_dl"`` requires the WordEmbeddings ``"glove_100d"``.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/3.SparkNLP_Pretrained_Models.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb>`__.
     ==================================== ======================
     Input Annotation types               Output Annotation type

spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl