spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +81 -28
- sparknlp/annotation.py +3 -2
- sparknlp/annotator/__init__.py +6 -0
- sparknlp/annotator/audio/__init__.py +2 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/{base → annotator}/chunk2_doc.py +4 -7
- sparknlp/annotator/chunker.py +1 -2
- sparknlp/annotator/classifier_dl/__init__.py +17 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/spanbert_coref.py +4 -18
- sparknlp/annotator/cv/__init__.py +15 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/dependency_parser.py +2 -3
- sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +37 -1
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +11 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
- sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
- sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
- sparknlp/annotator/embeddings/doc2vec.py +7 -1
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
- sparknlp/annotator/embeddings/word2vec.py +7 -1
- sparknlp/annotator/embeddings/word_embeddings.py +4 -5
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
- sparknlp/annotator/er/entity_ruler.py +37 -23
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
- sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
- sparknlp/annotator/lemmatizer.py +3 -4
- sparknlp/annotator/matcher/date_matcher.py +35 -3
- sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
- sparknlp/annotator/matcher/regex_matcher.py +3 -3
- sparknlp/annotator/matcher/text_matcher.py +2 -3
- sparknlp/annotator/n_gram_generator.py +1 -2
- sparknlp/annotator/ner/__init__.py +3 -1
- sparknlp/annotator/ner/ner_converter.py +18 -0
- sparknlp/annotator/ner/ner_crf.py +4 -5
- sparknlp/annotator/ner/ner_dl.py +10 -5
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +2 -2
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +2 -2
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/pos/perceptron.py +6 -7
- sparknlp/annotator/sentence/sentence_detector.py +2 -2
- sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
- sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
- sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
- sparknlp/annotator/seq2seq/__init__.py +17 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
- sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
- sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
- sparknlp/annotator/stemmer.py +2 -3
- sparknlp/annotator/stop_words_cleaner.py +3 -4
- sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
- sparknlp/annotator/token/__init__.py +0 -1
- sparknlp/annotator/token/recursive_tokenizer.py +2 -3
- sparknlp/annotator/token/tokenizer.py +2 -3
- sparknlp/annotator/ws/word_segmenter.py +35 -10
- sparknlp/base/__init__.py +2 -3
- sparknlp/base/doc2_chunk.py +0 -3
- sparknlp/base/document_assembler.py +5 -5
- sparknlp/base/embeddings_finisher.py +14 -2
- sparknlp/base/finisher.py +15 -4
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/image_assembler.py +69 -0
- sparknlp/base/light_pipeline.py +53 -21
- sparknlp/base/multi_document_assembler.py +9 -13
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/token_assembler.py +1 -2
- sparknlp/common/__init__.py +2 -0
- sparknlp/common/annotator_type.py +1 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +914 -9
- sparknlp/internal/__init__.py +841 -116
- sparknlp/internal/annotator_java_ml.py +1 -1
- sparknlp/internal/annotator_transformer.py +3 -0
- sparknlp/logging/comet.py +2 -2
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/pretrained_pipeline.py +1 -1
- sparknlp/pretrained/resource_downloader.py +126 -2
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +1 -0
- sparknlp/training/conll.py +8 -2
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/util.py +26 -0
- spark_nlp-4.2.6.dist-info/METADATA +0 -1256
- spark_nlp-4.2.6.dist-info/RECORD +0 -196
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
- /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
|
@@ -27,9 +27,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
|
|
|
27
27
|
to be set as the "format" field in the ``option`` parameter map and
|
|
28
28
|
depending on the file type, additional parameters might need to be set.
|
|
29
29
|
|
|
30
|
-
To enable regex extraction, ``setEnablePatternRegex(True)`` needs to be
|
|
31
|
-
called.
|
|
32
|
-
|
|
33
30
|
If the file is in a JSON format, then the rule definitions need to be given
|
|
34
31
|
in a list with the fields "id", "label" and "patterns"::
|
|
35
32
|
|
|
@@ -71,8 +68,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
|
|
|
71
68
|
----------
|
|
72
69
|
patternsResource
|
|
73
70
|
Resource in JSON or CSV format to map entities to patterns
|
|
74
|
-
enablePatternRegex
|
|
75
|
-
Enables regex pattern match
|
|
76
71
|
useStorage
|
|
77
72
|
Whether to use RocksDB storage to serialize patterns
|
|
78
73
|
|
|
@@ -106,8 +101,7 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
|
|
|
106
101
|
... "patterns.csv",
|
|
107
102
|
... ReadAs.TEXT,
|
|
108
103
|
... {"format": "csv", "delimiter": "\\\\|"}
|
|
109
|
-
... )
|
|
110
|
-
... .setEnablePatternRegex(True)
|
|
104
|
+
... )
|
|
111
105
|
>>> pipeline = Pipeline().setStages([
|
|
112
106
|
... documentAssembler,
|
|
113
107
|
... tokenizer,
|
|
@@ -135,11 +129,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
|
|
|
135
129
|
"Resource in JSON or CSV format to map entities to patterns",
|
|
136
130
|
typeConverter=TypeConverters.identity)
|
|
137
131
|
|
|
138
|
-
enablePatternRegex = Param(Params._dummy(),
|
|
139
|
-
"enablePatternRegex",
|
|
140
|
-
"Enables regex pattern match",
|
|
141
|
-
typeConverter=TypeConverters.toBoolean)
|
|
142
|
-
|
|
143
132
|
useStorage = Param(Params._dummy(),
|
|
144
133
|
"useStorage",
|
|
145
134
|
"Whether to use RocksDB storage to serialize patterns",
|
|
@@ -174,16 +163,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
|
|
|
174
163
|
"""
|
|
175
164
|
return self._set(patternsResource=ExternalResource(path, read_as, options))
|
|
176
165
|
|
|
177
|
-
def setEnablePatternRegex(self, value):
|
|
178
|
-
"""Sets whether to enable regex pattern matching.
|
|
179
|
-
|
|
180
|
-
Parameters
|
|
181
|
-
----------
|
|
182
|
-
value : bool
|
|
183
|
-
Whether to enable regex pattern matching.
|
|
184
|
-
"""
|
|
185
|
-
return self._set(enablePatternRegex=value)
|
|
186
|
-
|
|
187
166
|
def setUseStorage(self, value):
|
|
188
167
|
"""Sets whether to use RocksDB storage to serialize patterns.
|
|
189
168
|
|
|
@@ -236,6 +215,20 @@ class EntityRulerModel(AnnotatorModel, HasStorageModel):
|
|
|
236
215
|
|
|
237
216
|
outputAnnotatorType = AnnotatorType.CHUNK
|
|
238
217
|
|
|
218
|
+
autoMode = Param(
|
|
219
|
+
Params._dummy(),
|
|
220
|
+
"autoMode",
|
|
221
|
+
"Enable built-in regex presets that combine related entity patterns (e.g., 'communication_entities', 'network_entities', 'media_entities', etc.).",
|
|
222
|
+
typeConverter=TypeConverters.toString
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
extractEntities = Param(
|
|
226
|
+
Params._dummy(),
|
|
227
|
+
"extractEntities",
|
|
228
|
+
"List of entity types to extract. If not set, all entities in the active autoMode or from regexPatterns are used.",
|
|
229
|
+
typeConverter=TypeConverters.toListString
|
|
230
|
+
)
|
|
231
|
+
|
|
239
232
|
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.er.EntityRulerModel", java_model=None):
|
|
240
233
|
super(EntityRulerModel, self).__init__(
|
|
241
234
|
classname=classname,
|
|
@@ -249,5 +242,26 @@ class EntityRulerModel(AnnotatorModel, HasStorageModel):
|
|
|
249
242
|
|
|
250
243
|
@staticmethod
|
|
251
244
|
def loadStorage(path, spark, storage_ref):
|
|
252
|
-
HasStorageModel.loadStorages(path, spark, storage_ref, EntityRulerModel.
|
|
245
|
+
HasStorageModel.loadStorages(path, spark, storage_ref, EntityRulerModel.database)
|
|
246
|
+
|
|
253
247
|
|
|
248
|
+
def setAutoMode(self, value):
|
|
249
|
+
"""Sets the auto mode for predefined regex entity groups.
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
value : str
|
|
254
|
+
Name of the auto mode to activate (e.g., 'communication_entities', 'network_entities', etc.)
|
|
255
|
+
"""
|
|
256
|
+
return self._set(autoMode=value)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def setExtractEntities(self, value):
|
|
260
|
+
"""Sets specific entities to extract, filtering only those defined in regexPatterns or autoMode.
|
|
261
|
+
|
|
262
|
+
Parameters
|
|
263
|
+
----------
|
|
264
|
+
value : list[str]
|
|
265
|
+
List of entity names to extract, e.g., ['EMAIL_ADDRESS_PATTERN', 'IPV4_PATTERN'].
|
|
266
|
+
"""
|
|
267
|
+
return self._set(extractEntities=value)
|
|
@@ -44,8 +44,8 @@ class YakeKeywordExtraction(AnnotatorModel):
|
|
|
44
44
|
lower the score better the keyword). Therefore to filter the keywords, an
|
|
45
45
|
upper bound for the score can be set with :meth:`.setThreshold`.
|
|
46
46
|
|
|
47
|
-
For extended examples of usage, see the `
|
|
48
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
47
|
+
For extended examples of usage, see the `Examples
|
|
48
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb>`__.
|
|
49
49
|
|
|
50
50
|
====================== ======================
|
|
51
51
|
Input Annotation types Output Annotation type
|
|
@@ -268,4 +268,3 @@ class YakeKeywordExtraction(AnnotatorModel):
|
|
|
268
268
|
from pyspark.ml.wrapper import _jvm
|
|
269
269
|
stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover
|
|
270
270
|
return list(stopWordsObj.loadDefaultStopWords(language))
|
|
271
|
-
|
|
@@ -37,9 +37,9 @@ class LanguageDetectorDL(AnnotatorModel, HasStorageRef, HasEngine):
|
|
|
37
37
|
The default model is ``"ld_wiki_tatoeba_cnn_21"``, default language is
|
|
38
38
|
``"xx"`` (meaning multi-lingual), if no values are provided.
|
|
39
39
|
|
|
40
|
-
For available pretrained models please see the `Models Hub <https://
|
|
40
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Language+Detection>`__.
|
|
41
41
|
|
|
42
|
-
For extended examples of usage, see the `
|
|
42
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb>`__.
|
|
43
43
|
|
|
44
44
|
====================== ======================
|
|
45
45
|
Input Annotation types Output Annotation type
|
sparknlp/annotator/lemmatizer.py
CHANGED
|
@@ -24,8 +24,8 @@ class Lemmatizer(AnnotatorApproach):
|
|
|
24
24
|
|
|
25
25
|
For instantiated/pretrained models, see :class:`.LemmatizerModel`.
|
|
26
26
|
|
|
27
|
-
For available pretrained models please see the `Models Hub <https://
|
|
28
|
-
For extended examples of usage, see the `
|
|
27
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
|
|
28
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb>`__.
|
|
29
29
|
|
|
30
30
|
====================== ======================
|
|
31
31
|
Input Annotation types Output Annotation type
|
|
@@ -194,7 +194,7 @@ class LemmatizerModel(AnnotatorModel):
|
|
|
194
194
|
... .setInputCols(["token"]) \\
|
|
195
195
|
... .setOutputCol("lemma")
|
|
196
196
|
|
|
197
|
-
For available pretrained models please see the `Models Hub <https://
|
|
197
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
|
|
198
198
|
|
|
199
199
|
====================== ======================
|
|
200
200
|
Input Annotation types Output Annotation type
|
|
@@ -248,4 +248,3 @@ class LemmatizerModel(AnnotatorModel):
|
|
|
248
248
|
"""
|
|
249
249
|
from sparknlp.pretrained import ResourceDownloader
|
|
250
250
|
return ResourceDownloader.downloadModel(LemmatizerModel, name, lang, remote_loc)
|
|
251
|
-
|
|
@@ -67,6 +67,16 @@ class DateMatcherUtils(Params):
|
|
|
67
67
|
"source language for explicit translation",
|
|
68
68
|
typeConverter=TypeConverters.toString)
|
|
69
69
|
|
|
70
|
+
relaxedFactoryStrategy = Param(Params._dummy(),
|
|
71
|
+
"relaxedFactoryStrategy",
|
|
72
|
+
"Matched Strategy to searches relaxed dates",
|
|
73
|
+
typeConverter=TypeConverters.toString)
|
|
74
|
+
|
|
75
|
+
aggressiveMatching = Param(Params._dummy(),
|
|
76
|
+
"aggressiveMatching",
|
|
77
|
+
"Whether to aggressively attempt to find date matches, even in ambiguous or less common formats",
|
|
78
|
+
typeConverter=TypeConverters.toBoolean)
|
|
79
|
+
|
|
70
80
|
def setInputFormats(self, value):
|
|
71
81
|
"""Sets input formats patterns to match in the documents.
|
|
72
82
|
|
|
@@ -159,6 +169,29 @@ class DateMatcherUtils(Params):
|
|
|
159
169
|
"""
|
|
160
170
|
return self._set(anchorDateDay=value)
|
|
161
171
|
|
|
172
|
+
def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
|
|
173
|
+
""" Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy.
|
|
174
|
+
|
|
175
|
+
Not all of the date information needs to be included. For example
|
|
176
|
+
``"YYYY"`` is also a valid input.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
matchStrategy : MatchStrategy
|
|
181
|
+
Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy
|
|
182
|
+
"""
|
|
183
|
+
return self._set(relaxedFactoryStrategy=matchStrategy)
|
|
184
|
+
|
|
185
|
+
def setAggressiveMatching(self, value):
|
|
186
|
+
""" Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
aggressiveMatching : Boolean
|
|
191
|
+
Whether to aggressively attempt to find date matches, even in ambiguous or less common formats
|
|
192
|
+
"""
|
|
193
|
+
return self._set(aggressiveMatching=value)
|
|
194
|
+
|
|
162
195
|
|
|
163
196
|
class DateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
164
197
|
"""Matches standard date formats into a provided format
|
|
@@ -184,10 +217,10 @@ class DateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
|
184
217
|
``2008/04/31``.
|
|
185
218
|
|
|
186
219
|
Pretrained pipelines are available for this module, see
|
|
187
|
-
`Pipelines <https://
|
|
220
|
+
`Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
|
|
188
221
|
|
|
189
222
|
For extended examples of usage, see the
|
|
190
|
-
`
|
|
223
|
+
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
|
|
191
224
|
|
|
192
225
|
====================== ======================
|
|
193
226
|
Input Annotation types Output Annotation type
|
|
@@ -268,4 +301,3 @@ class DateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
|
268
301
|
anchorDateMonth=-1,
|
|
269
302
|
anchorDateDay=-1
|
|
270
303
|
)
|
|
271
|
-
|
|
@@ -33,7 +33,7 @@ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
|
33
33
|
For example ``"The 31st of April in the year 2008"`` will be converted into
|
|
34
34
|
``2008/04/31``.
|
|
35
35
|
|
|
36
|
-
For extended examples of usage, see the `
|
|
36
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
|
|
37
37
|
|
|
38
38
|
====================== ======================
|
|
39
39
|
Input Annotation types Output Annotation type
|
|
@@ -107,4 +107,3 @@ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
|
107
107
|
readMonthFirst=True,
|
|
108
108
|
defaultDayWhenMissing=1
|
|
109
109
|
)
|
|
110
|
-
|
|
@@ -32,10 +32,10 @@ class RegexMatcher(AnnotatorApproach):
|
|
|
32
32
|
delimited text file.
|
|
33
33
|
|
|
34
34
|
Pretrained pipelines are available for this module, see `Pipelines
|
|
35
|
-
<https://
|
|
35
|
+
<https://sparknlp.org/docs/en/pipelines>`__.
|
|
36
36
|
|
|
37
|
-
For extended examples of usage, see the `
|
|
38
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
37
|
+
For extended examples of usage, see the `Examples
|
|
38
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb>`__.
|
|
39
39
|
|
|
40
40
|
====================== ======================
|
|
41
41
|
Input Annotation types Output Annotation type
|
|
@@ -24,8 +24,8 @@ class TextMatcher(AnnotatorApproach):
|
|
|
24
24
|
A text file of predefined phrases must be provided with
|
|
25
25
|
:meth:`.setEntities`.
|
|
26
26
|
|
|
27
|
-
For extended examples of usage, see the `
|
|
28
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
27
|
+
For extended examples of usage, see the `Examples
|
|
28
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb>`__.
|
|
29
29
|
|
|
30
30
|
====================== ======================
|
|
31
31
|
Input Annotation types Output Annotation type
|
|
@@ -288,4 +288,3 @@ class TextMatcherModel(AnnotatorModel):
|
|
|
288
288
|
"""
|
|
289
289
|
from sparknlp.pretrained import ResourceDownloader
|
|
290
290
|
return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
|
|
291
|
-
|
|
@@ -27,7 +27,7 @@ class NGramGenerator(AnnotatorModel):
|
|
|
27
27
|
length is less than n (number of elements per n-gram), no n-grams are
|
|
28
28
|
returned.
|
|
29
29
|
|
|
30
|
-
For more extended examples see the `
|
|
30
|
+
For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb>`__.
|
|
31
31
|
|
|
32
32
|
====================== ======================
|
|
33
33
|
Input Annotation types Output Annotation type
|
|
@@ -139,4 +139,3 @@ class NGramGenerator(AnnotatorModel):
|
|
|
139
139
|
if len(value) > 1:
|
|
140
140
|
raise Exception("Delimiter should have length == 1")
|
|
141
141
|
return self._set(delimiter=value)
|
|
142
|
-
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2017-
|
|
1
|
+
# Copyright 2017-2023 John Snow Labs
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -16,4 +16,6 @@ from sparknlp.annotator.ner.ner_approach import *
|
|
|
16
16
|
from sparknlp.annotator.ner.ner_converter import *
|
|
17
17
|
from sparknlp.annotator.ner.ner_crf import *
|
|
18
18
|
from sparknlp.annotator.ner.ner_dl import *
|
|
19
|
+
from sparknlp.annotator.ner.ner_dl_graph_checker import *
|
|
19
20
|
from sparknlp.annotator.ner.ner_overwriter import *
|
|
21
|
+
from sparknlp.annotator.ner.zero_shot_ner_model import *
|
|
@@ -98,6 +98,13 @@ class NerConverter(AnnotatorModel):
|
|
|
98
98
|
typeConverter=TypeConverters.toBoolean
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
+
nerHasNoSchema = Param(
|
|
102
|
+
Params._dummy(),
|
|
103
|
+
"nerHasNoSchema",
|
|
104
|
+
"set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema",
|
|
105
|
+
typeConverter=TypeConverters.toBoolean
|
|
106
|
+
)
|
|
107
|
+
|
|
101
108
|
def setWhiteList(self, entities):
|
|
102
109
|
"""Sets list of entities to process. The rest will be ignored.
|
|
103
110
|
|
|
@@ -124,6 +131,17 @@ class NerConverter(AnnotatorModel):
|
|
|
124
131
|
"""
|
|
125
132
|
return self._set(preservePosition=value)
|
|
126
133
|
|
|
134
|
+
def setNerHasNoSchema(self, value):
|
|
135
|
+
"""
|
|
136
|
+
set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
value : bool
|
|
141
|
+
set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
|
|
142
|
+
"""
|
|
143
|
+
return self._set(nerHasNoSchema=value)
|
|
144
|
+
|
|
127
145
|
@keyword_only
|
|
128
146
|
def __init__(self):
|
|
129
147
|
super(NerConverter, self).__init__(
|
|
@@ -39,7 +39,7 @@ class NerCrfApproach(AnnotatorApproach, NerApproach):
|
|
|
39
39
|
Optionally the user can provide an entity dictionary file with
|
|
40
40
|
:meth:`.setExternalFeatures` for better accuracy.
|
|
41
41
|
|
|
42
|
-
For extended examples of usage, see the `
|
|
42
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
|
|
43
43
|
|
|
44
44
|
========================================= ======================
|
|
45
45
|
Input Annotation types Output Annotation type
|
|
@@ -278,10 +278,10 @@ class NerCrfModel(AnnotatorModel):
|
|
|
278
278
|
|
|
279
279
|
The default model is ``"ner_crf"``, if no name is provided. For available
|
|
280
280
|
pretrained models please see the `Models Hub
|
|
281
|
-
<https://
|
|
281
|
+
<https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
|
|
282
282
|
|
|
283
|
-
For extended examples of usage, see the `
|
|
284
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
283
|
+
For extended examples of usage, see the `Examples
|
|
284
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
|
|
285
285
|
|
|
286
286
|
========================================= ======================
|
|
287
287
|
Input Annotation types Output Annotation type
|
|
@@ -395,4 +395,3 @@ class NerCrfModel(AnnotatorModel):
|
|
|
395
395
|
"""
|
|
396
396
|
from sparknlp.pretrained import ResourceDownloader
|
|
397
397
|
return ResourceDownloader.downloadModel(NerCrfModel, name, lang, remote_loc)
|
|
398
|
-
|
sparknlp/annotator/ner/ner_dl.py
CHANGED
|
@@ -41,6 +41,11 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
|
|
|
41
41
|
- a WordEmbeddingsModel (any embeddings can be chosen, e.g. BertEmbeddings
|
|
42
42
|
for BERT based embeddings).
|
|
43
43
|
|
|
44
|
+
By default, collects all data points into memory for training. For larger datasets, use
|
|
45
|
+
``setEnableMemoryOptimizer(true)``. This will optimize memory usage during training at the cost
|
|
46
|
+
of speed. Note that this annotator will use as much memory as the largest partition of the
|
|
47
|
+
input dataset, so we recommend repartitioning to batch sizes.
|
|
48
|
+
|
|
44
49
|
Setting a test dataset to monitor model metrics can be done with
|
|
45
50
|
``.setTestDataset``. The method expects a path to a parquet file containing a
|
|
46
51
|
dataframe that has the same required columns as the training dataframe. The
|
|
@@ -72,7 +77,7 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
|
|
|
72
77
|
... .setOutputCol("ner") \\
|
|
73
78
|
... .setTestDataset("test_data")
|
|
74
79
|
|
|
75
|
-
For extended examples of usage, see the `
|
|
80
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner>`__.
|
|
76
81
|
|
|
77
82
|
==================================== ======================
|
|
78
83
|
Input Annotation types Output Annotation type
|
|
@@ -420,16 +425,16 @@ class NerDLModel(AnnotatorModel, HasStorageRef, HasBatchedAnnotate, HasEngine):
|
|
|
420
425
|
The default model is ``"ner_dl"``, if no name is provided.
|
|
421
426
|
|
|
422
427
|
For available pretrained models please see the `Models Hub
|
|
423
|
-
<https://
|
|
428
|
+
<https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
|
|
424
429
|
Additionally, pretrained pipelines are available for this module, see
|
|
425
|
-
`Pipelines <https://
|
|
430
|
+
`Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
|
|
426
431
|
|
|
427
432
|
Note that some pretrained models require specific types of embeddings,
|
|
428
433
|
depending on which they were trained on. For example, the default model
|
|
429
434
|
``"ner_dl"`` requires the WordEmbeddings ``"glove_100d"``.
|
|
430
435
|
|
|
431
|
-
For extended examples of usage, see the `
|
|
432
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
436
|
+
For extended examples of usage, see the `Examples
|
|
437
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb>`__.
|
|
433
438
|
|
|
434
439
|
==================================== ======================
|
|
435
440
|
Input Annotation types Output Annotation type
|