spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -27,9 +27,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
27
27
  to be set as the "format" field in the ``option`` parameter map and
28
28
  depending on the file type, additional parameters might need to be set.
29
29
 
30
- To enable regex extraction, ``setEnablePatternRegex(True)`` needs to be
31
- called.
32
-
33
30
  If the file is in a JSON format, then the rule definitions need to be given
34
31
  in a list with the fields "id", "label" and "patterns"::
35
32
 
@@ -71,8 +68,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
71
68
  ----------
72
69
  patternsResource
73
70
  Resource in JSON or CSV format to map entities to patterns
74
- enablePatternRegex
75
- Enables regex pattern match
76
71
  useStorage
77
72
  Whether to use RocksDB storage to serialize patterns
78
73
 
@@ -106,8 +101,7 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
106
101
  ... "patterns.csv",
107
102
  ... ReadAs.TEXT,
108
103
  ... {"format": "csv", "delimiter": "\\\\|"}
109
- ... ) \\
110
- ... .setEnablePatternRegex(True)
104
+ ... )
111
105
  >>> pipeline = Pipeline().setStages([
112
106
  ... documentAssembler,
113
107
  ... tokenizer,
@@ -135,11 +129,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
135
129
  "Resource in JSON or CSV format to map entities to patterns",
136
130
  typeConverter=TypeConverters.identity)
137
131
 
138
- enablePatternRegex = Param(Params._dummy(),
139
- "enablePatternRegex",
140
- "Enables regex pattern match",
141
- typeConverter=TypeConverters.toBoolean)
142
-
143
132
  useStorage = Param(Params._dummy(),
144
133
  "useStorage",
145
134
  "Whether to use RocksDB storage to serialize patterns",
@@ -174,16 +163,6 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage):
174
163
  """
175
164
  return self._set(patternsResource=ExternalResource(path, read_as, options))
176
165
 
177
- def setEnablePatternRegex(self, value):
178
- """Sets whether to enable regex pattern matching.
179
-
180
- Parameters
181
- ----------
182
- value : bool
183
- Whether to enable regex pattern matching.
184
- """
185
- return self._set(enablePatternRegex=value)
186
-
187
166
  def setUseStorage(self, value):
188
167
  """Sets whether to use RocksDB storage to serialize patterns.
189
168
 
@@ -236,6 +215,20 @@ class EntityRulerModel(AnnotatorModel, HasStorageModel):
236
215
 
237
216
  outputAnnotatorType = AnnotatorType.CHUNK
238
217
 
218
+ autoMode = Param(
219
+ Params._dummy(),
220
+ "autoMode",
221
+ "Enable built-in regex presets that combine related entity patterns (e.g., 'communication_entities', 'network_entities', 'media_entities', etc.).",
222
+ typeConverter=TypeConverters.toString
223
+ )
224
+
225
+ extractEntities = Param(
226
+ Params._dummy(),
227
+ "extractEntities",
228
+ "List of entity types to extract. If not set, all entities in the active autoMode or from regexPatterns are used.",
229
+ typeConverter=TypeConverters.toListString
230
+ )
231
+
239
232
  def __init__(self, classname="com.johnsnowlabs.nlp.annotators.er.EntityRulerModel", java_model=None):
240
233
  super(EntityRulerModel, self).__init__(
241
234
  classname=classname,
@@ -249,5 +242,26 @@ class EntityRulerModel(AnnotatorModel, HasStorageModel):
249
242
 
250
243
  @staticmethod
251
244
  def loadStorage(path, spark, storage_ref):
252
- HasStorageModel.loadStorages(path, spark, storage_ref, EntityRulerModel.databases)
245
+ HasStorageModel.loadStorages(path, spark, storage_ref, EntityRulerModel.database)
246
+
253
247
 
248
+ def setAutoMode(self, value):
249
+ """Sets the auto mode for predefined regex entity groups.
250
+
251
+ Parameters
252
+ ----------
253
+ value : str
254
+ Name of the auto mode to activate (e.g., 'communication_entities', 'network_entities', etc.)
255
+ """
256
+ return self._set(autoMode=value)
257
+
258
+
259
+ def setExtractEntities(self, value):
260
+ """Sets specific entities to extract, filtering only those defined in regexPatterns or autoMode.
261
+
262
+ Parameters
263
+ ----------
264
+ value : list[str]
265
+ List of entity names to extract, e.g., ['EMAIL_ADDRESS_PATTERN', 'IPV4_PATTERN'].
266
+ """
267
+ return self._set(extractEntities=value)
@@ -44,8 +44,8 @@ class YakeKeywordExtraction(AnnotatorModel):
44
44
  lower the score better the keyword). Therefore to filter the keywords, an
45
45
  upper bound for the score can be set with :meth:`.setThreshold`.
46
46
 
47
- For extended examples of usage, see the `Spark NLP Workshop
48
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/8.Keyword_Extraction_YAKE.ipynb>`__.
47
+ For extended examples of usage, see the `Examples
48
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/keyword-extraction/Keyword_Extraction_YAKE.ipynb>`__.
49
49
 
50
50
  ====================== ======================
51
51
  Input Annotation types Output Annotation type
@@ -268,4 +268,3 @@ class YakeKeywordExtraction(AnnotatorModel):
268
268
  from pyspark.ml.wrapper import _jvm
269
269
  stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover
270
270
  return list(stopWordsObj.loadDefaultStopWords(language))
271
-
@@ -37,9 +37,9 @@ class LanguageDetectorDL(AnnotatorModel, HasStorageRef, HasEngine):
37
37
  The default model is ``"ld_wiki_tatoeba_cnn_21"``, default language is
38
38
  ``"xx"`` (meaning multi-lingual), if no values are provided.
39
39
 
40
- For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Language+Detection>`__.
40
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Language+Detection>`__.
41
41
 
42
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/language-detection/Language_Detection_and_Indentification.ipynb>`__.
42
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb>`__.
43
43
 
44
44
  ====================== ======================
45
45
  Input Annotation types Output Annotation type
@@ -24,8 +24,8 @@ class Lemmatizer(AnnotatorApproach):
24
24
 
25
25
  For instantiated/pretrained models, see :class:`.LemmatizerModel`.
26
26
 
27
- For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Lemmatization>`__.
28
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
27
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
28
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Train-Lemmatizer-Italian.ipynb>`__.
29
29
 
30
30
  ====================== ======================
31
31
  Input Annotation types Output Annotation type
@@ -194,7 +194,7 @@ class LemmatizerModel(AnnotatorModel):
194
194
  ... .setInputCols(["token"]) \\
195
195
  ... .setOutputCol("lemma")
196
196
 
197
- For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Lemmatization>`__.
197
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Lemmatization>`__.
198
198
 
199
199
  ====================== ======================
200
200
  Input Annotation types Output Annotation type
@@ -248,4 +248,3 @@ class LemmatizerModel(AnnotatorModel):
248
248
  """
249
249
  from sparknlp.pretrained import ResourceDownloader
250
250
  return ResourceDownloader.downloadModel(LemmatizerModel, name, lang, remote_loc)
251
-
@@ -67,6 +67,16 @@ class DateMatcherUtils(Params):
67
67
  "source language for explicit translation",
68
68
  typeConverter=TypeConverters.toString)
69
69
 
70
+ relaxedFactoryStrategy = Param(Params._dummy(),
71
+ "relaxedFactoryStrategy",
72
+ "Matched Strategy to searches relaxed dates",
73
+ typeConverter=TypeConverters.toString)
74
+
75
+ aggressiveMatching = Param(Params._dummy(),
76
+ "aggressiveMatching",
77
+ "Whether to aggressively attempt to find date matches, even in ambiguous or less common formats",
78
+ typeConverter=TypeConverters.toBoolean)
79
+
70
80
  def setInputFormats(self, value):
71
81
  """Sets input formats patterns to match in the documents.
72
82
 
@@ -159,6 +169,29 @@ class DateMatcherUtils(Params):
159
169
  """
160
170
  return self._set(anchorDateDay=value)
161
171
 
172
+ def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
173
+ """ Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy.
174
+
175
+ Not all of the date information needs to be included. For example
176
+ ``"YYYY"`` is also a valid input.
177
+
178
+ Parameters
179
+ ----------
180
+ matchStrategy : MatchStrategy
181
+ Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy
182
+ """
183
+ return self._set(relaxedFactoryStrategy=matchStrategy)
184
+
185
+ def setAggressiveMatching(self, value):
186
+ """ Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats
187
+
188
+ Parameters
189
+ ----------
190
+ aggressiveMatching : Boolean
191
+ Whether to aggressively attempt to find date matches, even in ambiguous or less common formats
192
+ """
193
+ return self._set(aggressiveMatching=value)
194
+
162
195
 
163
196
  class DateMatcher(AnnotatorModel, DateMatcherUtils):
164
197
  """Matches standard date formats into a provided format
@@ -184,10 +217,10 @@ class DateMatcher(AnnotatorModel, DateMatcherUtils):
184
217
  ``2008/04/31``.
185
218
 
186
219
  Pretrained pipelines are available for this module, see
187
- `Pipelines <https://nlp.johnsnowlabs.com/docs/en/pipelines>`__.
220
+ `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
188
221
 
189
222
  For extended examples of usage, see the
190
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
223
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
191
224
 
192
225
  ====================== ======================
193
226
  Input Annotation types Output Annotation type
@@ -268,4 +301,3 @@ class DateMatcher(AnnotatorModel, DateMatcherUtils):
268
301
  anchorDateMonth=-1,
269
302
  anchorDateDay=-1
270
303
  )
271
-
@@ -33,7 +33,7 @@ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
33
33
  For example ``"The 31st of April in the year 2008"`` will be converted into
34
34
  ``2008/04/31``.
35
35
 
36
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
36
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
37
37
 
38
38
  ====================== ======================
39
39
  Input Annotation types Output Annotation type
@@ -107,4 +107,3 @@ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
107
107
  readMonthFirst=True,
108
108
  defaultDayWhenMissing=1
109
109
  )
110
-
@@ -32,10 +32,10 @@ class RegexMatcher(AnnotatorApproach):
32
32
  delimited text file.
33
33
 
34
34
  Pretrained pipelines are available for this module, see `Pipelines
35
- <https://nlp.johnsnowlabs.com/docs/en/pipelines>`__.
35
+ <https://sparknlp.org/docs/en/pipelines>`__.
36
36
 
37
- For extended examples of usage, see the `Spark NLP Workshop
38
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
37
+ For extended examples of usage, see the `Examples
38
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb>`__.
39
39
 
40
40
  ====================== ======================
41
41
  Input Annotation types Output Annotation type
@@ -24,8 +24,8 @@ class TextMatcher(AnnotatorApproach):
24
24
  A text file of predefined phrases must be provided with
25
25
  :meth:`.setEntities`.
26
26
 
27
- For extended examples of usage, see the `Spark NLP Workshop
28
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
27
+ For extended examples of usage, see the `Examples
28
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb>`__.
29
29
 
30
30
  ====================== ======================
31
31
  Input Annotation types Output Annotation type
@@ -288,4 +288,3 @@ class TextMatcherModel(AnnotatorModel):
288
288
  """
289
289
  from sparknlp.pretrained import ResourceDownloader
290
290
  return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
291
-
@@ -27,7 +27,7 @@ class NGramGenerator(AnnotatorModel):
27
27
  length is less than n (number of elements per n-gram), no n-grams are
28
28
  returned.
29
29
 
30
- For more extended examples see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/chunking/NgramGenerator.ipynb>`__.
30
+ For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb>`__.
31
31
 
32
32
  ====================== ======================
33
33
  Input Annotation types Output Annotation type
@@ -139,4 +139,3 @@ class NGramGenerator(AnnotatorModel):
139
139
  if len(value) > 1:
140
140
  raise Exception("Delimiter should have length == 1")
141
141
  return self._set(delimiter=value)
142
-
@@ -1,4 +1,4 @@
1
- # Copyright 2017-2022 John Snow Labs
1
+ # Copyright 2017-2023 John Snow Labs
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -16,4 +16,6 @@ from sparknlp.annotator.ner.ner_approach import *
16
16
  from sparknlp.annotator.ner.ner_converter import *
17
17
  from sparknlp.annotator.ner.ner_crf import *
18
18
  from sparknlp.annotator.ner.ner_dl import *
19
+ from sparknlp.annotator.ner.ner_dl_graph_checker import *
19
20
  from sparknlp.annotator.ner.ner_overwriter import *
21
+ from sparknlp.annotator.ner.zero_shot_ner_model import *
@@ -98,6 +98,13 @@ class NerConverter(AnnotatorModel):
98
98
  typeConverter=TypeConverters.toBoolean
99
99
  )
100
100
 
101
+ nerHasNoSchema = Param(
102
+ Params._dummy(),
103
+ "nerHasNoSchema",
104
+ "set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema",
105
+ typeConverter=TypeConverters.toBoolean
106
+ )
107
+
101
108
  def setWhiteList(self, entities):
102
109
  """Sets list of entities to process. The rest will be ignored.
103
110
 
@@ -124,6 +131,17 @@ class NerConverter(AnnotatorModel):
124
131
  """
125
132
  return self._set(preservePosition=value)
126
133
 
134
+ def setNerHasNoSchema(self, value):
135
+ """
136
+ set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
137
+
138
+ Parameters
139
+ ----------
140
+ value : bool
141
+ set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
142
+ """
143
+ return self._set(nerHasNoSchema=value)
144
+
127
145
  @keyword_only
128
146
  def __init__(self):
129
147
  super(NerConverter, self).__init__(
@@ -39,7 +39,7 @@ class NerCrfApproach(AnnotatorApproach, NerApproach):
39
39
  Optionally the user can provide an entity dictionary file with
40
40
  :meth:`.setExternalFeatures` for better accuracy.
41
41
 
42
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/crf-ner/ner_dl_crf.ipynb>`__.
42
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
43
43
 
44
44
  ========================================= ======================
45
45
  Input Annotation types Output Annotation type
@@ -278,10 +278,10 @@ class NerCrfModel(AnnotatorModel):
278
278
 
279
279
  The default model is ``"ner_crf"``, if no name is provided. For available
280
280
  pretrained models please see the `Models Hub
281
- <https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition>`__.
281
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
282
282
 
283
- For extended examples of usage, see the `Spark NLP Workshop
284
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/model-downloader/Running_Pretrained_pipelines.ipynb>`__.
283
+ For extended examples of usage, see the `Examples
284
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
285
285
 
286
286
  ========================================= ======================
287
287
  Input Annotation types Output Annotation type
@@ -395,4 +395,3 @@ class NerCrfModel(AnnotatorModel):
395
395
  """
396
396
  from sparknlp.pretrained import ResourceDownloader
397
397
  return ResourceDownloader.downloadModel(NerCrfModel, name, lang, remote_loc)
398
-
@@ -41,6 +41,11 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
41
41
  - a WordEmbeddingsModel (any embeddings can be chosen, e.g. BertEmbeddings
42
42
  for BERT based embeddings).
43
43
 
44
+ By default, collects all data points into memory for training. For larger datasets, use
45
+ ``setEnableMemoryOptimizer(true)``. This will optimize memory usage during training at the cost
46
+ of speed. Note that this annotator will use as much memory as the largest partition of the
47
+ input dataset, so we recommend repartitioning to batch sizes.
48
+
44
49
  Setting a test dataset to monitor model metrics can be done with
45
50
  ``.setTestDataset``. The method expects a path to a parquet file containing a
46
51
  dataframe that has the same required columns as the training dataframe. The
@@ -72,7 +77,7 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
72
77
  ... .setOutputCol("ner") \\
73
78
  ... .setTestDataset("test_data")
74
79
 
75
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master/jupyter/training/english/dl-ner>`__.
80
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner>`__.
76
81
 
77
82
  ==================================== ======================
78
83
  Input Annotation types Output Annotation type
@@ -420,16 +425,16 @@ class NerDLModel(AnnotatorModel, HasStorageRef, HasBatchedAnnotate, HasEngine):
420
425
  The default model is ``"ner_dl"``, if no name is provided.
421
426
 
422
427
  For available pretrained models please see the `Models Hub
423
- <https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition>`__.
428
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
424
429
  Additionally, pretrained pipelines are available for this module, see
425
- `Pipelines <https://nlp.johnsnowlabs.com/docs/en/pipelines>`__.
430
+ `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
426
431
 
427
432
  Note that some pretrained models require specific types of embeddings,
428
433
  depending on which they were trained on. For example, the default model
429
434
  ``"ner_dl"`` requires the WordEmbeddings ``"glove_100d"``.
430
435
 
431
- For extended examples of usage, see the `Spark NLP Workshop
432
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/3.SparkNLP_Pretrained_Models.ipynb>`__.
436
+ For extended examples of usage, see the `Examples
437
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb>`__.
433
438
 
434
439
  ==================================== ======================
435
440
  Input Annotation types Output Annotation type