spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,303 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the DateMatcher."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class DateMatcherUtils(Params):
20
+ """Base class for DateMatcher Annotators
21
+ """
22
+ inputFormats = Param(Params._dummy(),
23
+ "inputFormats",
24
+ "input formats list of patterns to match",
25
+ typeConverter=TypeConverters.toListString)
26
+
27
+ outputFormat = Param(Params._dummy(),
28
+ "outputFormat",
29
+ "desired output format for dates extracted",
30
+ typeConverter=TypeConverters.toString)
31
+
32
+ readMonthFirst = Param(Params._dummy(),
33
+ "readMonthFirst",
34
+ "Whether to parse july 07/05/2015 or as 05/07/2015",
35
+ typeConverter=TypeConverters.toBoolean
36
+ )
37
+
38
+ defaultDayWhenMissing = Param(Params._dummy(),
39
+ "defaultDayWhenMissing",
40
+ "which day to set when it is missing from parsed input",
41
+ typeConverter=TypeConverters.toInt
42
+ )
43
+
44
+ anchorDateYear = Param(Params._dummy(),
45
+ "anchorDateYear",
46
+ "Add an anchor year for the relative dates such as a day after tomorrow. If not set it "
47
+ "will use the current year. Example: 2021",
48
+ typeConverter=TypeConverters.toInt
49
+ )
50
+
51
+ anchorDateMonth = Param(Params._dummy(),
52
+ "anchorDateMonth",
53
+ "Add an anchor month for the relative dates such as a day after tomorrow. If not set it "
54
+ "will use the current month. Example: 1 which means January",
55
+ typeConverter=TypeConverters.toInt
56
+ )
57
+
58
+ anchorDateDay = Param(Params._dummy(),
59
+ "anchorDateDay",
60
+ "Add an anchor day of the day for the relative dates such as a day after tomorrow. If not "
61
+ "set it will use the current day. Example: 11",
62
+ typeConverter=TypeConverters.toInt
63
+ )
64
+
65
+ sourceLanguage = Param(Params._dummy(),
66
+ "sourceLanguage",
67
+ "source language for explicit translation",
68
+ typeConverter=TypeConverters.toString)
69
+
70
+ relaxedFactoryStrategy = Param(Params._dummy(),
71
+ "relaxedFactoryStrategy",
72
+ "Matched Strategy to searches relaxed dates",
73
+ typeConverter=TypeConverters.toString)
74
+
75
+ aggressiveMatching = Param(Params._dummy(),
76
+ "aggressiveMatching",
77
+ "Whether to aggressively attempt to find date matches, even in ambiguous or less common formats",
78
+ typeConverter=TypeConverters.toBoolean)
79
+
80
+ def setInputFormats(self, value):
81
+ """Sets input formats patterns to match in the documents.
82
+
83
+ Parameters
84
+ ----------
85
+ value : List[str]
86
+ Input formats regex patterns to match dates in documents
87
+ """
88
+ return self._set(inputFormats=value)
89
+
90
+ def setOutputFormat(self, value):
91
+ """Sets desired output format for extracted dates, by default yyyy/MM/dd.
92
+
93
+ Not all of the date information needs to be included. For example
94
+ ``"YYYY"`` is also a valid input.
95
+
96
+ Parameters
97
+ ----------
98
+ value : str
99
+ Desired output format for dates extracted.
100
+ """
101
+ return self._set(outputFormat=value)
102
+
103
+ def setReadMonthFirst(self, value):
104
+ """Sets whether to parse the date in mm/dd/yyyy format instead of
105
+ dd/mm/yyyy, by default True.
106
+
107
+ For example July 5th 2015, would be parsed as 07/05/2015 instead of
108
+ 05/07/2015.
109
+
110
+ Parameters
111
+ ----------
112
+ value : bool
113
+ Whether to parse the date in mm/dd/yyyy format instead of
114
+ dd/mm/yyyy.
115
+ """
116
+ return self._set(readMonthFirst=value)
117
+
118
+ def setDefaultDayWhenMissing(self, value):
119
+ """Sets which day to set when it is missing from parsed input,
120
+ by default 1.
121
+
122
+ Parameters
123
+ ----------
124
+ value : int
125
+ [description]
126
+ """
127
+ return self._set(defaultDayWhenMissing=value)
128
+
129
+ def setAnchorDateYear(self, value):
130
+ """Sets an anchor year for the relative dates such as a day after
131
+ tomorrow. If not set it will use the current year.
132
+
133
+ Example: 2021
134
+
135
+ Parameters
136
+ ----------
137
+ value : int
138
+ The anchor year for relative dates
139
+ """
140
+ return self._set(anchorDateYear=value)
141
+
142
+ def setAnchorDateMonth(self, value):
143
+ """Sets an anchor month for the relative dates such as a day after
144
+ tomorrow. If not set it will use the current month.
145
+
146
+ Example: 1 which means January
147
+
148
+ Parameters
149
+ ----------
150
+ value : int
151
+ The anchor month for relative dates
152
+ """
153
+ normalizedMonth = value - 1
154
+ return self._set(anchorDateMonth=normalizedMonth)
155
+
156
+ def setSourceLanguage(self, value):
157
+ return self._set(sourceLanguage=value)
158
+
159
+ def setAnchorDateDay(self, value):
160
+ """Sets an anchor day of the day for the relative dates such as a day
161
+ after tomorrow. If not set it will use the current day.
162
+
163
+ Example: 11
164
+
165
+ Parameters
166
+ ----------
167
+ value : int
168
+ The anchor day for relative dates
169
+ """
170
+ return self._set(anchorDateDay=value)
171
+
172
+ def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
173
+ """ Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy.
174
+
175
+ Not all of the date information needs to be included. For example
176
+ ``"YYYY"`` is also a valid input.
177
+
178
+ Parameters
179
+ ----------
180
+ matchStrategy : MatchStrategy
181
+ Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy
182
+ """
183
+ return self._set(relaxedFactoryStrategy=matchStrategy)
184
+
185
+ def setAggressiveMatching(self, value):
186
+ """ Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats
187
+
188
+ Parameters
189
+ ----------
190
+ aggressiveMatching : Boolean
191
+ Whether to aggressively attempt to find date matches, even in ambiguous or less common formats
192
+ """
193
+ return self._set(aggressiveMatching=value)
194
+
195
+
196
+ class DateMatcher(AnnotatorModel, DateMatcherUtils):
197
+ """Matches standard date formats into a provided format
198
+ Reads from different forms of date and time expressions and converts them
199
+ to a provided date format.
200
+
201
+ Extracts only **one** date per document. Use with sentence detector to find
202
+ matches in each sentence.
203
+ To extract multiple dates from a document, please use the
204
+ :class:`.MultiDateMatcher`.
205
+
206
+ Reads the following kind of dates::
207
+
208
+ "1978-01-28", "1984/04/02,1/02/1980", "2/28/79",
209
+ "The 31st of April in the year 2008", "Fri, 21 Nov 1997", "Jan 21,
210
+ ‘97", "Sun", "Nov 21", "jan 1st", "next thursday", "last wednesday",
211
+ "today", "tomorrow", "yesterday", "next week", "next month",
212
+ "next year", "day after", "the day before", "0600h", "06:00 hours",
213
+ "6pm", "5:30 a.m.", "at 5", "12:59", "23:59", "1988/11/23 6pm",
214
+ "next week at 7.30", "5 am tomorrow"
215
+
216
+ For example ``"The 31st of April in the year 2008"`` will be converted into
217
+ ``2008/04/31``.
218
+
219
+ Pretrained pipelines are available for this module, see
220
+ `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
221
+
222
+ For extended examples of usage, see the
223
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
224
+
225
+ ====================== ======================
226
+ Input Annotation types Output Annotation type
227
+ ====================== ======================
228
+ ``DOCUMENT`` ``DATE``
229
+ ====================== ======================
230
+
231
+ Parameters
232
+ ----------
233
+ dateFormat
234
+ Desired format for dates extracted, by default yyyy/MM/dd.
235
+ readMonthFirst
236
+ Whether to parse the date in mm/dd/yyyy format instead of dd/mm/yyyy,
237
+ by default True.
238
+ defaultDayWhenMissing
239
+ Which day to set when it is missing from parsed input, by default 1.
240
+ anchorDateYear
241
+ Add an anchor year for the relative dates such as a day after tomorrow.
242
+ If not set it will use the current year. Example: 2021
243
+ anchorDateMonth
244
+ Add an anchor month for the relative dates such as a day after tomorrow.
245
+ If not set it will use the current month. Example: 1 which means January
246
+ anchorDateDay
247
+ Add an anchor day of the day for the relative dates such as a day after
248
+ tomorrow. If not set it will use the current day. Example: 11
249
+
250
+ Examples
251
+ --------
252
+ >>> import sparknlp
253
+ >>> from sparknlp.base import *
254
+ >>> from sparknlp.annotator import *
255
+ >>> from pyspark.ml import Pipeline
256
+ >>> documentAssembler = DocumentAssembler() \\
257
+ ... .setInputCol("text") \\
258
+ ... .setOutputCol("document")
259
+ >>> date = DateMatcher() \\
260
+ ... .setInputCols("document") \\
261
+ ... .setOutputCol("date") \\
262
+ ... .setAnchorDateYear(2020) \\
263
+ ... .setAnchorDateMonth(1) \\
264
+ ... .setAnchorDateDay(11) \\
265
+ ... .setOutputFormat("yyyy/MM/dd")
266
+ >>> pipeline = Pipeline().setStages([
267
+ ... documentAssembler,
268
+ ... date
269
+ ... ])
270
+ >>> data = spark.createDataFrame([["Fri, 21 Nov 1997"], ["next week at 7.30"], ["see you a day after"]]).toDF("text")
271
+ >>> result = pipeline.fit(data).transform(data)
272
+ >>> result.selectExpr("date").show(truncate=False)
273
+ +-------------------------------------------------+
274
+ |date |
275
+ +-------------------------------------------------+
276
+ |[[date, 5, 15, 1997/11/21, [sentence -> 0], []]] |
277
+ |[[date, 0, 8, 2020/01/18, [sentence -> 0], []]] |
278
+ |[[date, 10, 18, 2020/01/12, [sentence -> 0], []]]|
279
+ +-------------------------------------------------+
280
+
281
+ See Also
282
+ --------
283
+ MultiDateMatcher : for matching multiple dates in a document
284
+ """
285
+
286
+ name = "DateMatcher"
287
+
288
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
289
+
290
+ outputAnnotatorType = AnnotatorType.DATE
291
+
292
+ @keyword_only
293
+ def __init__(self):
294
+ super(DateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DateMatcher")
295
+ self._setDefault(
296
+ inputFormats=[""],
297
+ outputFormat="yyyy/MM/dd",
298
+ readMonthFirst=True,
299
+ defaultDayWhenMissing=1,
300
+ anchorDateYear=-1,
301
+ anchorDateMonth=-1,
302
+ anchorDateDay=-1
303
+ )
@@ -0,0 +1,109 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for MultiDateMatcher."""
15
+
16
+ from sparknlp.common import *
17
+ from sparknlp.annotator.matcher.date_matcher import DateMatcherUtils
18
+
19
+
20
+ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
21
+ """Matches standard date formats into a provided format.
22
+
23
+ Reads the following kind of dates::
24
+
25
+ "1978-01-28", "1984/04/02,1/02/1980", "2/28/79",
26
+ "The 31st of April in the year 2008", "Fri, 21 Nov 1997", "Jan 21,
27
+ ‘97", "Sun", "Nov 21", "jan 1st", "next thursday", "last wednesday",
28
+ "today", "tomorrow", "yesterday", "next week", "next month",
29
+ "next year", "day after", "the day before", "0600h", "06:00 hours",
30
+ "6pm", "5:30 a.m.", "at 5", "12:59", "23:59", "1988/11/23 6pm",
31
+ "next week at 7.30", "5 am tomorrow"
32
+
33
+ For example ``"The 31st of April in the year 2008"`` will be converted into
34
+ ``2008/04/31``.
35
+
36
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb>`__.
37
+
38
+ ====================== ======================
39
+ Input Annotation types Output Annotation type
40
+ ====================== ======================
41
+ ``DOCUMENT`` ``DATE``
42
+ ====================== ======================
43
+
44
+ Parameters
45
+ ----------
46
+ dateFormat
47
+ Desired format for dates extracted, by default yyyy/MM/dd.
48
+ readMonthFirst
49
+ Whether to parse the date in mm/dd/yyyy format instead of dd/mm/yyyy,
50
+ by default True.
51
+ defaultDayWhenMissing
52
+ Which day to set when it is missing from parsed input, by default 1.
53
+ anchorDateYear
54
+ Add an anchor year for the relative dates such as a day after tomorrow.
55
+ If not set it will use the current year. Example: 2021
56
+ anchorDateMonth
57
+ Add an anchor month for the relative dates such as a day after tomorrow.
58
+ If not set it will use the current month. Example: 1 which means January
59
+ anchorDateDay
60
+ Add an anchor day of the day for the relative dates such as a day after
61
+ tomorrow. If not set it will use the current day. Example: 11
62
+
63
+ Examples
64
+ --------
65
+ >>> import sparknlp
66
+ >>> from sparknlp.base import *
67
+ >>> from sparknlp.annotator import *
68
+ >>> from pyspark.ml import Pipeline
69
+ >>> documentAssembler = DocumentAssembler() \\
70
+ ... .setInputCol("text") \\
71
+ ... .setOutputCol("document")
72
+ >>> date = MultiDateMatcher() \\
73
+ ... .setInputCols("document") \\
74
+ ... .setOutputCol("date") \\
75
+ ... .setAnchorDateYear(2020) \\
76
+ ... .setAnchorDateMonth(1) \\
77
+ ... .setAnchorDateDay(11) \\
78
+ ... .setOutputFormat("yyyy/MM/dd")
79
+ >>> pipeline = Pipeline().setStages([
80
+ ... documentAssembler,
81
+ ... date
82
+ ... ])
83
+ >>> data = spark.createDataFrame([["I saw him yesterday and he told me that he will visit us next week"]]) \\
84
+ ... .toDF("text")
85
+ >>> result = pipeline.fit(data).transform(data)
86
+ >>> result.selectExpr("explode(date) as dates").show(truncate=False)
87
+ +-----------------------------------------------+
88
+ |dates |
89
+ +-----------------------------------------------+
90
+ |[date, 57, 65, 2020/01/18, [sentence -> 0], []]|
91
+ |[date, 10, 18, 2020/01/10, [sentence -> 0], []]|
92
+ +-----------------------------------------------+
93
+ """
94
+
95
+ name = "MultiDateMatcher"
96
+
97
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
98
+
99
+ outputAnnotatorType = AnnotatorType.DATE
100
+
101
+ @keyword_only
102
+ def __init__(self):
103
+ super(MultiDateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.MultiDateMatcher")
104
+ self._setDefault(
105
+ inputFormats=[""],
106
+ outputFormat="yyyy/MM/dd",
107
+ readMonthFirst=True,
108
+ defaultDayWhenMissing=1
109
+ )
@@ -0,0 +1,221 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the RegexMatcher."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class RegexMatcher(AnnotatorApproach):
20
+ """Uses rules to match a set of regular expressions and associate them with a
21
+ provided identifier.
22
+
23
+ A rule consists of a regex pattern and an identifier, delimited by a character of
24
+ choice. An example could be `"\\d{4}\\/\\d\\d\\/\\d\\d,date"` which will match
25
+ strings like `"1970/01/01"` to the identifier `"date"`.
26
+
27
+ Rules must be provided by either :meth:`.setRules` (followed by
28
+ :meth:`.setDelimiter`) or an external file.
29
+
30
+ To use an external file, a dictionary of predefined regular expressions must be
31
+ provided with :meth:`.setExternalRules`. The dictionary can be set in the form of a
32
+ delimited text file.
33
+
34
+ Pretrained pipelines are available for this module, see `Pipelines
35
+ <https://sparknlp.org/docs/en/pipelines>`__.
36
+
37
+ For extended examples of usage, see the `Examples
38
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb>`__.
39
+
40
+ ====================== ======================
41
+ Input Annotation types Output Annotation type
42
+ ====================== ======================
43
+ ``DOCUMENT`` ``CHUNK``
44
+ ====================== ======================
45
+
46
+ Parameters
47
+ ----------
48
+ strategy
49
+ Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE, by default
50
+ "MATCH_ALL"
51
+ rules
52
+ Regex rules to match the identifier with
53
+ delimiter
54
+ Delimiter for rules provided with setRules
55
+ externalRules
56
+ external resource to rules, needs 'delimiter' in options
57
+
58
+ Examples
59
+ --------
60
+ >>> import sparknlp
61
+ >>> from sparknlp.base import *
62
+ >>> from sparknlp.annotator import *
63
+ >>> from pyspark.ml import Pipeline
64
+
65
+ In this example, the ``rules.txt`` has the form of::
66
+
67
+ the\\s\\w+, followed by 'the'
68
+ ceremonies, ceremony
69
+
70
+ where each regex is separated by the identifier ``","``
71
+
72
+ >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
73
+ >>> sentence = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
74
+ >>> regexMatcher = RegexMatcher() \\
75
+ ... .setExternalRules("src/test/resources/regex-matcher/rules.txt", ",") \\
76
+ ... .setInputCols(["sentence"]) \\
77
+ ... .setOutputCol("regex") \\
78
+ ... .setStrategy("MATCH_ALL")
79
+ >>> pipeline = Pipeline().setStages([documentAssembler, sentence, regexMatcher])
80
+ >>> data = spark.createDataFrame([[
81
+ ... "My first sentence with the first rule. This is my second sentence with ceremonies rule."
82
+ ... ]]).toDF("text")
83
+ >>> results = pipeline.fit(data).transform(data)
84
+ >>> results.selectExpr("explode(regex) as result").show(truncate=False)
85
+ +--------------------------------------------------------------------------------------------+
86
+ |result |
87
+ +--------------------------------------------------------------------------------------------+
88
+ |[chunk, 23, 31, the first, [identifier -> followed by 'the', sentence -> 0, chunk -> 0], []]|
89
+ |[chunk, 71, 80, ceremonies, [identifier -> ceremony, sentence -> 1, chunk -> 0], []] |
90
+ +--------------------------------------------------------------------------------------------+
91
+ """
92
+
93
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
94
+
95
+ outputAnnotatorType = AnnotatorType.CHUNK
96
+
97
+ strategy = Param(Params._dummy(),
98
+ "strategy",
99
+ "MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE",
100
+ typeConverter=TypeConverters.toString)
101
+ externalRules = Param(Params._dummy(),
102
+ "externalRules",
103
+ "external resource to rules, needs 'delimiter' in options",
104
+ typeConverter=TypeConverters.identity)
105
+ rules = Param(Params._dummy(),
106
+ "rules",
107
+ "Regex rules to match the identifier with",
108
+ typeConverter=TypeConverters.toListString)
109
+ delimiter = Param(Params._dummy(),
110
+ "delimiter",
111
+ "Delimiter for rules",
112
+ typeConverter=TypeConverters.toString)
113
+
114
+ @keyword_only
115
+ def __init__(self):
116
+ super(RegexMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexMatcher")
117
+ self._setDefault(
118
+ strategy="MATCH_ALL"
119
+ )
120
+
121
+ def setStrategy(self, value):
122
+ """Sets matching strategy, by default "MATCH_ALL".
123
+
124
+ Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE.
125
+
126
+ Parameters
127
+ ----------
128
+ value : str
129
+ Matching Strategy
130
+ """
131
+ return self._set(strategy=value)
132
+
133
+ def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
134
+ """Sets external resource to rules, needs 'delimiter' in options.
135
+
136
+ Only one of either parameter `rules` or `externalRules` must be set.
137
+
138
+
139
+ Parameters
140
+ ----------
141
+ path : str
142
+ Path to the source files
143
+ delimiter : str
144
+ Delimiter for the dictionary file. Can also be set it `options`.
145
+ read_as : str, optional
146
+ How to read the file, by default ReadAs.TEXT
147
+ options : dict, optional
148
+ Options to read the resource, by default {"format": "text"}
149
+ """
150
+ opts = options.copy()
151
+ if "delimiter" not in opts:
152
+ opts["delimiter"] = delimiter
153
+ return self._set(externalRules=ExternalResource(path, read_as, opts))
154
+
155
+ def setRules(self, value):
156
+ """Sets the regex rules to match the identifier with.
157
+
158
+ The rules must consist of a regex pattern and an identifier for that pattern. The regex
159
+ pattern and the identifier must be delimited by a character that will also have to set with
160
+ `setDelimiter`.
161
+
162
+ Only one of either parameter `rules` or `externalRules` must be set.
163
+
164
+ Examples
165
+ --------
166
+ >>> regexMatcher = RegexMatcher() \\
167
+ ... .setRules(["\\d{4}\\/\\d\\d\\/\\d\\d,date", "\\d{2}\\/\\d\\d\\/\\d\\d,short_date"]) \\
168
+ ... .setDelimiter(",") \\
169
+ ... .setInputCols(["sentence"]) \\
170
+ ... .setOutputCol("regex") \\
171
+ ... .setStrategy("MATCH_ALL")
172
+
173
+ Parameters
174
+ ----------
175
+ value : List[str]
176
+ List of rules
177
+ """
178
+ return self._set(rules=value)
179
+
180
+ def setDelimiter(self, value):
181
+ """Sets the delimiter for rules.
182
+
183
+ Parameters
184
+ ----------
185
+ value : str
186
+ Delimiter for the rules
187
+ """
188
+ return self._set(delimiter=value)
189
+
190
+ def _create_model(self, java_model):
191
+ return RegexMatcherModel(java_model=java_model)
192
+
193
+
194
+ class RegexMatcherModel(AnnotatorModel):
195
+ """Instantiated model of the RegexMatcher.
196
+
197
+ This is the instantiated model of the :class:`.RegexMatcher`.
198
+ For training your own model, please see the documentation of that class.
199
+
200
+ ====================== ======================
201
+ Input Annotation types Output Annotation type
202
+ ====================== ======================
203
+ ``DOCUMENT`` ``CHUNK``
204
+ ====================== ======================
205
+
206
+ Parameters
207
+ ----------
208
+ None
209
+ """
210
+
211
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
212
+
213
+ outputAnnotatorType = AnnotatorType.CHUNK
214
+
215
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RegexMatcherModel", java_model=None):
216
+ super(RegexMatcherModel, self).__init__(
217
+ classname=classname,
218
+ java_model=java_model
219
+ )
220
+
221
+ name = "RegexMatcherModel"