spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,290 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the TextMatcher."""
15
+
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class TextMatcher(AnnotatorApproach):
21
+ """Annotator to match exact phrases (by token) provided in a file against a
22
+ Document.
23
+
24
+ A text file of predefined phrases must be provided with
25
+ :meth:`.setEntities`.
26
+
27
+ For extended examples of usage, see the `Examples
28
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-matcher-pipeline/extractor.ipynb>`__.
29
+
30
+ ====================== ======================
31
+ Input Annotation types Output Annotation type
32
+ ====================== ======================
33
+ ``DOCUMENT, TOKEN`` ``CHUNK``
34
+ ====================== ======================
35
+
36
+ Parameters
37
+ ----------
38
+ entities
39
+ ExternalResource for entities
40
+ caseSensitive
41
+ Whether to match regardless of case, by default True
42
+ mergeOverlapping
43
+ Whether to merge overlapping matched chunks, by default False
44
+ entityValue
45
+ Value for the entity metadata field
46
+ buildFromTokens
47
+ Whether the TextMatcher should take the CHUNK from TOKEN or not
48
+
49
+ Examples
50
+ --------
51
+ In this example, the entities file is of the form::
52
+
53
+ ...
54
+ dolore magna aliqua
55
+ lorem ipsum dolor. sit
56
+ laborum
57
+ ...
58
+
59
+ where each line represents an entity phrase to be extracted.
60
+
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = DocumentAssembler() \\
66
+ ... .setInputCol("text") \\
67
+ ... .setOutputCol("document")
68
+ >>> tokenizer = Tokenizer() \\
69
+ ... .setInputCols(["document"]) \\
70
+ ... .setOutputCol("token")
71
+ >>> data = spark.createDataFrame([["Hello dolore magna aliqua. Lorem ipsum dolor. sit in laborum"]]).toDF("text")
72
+ >>> entityExtractor = TextMatcher() \\
73
+ ... .setInputCols(["document", "token"]) \\
74
+ ... .setEntities("src/test/resources/entity-extractor/test-phrases.txt", ReadAs.TEXT) \\
75
+ ... .setOutputCol("entity") \\
76
+ ... .setCaseSensitive(False)
77
+ >>> pipeline = Pipeline().setStages([documentAssembler, tokenizer, entityExtractor])
78
+ >>> results = pipeline.fit(data).transform(data)
79
+ >>> results.selectExpr("explode(entity) as result").show(truncate=False)
80
+ +------------------------------------------------------------------------------------------+
81
+ |result |
82
+ +------------------------------------------------------------------------------------------+
83
+ |[chunk, 6, 24, dolore magna aliqua, [entity -> entity, sentence -> 0, chunk -> 0], []] |
84
+ |[chunk, 27, 48, Lorem ipsum dolor. sit, [entity -> entity, sentence -> 0, chunk -> 1], []]|
85
+ |[chunk, 53, 59, laborum, [entity -> entity, sentence -> 0, chunk -> 2], []] |
86
+ +------------------------------------------------------------------------------------------+
87
+
88
+ See Also
89
+ --------
90
+ BigTextMatcher : to match large amounts of text
91
+ """
92
+
93
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
94
+
95
+ outputAnnotatorType = AnnotatorType.CHUNK
96
+
97
+ entities = Param(Params._dummy(),
98
+ "entities",
99
+ "ExternalResource for entities",
100
+ typeConverter=TypeConverters.identity)
101
+
102
+ caseSensitive = Param(Params._dummy(),
103
+ "caseSensitive",
104
+ "whether to match regardless of case. Defaults true",
105
+ typeConverter=TypeConverters.toBoolean)
106
+
107
+ mergeOverlapping = Param(Params._dummy(),
108
+ "mergeOverlapping",
109
+ "whether to merge overlapping matched chunks. Defaults false",
110
+ typeConverter=TypeConverters.toBoolean)
111
+
112
+ entityValue = Param(Params._dummy(),
113
+ "entityValue",
114
+ "value for the entity metadata field",
115
+ typeConverter=TypeConverters.toString)
116
+
117
+ buildFromTokens = Param(Params._dummy(),
118
+ "buildFromTokens",
119
+ "whether the TextMatcher should take the CHUNK from TOKEN or not",
120
+ typeConverter=TypeConverters.toBoolean)
121
+
122
+ @keyword_only
123
+ def __init__(self):
124
+ super(TextMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.TextMatcher")
125
+ self._setDefault(inputCols=[AnnotatorType.DOCUMENT, AnnotatorType.TOKEN])
126
+ self._setDefault(caseSensitive=True)
127
+ self._setDefault(mergeOverlapping=False)
128
+
129
+ def _create_model(self, java_model):
130
+ return TextMatcherModel(java_model=java_model)
131
+
132
+ def setEntities(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
133
+ """Sets the external resource for the entities.
134
+
135
+ Parameters
136
+ ----------
137
+ path : str
138
+ Path to the external resource
139
+ read_as : str, optional
140
+ How to read the resource, by default ReadAs.TEXT
141
+ options : dict, optional
142
+ Options for reading the resource, by default {"format": "text"}
143
+ """
144
+ return self._set(entities=ExternalResource(path, read_as, options.copy()))
145
+
146
+ def setCaseSensitive(self, b):
147
+ """Sets whether to match regardless of case, by default True.
148
+
149
+ Parameters
150
+ ----------
151
+ b : bool
152
+ Whether to match regardless of case
153
+ """
154
+ return self._set(caseSensitive=b)
155
+
156
+ def setMergeOverlapping(self, b):
157
+ """Sets whether to merge overlapping matched chunks, by default False.
158
+
159
+ Parameters
160
+ ----------
161
+ b : bool
162
+ Whether to merge overlapping matched chunks
163
+ """
164
+ return self._set(mergeOverlapping=b)
165
+
166
+ def setEntityValue(self, b):
167
+ """Sets value for the entity metadata field.
168
+
169
+ Parameters
170
+ ----------
171
+ b : str
172
+ Value for the entity metadata field
173
+ """
174
+ return self._set(entityValue=b)
175
+
176
+ def setBuildFromTokens(self, b):
177
+ """Sets whether the TextMatcher should take the CHUNK from TOKEN or not.
178
+
179
+ Parameters
180
+ ----------
181
+ b : bool
182
+ Whether the TextMatcher should take the CHUNK from TOKEN or not
183
+ """
184
+ return self._set(buildFromTokens=b)
185
+
186
+
187
+ class TextMatcherModel(AnnotatorModel):
188
+ """Instantiated model of the TextMatcher.
189
+
190
+ This is the instantiated model of the :class:`.TextMatcher`. For training
191
+ your own model, please see the documentation of that class.
192
+
193
+ ====================== ======================
194
+ Input Annotation types Output Annotation type
195
+ ====================== ======================
196
+ ``DOCUMENT, TOKEN`` ``CHUNK``
197
+ ====================== ======================
198
+
199
+ Parameters
200
+ ----------
201
+ mergeOverlapping
202
+ Whether to merge overlapping matched chunks, by default False
203
+ entityValue
204
+ Value for the entity metadata field
205
+ buildFromTokens
206
+ Whether the TextMatcher should take the CHUNK from TOKEN or not
207
+ """
208
+ name = "TextMatcherModel"
209
+
210
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
211
+
212
+ outputAnnotatorType = AnnotatorType.CHUNK
213
+
214
+ mergeOverlapping = Param(Params._dummy(),
215
+ "mergeOverlapping",
216
+ "whether to merge overlapping matched chunks. Defaults false",
217
+ typeConverter=TypeConverters.toBoolean)
218
+
219
+ searchTrie = Param(Params._dummy(),
220
+ "searchTrie",
221
+ "searchTrie",
222
+ typeConverter=TypeConverters.identity)
223
+
224
+ entityValue = Param(Params._dummy(),
225
+ "entityValue",
226
+ "value for the entity metadata field",
227
+ typeConverter=TypeConverters.toString)
228
+
229
+ buildFromTokens = Param(Params._dummy(),
230
+ "buildFromTokens",
231
+ "whether the TextMatcher should take the CHUNK from TOKEN or not",
232
+ typeConverter=TypeConverters.toBoolean)
233
+
234
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TextMatcherModel", java_model=None):
235
+ super(TextMatcherModel, self).__init__(
236
+ classname=classname,
237
+ java_model=java_model
238
+ )
239
+
240
+ def setMergeOverlapping(self, b):
241
+ """Sets whether to merge overlapping matched chunks, by default False.
242
+
243
+ Parameters
244
+ ----------
245
+ b : bool
246
+ Whether to merge overlapping matched chunks
247
+ """
248
+ return self._set(mergeOverlapping=b)
249
+
250
+ def setEntityValue(self, b):
251
+ """Sets value for the entity metadata field.
252
+
253
+ Parameters
254
+ ----------
255
+ b : str
256
+ Value for the entity metadata field
257
+ """
258
+ return self._set(entityValue=b)
259
+
260
+ def setBuildFromTokens(self, b):
261
+ """Sets whether the TextMatcher should take the CHUNK from TOKEN or not.
262
+
263
+ Parameters
264
+ ----------
265
+ b : bool
266
+ Whether the TextMatcher should take the CHUNK from TOKEN or not
267
+ """
268
+ return self._set(buildFromTokens=b)
269
+
270
+ @staticmethod
271
+ def pretrained(name, lang="en", remote_loc=None):
272
+ """Downloads and loads a pretrained model.
273
+
274
+ Parameters
275
+ ----------
276
+ name : str, optional
277
+ Name of the pretrained model
278
+ lang : str, optional
279
+ Language of the pretrained model, by default "en"
280
+ remote_loc : str, optional
281
+ Optional remote address of the resource, by default None. Will use
282
+ Spark NLPs repositories otherwise.
283
+
284
+ Returns
285
+ -------
286
+ TextMatcherModel
287
+ The restored model
288
+ """
289
+ from sparknlp.pretrained import ResourceDownloader
290
+ return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
@@ -0,0 +1,141 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the NGramGenerator."""
15
+ from sparknlp.common import *
16
+
17
+
18
+ class NGramGenerator(AnnotatorModel):
19
+ """A feature transformer that converts the input array of strings
20
+ (annotatorType ``TOKEN``) into an array of n-grams (annotatorType
21
+ ``CHUNK``).
22
+
23
+ Null values in the input array are ignored. It returns an array of n-grams
24
+ where each n-gram is represented by a space-separated string of words.
25
+
26
+ When the input is empty, an empty array is returned. When the input array
27
+ length is less than n (number of elements per n-gram), no n-grams are
28
+ returned.
29
+
30
+ For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb>`__.
31
+
32
+ ====================== ======================
33
+ Input Annotation types Output Annotation type
34
+ ====================== ======================
35
+ ``TOKEN`` ``CHUNK``
36
+ ====================== ======================
37
+
38
+ Parameters
39
+ ----------
40
+ n
41
+ Number elements per n-gram (>=1), by default 2
42
+ enableCumulative
43
+ Whether to calculate just the actual n-grams, by default False
44
+ delimiter
45
+ Character to use to join the tokens, by default " "
46
+
47
+ Examples
48
+ --------
49
+ >>> import sparknlp
50
+ >>> from sparknlp.base import *
51
+ >>> from sparknlp.annotator import *
52
+ >>> from pyspark.ml import Pipeline
53
+ >>> documentAssembler = DocumentAssembler() \\
54
+ ... .setInputCol("text") \\
55
+ ... .setOutputCol("document")
56
+ >>> sentence = SentenceDetector() \\
57
+ ... .setInputCols(["document"]) \\
58
+ ... .setOutputCol("sentence")
59
+ >>> tokenizer = Tokenizer() \\
60
+ ... .setInputCols(["sentence"]) \\
61
+ ... .setOutputCol("token")
62
+ >>> nGrams = NGramGenerator() \\
63
+ ... .setInputCols(["token"]) \\
64
+ ... .setOutputCol("ngrams") \\
65
+ ... .setN(2)
66
+ >>> pipeline = Pipeline().setStages([
67
+ ... documentAssembler,
68
+ ... sentence,
69
+ ... tokenizer,
70
+ ... nGrams
71
+ ... ])
72
+ >>> data = spark.createDataFrame([["This is my sentence."]]).toDF("text")
73
+ >>> results = pipeline.fit(data).transform(data)
74
+ >>> results.selectExpr("explode(ngrams) as result").show(truncate=False)
75
+ +------------------------------------------------------------+
76
+ |result |
77
+ +------------------------------------------------------------+
78
+ |[chunk, 0, 6, This is, [sentence -> 0, chunk -> 0], []] |
79
+ |[chunk, 5, 9, is my, [sentence -> 0, chunk -> 1], []] |
80
+ |[chunk, 8, 18, my sentence, [sentence -> 0, chunk -> 2], []]|
81
+ |[chunk, 11, 19, sentence ., [sentence -> 0, chunk -> 3], []]|
82
+ +------------------------------------------------------------+
83
+ """
84
+
85
+ name = "NGramGenerator"
86
+
87
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
88
+
89
+ outputAnnotatorType = AnnotatorType.CHUNK
90
+
91
+ @keyword_only
92
+ def __init__(self):
93
+ super(NGramGenerator, self).__init__(classname="com.johnsnowlabs.nlp.annotators.NGramGenerator")
94
+ self._setDefault(
95
+ n=2,
96
+ enableCumulative=False
97
+ )
98
+
99
+ n = Param(Params._dummy(), "n", "number elements per n-gram (>=1)", typeConverter=TypeConverters.toInt)
100
+ enableCumulative = Param(Params._dummy(), "enableCumulative", "whether to calculate just the actual n-grams " +
101
+ "or all n-grams from 1 through n", typeConverter=TypeConverters.toBoolean)
102
+
103
+ delimiter = Param(Params._dummy(), "delimiter", "String to use to join the tokens ",
104
+ typeConverter=TypeConverters.toString)
105
+
106
+ def setN(self, value):
107
+ """Sets number elements per n-gram (>=1), by default 2.
108
+
109
+ Parameters
110
+ ----------
111
+ value : int
112
+ Number elements per n-gram (>=1), by default 2
113
+ """
114
+ return self._set(n=value)
115
+
116
+ def setEnableCumulative(self, value):
117
+ """Sets whether to calculate just the actual n-grams, by default False.
118
+
119
+ Parameters
120
+ ----------
121
+ value : bool
122
+ Whether to calculate just the actual n-grams
123
+ """
124
+ return self._set(enableCumulative=value)
125
+
126
+ def setDelimiter(self, value):
127
+ """Sets character to use to join the tokens
128
+
129
+ Parameters
130
+ ----------
131
+ value : str
132
+ character to use to join the tokens
133
+
134
+ Raises
135
+ ------
136
+ Exception
137
+ Delimiter should have length == 1
138
+ """
139
+ if len(value) > 1:
140
+ raise Exception("Delimiter should have length == 1")
141
+ return self._set(delimiter=value)
@@ -0,0 +1,21 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Module of annotators for named entity recognition."""
15
+ from sparknlp.annotator.ner.ner_approach import *
16
+ from sparknlp.annotator.ner.ner_converter import *
17
+ from sparknlp.annotator.ner.ner_crf import *
18
+ from sparknlp.annotator.ner.ner_dl import *
19
+ from sparknlp.annotator.ner.ner_dl_graph_checker import *
20
+ from sparknlp.annotator.ner.ner_overwriter import *
21
+ from sparknlp.annotator.ner.zero_shot_ner_model import *
@@ -0,0 +1,94 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains base classes for NER Annotators."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class NerApproach(Params):
20
+ """Base class for Ner*Approach Annotators
21
+ """
22
+ labelColumn = Param(Params._dummy(),
23
+ "labelColumn",
24
+ "Column with label per each token",
25
+ typeConverter=TypeConverters.toString)
26
+
27
+ entities = Param(Params._dummy(), "entities", "Entities to recognize", TypeConverters.toListString)
28
+
29
+ minEpochs = Param(Params._dummy(), "minEpochs", "Minimum number of epochs to train", TypeConverters.toInt)
30
+
31
+ maxEpochs = Param(Params._dummy(), "maxEpochs", "Maximum number of epochs to train", TypeConverters.toInt)
32
+
33
+ randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)
34
+
35
+ def setLabelColumn(self, value):
36
+ """Sets name of column for data labels.
37
+
38
+ Parameters
39
+ ----------
40
+ value : str
41
+ Column for data labels
42
+ """
43
+ return self._set(labelColumn=value)
44
+
45
+ def setEntities(self, tags):
46
+ """Sets entities to recognize.
47
+
48
+ Parameters
49
+ ----------
50
+ tags : List[str]
51
+ List of entities
52
+ """
53
+ return self._set(entities=tags)
54
+
55
+ def setMinEpochs(self, epochs):
56
+ """Sets minimum number of epochs to train.
57
+
58
+ Parameters
59
+ ----------
60
+ epochs : int
61
+ Minimum number of epochs to train
62
+ """
63
+ return self._set(minEpochs=epochs)
64
+
65
+ def setMaxEpochs(self, epochs):
66
+ """Sets maximum number of epochs to train.
67
+
68
+ Parameters
69
+ ----------
70
+ epochs : int
71
+ Maximum number of epochs to train
72
+ """
73
+ return self._set(maxEpochs=epochs)
74
+
75
+ def setRandomSeed(self, seed):
76
+ """Sets random seed for shuffling.
77
+
78
+ Parameters
79
+ ----------
80
+ seed : int
81
+ Random seed for shuffling
82
+ """
83
+ return self._set(randomSeed=seed)
84
+
85
+ def getLabelColumn(self):
86
+ """Gets column for label per each token.
87
+
88
+ Returns
89
+ -------
90
+ str
91
+ Column with label per each token
92
+ """
93
+ return self.getOrDefault(self.labelColumn)
94
+
@@ -0,0 +1,148 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the NerConverter."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class NerConverter(AnnotatorModel):
20
+ """Converts a IOB or IOB2 representation of NER to a user-friendly one, by
21
+ associating the tokens of recognized entities and their label. Results in
22
+ ``CHUNK`` Annotation type.
23
+
24
+ NER chunks can then be filtered by setting a whitelist with
25
+ ``setWhiteList``. Chunks with no associated entity (tagged "O") are
26
+ filtered.
27
+
28
+ See also `Inside–outside–beginning (tagging)
29
+ <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`__
30
+ for more information.
31
+
32
+ ================================= ======================
33
+ Input Annotation types Output Annotation type
34
+ ================================= ======================
35
+ ``DOCUMENT, TOKEN, NAMED_ENTITY`` ``CHUNK``
36
+ ================================= ======================
37
+
38
+ Parameters
39
+ ----------
40
+ whiteList
41
+ If defined, list of entities to process. The rest will be ignored. Do
42
+ not include IOB prefix on labels
43
+ preservePosition
44
+ Whether to preserve the original position of the tokens in the original document
45
+ or use the modified tokens, by default `True`
46
+
47
+ Examples
48
+ --------
49
+ This is a continuation of the example of the :class:`.NerDLModel`. See that
50
+ class on how to extract the entities. The output of the NerDLModel follows
51
+ the Annotator schema and can be converted like so:
52
+
53
+ >>> result.selectExpr("explode(ner)").show(truncate=False)
54
+ +----------------------------------------------------+
55
+ |col |
56
+ +----------------------------------------------------+
57
+ |[named_entity, 0, 2, B-ORG, [word -> U.N], []] |
58
+ |[named_entity, 3, 3, O, [word -> .], []] |
59
+ |[named_entity, 5, 12, O, [word -> official], []] |
60
+ |[named_entity, 14, 18, B-PER, [word -> Ekeus], []] |
61
+ |[named_entity, 20, 24, O, [word -> heads], []] |
62
+ |[named_entity, 26, 28, O, [word -> for], []] |
63
+ |[named_entity, 30, 36, B-LOC, [word -> Baghdad], []]|
64
+ |[named_entity, 37, 37, O, [word -> .], []] |
65
+ +----------------------------------------------------+
66
+
67
+ After the converter is used:
68
+
69
+ >>> converter = NerConverter() \\
70
+ ... .setInputCols(["sentence", "token", "ner"]) \\
71
+ ... .setOutputCol("entities")
72
+ >>> converter.transform(result).selectExpr("explode(entities)").show(truncate=False)
73
+ +------------------------------------------------------------------------+
74
+ |col |
75
+ +------------------------------------------------------------------------+
76
+ |[chunk, 0, 2, U.N, [entity -> ORG, sentence -> 0, chunk -> 0], []] |
77
+ |[chunk, 14, 18, Ekeus, [entity -> PER, sentence -> 0, chunk -> 1], []] |
78
+ |[chunk, 30, 36, Baghdad, [entity -> LOC, sentence -> 0, chunk -> 2], []]|
79
+ +------------------------------------------------------------------------+
80
+ """
81
+ name = 'NerConverter'
82
+
83
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.NAMED_ENTITY]
84
+
85
+ outputAnnotatorType = AnnotatorType.CHUNK
86
+
87
+ whiteList = Param(
88
+ Params._dummy(),
89
+ "whiteList",
90
+ "If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels",
91
+ typeConverter=TypeConverters.toListString
92
+ )
93
+
94
+ preservePosition = Param(
95
+ Params._dummy(),
96
+ "preservePosition",
97
+ "Whether to preserve the original position of the tokens in the original document or use the modified tokens",
98
+ typeConverter=TypeConverters.toBoolean
99
+ )
100
+
101
+ nerHasNoSchema = Param(
102
+ Params._dummy(),
103
+ "nerHasNoSchema",
104
+ "set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema",
105
+ typeConverter=TypeConverters.toBoolean
106
+ )
107
+
108
+ def setWhiteList(self, entities):
109
+ """Sets list of entities to process. The rest will be ignored.
110
+
111
+ Does not include IOB prefix on labels.
112
+
113
+ Parameters
114
+ ----------
115
+ entities : List[str]
116
+ If defined, list of entities to process. The rest will be ignored.
117
+
118
+ """
119
+ return self._set(whiteList=entities)
120
+
121
+ def setPreservePosition(self, value):
122
+ """
123
+ Whether to preserve the original position of the tokens in the original document
124
+ or use the modified tokens, by default `True`.
125
+
126
+ Parameters
127
+ ----------
128
+ value : bool
129
+ Whether to preserve the original position of the tokens in the original
130
+ document or use the modified tokens
131
+ """
132
+ return self._set(preservePosition=value)
133
+
134
+ def setNerHasNoSchema(self, value):
135
+ """
136
+ set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
137
+
138
+ Parameters
139
+ ----------
140
+ value : bool
141
+ set this to true if your NER tags coming from a model that does not have a IOB/IOB2 schema
142
+ """
143
+ return self._set(nerHasNoSchema=value)
144
+
145
+ @keyword_only
146
+ def __init__(self):
147
+ super(NerConverter, self).__init__(
148
+ classname="com.johnsnowlabs.nlp.annotators.ner.NerConverter")