spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,561 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the Tokenizer."""
15
+
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class Tokenizer(AnnotatorApproach):
21
+ """Tokenizes raw text in document type columns into TokenizedSentence .
22
+
23
+ This class represents a non fitted tokenizer. Fitting it will cause the
24
+ internal RuleFactory to construct the rules for tokenizing from the input
25
+ configuration.
26
+
27
+ Identifies tokens with tokenization open standards. A few rules will help
28
+ customizing it if defaults do not fit user needs.
29
+
30
+ For extended examples of usage see the `Examples
31
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__.
32
+
33
+ ====================== ======================
34
+ Input Annotation types Output Annotation type
35
+ ====================== ======================
36
+ ``DOCUMENT`` ``TOKEN``
37
+ ====================== ======================
38
+
39
+ Parameters
40
+ ----------
41
+ targetPattern
42
+ Pattern to grab from text as token candidates, by default ``\\S+``
43
+ prefixPattern
44
+ Regex with groups and begins with ``\\A`` to match target prefix, by
45
+ default ``\\A([^\\s\\w\\$\\.]*)``
46
+ suffixPattern
47
+ Regex with groups and ends with ``\\z`` to match target suffix, by
48
+ default ``([^\\s\\w]?)([^\\s\\w]*)\\z``
49
+ infixPatterns
50
+ Regex patterns that match tokens within a single target. groups identify
51
+ different sub-tokens. multiple defaults
52
+ exceptions
53
+ Words that won't be affected by tokenization rules
54
+ exceptionsPath
55
+ Path to file containing list of exceptions
56
+ caseSensitiveExceptions
57
+ Whether to care for case sensitiveness in exceptions, by default True
58
+ contextChars
59
+ Character list used to separate from token boundaries, by default ['.',
60
+ ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"]
61
+ splitPattern
62
+ Pattern to separate from the inside of tokens. Takes priority over
63
+ splitChars.
64
+ splitChars
65
+ Character list used to separate from the inside of tokens
66
+ minLength
67
+ Set the minimum allowed length for each token, by default 0
68
+ maxLength
69
+ Set the maximum allowed length for each token, by default 99999
70
+
71
+ Examples
72
+ --------
73
+ >>> import sparknlp
74
+ >>> from sparknlp.base import *
75
+ >>> from sparknlp.annotator import *
76
+ >>> from pyspark.ml import Pipeline
77
+ >>> data = spark.createDataFrame([["I'd like to say we didn't expect that. Jane's boyfriend."]]).toDF("text")
78
+ >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
79
+ >>> tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token").fit(data)
80
+ >>> pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)
81
+ >>> result = pipeline.transform(data)
82
+ >>> result.selectExpr("token.result").show(truncate=False)
83
+ +-----------------------------------------------------------------------+
84
+ |output |
85
+ +-----------------------------------------------------------------------+
86
+ |[I'd, like, to, say, we, didn't, expect, that, ., Jane's, boyfriend, .]|
87
+ +-----------------------------------------------------------------------+
88
+ """
89
+
90
+ name = 'Tokenizer'
91
+
92
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
93
+
94
+ outputAnnotatorType = AnnotatorType.TOKEN
95
+
96
+ targetPattern = Param(Params._dummy(),
97
+ "targetPattern",
98
+ "pattern to grab from text as token candidates. Defaults \S+",
99
+ typeConverter=TypeConverters.toString)
100
+
101
+ prefixPattern = Param(Params._dummy(),
102
+ "prefixPattern",
103
+ "regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)",
104
+ typeConverter=TypeConverters.toString)
105
+
106
+ suffixPattern = Param(Params._dummy(),
107
+ "suffixPattern",
108
+ "regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z",
109
+ typeConverter=TypeConverters.toString)
110
+
111
+ infixPatterns = Param(Params._dummy(),
112
+ "infixPatterns",
113
+ "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults",
114
+ typeConverter=TypeConverters.toListString)
115
+
116
+ exceptions = Param(Params._dummy(),
117
+ "exceptions",
118
+ "Words that won't be affected by tokenization rules",
119
+ typeConverter=TypeConverters.toListString)
120
+
121
+ exceptionsPath = Param(Params._dummy(),
122
+ "exceptionsPath",
123
+ "path to file containing list of exceptions",
124
+ typeConverter=TypeConverters.identity)
125
+
126
+ caseSensitiveExceptions = Param(Params._dummy(),
127
+ "caseSensitiveExceptions",
128
+ "Whether to care for case sensitiveness in exceptions",
129
+ typeConverter=TypeConverters.toBoolean)
130
+
131
+ contextChars = Param(Params._dummy(),
132
+ "contextChars",
133
+ "character list used to separate from token boundaries",
134
+ typeConverter=TypeConverters.toListString)
135
+
136
+ splitPattern = Param(Params._dummy(),
137
+ "splitPattern",
138
+ "character list used to separate from the inside of tokens",
139
+ typeConverter=TypeConverters.toString)
140
+
141
+ splitChars = Param(Params._dummy(),
142
+ "splitChars",
143
+ "character list used to separate from the inside of tokens",
144
+ typeConverter=TypeConverters.toListString)
145
+
146
+ minLength = Param(Params._dummy(),
147
+ "minLength",
148
+ "Set the minimum allowed length for each token",
149
+ typeConverter=TypeConverters.toInt)
150
+
151
+ maxLength = Param(Params._dummy(),
152
+ "maxLength",
153
+ "Set the maximum allowed length for each token",
154
+ typeConverter=TypeConverters.toInt)
155
+
156
+ @keyword_only
157
+ def __init__(self):
158
+ super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Tokenizer")
159
+ self._setDefault(
160
+ targetPattern="\\S+",
161
+ contextChars=[".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'"],
162
+ caseSensitiveExceptions=True,
163
+ minLength=0,
164
+ maxLength=99999
165
+ )
166
+
167
+ def getInfixPatterns(self):
168
+ """Gets regex patterns that match tokens within a single target. Groups
169
+ identify different sub-tokens.
170
+
171
+ Returns
172
+ -------
173
+ List[str]
174
+ The infix patterns
175
+ """
176
+ return self.getOrDefault("infixPatterns")
177
+
178
+ def getSuffixPattern(self):
179
+ """Gets regex with groups and ends with ``\\z`` to match target suffix.
180
+
181
+ Returns
182
+ -------
183
+ str
184
+ The suffix pattern
185
+ """
186
+ return self.getOrDefault("suffixPattern")
187
+
188
+ def getPrefixPattern(self):
189
+ """Gets regex with groups and begins with ``\\A`` to match target
190
+ prefix.
191
+
192
+ Returns
193
+ -------
194
+ str
195
+ The prefix pattern
196
+ """
197
+ return self.getOrDefault("prefixPattern")
198
+
199
+ def getContextChars(self):
200
+ """Gets character list used to separate from token boundaries.
201
+
202
+ Returns
203
+ -------
204
+ List[str]
205
+ Character list used to separate from token boundaries
206
+ """
207
+ return self.getOrDefault("contextChars")
208
+
209
+ def getSplitChars(self):
210
+ """Gets character list used to separate from the inside of tokens.
211
+
212
+ Returns
213
+ -------
214
+ List[str]
215
+ Character list used to separate from the inside of tokens
216
+ """
217
+ return self.getOrDefault("splitChars")
218
+
219
+ def setTargetPattern(self, value):
220
+ """Sets pattern to grab from text as token candidates, by default
221
+ ``\\S+``.
222
+
223
+ Parameters
224
+ ----------
225
+ value : str
226
+ Pattern to grab from text as token candidates
227
+ """
228
+ return self._set(targetPattern=value)
229
+
230
+ def setPrefixPattern(self, value):
231
+ """Sets regex with groups and begins with ``\\A`` to match target prefix, by
232
+ default ``\\A([^\\s\\w\\$\\.]*)``.
233
+
234
+ Parameters
235
+ ----------
236
+ value : str
237
+ Regex with groups and begins with ``\\A`` to match target prefix
238
+ """
239
+ return self._set(prefixPattern=value)
240
+
241
+ def setSuffixPattern(self, value):
242
+ """Sets regex with groups and ends with ``\\z`` to match target suffix,
243
+ by default ``([^\\s\\w]?)([^\\s\\w]*)\\z``.
244
+
245
+ Parameters
246
+ ----------
247
+ value : str
248
+ Regex with groups and ends with ``\\z`` to match target suffix
249
+ """
250
+ return self._set(suffixPattern=value)
251
+
252
+ def setInfixPatterns(self, value):
253
+ """Sets regex patterns that match tokens within a single target. Groups
254
+ identify different sub-tokens.
255
+
256
+ Parameters
257
+ ----------
258
+ value : List[str]
259
+ Regex patterns that match tokens within a single target
260
+ """
261
+ return self._set(infixPatterns=value)
262
+
263
+ def addInfixPattern(self, value):
264
+ """Adds an additional regex pattern that match tokens within a single
265
+ target. Groups identify different sub-tokens.
266
+
267
+ Parameters
268
+ ----------
269
+ value : str
270
+ Regex pattern that match tokens within a single target
271
+ """
272
+ try:
273
+ infix_patterns = self.getInfixPatterns()
274
+ except KeyError:
275
+ infix_patterns = []
276
+ infix_patterns.insert(0, value)
277
+ return self._set(infixPatterns=infix_patterns)
278
+
279
+ def setExceptions(self, value):
280
+ """Sets words that won't be affected by tokenization rules.
281
+
282
+ Parameters
283
+ ----------
284
+ value : List[str]
285
+ Words that won't be affected by tokenization rules
286
+ """
287
+ return self._set(exceptions=value)
288
+
289
+ def getExceptions(self):
290
+ """Gets words that won't be affected by tokenization rules.
291
+
292
+ Returns
293
+ -------
294
+ List[str]
295
+ Words that won't be affected by tokenization rules
296
+ """
297
+ return self.getOrDefault("exceptions")
298
+
299
+ def setExceptionsPath(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
300
+ """Path to txt file with list of token exceptions
301
+
302
+ Parameters
303
+ ----------
304
+ path : str
305
+ Path to the source file
306
+ read_as : str, optional
307
+ How to read the file, by default ReadAs.TEXT
308
+ options : dict, optional
309
+ Options to read the resource, by default {"format": "text"}
310
+ """
311
+ opts = options.copy()
312
+ return self._set(exceptionsPath=ExternalResource(path, read_as, opts))
313
+
314
+ def addException(self, value):
315
+ """Adds an additional word that won't be affected by tokenization rules.
316
+
317
+ Parameters
318
+ ----------
319
+ value : str
320
+ Additional word that won't be affected by tokenization rules
321
+ """
322
+ try:
323
+ exception_tokens = self.getExceptions()
324
+ except KeyError:
325
+ exception_tokens = []
326
+ exception_tokens.append(value)
327
+ return self._set(exceptions=exception_tokens)
328
+
329
+ def setCaseSensitiveExceptions(self, value):
330
+ """Sets whether to care for case sensitiveness in exceptions, by default
331
+ True.
332
+
333
+ Parameters
334
+ ----------
335
+ value : bool
336
+ Whether to care for case sensitiveness in exceptions
337
+ """
338
+ return self._set(caseSensitiveExceptions=value)
339
+
340
+ def getCaseSensitiveExceptions(self):
341
+ """Gets whether to care for case sensitiveness in exceptions.
342
+
343
+ Returns
344
+ -------
345
+ bool
346
+ Whether to care for case sensitiveness in exceptions
347
+ """
348
+ return self.getOrDefault("caseSensitiveExceptions")
349
+
350
+ def setContextChars(self, value):
351
+ """Sets character list used to separate from token boundaries, by
352
+ default ['.', ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"].
353
+
354
+ Parameters
355
+ ----------
356
+ value : List[str]
357
+ Character list used to separate from token boundaries
358
+ """
359
+ return self._set(contextChars=value)
360
+
361
+ def addContextChars(self, value):
362
+ """Adds an additional character to the list used to separate from token
363
+ boundaries.
364
+
365
+ Parameters
366
+ ----------
367
+ value : str
368
+ Additional context character
369
+ """
370
+ try:
371
+ context_chars = self.getContextChars()
372
+ except KeyError:
373
+ context_chars = []
374
+ context_chars.append(value)
375
+ return self._set(contextChars=context_chars)
376
+
377
+ def setSplitPattern(self, value):
378
+ """Sets pattern to separate from the inside of tokens. Takes priority
379
+ over splitChars.
380
+
381
+ Parameters
382
+ ----------
383
+ value : str
384
+ Pattern used to separate from the inside of tokens
385
+ """
386
+ return self._set(splitPattern=value)
387
+
388
+ def setSplitChars(self, value):
389
+ """Sets character list used to separate from the inside of tokens.
390
+
391
+ Parameters
392
+ ----------
393
+ value : List[str]
394
+ Character list used to separate from the inside of tokens
395
+ """
396
+ return self._set(splitChars=value)
397
+
398
+ def addSplitChars(self, value):
399
+ """Adds an additional character to separate from the inside of tokens.
400
+
401
+ Parameters
402
+ ----------
403
+ value : str
404
+ Additional character to separate from the inside of tokens
405
+ """
406
+ try:
407
+ split_chars = self.getSplitChars()
408
+ except KeyError:
409
+ split_chars = []
410
+ split_chars.append(value)
411
+ return self._set(splitChars=split_chars)
412
+
413
+ def setMinLength(self, value):
414
+ """Sets the minimum allowed length for each token, by default 0.
415
+
416
+ Parameters
417
+ ----------
418
+ value : int
419
+ Minimum allowed length for each token
420
+ """
421
+ return self._set(minLength=value)
422
+
423
+ def setMaxLength(self, value):
424
+ """Sets the maximum allowed length for each token, by default 99999.
425
+
426
+ Parameters
427
+ ----------
428
+ value : int
429
+ Maximum allowed length for each token
430
+ """
431
+ return self._set(maxLength=value)
432
+
433
+ def _create_model(self, java_model):
434
+ return TokenizerModel(java_model=java_model)
435
+
436
+
437
+ class TokenizerModel(AnnotatorModel):
438
+ """Tokenizes raw text into word pieces, tokens. Identifies tokens with
439
+ tokenization open standards. A few rules will help customizing it if
440
+ defaults do not fit user needs.
441
+
442
+ This class represents an already fitted :class:`.Tokenizer`.
443
+
444
+ See the main class Tokenizer for more examples of usage.
445
+
446
+ ====================== ======================
447
+ Input Annotation types Output Annotation type
448
+ ====================== ======================
449
+ ``DOCUMENT`` ``TOKEN``
450
+ ====================== ======================
451
+
452
+ Parameters
453
+ ----------
454
+ splitPattern
455
+ Character list used to separate from the inside of tokens
456
+ splitChars
457
+ Character list used to separate from the inside of tokens
458
+ """
459
+ name = "TokenizerModel"
460
+
461
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
462
+
463
+ outputAnnotatorType = AnnotatorType.TOKEN
464
+
465
+ exceptions = Param(Params._dummy(),
466
+ "exceptions",
467
+ "Words that won't be affected by tokenization rules",
468
+ typeConverter=TypeConverters.toListString)
469
+
470
+ caseSensitiveExceptions = Param(Params._dummy(),
471
+ "caseSensitiveExceptions",
472
+ "Whether to care for case sensitiveness in exceptions",
473
+ typeConverter=TypeConverters.toBoolean)
474
+
475
+ targetPattern = Param(Params._dummy(),
476
+ "targetPattern",
477
+ "pattern to grab from text as token candidates. Defaults \S+",
478
+ typeConverter=TypeConverters.toString)
479
+
480
+ rules = Param(Params._dummy(),
481
+ "rules",
482
+ "Rules structure factory containing pre processed regex rules",
483
+ typeConverter=TypeConverters.identity)
484
+
485
+ splitPattern = Param(Params._dummy(),
486
+ "splitPattern",
487
+ "character list used to separate from the inside of tokens",
488
+ typeConverter=TypeConverters.toString)
489
+
490
+ splitChars = Param(Params._dummy(),
491
+ "splitChars",
492
+ "character list used to separate from the inside of tokens",
493
+ typeConverter=TypeConverters.toListString)
494
+
495
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TokenizerModel", java_model=None):
496
+ super(TokenizerModel, self).__init__(
497
+ classname=classname,
498
+ java_model=java_model
499
+ )
500
+ self._setDefault(
501
+ targetPattern="\\S+",
502
+ caseSensitiveExceptions=True
503
+ )
504
+
505
+ def setSplitPattern(self, value):
506
+ """Sets pattern to separate from the inside of tokens. Takes priority
507
+ over splitChars.
508
+
509
+ Parameters
510
+ ----------
511
+ value : str
512
+ Pattern used to separate from the inside of tokens
513
+ """
514
+ return self._set(splitPattern=value)
515
+
516
+ def setSplitChars(self, value):
517
+ """Sets character list used to separate from the inside of tokens.
518
+
519
+ Parameters
520
+ ----------
521
+ value : List[str]
522
+ Character list used to separate from the inside of tokens
523
+ """
524
+ return self._set(splitChars=value)
525
+
526
+ def addSplitChars(self, value):
527
+ """Adds an additional character to separate from the inside of tokens.
528
+
529
+ Parameters
530
+ ----------
531
+ value : str
532
+ Additional character to separate from the inside of tokens
533
+ """
534
+ try:
535
+ split_chars = self.getSplitChars()
536
+ except KeyError:
537
+ split_chars = []
538
+ split_chars.append(value)
539
+ return self._set(splitChars=split_chars)
540
+
541
+ @staticmethod
542
+ def pretrained(name="token_rules", lang="en", remote_loc=None):
543
+ """Downloads and loads a pretrained model.
544
+
545
+ Parameters
546
+ ----------
547
+ name : str, optional
548
+ Name of the pretrained model, by default "token_rules"
549
+ lang : str, optional
550
+ Language of the pretrained model, by default "en"
551
+ remote_loc : str, optional
552
+ Optional remote address of the resource, by default None. Will use
553
+ Spark NLPs repositories otherwise.
554
+
555
+ Returns
556
+ -------
557
+ TokenizerModel
558
+ The restored model
559
+ """
560
+ from sparknlp.pretrained import ResourceDownloader
561
+ return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)
@@ -0,0 +1,76 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Token2Chunk."""
15
+
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class Token2Chunk(AnnotatorModel):
21
+ """Converts ``TOKEN`` type Annotations to ``CHUNK`` type.
22
+
23
+ This can be useful if a entities have been already extracted as ``TOKEN``
24
+ and following annotators require ``CHUNK`` types.
25
+
26
+ ====================== ======================
27
+ Input Annotation types Output Annotation type
28
+ ====================== ======================
29
+ ``TOKEN`` ``CHUNK``
30
+ ====================== ======================
31
+
32
+ Parameters
33
+ ----------
34
+ None
35
+
36
+ Examples
37
+ --------
38
+ >>> import sparknlp
39
+ >>> from sparknlp.base import *
40
+ >>> from sparknlp.annotator import *
41
+ >>> from pyspark.ml import Pipeline
42
+ >>> documentAssembler = DocumentAssembler() \\
43
+ ... .setInputCol("text") \\
44
+ ... .setOutputCol("document")
45
+ >>> tokenizer = Tokenizer() \\
46
+ ... .setInputCols(["document"]) \\
47
+ ... .setOutputCol("token")
48
+ >>> token2chunk = Token2Chunk() \\
49
+ ... .setInputCols(["token"]) \\
50
+ ... .setOutputCol("chunk")
51
+ >>> pipeline = Pipeline().setStages([
52
+ ... documentAssembler,
53
+ ... tokenizer,
54
+ ... token2chunk
55
+ ... ])
56
+ >>> data = spark.createDataFrame([["One Two Three Four"]]).toDF("text")
57
+ >>> result = pipeline.fit(data).transform(data)
58
+ >>> result.selectExpr("explode(chunk) as result").show(truncate=False)
59
+ +------------------------------------------+
60
+ |result |
61
+ +------------------------------------------+
62
+ |[chunk, 0, 2, One, [sentence -> 0], []] |
63
+ |[chunk, 4, 6, Two, [sentence -> 0], []] |
64
+ |[chunk, 8, 12, Three, [sentence -> 0], []]|
65
+ |[chunk, 14, 17, Four, [sentence -> 0], []]|
66
+ +------------------------------------------+
67
+ """
68
+ name = "Token2Chunk"
69
+
70
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
71
+
72
+ outputAnnotatorType = AnnotatorType.CHUNK
73
+
74
+ def __init__(self):
75
+ super(Token2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Token2Chunk")
76
+
@@ -0,0 +1,16 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Module of annotators for word segmentation."""
16
+ from sparknlp.annotator.ws.word_segmenter import *