spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,150 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for CoNLL."""
15
+
16
+ import pyspark
17
+
18
+ from sparknlp.common import ReadAs
19
+ from sparknlp.internal import ExtendedJavaWrapper
20
+
21
+
22
+ class CoNLL(ExtendedJavaWrapper):
23
+ """Instantiates the class to read a CoNLL dataset.
24
+
25
+ The dataset should be in the format of `CoNLL 2003
26
+ <https://www.clips.uantwerpen.be/conll2003/ner/>`_ and needs to be specified
27
+ with :meth:`.readDataset`, which will create a dataframe with the data.
28
+
29
+ Can be used to train a :class:`NerDLApproach
30
+ <sparknlp.annotator.NerDLApproach>`.
31
+
32
+ **Input File Format**::
33
+
34
+ -DOCSTART- -X- -X- O
35
+
36
+ EU NNP B-NP B-ORG
37
+ rejects VBZ B-VP O
38
+ German JJ B-NP B-MISC
39
+ call NN I-NP O
40
+ to TO B-VP O
41
+ boycott VB I-VP O
42
+ British JJ B-NP B-MISC
43
+ lamb NN I-NP O
44
+ . . O O
45
+
46
+ Parameters
47
+ ----------
48
+ documentCol : str, optional
49
+ Name of the :class:`.DocumentAssembler` column, by default 'document'
50
+ sentenceCol : str, optional
51
+ Name of the :class:`.SentenceDetector` column, by default 'sentence'
52
+ tokenCol : str, optional
53
+ Name of the :class:`.Tokenizer` column, by default 'token'
54
+ posCol : str, optional
55
+ Name of the :class:`.PerceptronModel` column, by default 'pos'
56
+ conllLabelIndex : int, optional
57
+ Index of the label column in the dataset, by default 3
58
+ conllPosIndex : int, optional
59
+ Index of the POS tags in the dataset, by default 1
60
+ textCol : str, optional
61
+ Index of the text column in the dataset, by default 'text'
62
+ labelCol : str, optional
63
+ Name of the label column, by default 'label'
64
+ explodeSentences : bool, optional
65
+ Whether to explode sentences to separate rows, by default True
66
+ delimiter: str, optional
67
+ Delimiter used to separate columns inside CoNLL file
68
+ includeDocId: bool, optional
69
+ Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
70
+
71
+ Examples
72
+ --------
73
+ >>> from sparknlp.training import CoNLL
74
+ >>> trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
75
+ >>> trainingData.selectExpr(
76
+ ... "text",
77
+ ... "token.result as tokens",
78
+ ... "pos.result as pos",
79
+ ... "label.result as label"
80
+ ... ).show(3, False)
81
+ +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
82
+ |text |tokens |pos |label |
83
+ +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
84
+ |EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
85
+ |Peter Blackburn |[Peter, Blackburn] |[NNP, NNP] |[B-PER, I-PER] |
86
+ |BRUSSELS 1996-08-22 |[BRUSSELS, 1996-08-22] |[NNP, CD] |[B-LOC, O] |
87
+ +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
88
+ """
89
+
90
+ def __init__(self,
91
+ documentCol='document',
92
+ sentenceCol='sentence',
93
+ tokenCol='token',
94
+ posCol='pos',
95
+ conllLabelIndex=3,
96
+ conllPosIndex=1,
97
+ conllDocIdCol="doc_id",
98
+ textCol='text',
99
+ labelCol='label',
100
+ explodeSentences=True,
101
+ delimiter=' ',
102
+ includeDocId=False
103
+ ):
104
+ super(CoNLL, self).__init__("com.johnsnowlabs.nlp.training.CoNLL",
105
+ documentCol,
106
+ sentenceCol,
107
+ tokenCol,
108
+ posCol,
109
+ conllLabelIndex,
110
+ conllPosIndex,
111
+ conllDocIdCol,
112
+ textCol,
113
+ labelCol,
114
+ explodeSentences,
115
+ delimiter,
116
+ includeDocId)
117
+
118
+ def readDataset(self, spark, path, read_as=ReadAs.TEXT, partitions=8, storage_level=pyspark.StorageLevel.DISK_ONLY):
119
+ # ToDo Replace with std pyspark
120
+ """Reads the dataset from an external resource.
121
+
122
+ Parameters
123
+ ----------
124
+ spark : :class:`pyspark.sql.SparkSession`
125
+ Initiated Spark Session with Spark NLP
126
+ path : str
127
+ Path to the resource, it can take two forms; a path to a conll file, or a path to a folder containing multiple CoNLL files.
128
+ When the path points to a folder, the path must end in '*'.
129
+ Examples:
130
+ "/path/to/single/file.conll'
131
+ "/path/to/folder/containing/multiple/files/*'
132
+
133
+ read_as : str, optional
134
+ How to read the resource, by default ReadAs.TEXT
135
+ partitions : sets the minimum number of partitions for the case of lifting multiple files in parallel into a single dataframe. Defaults to 8.
136
+ storage_level : sets the persistence level according to PySpark definitions. Defaults to StorageLevel.DISK_ONLY. Applies only when lifting multiple files.
137
+
138
+
139
+ Returns
140
+ -------
141
+ :class:`pyspark.sql.DataFrame`
142
+ Spark Dataframe with the data
143
+ """
144
+ jSession = spark._jsparkSession
145
+
146
+ jdf = self._java_obj.readDataset(jSession, path, read_as, partitions,
147
+ spark.sparkContext._getJavaStorageLevel(storage_level))
148
+ dataframe = self.getDataFrame(spark, jdf)
149
+ return dataframe
150
+
@@ -0,0 +1,103 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for CoNLLU."""
15
+
16
+ from sparknlp.common import ReadAs
17
+ from sparknlp.internal import ExtendedJavaWrapper
18
+
19
+
20
+ class CoNLLU(ExtendedJavaWrapper):
21
+ """Instantiates the class to read a CoNLL-U dataset.
22
+
23
+ The dataset should be in the format of `CoNLL-U
24
+ <https://universaldependencies.org/format.html>`_ and needs to be specified
25
+ with :meth:`.readDataset`, which will create a dataframe with the data.
26
+
27
+ Can be used to train a :class:`DependencyParserApproach
28
+ <sparknlp.annotator.DependencyParserApproach>`
29
+
30
+ **Input File Format**::
31
+
32
+ # sent_id = 1
33
+ # text = They buy and sell books.
34
+ 1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
35
+ 2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
36
+ 3 and and CONJ CC _ 4 cc 4:cc _
37
+ 4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
38
+ 5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
39
+ 6 . . PUNCT . _ 2 punct 2:punct _
40
+
41
+ Examples
42
+ --------
43
+ >>> from sparknlp.training import CoNLLU
44
+ >>> conlluFile = "src/test/resources/conllu/en.test.conllu"
45
+ >>> conllDataSet = CoNLLU(False).readDataset(spark, conlluFile)
46
+ >>> conllDataSet.selectExpr(
47
+ ... "text",
48
+ ... "form.result as form",
49
+ ... "upos.result as upos",
50
+ ... "xpos.result as xpos",
51
+ ... "lemma.result as lemma"
52
+ ... ).show(1, False)
53
+ +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
54
+ |text |form |upos |xpos |lemma |
55
+ +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
56
+ |What if Google Morphed Into GoogleOS? |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
57
+ +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
58
+ """
59
+
60
+ def __init__(self,
61
+ textCol='text',
62
+ documentCol='document',
63
+ sentenceCol='sentence',
64
+ formCol='form',
65
+ uposCol='upos',
66
+ xposCol='xpos',
67
+ lemmaCol='lemma',
68
+ explodeSentences=True
69
+ ):
70
+ super(CoNLLU, self).__init__("com.johnsnowlabs.nlp.training.CoNLLU",
71
+ textCol,
72
+ documentCol,
73
+ sentenceCol,
74
+ formCol,
75
+ uposCol,
76
+ xposCol,
77
+ lemmaCol,
78
+ explodeSentences)
79
+
80
+ def readDataset(self, spark, path, read_as=ReadAs.TEXT):
81
+ """Reads the dataset from an external resource.
82
+
83
+ Parameters
84
+ ----------
85
+ spark : :class:`pyspark.sql.SparkSession`
86
+ Initiated Spark Session with Spark NLP
87
+ path : str
88
+ Path to the resource
89
+ read_as : str, optional
90
+ How to read the resource, by default ReadAs.TEXT
91
+
92
+ Returns
93
+ -------
94
+ :class:`pyspark.sql.DataFrame`
95
+ Spark Dataframe with the data
96
+ """
97
+ # ToDo Replace with std pyspark
98
+ jSession = spark._jsparkSession
99
+
100
+ jdf = self._java_obj.readDataset(jSession, path, read_as)
101
+ dataframe = self.getDataFrame(spark, jdf)
102
+ return dataframe
103
+
@@ -0,0 +1,103 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains helper classes for part-of-speech tagging."""
15
+
16
+ from sparknlp.internal import ExtendedJavaWrapper
17
+
18
+
19
+ class POS(ExtendedJavaWrapper):
20
+ """Helper class for creating DataFrames for training a part-of-speech
21
+ tagger.
22
+
23
+ The dataset needs to consist of sentences on each line, where each word is
24
+ delimited with its respective tag.
25
+
26
+ **Input File Format**::
27
+
28
+ A|DT few|JJ months|NNS ago|RB you|PRP received|VBD a|DT letter|NN
29
+
30
+
31
+ The sentence can then be parsed with :meth:`.readDataset` into a column with
32
+ annotations of type ``POS``.
33
+
34
+ Can be used to train a :class:`PerceptronApproach
35
+ <sparknlp.annotator.PerceptronApproach>`.
36
+
37
+ Examples
38
+ --------
39
+ In this example, the file ``test-training.txt`` has the content of the
40
+ sentence above.
41
+
42
+ >>> from sparknlp.training import POS
43
+ >>> pos = POS()
44
+ >>> path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
45
+ >>> posDf = pos.readDataset(spark, path, "|", "tags")
46
+ >>> posDf.selectExpr("explode(tags) as tags").show(truncate=False)
47
+ +---------------------------------------------+
48
+ |tags |
49
+ +---------------------------------------------+
50
+ |[pos, 0, 5, NNP, [word -> Pierre], []] |
51
+ |[pos, 7, 12, NNP, [word -> Vinken], []] |
52
+ |[pos, 14, 14, ,, [word -> ,], []] |
53
+ |[pos, 16, 17, CD, [word -> 61], []] |
54
+ |[pos, 19, 23, NNS, [word -> years], []] |
55
+ |[pos, 25, 27, JJ, [word -> old], []] |
56
+ |[pos, 29, 29, ,, [word -> ,], []] |
57
+ |[pos, 31, 34, MD, [word -> will], []] |
58
+ |[pos, 36, 39, VB, [word -> join], []] |
59
+ |[pos, 41, 43, DT, [word -> the], []] |
60
+ |[pos, 45, 49, NN, [word -> board], []] |
61
+ |[pos, 51, 52, IN, [word -> as], []] |
62
+ |[pos, 47, 47, DT, [word -> a], []] |
63
+ |[pos, 56, 67, JJ, [word -> nonexecutive], []]|
64
+ |[pos, 69, 76, NN, [word -> director], []] |
65
+ |[pos, 78, 81, NNP, [word -> Nov.], []] |
66
+ |[pos, 83, 84, CD, [word -> 29], []] |
67
+ |[pos, 81, 81, ., [word -> .], []] |
68
+ +---------------------------------------------+
69
+ """
70
+
71
+ def __init__(self):
72
+ super(POS, self).__init__("com.johnsnowlabs.nlp.training.POS")
73
+
74
+ def readDataset(self, spark, path, delimiter="|", outputPosCol="tags", outputDocumentCol="document",
75
+ outputTextCol="text"):
76
+ # ToDo Replace with std pyspark
77
+ """Reads the dataset from an external resource.
78
+
79
+ Parameters
80
+ ----------
81
+ spark : :class:`pyspark.sql.SparkSession`
82
+ Initiated Spark Session with Spark NLP
83
+ path : str
84
+ Path to the resource
85
+ delimiter : str, optional
86
+ Delimiter of word and POS, by default "|"
87
+ outputPosCol : str, optional
88
+ Name of the output POS column, by default "tags"
89
+ outputDocumentCol : str, optional
90
+ Name of the output document column, by default "document"
91
+ outputTextCol : str, optional
92
+ Name of the output text column, by default "text"
93
+
94
+ Returns
95
+ -------
96
+ :class:`pyspark.sql.DataFrame`
97
+ Spark Dataframe with the data
98
+ """
99
+ jSession = spark._jsparkSession
100
+
101
+ jdf = self._java_obj.readDataset(jSession, path, delimiter, outputPosCol, outputDocumentCol, outputTextCol)
102
+ dataframe = self.getDataFrame(spark, jdf)
103
+ return dataframe
@@ -0,0 +1,76 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains helper classes for PubTator datasets."""
15
+
16
+ from sparknlp.internal import ExtendedJavaWrapper
17
+
18
+
19
+ class PubTator(ExtendedJavaWrapper):
20
+ """The PubTator format includes medical papers’ titles, abstracts, and
21
+ tagged chunks.
22
+
23
+ For more information see `PubTator Docs
24
+ <http://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=format_3783>`_
25
+ and `MedMentions Docs <http://github.com/chanzuckerberg/MedMentions>`_.
26
+
27
+ :meth:`.readDataset` is used to create a Spark DataFrame from a PubTator
28
+ text file.
29
+
30
+ **Input File Format**::
31
+
32
+ 25763772 0 5 DCTN4 T116,T123 C4308010
33
+ 25763772 23 63 chronic Pseudomonas aeruginosa infection T047 C0854135
34
+ 25763772 67 82 cystic fibrosis T047 C0010674
35
+ 25763772 83 120 Pseudomonas aeruginosa (Pa) infection T047 C0854135
36
+ 25763772 124 139 cystic fibrosis T047 C0010674
37
+
38
+ Examples
39
+ --------
40
+ >>> from sparknlp.training import PubTator
41
+ >>> pubTatorFile = "./src/test/resources/corpus_pubtator_sample.txt"
42
+ >>> pubTatorDataSet = PubTator().readDataset(spark, pubTatorFile)
43
+ >>> pubTatorDataSet.show(1)
44
+ +--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
45
+ | doc_id| finished_token| finished_pos| finished_ner|finished_token_metadata|finished_pos_metadata|finished_label_metadata|
46
+ +--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
47
+ |25763772|[DCTN4, as, a, mo...|[NNP, IN, DT, NN,...|[B-T116, O, O, O,...| [[sentence, 0], [...| [[word, DCTN4], [...| [[word, DCTN4], [...|
48
+ +--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
49
+ """
50
+
51
+ def __init__(self):
52
+ super(PubTator, self).__init__("com.johnsnowlabs.nlp.training.PubTator")
53
+
54
+ def readDataset(self, spark, path, isPaddedToken=True):
55
+ # ToDo Replace with std pyspark
56
+ """Reads the dataset from an external resource.
57
+
58
+ Parameters
59
+ ----------
60
+ spark : :class:`pyspark.sql.SparkSession`
61
+ Initiated Spark Session with Spark NLP
62
+ path : str
63
+ Path to the resource
64
+ isPaddedToken : str, optional
65
+ Whether tokens are padded, by default True
66
+
67
+ Returns
68
+ -------
69
+ :class:`pyspark.sql.DataFrame`
70
+ Spark Dataframe with the data
71
+ """
72
+ jSession = spark._jsparkSession
73
+
74
+ jdf = self._java_obj.readDataset(jSession, path, isPaddedToken)
75
+ dataframe = self.getDataFrame(spark, jdf)
76
+ return dataframe
@@ -0,0 +1,57 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark.sql import SparkSession
15
+
16
+ from sparknlp.internal import ExtendedJavaWrapper
17
+
18
+
19
+ class SpacyToAnnotation(ExtendedJavaWrapper):
20
+
21
+ """Helper class to load a list of tokens/sentences as JSON to Annotation.
22
+
23
+ The JSON will be in this format:
24
+ [
25
+ {
26
+ "tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."],
27
+ "token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false],
28
+ "sentence_ends": [2, 7, 12]
29
+ }
30
+ ]
31
+
32
+ Examples
33
+ --------
34
+ >>> from sparknlp.training import SpacyToAnnotation
35
+ >>> result = SpacyToAnnotation().readDataset(spark, "src/test/resources/spacy-to-annotation/multi_doc_tokens.json")
36
+ >>> result.show(False)
37
+ +-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
38
+ |document |sentence |token |
39
+ +-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
40
+ |[{document, 0, 55, John went to the store last night. He bought some bread., {}, []}]|[{document, 0, 33, John went to the store last night., {sentence -> 0}, []}, {document, 35, 55, He bought some bread., {sentence -> 1}, []}] |[{token, 0, 3, John, {sentence -> 0}, []}, {token, 5, 8, went, {sentence -> 0}, []}, {token, 10, 11, to, {sentence -> 0}, []}, {token, 13, 15, the, {sentence -> 0}, []}, {token, 17, 21, store, {sentence -> 0}, []}, {token, 23, 26, last, {sentence -> 0}, []}, {token, 28, 32, night, {sentence -> 0}, []}, {token, 33, 33, ., {sentence -> 0}, []}, {token, 35, 36, He, {sentence -> 1}, []}, {token, 38, 43, bought, {sentence -> 1}, []}, {token, 45, 48, some, {sentence -> 1}, []}, {token, 50, 54, bread, {sentence -> 1}, []}, {token, 55, 55, ., {sentence -> 1}, []}]|
41
+ |[{document, 0, 47, Hello world! How are you today? I'm fine thanks., {}, []}] |[{document, 0, 11, Hello world!, {sentence -> 0}, []}, {document, 13, 30, How are you today?, {sentence -> 1}, []}, {document, 32, 47, I'm fine thanks., {sentence -> 2}, []}]|[{token, 0, 4, Hello, {sentence -> 0}, []}, {token, 6, 10, world, {sentence -> 0}, []}, {token, 11, 11, !, {sentence -> 0}, []}, {token, 13, 15, How, {sentence -> 1}, []}, {token, 17, 19, are, {sentence -> 1}, []}, {token, 21, 23, you, {sentence -> 1}, []}, {token, 25, 29, today, {sentence -> 1}, []}, {token, 30, 30, ?, {sentence -> 1}, []}, {token, 32, 32, I, {sentence -> 2}, []}, {token, 33, 34, 'm, {sentence -> 2}, []}, {token, 36, 39, fine, {sentence -> 2}, []}, {token, 41, 46, thanks, {sentence -> 2}, []}, {token, 47, 47, ., {sentence -> 2}, []}] |
42
+ +-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
43
+
44
+ """
45
+
46
+ def __init__(self):
47
+ super(SpacyToAnnotation, self).__init__("com.johnsnowlabs.nlp.training.SpacyToAnnotation")
48
+
49
+ def readJsonFile(self, spark, jsonFilePath, params=None):
50
+ if params is None:
51
+ params = {}
52
+
53
+ jSession = spark._jsparkSession
54
+
55
+ jdf = self._java_obj.readJsonFileJava(jSession, jsonFilePath, params)
56
+ annotation_dataset = self.getDataFrame(spark, jdf)
57
+ return annotation_dataset
@@ -0,0 +1,5 @@
1
+ from ._tf_graph_builders.graph_builders import TFGraphBuilderFactory
2
+ from ._tf_graph_builders_1x.graph_builders import TFGraphBuilderFactory as TFGraphBuilderFactory1x
3
+
4
+ tf_graph = TFGraphBuilderFactory()
5
+ tf_graph_1x = TFGraphBuilderFactory1x()
@@ -0,0 +1,149 @@
1
+ import requests
2
+ import json
3
+ from typing import List
4
+ import sparknlp
5
+ import os
6
+ import zipfile
7
+
8
+
9
+ class PushToHub:
10
+ list_of_tasks = [ # List of available tasks in Modelhub
11
+ "Named Entity Recognition",
12
+ 'Text Classification',
13
+ 'Text Generation',
14
+ 'Sentiment Analysis',
15
+ 'Translation',
16
+ 'Question Answering',
17
+ 'Summarization',
18
+ 'Sentence Detection',
19
+ 'Embeddings',
20
+ 'Language Detection',
21
+ 'Stop Words Removal',
22
+ 'Word Segmentation',
23
+ 'Part of Speech Tagging',
24
+ 'Lemmatization',
25
+ 'Chunk Mapping',
26
+ 'Spell Check',
27
+ 'Dependency Parser',
28
+ 'Pipeline Public']
29
+
30
+ def zip_directory(folder_path: str, zip_path: str):
31
+ """Zips folder for pushing to hub.
32
+
33
+ folder_path:Path to the folder to zip.
34
+ zip_path:Path of the zip file to create."""
35
+
36
+ with zipfile.ZipFile(zip_path, mode='w') as zipf:
37
+ len_dir_path = len(folder_path)
38
+ for root, _, files in os.walk(folder_path):
39
+ for file in files:
40
+ file_path = os.path.join(root, file)
41
+ zipf.write(file_path, file_path[len_dir_path:])
42
+
43
+ def unzip_directory(zip_path: str):
44
+ """Unzips Model to check for required files for upload.
45
+
46
+ Keyword Arguments:
47
+ zip_path:Zip Path to unzip.
48
+ """
49
+
50
+ def check_for_required_info(model_data: dict):
51
+ """Checks if the required fields exist in given dictionary and fills any remaining fields.
52
+
53
+ Keyword Arguments:
54
+ model_data: The model data to check.
55
+ """
56
+
57
+ list_of_required_fields = ['name', 'task', 'language', 'pythonCode', 'model_zip_path']
58
+
59
+ if model_data['task'] not in PushToHub.list_of_tasks:
60
+ list_of_tasks_string_version = "\n".join(PushToHub.list_of_tasks)
61
+ raise ValueError(
62
+ f"""Invalid task, please pick one of the following tasks\n{list_of_tasks_string_version}""")
63
+
64
+ if model_data['model_zip_path'].endswith(".zip"):
65
+ with zipfile.ZipFile(model_data['model_zip_path']) as modelfile:
66
+ if 'metadata/part-00000' not in modelfile.namelist():
67
+ raise ValueError("The Model is not a Spark Saved Model.")
68
+ else:
69
+ if not os.path.exists(f"{model_data['model_zip_path']}/metadata/part-00000"):
70
+ raise ValueError("The Model is not a Spark Saved Model.")
71
+
72
+ def push_to_hub(name: str,
73
+ language: str,
74
+ model_zip_path: str,
75
+ task: str,
76
+ pythonCode: str,
77
+ GIT_TOKEN: str,
78
+ title: str = None,
79
+ tags: List[str] = None,
80
+ dependencies: str = None,
81
+ description: str = None,
82
+ predictedEntities: str = None,
83
+ sparknlpVersion: str = None,
84
+ howToUse: str = None,
85
+ liveDemo: str = None,
86
+ runInColab: str = None,
87
+ scalaCode: str = None,
88
+ nluCode: str = None,
89
+ results: str = None,
90
+ dataSource: str = None,
91
+ includedModels: str = None,
92
+ benchmarking: str = None,
93
+ ) -> str:
94
+ """Pushes model to Hub.
95
+
96
+ Keyword Arguments:
97
+ model_data:Dictionary containing info about the model such as Name and Language.
98
+ GIT_TOKEN: Token required for pushing to hub.
99
+ """
100
+
101
+ model_data = {item: value for (item, value) in locals().items() if value is not None}
102
+ PushToHub.check_for_required_info(model_data)
103
+ model_data = PushToHub.create_docs(model_data)
104
+
105
+ r1 = requests.post('https://modelshub.johnsnowlabs.com/api/v1/models', data=json.dumps(model_data), headers={
106
+ 'Content-type': 'application/json',
107
+ 'Authorization': f'Bearer {GIT_TOKEN}'
108
+ })
109
+
110
+ if r1.status_code == 201:
111
+ r2 = requests.post(
112
+ 'https://modelshub.johnsnowlabs.com/api/v1/models/%s/file' % r1.json()['id'],
113
+ data=open(model_data['model_zip_path'], 'rb'), headers={
114
+ 'Authorization': f'Bearer {GIT_TOKEN}'
115
+ })
116
+ if r2.status_code == 200:
117
+ print(r2.json()['message'])
118
+ return r2.json()['message']
119
+ else:
120
+ print(f"Something Went Wrong During the Upload. Got Status Code: {r1.status_code}")
121
+ return f"Something Went Wrong During the Upload. Got Status Code: {r1.status_code}"
122
+
123
+ def create_docs(dicionary_for_upload: dict) -> dict:
124
+ """Adds fields in the dictionary for pushing to hub.
125
+
126
+ Keyword Arguments:
127
+ dictionary_for_upload: The dictionary to add keys to.
128
+ """
129
+
130
+ dicionary_for_upload['sparkVersion'] = "3.0"
131
+ dicionary_for_upload['license'] = 'Open Source'
132
+ dicionary_for_upload['supported'] = False
133
+
134
+ if 'sparknlpVersion' not in dicionary_for_upload.keys():
135
+ dicionary_for_upload['sparknlpVersion'] = "Spark NLP " + sparknlp.version()
136
+
137
+ if 'description' not in dicionary_for_upload.keys():
138
+ dicionary_for_upload[
139
+ 'description'] = f"This model is used for {dicionary_for_upload['task']} and this model works with {dicionary_for_upload['language']} language"
140
+
141
+ if 'title' not in dicionary_for_upload.keys():
142
+ dicionary_for_upload[
143
+ 'title'] = f"{dicionary_for_upload['task']} for {dicionary_for_upload['language']} language"
144
+
145
+ if os.path.isdir(dicionary_for_upload['model_zip_path']):
146
+ PushToHub.zip_directory(dicionary_for_upload['model_zip_path'],
147
+ f"{dicionary_for_upload['model_zip_path']}.zip")
148
+ dicionary_for_upload['model_zip_path'] = dicionary_for_upload['model_zip_path'] + '.zip'
149
+ return dicionary_for_upload