spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,385 @@
1
+ # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+ """Module implementing RNN Cells that used to be in core.
16
+
17
+ @@EmbeddingWrapper
18
+ @@InputProjectionWrapper
19
+ @@OutputProjectionWrapper
20
+ """
21
+ from __future__ import absolute_import
22
+ from __future__ import division
23
+ from __future__ import print_function
24
+
25
+ import math
26
+
27
+ from tensorflow.python.framework import constant_op
28
+ from tensorflow.python.framework import dtypes
29
+ from tensorflow.python.framework import ops
30
+ from tensorflow.python.ops import array_ops
31
+ from tensorflow.python.ops import embedding_ops
32
+ from tensorflow.python.ops import init_ops
33
+ from tensorflow.python.ops import math_ops
34
+ from tensorflow.python.ops import nn_ops
35
+ from tensorflow.python.ops import rnn_cell_impl
36
+ from tensorflow.python.ops import variable_scope as vs
37
+ from tensorflow.python.platform import tf_logging as logging
38
+ from tensorflow.python.util import nest
39
+
40
+ # pylint: disable=protected-access,invalid-name
41
+ RNNCell = rnn_cell_impl.RNNCell
42
+ _WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME
43
+ _BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME
44
+
45
+
46
+ # pylint: enable=protected-access,invalid-name
47
+
48
+
49
+ class _Linear(object):
50
+ """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
51
+
52
+ Args:
53
+ args: a 2D Tensor or a list of 2D, batch, n, Tensors.
54
+ output_size: int, second dimension of weight variable.
55
+ dtype: data type for variables.
56
+ build_bias: boolean, whether to build a bias variable.
57
+ bias_initializer: starting value to initialize the bias
58
+ (default is all zeros).
59
+ kernel_initializer: starting value to initialize the weight.
60
+
61
+ Raises:
62
+ ValueError: if inputs_shape is wrong.
63
+ """
64
+
65
+ def __init__(self,
66
+ args,
67
+ output_size,
68
+ build_bias,
69
+ bias_initializer=None,
70
+ kernel_initializer=None):
71
+ self._build_bias = build_bias
72
+
73
+ if args is None or (nest.is_sequence(args) and not args):
74
+ raise ValueError("`args` must be specified")
75
+ if not nest.is_sequence(args):
76
+ args = [args]
77
+ self._is_sequence = False
78
+ else:
79
+ self._is_sequence = True
80
+
81
+ # Calculate the total size of arguments on dimension 1.
82
+ total_arg_size = 0
83
+ shapes = [a.get_shape() for a in args]
84
+ for shape in shapes:
85
+ if shape.ndims != 2:
86
+ raise ValueError("linear is expecting 2D arguments: %s" % shapes)
87
+ if shape.dims[1].value is None:
88
+ raise ValueError("linear expects shape[1] to be provided for shape %s, "
89
+ "but saw %s" % (shape, shape[1]))
90
+ else:
91
+ total_arg_size += shape.dims[1].value
92
+
93
+ dtype = [a.dtype for a in args][0]
94
+
95
+ scope = vs.get_variable_scope()
96
+ with vs.variable_scope(scope) as outer_scope:
97
+ self._weights = vs.get_variable(
98
+ _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
99
+ dtype=dtype,
100
+ initializer=kernel_initializer)
101
+ if build_bias:
102
+ with vs.variable_scope(outer_scope) as inner_scope:
103
+ inner_scope.set_partitioner(None)
104
+ if bias_initializer is None:
105
+ bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
106
+ self._biases = vs.get_variable(
107
+ _BIAS_VARIABLE_NAME, [output_size],
108
+ dtype=dtype,
109
+ initializer=bias_initializer)
110
+
111
+ def __call__(self, args):
112
+ if not self._is_sequence:
113
+ args = [args]
114
+
115
+ if len(args) == 1:
116
+ res = math_ops.matmul(args[0], self._weights)
117
+ else:
118
+ # Explicitly creating a one for a minor performance improvement.
119
+ one = constant_op.constant(1, dtype=dtypes.int32)
120
+ res = math_ops.matmul(array_ops.concat(args, one), self._weights)
121
+ if self._build_bias:
122
+ res = nn_ops.bias_add(res, self._biases)
123
+ return res
124
+
125
+
126
+ # TODO(xpan): Remove this function in a follow up.
127
+ def _linear(args,
128
+ output_size,
129
+ bias,
130
+ bias_initializer=None,
131
+ kernel_initializer=None):
132
+ """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
133
+
134
+ Args:
135
+ args: a 2D Tensor or a list of 2D, batch, n, Tensors.
136
+ output_size: int, second dimension of W[i].
137
+ bias: boolean, whether to add a bias term or not.
138
+ bias_initializer: starting value to initialize the bias
139
+ (default is all zeros).
140
+ kernel_initializer: starting value to initialize the weight.
141
+
142
+ Returns:
143
+ A 2D Tensor with shape `[batch, output_size]` equal to
144
+ sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
145
+
146
+ Raises:
147
+ ValueError: if some of the arguments has unspecified or wrong shape.
148
+ """
149
+ if args is None or (nest.is_sequence(args) and not args):
150
+ raise ValueError("`args` must be specified")
151
+ if not nest.is_sequence(args):
152
+ args = [args]
153
+
154
+ # Calculate the total size of arguments on dimension 1.
155
+ total_arg_size = 0
156
+ shapes = [a.get_shape() for a in args]
157
+ for shape in shapes:
158
+ if shape.ndims != 2:
159
+ raise ValueError("linear is expecting 2D arguments: %s" % shapes)
160
+ if shape.dims[1].value is None:
161
+ raise ValueError("linear expects shape[1] to be provided for shape %s, "
162
+ "but saw %s" % (shape, shape[1]))
163
+ else:
164
+ total_arg_size += shape.dims[1].value
165
+
166
+ dtype = [a.dtype for a in args][0]
167
+
168
+ # Now the computation.
169
+ scope = vs.get_variable_scope()
170
+ with vs.variable_scope(scope) as outer_scope:
171
+ weights = vs.get_variable(
172
+ _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
173
+ dtype=dtype,
174
+ initializer=kernel_initializer)
175
+ if len(args) == 1:
176
+ res = math_ops.matmul(args[0], weights)
177
+ else:
178
+ res = math_ops.matmul(array_ops.concat(args, 1), weights)
179
+ if not bias:
180
+ return res
181
+ with vs.variable_scope(outer_scope) as inner_scope:
182
+ inner_scope.set_partitioner(None)
183
+ if bias_initializer is None:
184
+ bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
185
+ biases = vs.get_variable(
186
+ _BIAS_VARIABLE_NAME, [output_size],
187
+ dtype=dtype,
188
+ initializer=bias_initializer)
189
+ return nn_ops.bias_add(res, biases)
190
+
191
+
192
+ class EmbeddingWrapper(RNNCell):
193
+ """Operator adding input embedding to the given cell.
194
+
195
+ Note: in many cases it may be more efficient to not use this wrapper,
196
+ but instead concatenate the whole sequence of your inputs in time,
197
+ do the embedding on this batch-concatenated sequence, then split it and
198
+ feed into your RNN.
199
+ """
200
+
201
+ def __init__(self,
202
+ cell,
203
+ embedding_classes,
204
+ embedding_size,
205
+ initializer=None,
206
+ reuse=None):
207
+ """Create a cell with an added input embedding.
208
+
209
+ Args:
210
+ cell: an RNNCell, an embedding will be put before its inputs.
211
+ embedding_classes: integer, how many symbols will be embedded.
212
+ embedding_size: integer, the size of the vectors we embed into.
213
+ initializer: an initializer to use when creating the embedding;
214
+ if None, the initializer from variable scope or a default one is used.
215
+ reuse: (optional) Python boolean describing whether to reuse variables
216
+ in an existing scope. If not `True`, and the existing scope already has
217
+ the given variables, an error is raised.
218
+
219
+ Raises:
220
+ TypeError: if cell is not an RNNCell.
221
+ ValueError: if embedding_classes is not positive.
222
+ """
223
+ super(EmbeddingWrapper, self).__init__(_reuse=reuse)
224
+ rnn_cell_impl.assert_like_rnncell("cell", cell)
225
+ if embedding_classes <= 0 or embedding_size <= 0:
226
+ raise ValueError("Both embedding_classes and embedding_size must be > 0: "
227
+ "%d, %d." % (embedding_classes, embedding_size))
228
+ self._cell = cell
229
+ self._embedding_classes = embedding_classes
230
+ self._embedding_size = embedding_size
231
+ self._initializer = initializer
232
+
233
+ @property
234
+ def state_size(self):
235
+ return self._cell.state_size
236
+
237
+ @property
238
+ def output_size(self):
239
+ return self._cell.output_size
240
+
241
+ def zero_state(self, batch_size, dtype):
242
+ with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
243
+ return self._cell.zero_state(batch_size, dtype)
244
+
245
+ def call(self, inputs, state):
246
+ """Run the cell on embedded inputs."""
247
+ with ops.device("/cpu:0"):
248
+ if self._initializer:
249
+ initializer = self._initializer
250
+ elif vs.get_variable_scope().initializer:
251
+ initializer = vs.get_variable_scope().initializer
252
+ else:
253
+ # Default initializer for embeddings should have variance=1.
254
+ sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1.
255
+ initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
256
+
257
+ if isinstance(state, tuple):
258
+ data_type = state[0].dtype
259
+ else:
260
+ data_type = state.dtype
261
+
262
+ embedding = vs.get_variable(
263
+ "embedding", [self._embedding_classes, self._embedding_size],
264
+ initializer=initializer,
265
+ dtype=data_type)
266
+ embedded = embedding_ops.embedding_lookup(embedding,
267
+ array_ops.reshape(inputs, [-1]))
268
+
269
+ return self._cell(embedded, state)
270
+
271
+
272
+ class InputProjectionWrapper(RNNCell):
273
+ """Operator adding an input projection to the given cell.
274
+
275
+ Note: in many cases it may be more efficient to not use this wrapper,
276
+ but instead concatenate the whole sequence of your inputs in time,
277
+ do the projection on this batch-concatenated sequence, then split it.
278
+ """
279
+
280
+ def __init__(self,
281
+ cell,
282
+ num_proj,
283
+ activation=None,
284
+ input_size=None,
285
+ reuse=None):
286
+ """Create a cell with input projection.
287
+
288
+ Args:
289
+ cell: an RNNCell, a projection of inputs is added before it.
290
+ num_proj: Python integer. The dimension to project to.
291
+ activation: (optional) an optional activation function.
292
+ input_size: Deprecated and unused.
293
+ reuse: (optional) Python boolean describing whether to reuse variables
294
+ in an existing scope. If not `True`, and the existing scope already has
295
+ the given variables, an error is raised.
296
+
297
+ Raises:
298
+ TypeError: if cell is not an RNNCell.
299
+ """
300
+ super(InputProjectionWrapper, self).__init__(_reuse=reuse)
301
+ if input_size is not None:
302
+ logging.warn("%s: The input_size parameter is deprecated.", self)
303
+ rnn_cell_impl.assert_like_rnncell("cell", cell)
304
+ self._cell = cell
305
+ self._num_proj = num_proj
306
+ self._activation = activation
307
+ self._linear = None
308
+
309
+ @property
310
+ def state_size(self):
311
+ return self._cell.state_size
312
+
313
+ @property
314
+ def output_size(self):
315
+ return self._cell.output_size
316
+
317
+ def zero_state(self, batch_size, dtype):
318
+ with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
319
+ return self._cell.zero_state(batch_size, dtype)
320
+
321
+ def call(self, inputs, state):
322
+ """Run the input projection and then the cell."""
323
+ # Default scope: "InputProjectionWrapper"
324
+ if self._linear is None:
325
+ self._linear = _Linear(inputs, self._num_proj, True)
326
+ projected = self._linear(inputs)
327
+ if self._activation:
328
+ projected = self._activation(projected)
329
+ return self._cell(projected, state)
330
+
331
+
332
+ class OutputProjectionWrapper(RNNCell):
333
+ """Operator adding an output projection to the given cell.
334
+
335
+ Note: in many cases it may be more efficient to not use this wrapper,
336
+ but instead concatenate the whole sequence of your outputs in time,
337
+ do the projection on this batch-concatenated sequence, then split it
338
+ if needed or directly feed into a softmax.
339
+ """
340
+
341
+ def __init__(self, cell, output_size, activation=None, reuse=None):
342
+ """Create a cell with output projection.
343
+
344
+ Args:
345
+ cell: an RNNCell, a projection to output_size is added to it.
346
+ output_size: integer, the size of the output after projection.
347
+ activation: (optional) an optional activation function.
348
+ reuse: (optional) Python boolean describing whether to reuse variables
349
+ in an existing scope. If not `True`, and the existing scope already has
350
+ the given variables, an error is raised.
351
+
352
+ Raises:
353
+ TypeError: if cell is not an RNNCell.
354
+ ValueError: if output_size is not positive.
355
+ """
356
+ super(OutputProjectionWrapper, self).__init__(_reuse=reuse)
357
+ rnn_cell_impl.assert_like_rnncell("cell", cell)
358
+ if output_size < 1:
359
+ raise ValueError("Parameter output_size must be > 0: %d." % output_size)
360
+ self._cell = cell
361
+ self._output_size = output_size
362
+ self._activation = activation
363
+ self._linear = None
364
+
365
+ @property
366
+ def state_size(self):
367
+ return self._cell.state_size
368
+
369
+ @property
370
+ def output_size(self):
371
+ return self._output_size
372
+
373
+ def zero_state(self, batch_size, dtype):
374
+ with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
375
+ return self._cell.zero_state(batch_size, dtype)
376
+
377
+ def call(self, inputs, state):
378
+ """Run the cell and output projection on inputs, starting from state."""
379
+ output, res_state = self._cell(inputs, state)
380
+ if self._linear is None:
381
+ self._linear = _Linear(output, self._output_size, True)
382
+ projected = self._linear(output)
383
+ if self._activation:
384
+ projected = self._activation(projected)
385
+ return projected, res_state
@@ -0,0 +1,183 @@
1
+ # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+ """Module for constructing fused RNN cells."""
16
+
17
+ from __future__ import absolute_import
18
+ from __future__ import division
19
+ from __future__ import print_function
20
+
21
+ import abc
22
+
23
+ import six
24
+ from tensorflow.python.ops import array_ops
25
+ from tensorflow.python.ops import rnn
26
+
27
+
28
+ @six.add_metaclass(abc.ABCMeta)
29
+ class FusedRNNCell(object):
30
+ """Abstract object representing a fused RNN cell.
31
+
32
+ A fused RNN cell represents the entire RNN expanded over the time
33
+ dimension. In effect, this represents an entire recurrent network.
34
+
35
+ Unlike RNN cells which are subclasses of `rnn_cell.RNNCell`, a `FusedRNNCell`
36
+ operates on the entire time sequence at once, by putting the loop over time
37
+ inside the cell. This usually leads to much more efficient, but more complex
38
+ and less flexible implementations.
39
+
40
+ Every `FusedRNNCell` must implement `__call__` with the following signature.
41
+ """
42
+
43
+ @abc.abstractmethod
44
+ def __call__(self,
45
+ inputs,
46
+ initial_state=None,
47
+ dtype=None,
48
+ sequence_length=None,
49
+ scope=None):
50
+ """Run this fused RNN on inputs, starting from the given state.
51
+
52
+ Args:
53
+ inputs: `3-D` tensor with shape `[time_len x batch_size x input_size]`
54
+ or a list of `time_len` tensors of shape `[batch_size x input_size]`.
55
+ initial_state: either a tensor with shape `[batch_size x state_size]`
56
+ or a tuple with shapes `[batch_size x s] for s in state_size`, if the
57
+ cell takes tuples. If this is not provided, the cell is expected to
58
+ create a zero initial state of type `dtype`.
59
+ dtype: The data type for the initial state and expected output. Required
60
+ if `initial_state` is not provided or RNN state has a heterogeneous
61
+ dtype.
62
+ sequence_length: Specifies the length of each sequence in inputs. An
63
+ `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
64
+ time_len)`.
65
+ Defaults to `time_len` for each element.
66
+ scope: `VariableScope` or `string` for the created subgraph; defaults to
67
+ class name.
68
+
69
+ Returns:
70
+ A pair containing:
71
+
72
+ - Output: A `3-D` tensor of shape `[time_len x batch_size x output_size]`
73
+ or a list of `time_len` tensors of shape `[batch_size x output_size]`,
74
+ to match the type of the `inputs`.
75
+ - Final state: Either a single `2-D` tensor, or a tuple of tensors
76
+ matching the arity and shapes of `initial_state`.
77
+ """
78
+ pass
79
+
80
+
81
+ class FusedRNNCellAdaptor(FusedRNNCell):
82
+ """This is an adaptor for RNNCell classes to be used with `FusedRNNCell`."""
83
+
84
+ def __init__(self, cell, use_dynamic_rnn=False):
85
+ """Initialize the adaptor.
86
+
87
+ Args:
88
+ cell: an instance of a subclass of a `rnn_cell.RNNCell`.
89
+ use_dynamic_rnn: whether to use dynamic (or static) RNN.
90
+ """
91
+ self._cell = cell
92
+ self._use_dynamic_rnn = use_dynamic_rnn
93
+
94
+ def __call__(self,
95
+ inputs,
96
+ initial_state=None,
97
+ dtype=None,
98
+ sequence_length=None,
99
+ scope=None):
100
+ is_list = isinstance(inputs, list)
101
+ if self._use_dynamic_rnn:
102
+ if is_list:
103
+ inputs = array_ops.stack(inputs)
104
+ outputs, state = rnn.dynamic_rnn(
105
+ self._cell,
106
+ inputs,
107
+ sequence_length=sequence_length,
108
+ initial_state=initial_state,
109
+ dtype=dtype,
110
+ time_major=True,
111
+ scope=scope)
112
+ if is_list:
113
+ # Convert outputs back to list
114
+ outputs = array_ops.unstack(outputs)
115
+ else: # non-dynamic rnn
116
+ if not is_list:
117
+ inputs = array_ops.unstack(inputs)
118
+ outputs, state = rnn.static_rnn(
119
+ self._cell,
120
+ inputs,
121
+ initial_state=initial_state,
122
+ dtype=dtype,
123
+ sequence_length=sequence_length,
124
+ scope=scope)
125
+ if not is_list:
126
+ # Convert outputs back to tensor
127
+ outputs = array_ops.stack(outputs)
128
+
129
+ return outputs, state
130
+
131
+
132
+ class TimeReversedFusedRNN(FusedRNNCell):
133
+ """This is an adaptor to time-reverse a FusedRNNCell.
134
+
135
+ For example,
136
+
137
+ ```python
138
+ cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(10)
139
+ fw_lstm = tf.contrib.rnn.FusedRNNCellAdaptor(cell, use_dynamic_rnn=True)
140
+ bw_lstm = tf.contrib.rnn.TimeReversedFusedRNN(fw_lstm)
141
+ fw_out, fw_state = fw_lstm(inputs)
142
+ bw_out, bw_state = bw_lstm(inputs)
143
+ ```
144
+ """
145
+
146
+ def __init__(self, cell):
147
+ self._cell = cell
148
+
149
+ def _reverse(self, t, lengths):
150
+ """Time reverse the provided tensor or list of tensors.
151
+
152
+ Assumes the top dimension is the time dimension.
153
+
154
+ Args:
155
+ t: 3D tensor or list of 2D tensors to be reversed
156
+ lengths: 1D tensor of lengths, or `None`
157
+
158
+ Returns:
159
+ A reversed tensor or list of tensors
160
+ """
161
+ if isinstance(t, list):
162
+ return list(reversed(t))
163
+ else:
164
+ if lengths is None:
165
+ return array_ops.reverse_v2(t, [0])
166
+ else:
167
+ return array_ops.reverse_sequence(t, lengths, 0, 1)
168
+
169
+ def __call__(self,
170
+ inputs,
171
+ initial_state=None,
172
+ dtype=None,
173
+ sequence_length=None,
174
+ scope=None):
175
+ inputs = self._reverse(inputs, sequence_length)
176
+ outputs, state = self._cell(
177
+ inputs,
178
+ initial_state=initial_state,
179
+ dtype=dtype,
180
+ sequence_length=sequence_length,
181
+ scope=scope)
182
+ outputs = self._reverse(outputs, sequence_length)
183
+ return outputs, state