spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,461 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from sparknlp.internal import ExtendedJavaWrapper
15
+
16
+
17
+ class SparkNLPReader(ExtendedJavaWrapper):
18
+ """Instantiates class to read documents in various formats.
19
+
20
+ Parameters
21
+ ----------
22
+ params : spark
23
+ Spark session
24
+ params : dict, optional
25
+ Parameter with custom configuration
26
+
27
+ Notes
28
+ -----
29
+ This class can read HTML, email, PDF, MS Word, Excel, PowerPoint, and text files.
30
+
31
+ Examples
32
+ --------
33
+ >>> from sparknlp.reader import SparkNLPReader
34
+ >>> reader = SparkNLPReader(spark)
35
+
36
+ Reading HTML
37
+
38
+ >>> html_df = reader.html("https://www.wikipedia.org")
39
+ >>> # Or with shorthand
40
+ >>> import sparknlp
41
+ >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
42
+
43
+ Reading PDF
44
+
45
+ >>> pdf_df = reader.pdf("home/user/pdfs-directory")
46
+ >>> # Or with shorthand
47
+ >>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
48
+
49
+ Reading Email
50
+
51
+ >>> email_df = reader.email("home/user/emails-directory")
52
+ >>> # Or with shorthand
53
+ >>> email_df = sparknlp.read().email("home/user/emails-directory")
54
+ """
55
+
56
+ def __init__(self, spark, params=None, headers=None):
57
+ if params is None:
58
+ params = {}
59
+ super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params, headers)
60
+ self.spark = spark
61
+
62
+ def html(self, htmlPath):
63
+ """Reads HTML files or URLs and returns a Spark DataFrame.
64
+
65
+ Parameters
66
+ ----------
67
+ htmlPath : str or list of str
68
+ Path(s) to HTML file(s) or a list of URLs.
69
+
70
+ Returns
71
+ -------
72
+ pyspark.sql.DataFrame
73
+ A DataFrame containing the parsed HTML content.
74
+
75
+ Examples
76
+ --------
77
+ >>> from sparknlp.reader import SparkNLPReader
78
+ >>> html_df = SparkNLPReader().html("https://www.wikipedia.org")
79
+
80
+ You can also use SparkNLP to simplify the process:
81
+
82
+ >>> import sparknlp
83
+ >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
84
+ >>> html_df.show(truncate=False)
85
+
86
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
87
+ |url |html |
88
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
89
+ |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
90
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
91
+ >>> html_df.printSchema()
92
+ root
93
+ |-- url: string (nullable = true)
94
+ |-- html: array (nullable = true)
95
+ | |-- element: struct (containsNull = true)
96
+ | | |-- elementType: string (nullable = true)
97
+ | | |-- content: string (nullable = true)
98
+ | | |-- metadata: map (nullable = true)
99
+ | | | |-- key: string
100
+ | | | |-- value: string (valueContainsNull = true)
101
+ """
102
+ if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
103
+ raise TypeError("htmlPath must be a string or a list of strings")
104
+ jdf = self._java_obj.html(htmlPath)
105
+ dataframe = self.getDataFrame(self.spark, jdf)
106
+ return dataframe
107
+
108
+ def email(self, filePath):
109
+ """Reads email files and returns a Spark DataFrame.
110
+
111
+ Parameters
112
+ ----------
113
+ filePath : str
114
+ Path to an email file or a directory containing emails.
115
+
116
+ Returns
117
+ -------
118
+ pyspark.sql.DataFrame
119
+ A DataFrame containing parsed email data.
120
+
121
+ Examples
122
+ --------
123
+ >>> from sparknlp.reader import SparkNLPReader
124
+ >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
125
+
126
+ You can also use SparkNLP to simplify the process:
127
+
128
+ >>> import sparknlp
129
+ >>> email_df = sparknlp.read().email("home/user/emails-directory")
130
+ >>> email_df.show()
131
+ +---------------------------------------------------+
132
+ |email |
133
+ +---------------------------------------------------+
134
+ |[{Title, Email Text Attachments, {sent_to -> Danilo|
135
+ +---------------------------------------------------+
136
+ >>> email_df.printSchema()
137
+ root
138
+ |-- path: string (nullable = true)
139
+ |-- content: array (nullable = true)
140
+ |-- email: array (nullable = true)
141
+ | |-- element: struct (containsNull = true)
142
+ | | |-- elementType: string (nullable = true)
143
+ | | |-- content: string (nullable = true)
144
+ | | |-- metadata: map (nullable = true)
145
+ | | | |-- key: string
146
+ | | | |-- value: string (valueContainsNull = true)
147
+
148
+ """
149
+ if not isinstance(filePath, str):
150
+ raise TypeError("filePath must be a string")
151
+ jdf = self._java_obj.email(filePath)
152
+ dataframe = self.getDataFrame(self.spark, jdf)
153
+ return dataframe
154
+
155
+ def doc(self, docPath):
156
+ """Reads word document files and returns a Spark DataFrame.
157
+
158
+ Parameters
159
+ ----------
160
+ docPath : str
161
+ Path to a word document file.
162
+
163
+ Returns
164
+ -------
165
+ pyspark.sql.DataFrame
166
+ A DataFrame containing parsed document content.
167
+
168
+ Examples
169
+ --------
170
+ >>> from sparknlp.reader import SparkNLPReader
171
+ >>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
172
+
173
+ You can use SparkNLP for one line of code
174
+
175
+ >>> import sparknlp
176
+ >>> doc_df = sparknlp.read().doc("home/user/word-directory")
177
+ >>> doc_df.show()
178
+ +-------------------------------------------------+
179
+ |doc | |
180
+ +-------------------------------------------------+
181
+ |[{Table, Header Col 1, {}}, {Table, Header Col 2,|
182
+ +-------------------------------------------------+
183
+
184
+ >>> doc_df.printSchema()
185
+ root
186
+ |-- path: string (nullable = true)
187
+ |-- content: array (nullable = true)
188
+ |-- doc: array (nullable = true)
189
+ | |-- element: struct (containsNull = true)
190
+ | | |-- elementType: string (nullable = true)
191
+ | | |-- content: string (nullable = true)
192
+ | | |-- metadata: map (nullable = true)
193
+ | | | |-- key: string
194
+ | | | |-- value: string (valueContainsNull = true)
195
+
196
+ """
197
+ if not isinstance(docPath, str):
198
+ raise TypeError("docPath must be a string")
199
+ jdf = self._java_obj.doc(docPath)
200
+ dataframe = self.getDataFrame(self.spark, jdf)
201
+ return dataframe
202
+
203
+ def pdf(self, pdfPath):
204
+ if not isinstance(pdfPath, str):
205
+ raise TypeError("docPath must be a string")
206
+ jdf = self._java_obj.pdf(pdfPath)
207
+ dataframe = self.getDataFrame(self.spark, jdf)
208
+ return dataframe
209
+
210
+ def xls(self, docPath):
211
+ """Reads excel document files and returns a Spark DataFrame.
212
+
213
+ Parameters
214
+ ----------
215
+ docPath : str
216
+ Path to an excel document file.
217
+
218
+ Returns
219
+ -------
220
+ pyspark.sql.DataFrame
221
+ A DataFrame containing parsed document content.
222
+
223
+ Examples
224
+ --------
225
+ >>> from sparknlp.reader import SparkNLPReader
226
+ >>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
227
+
228
+ You can use SparkNLP for one line of code
229
+
230
+ >>> import sparknlp
231
+ >>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
232
+ >>> xlsDf.show()
233
+ +--------------------------------------------+
234
+ |xls |
235
+ +--------------------------------------------+
236
+ |[{Title, Financial performance, {SheetNam}}]|
237
+ +--------------------------------------------+
238
+
239
+ >>> xlsDf.printSchema()
240
+ root
241
+ |-- path: string (nullable = true)
242
+ |-- content: binary (nullable = true)
243
+ |-- xls: array (nullable = true)
244
+ | |-- element: struct (containsNull = true)
245
+ | | |-- elementType: string (nullable = true)
246
+ | | |-- content: string (nullable = true)
247
+ | | |-- metadata: map (nullable = true)
248
+ | | | |-- key: string
249
+ | | | |-- value: string (valueContainsNull = true)
250
+ """
251
+ if not isinstance(docPath, str):
252
+ raise TypeError("docPath must be a string")
253
+ jdf = self._java_obj.xls(docPath)
254
+ dataframe = self.getDataFrame(self.spark, jdf)
255
+ return dataframe
256
+
257
+ def ppt(self, docPath):
258
+ """
259
+ Reads power point document files and returns a Spark DataFrame.
260
+
261
+ Parameters
262
+ ----------
263
+ docPath : str
264
+ Path to an power point document file.
265
+
266
+ Returns
267
+ -------
268
+ pyspark.sql.DataFrame
269
+ A DataFrame containing parsed document content.
270
+
271
+ Examples
272
+ --------
273
+ >>> from sparknlp.reader import SparkNLPReader
274
+ >>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
275
+
276
+ You can use SparkNLP for one line of code
277
+
278
+ >>> import sparknlp
279
+ >>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
280
+ >>> pptDf.show(truncate=False)
281
+ +-------------------------------------+
282
+ |ppt |
283
+ +-------------------------------------+
284
+ |[{Title, Adding a Bullet Slide, {}},]|
285
+ +-------------------------------------+
286
+ """
287
+ if not isinstance(docPath, str):
288
+ raise TypeError("docPath must be a string")
289
+ jdf = self._java_obj.ppt(docPath)
290
+ dataframe = self.getDataFrame(self.spark, jdf)
291
+ return dataframe
292
+
293
+ def txt(self, docPath):
294
+ """Reads TXT files and returns a Spark DataFrame.
295
+
296
+ Parameters
297
+ ----------
298
+ docPath : str
299
+ Path to a TXT file.
300
+
301
+ Returns
302
+ -------
303
+ pyspark.sql.DataFrame
304
+ A DataFrame containing parsed document content.
305
+
306
+ Examples
307
+ --------
308
+ >>> from sparknlp.reader import SparkNLPReader
309
+ >>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
310
+
311
+ You can use SparkNLP for one line of code
312
+
313
+ >>> import sparknlp
314
+ >>> txtDf = sparknlp.read().txt("home/user/txt/files")
315
+ >>> txtDf.show(truncate=False)
316
+ +-----------------------------------------------+
317
+ |txt |
318
+ +-----------------------------------------------+
319
+ |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}]|
320
+ +-----------------------------------------------+
321
+ """
322
+ if not isinstance(docPath, str):
323
+ raise TypeError("docPath must be a string")
324
+ jdf = self._java_obj.txt(docPath)
325
+ return self.getDataFrame(self.spark, jdf)
326
+
327
+ def xml(self, docPath):
328
+ """Reads XML files and returns a Spark DataFrame.
329
+
330
+ Parameters
331
+ ----------
332
+ docPath : str
333
+ Path to an XML file or a directory containing XML files.
334
+
335
+ Returns
336
+ -------
337
+ pyspark.sql.DataFrame
338
+ A DataFrame containing parsed XML content.
339
+
340
+ Examples
341
+ --------
342
+ >>> from sparknlp.reader import SparkNLPReader
343
+ >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
344
+
345
+ You can use SparkNLP for one line of code
346
+
347
+ >>> import sparknlp
348
+ >>> xml_df = sparknlp.read().xml("home/user/xml-directory")
349
+ >>> xml_df.show(truncate=False)
350
+ +-----------------------------------------------------------+
351
+ |xml |
352
+ +-----------------------------------------------------------+
353
+ |[{Title, John Smith, {elementId -> ..., tag -> title}}] |
354
+ +-----------------------------------------------------------+
355
+
356
+ >>> xml_df.printSchema()
357
+ root
358
+ |-- path: string (nullable = true)
359
+ |-- xml: array (nullable = true)
360
+ | |-- element: struct (containsNull = true)
361
+ | | |-- elementType: string (nullable = true)
362
+ | | |-- content: string (nullable = true)
363
+ | | |-- metadata: map (nullable = true)
364
+ | | | |-- key: string
365
+ | | | |-- value: string (valueContainsNull = true)
366
+ """
367
+ if not isinstance(docPath, str):
368
+ raise TypeError("docPath must be a string")
369
+ jdf = self._java_obj.xml(docPath)
370
+ return self.getDataFrame(self.spark, jdf)
371
+
372
+
373
+ def md(self, filePath):
374
+ """Reads Markdown files and returns a Spark DataFrame.
375
+
376
+ Parameters
377
+ ----------
378
+ filePath : str
379
+ Path to a Markdown file or a directory containing Markdown files.
380
+
381
+ Returns
382
+ -------
383
+ pyspark.sql.DataFrame
384
+ A DataFrame containing parsed Markdown content.
385
+
386
+ Examples
387
+ --------
388
+ >>> from sparknlp.reader import SparkNLPReader
389
+ >>> md_df = SparkNLPReader(spark).md("home/user/markdown-directory")
390
+
391
+ You can use SparkNLP for one line of code
392
+
393
+ >>> import sparknlp
394
+ >>> md_df = sparknlp.read().md("home/user/markdown-directory")
395
+ >>> md_df.show(truncate=False)
396
+ +-----------------------------------------------------------+
397
+ |md |
398
+ +-----------------------------------------------------------+
399
+ |[{Title, Sample Markdown Document, {elementId -> ..., tag -> title}}]|
400
+ +-----------------------------------------------------------+
401
+
402
+ >>> md_df.printSchema()
403
+ root
404
+ |-- path: string (nullable = true)
405
+ |-- md: array (nullable = true)
406
+ | |-- element: struct (containsNull = true)
407
+ | | |-- elementType: string (nullable = true)
408
+ | | |-- content: string (nullable = true)
409
+ | | |-- metadata: map (nullable = true)
410
+ | | | |-- key: string
411
+ | | | |-- value: string (valueContainsNull = true)
412
+ """
413
+ if not isinstance(filePath, str):
414
+ raise TypeError("filePath must be a string")
415
+ jdf = self._java_obj.md(filePath)
416
+ return self.getDataFrame(self.spark, jdf)
417
+
418
+ def csv(self, csvPath):
419
+ """Reads CSV files and returns a Spark DataFrame.
420
+
421
+ Parameters
422
+ ----------
423
+ docPath : str
424
+ Path to an CSV file or a directory containing CSV files.
425
+
426
+ Returns
427
+ -------
428
+ pyspark.sql.DataFrame
429
+ A DataFrame containing parsed CSV content.
430
+
431
+ Examples
432
+ --------
433
+ >>> from sparknlp.reader import SparkNLPReader
434
+ >>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory")
435
+
436
+ You can use SparkNLP for one line of code
437
+
438
+ >>> import sparknlp
439
+ >>> csv_df = sparknlp.read().csv("home/user/csv-directory")
440
+ >>> csv_df.show(truncate=False)
441
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
442
+ |csv |
443
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
444
+ |[{NarrativeText, Alice 100 Bob 95, {}}, {Table, <table><tr><td>Alice</td><td>100</td></tr><tr><td>Bob</td><td>95</td></tr></table>, {}}] |
445
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
446
+
447
+ >>> csv_df.printSchema()
448
+ root
449
+ |-- path: string (nullable = true)
450
+ |-- csv: array (nullable = true)
451
+ | |-- element: struct (containsNull = true)
452
+ | | |-- elementType: string (nullable = true)
453
+ | | |-- content: string (nullable = true)
454
+ | | |-- metadata: map (nullable = true)
455
+ | | | |-- key: string
456
+ | | | |-- value: string (valueContainsNull = true)
457
+ """
458
+ if not isinstance(csvPath, str):
459
+ raise TypeError("docPath must be a string")
460
+ jdf = self._java_obj.csv(csvPath)
461
+ return self.getDataFrame(self.spark, jdf)
@@ -0,0 +1,20 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Module of classes for handling training data."""
15
+
16
+ from sparknlp.training.conll import *
17
+ from sparknlp.training.conllu import *
18
+ from sparknlp.training.pos import *
19
+ from sparknlp.training.pub_tator import *
20
+ from sparknlp.training.spacy_to_annotation import *
File without changes