spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,200 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the PartitionTransformer class for reading various types of documents into chunks."""
15
+ from sparknlp.common import *
16
+ from sparknlp.partition.partition_properties import *
17
+
18
+
19
+ class PartitionTransformer(
20
+ AnnotatorModel,
21
+ HasEmailReaderProperties,
22
+ HasExcelReaderProperties,
23
+ HasHTMLReaderProperties,
24
+ HasPowerPointProperties,
25
+ HasTextReaderProperties,
26
+ HasChunkerProperties
27
+ ):
28
+ """
29
+ The PartitionTransformer annotator allows you to use the Partition feature more smoothly
30
+ within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
31
+
32
+ It supports reading from files, URLs, in-memory strings, or byte arrays, and works
33
+ within a Spark NLP pipeline.
34
+
35
+ Supported formats include:
36
+ - Plain text
37
+ - HTML
38
+ - Word (.doc/.docx)
39
+ - Excel (.xls/.xlsx)
40
+ - PowerPoint (.ppt/.pptx)
41
+ - Email files (.eml, .msg)
42
+ - PDFs
43
+
44
+ Parameters
45
+ ----------
46
+ inputCols : list of str
47
+ Names of input columns (typically from DocumentAssembler).
48
+ outputCol : str
49
+ Name of the column to store the output.
50
+ contentType : str
51
+ The type of content: e.g., "text", "url", "file", etc.
52
+ headers : dict, optional
53
+ Headers to be used if content type is a URL.
54
+
55
+ Examples
56
+ --------
57
+ >>> dataset = spark.createDataFrame([
58
+ ... ("https://www.blizzard.com",),
59
+ ... ], ["text"])
60
+
61
+ >>> documentAssembler = DocumentAssembler() \\
62
+ ... .setInputCol("text") \\
63
+ ... .setOutputCol("document")
64
+
65
+ >>> partition = PartitionTransformer() \\
66
+ ... .setInputCols(["document"]) \\
67
+ ... .setOutputCol("partition") \\
68
+ ... .setContentType("url") \\
69
+ ... .setHeaders({"Accept-Language": "es-ES"})
70
+
71
+ >>> pipeline = Pipeline(stages=[documentAssembler, partition])
72
+ >>> pipelineModel = pipeline.fit(dataset)
73
+ >>> resultDf = pipelineModel.transform(dataset)
74
+ >>> resultDf.show()
75
+ +--------------------+--------------------+--------------------+
76
+ | text| document| partition|
77
+ +--------------------+--------------------+--------------------+
78
+ |https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
79
+ |https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
80
+ +--------------------+--------------------+--------------------+
81
+ """
82
+
83
+ name = "PartitionTransformer"
84
+
85
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
86
+
87
+ outputAnnotatorType = AnnotatorType.DOCUMENT
88
+
89
+ contentPath = Param(
90
+ Params._dummy(),
91
+ "contentPath",
92
+ "Path to the content source",
93
+ typeConverter=TypeConverters.toString
94
+ )
95
+
96
+ def setContentPath(self, value):
97
+ return self._set(contentPath=value)
98
+
99
+ def getContentPath(self):
100
+ return self.getOrDefault(self.contentPath)
101
+
102
+ contentType = Param(
103
+ Params._dummy(),
104
+ "contentType",
105
+ "Set the content type to load following MIME specification",
106
+ typeConverter=TypeConverters.toString
107
+ )
108
+
109
+ def setContentType(self, value):
110
+ return self._set(contentType=value)
111
+
112
+ def getContentType(self):
113
+ return self.getOrDefault(self.contentType)
114
+
115
+ storeContent = Param(
116
+ Params._dummy(),
117
+ "storeContent",
118
+ "Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.",
119
+ typeConverter=TypeConverters.toBoolean
120
+ )
121
+
122
+ def setStoreContent(self, value):
123
+ return self._set(storeContent=value)
124
+
125
+ def getStoreContent(self):
126
+ return self.getOrDefault(self.storeContent)
127
+
128
+ titleFontSize = Param(
129
+ Params._dummy(),
130
+ "titleFontSize",
131
+ "Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).",
132
+ typeConverter=TypeConverters.toInt
133
+ )
134
+
135
+ def setTitleFontSize(self, value):
136
+ return self._set(titleFontSize=value)
137
+
138
+ def getTitleFontSize(self):
139
+ return self.getOrDefault(self.titleFontSize)
140
+
141
+ inferTableStructure = Param(
142
+ Params._dummy(),
143
+ "inferTableStructure",
144
+ "Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.",
145
+ typeConverter=TypeConverters.toBoolean
146
+ )
147
+
148
+ def setInferTableStructure(self, value):
149
+ return self._set(inferTableStructure=value)
150
+
151
+ def getInferTableStructure(self):
152
+ return self.getOrDefault(self.inferTableStructure)
153
+
154
+ includePageBreaks = Param(
155
+ Params._dummy(),
156
+ "includePageBreaks",
157
+ "Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.",
158
+ typeConverter=TypeConverters.toBoolean
159
+ )
160
+
161
+ def setIncludePageBreaks(self, value):
162
+ return self._set(includePageBreaks=value)
163
+
164
+ def getIncludePageBreaks(self):
165
+ return self.getOrDefault(self.includePageBreaks)
166
+
167
+ @keyword_only
168
+ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
169
+ java_model=None):
170
+ super(PartitionTransformer, self).__init__(
171
+ classname=classname,
172
+ java_model=java_model
173
+ )
174
+ DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}"
175
+
176
+ self._setDefault(
177
+ contentPath="",
178
+ contentType="text/plain",
179
+ storeContent=False,
180
+ titleFontSize = 9,
181
+ inferTableStructure=False,
182
+ includePageBreaks=False,
183
+ addAttachmentContent=False,
184
+ cellSeparator="\t",
185
+ appendCells=False,
186
+ timeout=0,
187
+ includeSlideNotes=False,
188
+ titleLengthSize=50,
189
+ groupBrokenParagraphs=False,
190
+ paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
191
+ shortLineWordThreshold=5,
192
+ maxLineCount=2000,
193
+ threshold=0.1,
194
+ chunkingStrategy="",
195
+ maxCharacters=100,
196
+ newAfterNChars=-1,
197
+ overlap=0,
198
+ combineTextUnderNChars=0,
199
+ overlapAll=False
200
+ )
@@ -0,0 +1,17 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Module for pretrained pipelines and resources."""
15
+ from sparknlp.pretrained.pretrained_pipeline import *
16
+ from sparknlp.pretrained.resource_downloader import *
17
+ from sparknlp.pretrained.utils import *
@@ -0,0 +1,158 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the PretrainedPipeline."""
15
+
16
+ from pyspark.ml import PipelineModel
17
+ from pyspark.sql import DataFrame
18
+
19
+ from sparknlp.base import LightPipeline
20
+ from sparknlp.pretrained.resource_downloader import ResourceDownloader
21
+
22
+
23
+ class PretrainedPipeline:
24
+ """Loads a Represents a fully constructed and trained Spark NLP pipeline,
25
+ ready to be used.
26
+
27
+ This way, a whole pipeline can be defined in 1 line. Additionally, the
28
+ :class:`.LightPipeline` version of the model can be retrieved with member
29
+ :attr:`.light_model`.
30
+
31
+ For more extended examples see the `Pipelines page
32
+ <https://sparknlp.org/docs/en/pipelines>`_ and our `Github Model
33
+ Repository <https://github.com/JohnSnowLabs/spark-nlp-models>`_ for
34
+ available pipeline models.
35
+
36
+ Parameters
37
+ ----------
38
+ name : str
39
+ Name of the PretrainedPipeline. These can be gathered from the Pipelines
40
+ Page.
41
+ lang : str, optional
42
+ Langauge of the model, by default 'en'
43
+ remote_loc : str, optional
44
+ Link to the remote location of the model (if it was already downloaded),
45
+ by default None
46
+ parse_embeddings : bool, optional
47
+ Whether to parse embeddings, by default False
48
+ disk_location : str , optional
49
+ Path to locally stored PretrainedPipeline, by default None
50
+ """
51
+
52
+ def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
53
+ if not disk_location:
54
+ self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
55
+ else:
56
+ self.model = PipelineModel.load(disk_location)
57
+ self.light_model = LightPipeline(self.model, parse_embeddings)
58
+
59
+ @staticmethod
60
+ def from_disk(path, parse_embeddings=False):
61
+ return PretrainedPipeline(None, None, None, parse_embeddings, path)
62
+
63
+ def annotate(self, target, column=None):
64
+ """Annotates the data provided, extracting the results.
65
+
66
+ The data should be either a list or a str.
67
+
68
+ Parameters
69
+ ----------
70
+ target : list or str
71
+ The data to be annotated
72
+
73
+ Returns
74
+ -------
75
+ List[dict] or dict
76
+ The result of the annotation
77
+
78
+ Examples
79
+ --------
80
+ >>> from sparknlp.pretrained import PretrainedPipeline
81
+ >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
82
+ >>> result = explain_document_pipeline.annotate('U.N. official Ekeus heads for Baghdad.')
83
+ >>> result.keys()
84
+ dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
85
+ >>> result["ner"]
86
+ ['B-ORG', 'O', 'O', 'B-PER', 'O', 'O', 'B-LOC', 'O']
87
+ """
88
+
89
+ annotations = self.light_model.annotate(target)
90
+ return annotations
91
+
92
+ def fullAnnotate(self, target, optional_target=""):
93
+ """Annotates the data provided into `Annotation` type results.
94
+
95
+ The data should be either a list or a str.
96
+
97
+ Parameters
98
+ ----------
99
+ target : list or str
100
+ The data to be annotated
101
+
102
+ Returns
103
+ -------
104
+ List[Annotation]
105
+ The result of the annotation
106
+
107
+ Examples
108
+ --------
109
+ >>> from sparknlp.pretrained import PretrainedPipeline
110
+ >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
111
+ >>> result = explain_document_pipeline.fullAnnotate('U.N. official Ekeus heads for Baghdad.')
112
+ >>> result[0].keys()
113
+ dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
114
+ >>> result[0]["ner"]
115
+ [Annotation(named_entity, 0, 2, B-ORG, {'word': 'U.N'}),
116
+ Annotation(named_entity, 3, 3, O, {'word': '.'}),
117
+ Annotation(named_entity, 5, 12, O, {'word': 'official'}),
118
+ Annotation(named_entity, 14, 18, B-PER, {'word': 'Ekeus'}),
119
+ Annotation(named_entity, 20, 24, O, {'word': 'heads'}),
120
+ Annotation(named_entity, 26, 28, O, {'word': 'for'}),
121
+ Annotation(named_entity, 30, 36, B-LOC, {'word': 'Baghdad'}),
122
+ Annotation(named_entity, 37, 37, O, {'word': '.'})]
123
+ """
124
+ annotations = self.light_model.fullAnnotate(target, optional_target)
125
+ return annotations
126
+
127
+ def fullAnnotateImage(self, path_to_image):
128
+ """Annotates the data provided into `Annotation` type results.
129
+
130
+ The data should be either a list or a str.
131
+
132
+ Parameters
133
+ ----------
134
+ path_to_image : list or str
135
+ Source path of image, list of paths to images
136
+
137
+ Returns
138
+ -------
139
+ List[AnnotationImage]
140
+ The result of the annotation
141
+ """
142
+ pipeline = self.light_model
143
+ return pipeline.fullAnnotateImage(path_to_image)
144
+
145
+ def transform(self, data):
146
+ """Transforms the input dataset with Spark.
147
+
148
+ Parameters
149
+ ----------
150
+ data : :class:`pyspark.sql.DataFrame`
151
+ input dataset
152
+
153
+ Returns
154
+ -------
155
+ :class:`pyspark.sql.DataFrame`
156
+ transformed dataset
157
+ """
158
+ return self.model.transform(data)
@@ -0,0 +1,216 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the ResourceDownloader."""
15
+
16
+ import sys
17
+ import threading
18
+
19
+ from py4j.protocol import Py4JJavaError
20
+ from pyspark.ml import PipelineModel
21
+
22
+ import sparknlp.internal as _internal
23
+ from sparknlp.pretrained.utils import printProgress
24
+
25
+
26
+ class ResourceDownloader(object):
27
+ """Downloads and manages resources, pretrained models/pipelines.
28
+
29
+ Usually you will not need to use this class directly. It is called by the
30
+ `pretrained()` function of annotators.
31
+
32
+ However, you can use this class to list the available pretrained resources.
33
+
34
+ Examples
35
+ --------
36
+ If you want to list all NerDLModels for the english language you can run:
37
+
38
+ >>> ResourceDownloader.showPublicModels("NerDLModel", "en")
39
+ +-------------+------+---------+
40
+ | Model | lang | version |
41
+ +-------------+------+---------+
42
+ | onto_100 | en | 2.1.0 |
43
+ | onto_300 | en | 2.1.0 |
44
+ | ner_dl_bert | en | 2.2.0 |
45
+ | ... | ... | ... |
46
+
47
+
48
+ Similarly for Pipelines:
49
+
50
+ >>> ResourceDownloader.showPublicPipelines("en")
51
+ +------------------+------+---------+
52
+ | Pipeline | lang | version |
53
+ +------------------+------+---------+
54
+ | dependency_parse | en | 2.0.2 |
55
+ | check_spelling | en | 2.1.0 |
56
+ | match_datetime | en | 2.1.0 |
57
+ | ... | ... | ... |
58
+
59
+ """
60
+
61
+ @staticmethod
62
+ def downloadModel(reader, name, language, remote_loc=None, j_dwn='PythonResourceDownloader'):
63
+ """Downloads and loads a model with the default downloader. Usually this method
64
+ does not need to be called directly, as it is called by the `pretrained()`
65
+ method of the annotator.
66
+
67
+ Parameters
68
+ ----------
69
+ reader : obj
70
+ Class to read the model for
71
+ name : str
72
+ Name of the pretrained model
73
+ language : str
74
+ Language of the model
75
+ remote_loc : str, optional
76
+ Directory of the Spark NLP Folder, by default None
77
+ j_dwn : str, optional
78
+ Which java downloader to use, by default 'PythonResourceDownloader'
79
+
80
+ Returns
81
+ -------
82
+ AnnotatorModel
83
+ Loaded pretrained annotator/pipeline
84
+ """
85
+ print(name + " download started this may take some time.")
86
+ file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
87
+ if file_size == "-1":
88
+ print("Can not find the model to download please check the name!")
89
+ else:
90
+ print("Approximate size to download " + file_size)
91
+ stop_threads = False
92
+ t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
93
+ t1.start()
94
+ try:
95
+ j_obj = _internal._DownloadModel(reader.name, name, language, remote_loc, j_dwn).apply()
96
+ except Py4JJavaError as e:
97
+ sys.stdout.write("\n" + str(e))
98
+ raise e
99
+ finally:
100
+ stop_threads = True
101
+ t1.join()
102
+
103
+ return reader(classname=None, java_model=j_obj)
104
+
105
+ @staticmethod
106
+ def downloadModelDirectly(name, remote_loc="public/models", unzip=True):
107
+ """Downloads a model directly to the cache folder.
108
+ You can use to copy-paste the s3 URI from the model hub and download the model.
109
+ For available s3 URI and models, please see the `Models Hub <https://sparknlp.org/models>`__.
110
+ Parameters
111
+ ----------
112
+ name : str
113
+ Name of the model or s3 URI
114
+ remote_loc : str, optional
115
+ Directory of the remote Spark NLP Folder, by default "public/models"
116
+ unzip : Bool, optional
117
+ Used to unzip model, by default 'True'
118
+ """
119
+ _internal._DownloadModelDirectly(name, remote_loc, unzip).apply()
120
+
121
+
122
+ @staticmethod
123
+ def downloadPipeline(name, language, remote_loc=None):
124
+ """Downloads and loads a pipeline with the default downloader.
125
+
126
+ Parameters
127
+ ----------
128
+ name : str
129
+ Name of the pipeline
130
+ language : str
131
+ Language of the pipeline
132
+ remote_loc : str, optional
133
+ Directory of the remote Spark NLP Folder, by default None
134
+
135
+ Returns
136
+ -------
137
+ PipelineModel
138
+ The loaded pipeline
139
+ """
140
+ print(name + " download started this may take some time.")
141
+ file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
142
+ if file_size == "-1":
143
+ print("Can not find the model to download please check the name!")
144
+ else:
145
+ print("Approx size to download " + file_size)
146
+ stop_threads = False
147
+ t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
148
+ t1.start()
149
+ try:
150
+ j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
151
+ jmodel = PipelineModel._from_java(j_obj)
152
+ finally:
153
+ stop_threads = True
154
+ t1.join()
155
+
156
+ return jmodel
157
+
158
+ @staticmethod
159
+ def clearCache(name, language, remote_loc=None):
160
+ """Clears the cache entry of a model.
161
+
162
+ Parameters
163
+ ----------
164
+ name : str
165
+ Name of the model
166
+ language : en
167
+ Language of the model
168
+ remote_loc : str, optional
169
+ Directory of the remote Spark NLP Folder, by default None
170
+ """
171
+ _internal._ClearCache(name, language, remote_loc).apply()
172
+
173
+ @staticmethod
174
+ def showPublicModels(annotator=None, lang=None, version=None):
175
+ """Prints all pretrained models for a particular annotator model, that are
176
+ compatible with a version of Spark NLP. If any of the optional arguments are not
177
+ set, the filter is not considered.
178
+
179
+ Parameters
180
+ ----------
181
+ annotator : str, optional
182
+ Name of the annotator to filer, by default None
183
+ lang : str, optional
184
+ Language of the models to filter, by default None
185
+ version : str, optional
186
+ Version of Spark NLP to filter, by default None
187
+ """
188
+ print(_internal._ShowPublicModels(annotator, lang, version).apply())
189
+
190
+ @staticmethod
191
+ def showPublicPipelines(lang=None, version=None):
192
+ """Prints all pretrained models for a particular annotator model, that are
193
+ compatible with a version of Spark NLP. If any of the optional arguments are not
194
+ set, the filter is not considered.
195
+
196
+ Parameters
197
+ ----------
198
+ lang : str, optional
199
+ Language of the models to filter, by default None
200
+ version : str, optional
201
+ Version of Spark NLP to filter, by default None
202
+ """
203
+ print(_internal._ShowPublicPipelines(lang, version).apply())
204
+
205
+ @staticmethod
206
+ def showUnCategorizedResources():
207
+ """Shows models or pipelines in the metadata which has not been categorized yet.
208
+ """
209
+ print(_internal._ShowUnCategorizedResources().apply())
210
+
211
+ @staticmethod
212
+ def showAvailableAnnotators():
213
+ """Shows all available annotators in Spark NLP.
214
+ """
215
+ print(_internal._ShowAvailableAnnotators().apply())
216
+
@@ -0,0 +1,35 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains utilities for pretrained annotators and pipelines."""
15
+
16
+ import sys
17
+ import time
18
+
19
+
20
+ def printProgress(stop):
21
+ states = [' | ', ' / ', ' — ', ' \\ ']
22
+ nextc = 0
23
+ while True:
24
+ sys.stdout.write('\r[{}]'.format(states[nextc]))
25
+ sys.stdout.flush()
26
+ time.sleep(2.5)
27
+ nextc = nextc + 1 if nextc < 3 else 0
28
+ if stop():
29
+ sys.stdout.write('\r[{}]'.format('OK!'))
30
+ sys.stdout.flush()
31
+ break
32
+
33
+ sys.stdout.write('\n')
34
+ return
35
+
@@ -0,0 +1,15 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Module for reading different files types."""
15
+ from sparknlp.reader.sparknlp_reader import *
@@ -0,0 +1,19 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from enum import Enum
15
+
16
+ class TextStripperType(Enum):
17
+ """Text Stripper Type"""
18
+ PDF_TEXT_STRIPPER = "PDFTextStripper"
19
+ PDF_LAYOUT_TEXT_STRIPPER = "PDFLayoutTextStripper"