spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,200 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the PartitionTransformer class for reading various types of documents into chunks."""
15
+ from sparknlp.common import *
16
+ from sparknlp.partition.partition_properties import *
17
+
18
+
19
+ class PartitionTransformer(
20
+ AnnotatorModel,
21
+ HasEmailReaderProperties,
22
+ HasExcelReaderProperties,
23
+ HasHTMLReaderProperties,
24
+ HasPowerPointProperties,
25
+ HasTextReaderProperties,
26
+ HasChunkerProperties
27
+ ):
28
+ """
29
+ The PartitionTransformer annotator allows you to use the Partition feature more smoothly
30
+ within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
31
+
32
+ It supports reading from files, URLs, in-memory strings, or byte arrays, and works
33
+ within a Spark NLP pipeline.
34
+
35
+ Supported formats include:
36
+ - Plain text
37
+ - HTML
38
+ - Word (.doc/.docx)
39
+ - Excel (.xls/.xlsx)
40
+ - PowerPoint (.ppt/.pptx)
41
+ - Email files (.eml, .msg)
42
+ - PDFs
43
+
44
+ Parameters
45
+ ----------
46
+ inputCols : list of str
47
+ Names of input columns (typically from DocumentAssembler).
48
+ outputCol : str
49
+ Name of the column to store the output.
50
+ contentType : str
51
+ The type of content: e.g., "text", "url", "file", etc.
52
+ headers : dict, optional
53
+ Headers to be used if content type is a URL.
54
+
55
+ Examples
56
+ --------
57
+ >>> dataset = spark.createDataFrame([
58
+ ... ("https://www.blizzard.com",),
59
+ ... ], ["text"])
60
+
61
+ >>> documentAssembler = DocumentAssembler() \\
62
+ ... .setInputCol("text") \\
63
+ ... .setOutputCol("document")
64
+
65
+ >>> partition = PartitionTransformer() \\
66
+ ... .setInputCols(["document"]) \\
67
+ ... .setOutputCol("partition") \\
68
+ ... .setContentType("url") \\
69
+ ... .setHeaders({"Accept-Language": "es-ES"})
70
+
71
+ >>> pipeline = Pipeline(stages=[documentAssembler, partition])
72
+ >>> pipelineModel = pipeline.fit(dataset)
73
+ >>> resultDf = pipelineModel.transform(dataset)
74
+ >>> resultDf.show()
75
+ +--------------------+--------------------+--------------------+
76
+ | text| document| partition|
77
+ +--------------------+--------------------+--------------------+
78
+ |https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
79
+ |https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
80
+ +--------------------+--------------------+--------------------+
81
+ """
82
+
83
+ name = "PartitionTransformer"
84
+
85
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
86
+
87
+ outputAnnotatorType = AnnotatorType.DOCUMENT
88
+
89
+ contentPath = Param(
90
+ Params._dummy(),
91
+ "contentPath",
92
+ "Path to the content source",
93
+ typeConverter=TypeConverters.toString
94
+ )
95
+
96
+ def setContentPath(self, value):
97
+ return self._set(contentPath=value)
98
+
99
+ def getContentPath(self):
100
+ return self.getOrDefault(self.contentPath)
101
+
102
+ contentType = Param(
103
+ Params._dummy(),
104
+ "contentType",
105
+ "Set the content type to load following MIME specification",
106
+ typeConverter=TypeConverters.toString
107
+ )
108
+
109
+ def setContentType(self, value):
110
+ return self._set(contentType=value)
111
+
112
+ def getContentType(self):
113
+ return self.getOrDefault(self.contentType)
114
+
115
+ storeContent = Param(
116
+ Params._dummy(),
117
+ "storeContent",
118
+ "Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.",
119
+ typeConverter=TypeConverters.toBoolean
120
+ )
121
+
122
+ def setStoreContent(self, value):
123
+ return self._set(storeContent=value)
124
+
125
+ def getStoreContent(self):
126
+ return self.getOrDefault(self.storeContent)
127
+
128
+ titleFontSize = Param(
129
+ Params._dummy(),
130
+ "titleFontSize",
131
+ "Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).",
132
+ typeConverter=TypeConverters.toInt
133
+ )
134
+
135
+ def setTitleFontSize(self, value):
136
+ return self._set(titleFontSize=value)
137
+
138
+ def getTitleFontSize(self):
139
+ return self.getOrDefault(self.titleFontSize)
140
+
141
+ inferTableStructure = Param(
142
+ Params._dummy(),
143
+ "inferTableStructure",
144
+ "Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.",
145
+ typeConverter=TypeConverters.toBoolean
146
+ )
147
+
148
+ def setInferTableStructure(self, value):
149
+ return self._set(inferTableStructure=value)
150
+
151
+ def getInferTableStructure(self):
152
+ return self.getOrDefault(self.inferTableStructure)
153
+
154
+ includePageBreaks = Param(
155
+ Params._dummy(),
156
+ "includePageBreaks",
157
+ "Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.",
158
+ typeConverter=TypeConverters.toBoolean
159
+ )
160
+
161
+ def setIncludePageBreaks(self, value):
162
+ return self._set(includePageBreaks=value)
163
+
164
+ def getIncludePageBreaks(self):
165
+ return self.getOrDefault(self.includePageBreaks)
166
+
167
+ @keyword_only
168
+ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
169
+ java_model=None):
170
+ super(PartitionTransformer, self).__init__(
171
+ classname=classname,
172
+ java_model=java_model
173
+ )
174
+ DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}"
175
+
176
+ self._setDefault(
177
+ contentPath="",
178
+ contentType="text/plain",
179
+ storeContent=False,
180
+ titleFontSize = 9,
181
+ inferTableStructure=False,
182
+ includePageBreaks=False,
183
+ addAttachmentContent=False,
184
+ cellSeparator="\t",
185
+ appendCells=False,
186
+ timeout=0,
187
+ includeSlideNotes=False,
188
+ titleLengthSize=50,
189
+ groupBrokenParagraphs=False,
190
+ paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
191
+ shortLineWordThreshold=5,
192
+ maxLineCount=2000,
193
+ threshold=0.1,
194
+ chunkingStrategy="",
195
+ maxCharacters=100,
196
+ newAfterNChars=-1,
197
+ overlap=0,
198
+ combineTextUnderNChars=0,
199
+ overlapAll=False
200
+ )
@@ -29,7 +29,7 @@ class PretrainedPipeline:
29
29
  :attr:`.light_model`.
30
30
 
31
31
  For more extended examples see the `Pipelines page
32
- <https://nlp.johnsnowlabs.com/docs/en/pipelines>`_ and our `Github Model
32
+ <https://sparknlp.org/docs/en/pipelines>`_ and our `Github Model
33
33
  Repository <https://github.com/JohnSnowLabs/spark-nlp-models>`_ for
34
34
  available pipeline models.
35
35
 
@@ -24,9 +24,64 @@ from sparknlp.pretrained.utils import printProgress
24
24
 
25
25
 
26
26
  class ResourceDownloader(object):
27
+ """Downloads and manages resources, pretrained models/pipelines.
28
+
29
+ Usually you will not need to use this class directly. It is called by the
30
+ `pretrained()` function of annotators.
31
+
32
+ However, you can use this class to list the available pretrained resources.
33
+
34
+ Examples
35
+ --------
36
+ If you want to list all NerDLModels for the english language you can run:
37
+
38
+ >>> ResourceDownloader.showPublicModels("NerDLModel", "en")
39
+ +-------------+------+---------+
40
+ | Model | lang | version |
41
+ +-------------+------+---------+
42
+ | onto_100 | en | 2.1.0 |
43
+ | onto_300 | en | 2.1.0 |
44
+ | ner_dl_bert | en | 2.2.0 |
45
+ | ... | ... | ... |
46
+
47
+
48
+ Similarly for Pipelines:
49
+
50
+ >>> ResourceDownloader.showPublicPipelines("en")
51
+ +------------------+------+---------+
52
+ | Pipeline | lang | version |
53
+ +------------------+------+---------+
54
+ | dependency_parse | en | 2.0.2 |
55
+ | check_spelling | en | 2.1.0 |
56
+ | match_datetime | en | 2.1.0 |
57
+ | ... | ... | ... |
58
+
59
+ """
27
60
 
28
61
  @staticmethod
29
62
  def downloadModel(reader, name, language, remote_loc=None, j_dwn='PythonResourceDownloader'):
63
+ """Downloads and loads a model with the default downloader. Usually this method
64
+ does not need to be called directly, as it is called by the `pretrained()`
65
+ method of the annotator.
66
+
67
+ Parameters
68
+ ----------
69
+ reader : obj
70
+ Class to read the model for
71
+ name : str
72
+ Name of the pretrained model
73
+ language : str
74
+ Language of the model
75
+ remote_loc : str, optional
76
+ Directory of the Spark NLP Folder, by default None
77
+ j_dwn : str, optional
78
+ Which java downloader to use, by default 'PythonResourceDownloader'
79
+
80
+ Returns
81
+ -------
82
+ AnnotatorModel
83
+ Loaded pretrained annotator/pipeline
84
+ """
30
85
  print(name + " download started this may take some time.")
31
86
  file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
32
87
  if file_size == "-1":
@@ -46,12 +101,42 @@ class ResourceDownloader(object):
46
101
  t1.join()
47
102
 
48
103
  return reader(classname=None, java_model=j_obj)
104
+
49
105
  @staticmethod
50
- def downloadModelDirectly(name, remote_loc="public/models"):
51
- _internal._DownloadModelDirectly(name, remote_loc).apply()
106
+ def downloadModelDirectly(name, remote_loc="public/models", unzip=True):
107
+ """Downloads a model directly to the cache folder.
108
+ You can use to copy-paste the s3 URI from the model hub and download the model.
109
+ For available s3 URI and models, please see the `Models Hub <https://sparknlp.org/models>`__.
110
+ Parameters
111
+ ----------
112
+ name : str
113
+ Name of the model or s3 URI
114
+ remote_loc : str, optional
115
+ Directory of the remote Spark NLP Folder, by default "public/models"
116
+ unzip : Bool, optional
117
+ Used to unzip model, by default 'True'
118
+ """
119
+ _internal._DownloadModelDirectly(name, remote_loc, unzip).apply()
120
+
52
121
 
53
122
  @staticmethod
54
123
  def downloadPipeline(name, language, remote_loc=None):
124
+ """Downloads and loads a pipeline with the default downloader.
125
+
126
+ Parameters
127
+ ----------
128
+ name : str
129
+ Name of the pipeline
130
+ language : str
131
+ Language of the pipeline
132
+ remote_loc : str, optional
133
+ Directory of the remote Spark NLP Folder, by default None
134
+
135
+ Returns
136
+ -------
137
+ PipelineModel
138
+ The loaded pipeline
139
+ """
55
140
  print(name + " download started this may take some time.")
56
141
  file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
57
142
  if file_size == "-1":
@@ -72,21 +157,60 @@ class ResourceDownloader(object):
72
157
 
73
158
  @staticmethod
74
159
  def clearCache(name, language, remote_loc=None):
160
+ """Clears the cache entry of a model.
161
+
162
+ Parameters
163
+ ----------
164
+ name : str
165
+ Name of the model
166
+ language : en
167
+ Language of the model
168
+ remote_loc : str, optional
169
+ Directory of the remote Spark NLP Folder, by default None
170
+ """
75
171
  _internal._ClearCache(name, language, remote_loc).apply()
76
172
 
77
173
  @staticmethod
78
174
  def showPublicModels(annotator=None, lang=None, version=None):
175
+ """Prints all pretrained models for a particular annotator model, that are
176
+ compatible with a version of Spark NLP. If any of the optional arguments are not
177
+ set, the filter is not considered.
178
+
179
+ Parameters
180
+ ----------
181
+ annotator : str, optional
182
+ Name of the annotator to filer, by default None
183
+ lang : str, optional
184
+ Language of the models to filter, by default None
185
+ version : str, optional
186
+ Version of Spark NLP to filter, by default None
187
+ """
79
188
  print(_internal._ShowPublicModels(annotator, lang, version).apply())
80
189
 
81
190
  @staticmethod
82
191
  def showPublicPipelines(lang=None, version=None):
192
+ """Prints all pretrained models for a particular annotator model, that are
193
+ compatible with a version of Spark NLP. If any of the optional arguments are not
194
+ set, the filter is not considered.
195
+
196
+ Parameters
197
+ ----------
198
+ lang : str, optional
199
+ Language of the models to filter, by default None
200
+ version : str, optional
201
+ Version of Spark NLP to filter, by default None
202
+ """
83
203
  print(_internal._ShowPublicPipelines(lang, version).apply())
84
204
 
85
205
  @staticmethod
86
206
  def showUnCategorizedResources():
207
+ """Shows models or pipelines in the metadata which has not been categorized yet.
208
+ """
87
209
  print(_internal._ShowUnCategorizedResources().apply())
88
210
 
89
211
  @staticmethod
90
212
  def showAvailableAnnotators():
213
+ """Shows all available annotators in Spark NLP.
214
+ """
91
215
  print(_internal._ShowAvailableAnnotators().apply())
92
216
 
@@ -0,0 +1,15 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Module for reading different files types."""
15
+ from sparknlp.reader.sparknlp_reader import *
@@ -0,0 +1,19 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from enum import Enum
15
+
16
+ class TextStripperType(Enum):
17
+ """Text Stripper Type"""
18
+ PDF_TEXT_STRIPPER = "PDFTextStripper"
19
+ PDF_LAYOUT_TEXT_STRIPPER = "PDFLayoutTextStripper"
@@ -0,0 +1,190 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark import keyword_only
15
+ from pyspark.ml.param import Param, Params, TypeConverters
16
+ from pyspark.ml.param.shared import HasInputCol, HasOutputCol
17
+ from pyspark.ml.util import JavaMLReadable, JavaMLWritable
18
+ from pyspark.ml.wrapper import JavaTransformer
19
+
20
+ from sparknlp.reader.enums import TextStripperType
21
+
22
+
23
+ class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
24
+ JavaMLReadable, JavaMLWritable):
25
+ """
26
+ Extract text from PDF documents as either a single string or multiple strings per page.
27
+ Input is a column with binary content of PDF files. Output is a column with extracted text,
28
+ with options to include page numbers or split pages.
29
+
30
+ Parameters
31
+ ----------
32
+ pageNumCol : str, optional
33
+ Page number output column name.
34
+ partitionNum : int, optional
35
+ Number of partitions (default is 0).
36
+ storeSplittedPdf : bool, optional
37
+ Whether to store content of split PDFs (default is False).
38
+ splitPage : bool, optional
39
+ Enable/disable splitting per page (default is True).
40
+ onlyPageNum : bool, optional
41
+ Whether to extract only page numbers (default is False).
42
+ textStripper : str or TextStripperType, optional
43
+ Defines layout and formatting type.
44
+ sort : bool, optional
45
+ Enable/disable sorting content per page (default is False).
46
+
47
+ Examples
48
+ --------
49
+ >>> import sparknlp
50
+ >>> from sparknlp.reader import *
51
+ >>> from pyspark.ml import Pipeline
52
+ >>> pdf_path = "Documents/files/pdf"
53
+ >>> data_frame = spark.read.format("binaryFile").load(pdf_path)
54
+ >>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
55
+ >>> pipeline = Pipeline(stages=[pdf_to_text])
56
+ >>> pipeline_model = pipeline.fit(data_frame)
57
+ >>> pdf_df = pipeline_model.transform(data_frame)
58
+ >>> pdf_df.show()
59
+ +--------------------+--------------------+
60
+ | path| modificationTime|
61
+ +--------------------+--------------------+
62
+ |file:/Users/paula...|2025-05-15 11:33:...|
63
+ |file:/Users/paula...|2025-05-15 11:33:...|
64
+ +--------------------+--------------------+
65
+ >>> pdf_df.printSchema()
66
+ root
67
+ |-- path: string (nullable = true)
68
+ |-- modificationTime: timestamp (nullable = true)
69
+ |-- length: long (nullable = true)
70
+ |-- text: string (nullable = true)
71
+ |-- height_dimension: integer (nullable = true)
72
+ |-- width_dimension: integer (nullable = true)
73
+ |-- content: binary (nullable = true)
74
+ |-- exception: string (nullable = true)
75
+ |-- pagenum: integer (nullable = true)
76
+ """
77
+ pageNumCol = Param(Params._dummy(), "pageNumCol",
78
+ "Page number output column name.",
79
+ typeConverter=TypeConverters.toString)
80
+
81
+ partitionNum = Param(Params._dummy(), "partitionNum",
82
+ "Number of partitions.",
83
+ typeConverter=TypeConverters.toInt)
84
+
85
+ storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
86
+ "Force to store splitted pdf.",
87
+ typeConverter=TypeConverters.toBoolean)
88
+
89
+ splitPage = Param(Params._dummy(), "splitPage",
90
+ "Param for enable/disable splitting document per page",
91
+ typeConverter=TypeConverters.toBoolean)
92
+
93
+ textStripper = Param(Params._dummy(), "textStripper",
94
+ "Text stripper type used for output layout and formatting",
95
+ typeConverter=TypeConverters.toString)
96
+
97
+ sort = Param(Params._dummy(), "sort",
98
+ "Param for enable/disable sort lines",
99
+ typeConverter=TypeConverters.toBoolean)
100
+
101
+ onlyPageNum = Param(Params._dummy(), "onlyPageNum",
102
+ "Force to extract only number of pages",
103
+ typeConverter=TypeConverters.toBoolean)
104
+
105
+ extractCoordinates = Param(Params._dummy(), "extractCoordinates",
106
+ "Force extract coordinates of text.",
107
+ typeConverter=TypeConverters.toBoolean)
108
+
109
+ normalizeLigatures = Param(Params._dummy(), "normalizeLigatures",
110
+ "Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
111
+ typeConverter=TypeConverters.toBoolean)
112
+
113
+ @keyword_only
114
+ def __init__(self):
115
+ """
116
+ __init__(self)
117
+ """
118
+ super(PdfToText, self).__init__()
119
+ self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
120
+
121
+ def setInputCol(self, value):
122
+ """
123
+ Sets the value of :py:attr:`inputCol`.
124
+ """
125
+ return self._set(inputCol=value)
126
+
127
+ def setOutputCol(self, value):
128
+ """
129
+ Sets the value of :py:attr:`outputCol`.
130
+ """
131
+ return self._set(outputCol=value)
132
+
133
+ def setPageNumCol(self, value):
134
+ """
135
+ Sets the value of :py:attr:`pageNumCol`.
136
+ """
137
+ return self._set(pageNumCol=value)
138
+
139
+ def setPartitionNum(self, value):
140
+ """
141
+ Sets the value of :py:attr:`partitionNum`.
142
+ """
143
+ return self._set(partitionNum=value)
144
+
145
+ def setStoreSplittedPdf(self, value):
146
+ """
147
+ Sets the value of :py:attr:`storeSplittedPdf`.
148
+ """
149
+ return self._set(storeSplittedPdf=value)
150
+
151
+ def setSplitPage(self, value):
152
+ """
153
+ Sets the value of :py:attr:`splitPage`.
154
+ """
155
+ return self._set(splitPage=value)
156
+
157
+ def setOnlyPageNum(self, value):
158
+ """
159
+ Sets the value of :py:attr:`onlyPageNum`.
160
+ """
161
+ return self._set(onlyPageNum=value)
162
+
163
+ def setTextStripper(self, value):
164
+ """
165
+ Sets the value of :py:attr:`textStripper`.
166
+ """
167
+ if isinstance(value, TextStripperType):
168
+ value = value.value
169
+ if value not in [i.value for i in TextStripperType]:
170
+ type_value = type(value)
171
+ raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.")
172
+ return self._set(textStripper=str(value))
173
+
174
+ def setSort(self, value):
175
+ """
176
+ Sets the value of :py:attr:`sort`.
177
+ """
178
+ return self._set(sort=value)
179
+
180
+ def setExtractCoordinates(self, value):
181
+ """
182
+ Sets the value of :py:attr:`extractCoordinates`.
183
+ """
184
+ return self._set(extractCoordinates=value)
185
+
186
+ def setNormalizeLigatures(self, value):
187
+ """
188
+ Sets the value of :py:attr:`normalizeLigatures`.
189
+ """
190
+ return self._set(normalizeLigatures=value)