spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,461 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from sparknlp.internal import ExtendedJavaWrapper
15
+
16
+
17
+ class SparkNLPReader(ExtendedJavaWrapper):
18
+ """Instantiates class to read documents in various formats.
19
+
20
+ Parameters
21
+ ----------
22
+ params : spark
23
+ Spark session
24
+ params : dict, optional
25
+ Parameter with custom configuration
26
+
27
+ Notes
28
+ -----
29
+ This class can read HTML, email, PDF, MS Word, Excel, PowerPoint, and text files.
30
+
31
+ Examples
32
+ --------
33
+ >>> from sparknlp.reader import SparkNLPReader
34
+ >>> reader = SparkNLPReader(spark)
35
+
36
+ Reading HTML
37
+
38
+ >>> html_df = reader.html("https://www.wikipedia.org")
39
+ >>> # Or with shorthand
40
+ >>> import sparknlp
41
+ >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
42
+
43
+ Reading PDF
44
+
45
+ >>> pdf_df = reader.pdf("home/user/pdfs-directory")
46
+ >>> # Or with shorthand
47
+ >>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
48
+
49
+ Reading Email
50
+
51
+ >>> email_df = reader.email("home/user/emails-directory")
52
+ >>> # Or with shorthand
53
+ >>> email_df = sparknlp.read().email("home/user/emails-directory")
54
+ """
55
+
56
+ def __init__(self, spark, params=None, headers=None):
57
+ if params is None:
58
+ params = {}
59
+ super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params, headers)
60
+ self.spark = spark
61
+
62
+ def html(self, htmlPath):
63
+ """Reads HTML files or URLs and returns a Spark DataFrame.
64
+
65
+ Parameters
66
+ ----------
67
+ htmlPath : str or list of str
68
+ Path(s) to HTML file(s) or a list of URLs.
69
+
70
+ Returns
71
+ -------
72
+ pyspark.sql.DataFrame
73
+ A DataFrame containing the parsed HTML content.
74
+
75
+ Examples
76
+ --------
77
+ >>> from sparknlp.reader import SparkNLPReader
78
+ >>> html_df = SparkNLPReader().html("https://www.wikipedia.org")
79
+
80
+ You can also use SparkNLP to simplify the process:
81
+
82
+ >>> import sparknlp
83
+ >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
84
+ >>> html_df.show(truncate=False)
85
+
86
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
87
+ |url |html |
88
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
89
+ |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
90
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
91
+ >>> html_df.printSchema()
92
+ root
93
+ |-- url: string (nullable = true)
94
+ |-- html: array (nullable = true)
95
+ | |-- element: struct (containsNull = true)
96
+ | | |-- elementType: string (nullable = true)
97
+ | | |-- content: string (nullable = true)
98
+ | | |-- metadata: map (nullable = true)
99
+ | | | |-- key: string
100
+ | | | |-- value: string (valueContainsNull = true)
101
+ """
102
+ if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
103
+ raise TypeError("htmlPath must be a string or a list of strings")
104
+ jdf = self._java_obj.html(htmlPath)
105
+ dataframe = self.getDataFrame(self.spark, jdf)
106
+ return dataframe
107
+
108
+ def email(self, filePath):
109
+ """Reads email files and returns a Spark DataFrame.
110
+
111
+ Parameters
112
+ ----------
113
+ filePath : str
114
+ Path to an email file or a directory containing emails.
115
+
116
+ Returns
117
+ -------
118
+ pyspark.sql.DataFrame
119
+ A DataFrame containing parsed email data.
120
+
121
+ Examples
122
+ --------
123
+ >>> from sparknlp.reader import SparkNLPReader
124
+ >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
125
+
126
+ You can also use SparkNLP to simplify the process:
127
+
128
+ >>> import sparknlp
129
+ >>> email_df = sparknlp.read().email("home/user/emails-directory")
130
+ >>> email_df.show()
131
+ +---------------------------------------------------+
132
+ |email |
133
+ +---------------------------------------------------+
134
+ |[{Title, Email Text Attachments, {sent_to -> Danilo|
135
+ +---------------------------------------------------+
136
+ >>> email_df.printSchema()
137
+ root
138
+ |-- path: string (nullable = true)
139
+ |-- content: array (nullable = true)
140
+ |-- email: array (nullable = true)
141
+ | |-- element: struct (containsNull = true)
142
+ | | |-- elementType: string (nullable = true)
143
+ | | |-- content: string (nullable = true)
144
+ | | |-- metadata: map (nullable = true)
145
+ | | | |-- key: string
146
+ | | | |-- value: string (valueContainsNull = true)
147
+
148
+ """
149
+ if not isinstance(filePath, str):
150
+ raise TypeError("filePath must be a string")
151
+ jdf = self._java_obj.email(filePath)
152
+ dataframe = self.getDataFrame(self.spark, jdf)
153
+ return dataframe
154
+
155
+ def doc(self, docPath):
156
+ """Reads word document files and returns a Spark DataFrame.
157
+
158
+ Parameters
159
+ ----------
160
+ docPath : str
161
+ Path to a word document file.
162
+
163
+ Returns
164
+ -------
165
+ pyspark.sql.DataFrame
166
+ A DataFrame containing parsed document content.
167
+
168
+ Examples
169
+ --------
170
+ >>> from sparknlp.reader import SparkNLPReader
171
+ >>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
172
+
173
+ You can use SparkNLP for one line of code
174
+
175
+ >>> import sparknlp
176
+ >>> doc_df = sparknlp.read().doc("home/user/word-directory")
177
+ >>> doc_df.show()
178
+ +-------------------------------------------------+
179
+ |doc | |
180
+ +-------------------------------------------------+
181
+ |[{Table, Header Col 1, {}}, {Table, Header Col 2,|
182
+ +-------------------------------------------------+
183
+
184
+ >>> doc_df.printSchema()
185
+ root
186
+ |-- path: string (nullable = true)
187
+ |-- content: array (nullable = true)
188
+ |-- doc: array (nullable = true)
189
+ | |-- element: struct (containsNull = true)
190
+ | | |-- elementType: string (nullable = true)
191
+ | | |-- content: string (nullable = true)
192
+ | | |-- metadata: map (nullable = true)
193
+ | | | |-- key: string
194
+ | | | |-- value: string (valueContainsNull = true)
195
+
196
+ """
197
+ if not isinstance(docPath, str):
198
+ raise TypeError("docPath must be a string")
199
+ jdf = self._java_obj.doc(docPath)
200
+ dataframe = self.getDataFrame(self.spark, jdf)
201
+ return dataframe
202
+
203
+ def pdf(self, pdfPath):
204
+ if not isinstance(pdfPath, str):
205
+ raise TypeError("docPath must be a string")
206
+ jdf = self._java_obj.pdf(pdfPath)
207
+ dataframe = self.getDataFrame(self.spark, jdf)
208
+ return dataframe
209
+
210
+ def xls(self, docPath):
211
+ """Reads excel document files and returns a Spark DataFrame.
212
+
213
+ Parameters
214
+ ----------
215
+ docPath : str
216
+ Path to an excel document file.
217
+
218
+ Returns
219
+ -------
220
+ pyspark.sql.DataFrame
221
+ A DataFrame containing parsed document content.
222
+
223
+ Examples
224
+ --------
225
+ >>> from sparknlp.reader import SparkNLPReader
226
+ >>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
227
+
228
+ You can use SparkNLP for one line of code
229
+
230
+ >>> import sparknlp
231
+ >>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
232
+ >>> xlsDf.show()
233
+ +--------------------------------------------+
234
+ |xls |
235
+ +--------------------------------------------+
236
+ |[{Title, Financial performance, {SheetNam}}]|
237
+ +--------------------------------------------+
238
+
239
+ >>> xlsDf.printSchema()
240
+ root
241
+ |-- path: string (nullable = true)
242
+ |-- content: binary (nullable = true)
243
+ |-- xls: array (nullable = true)
244
+ | |-- element: struct (containsNull = true)
245
+ | | |-- elementType: string (nullable = true)
246
+ | | |-- content: string (nullable = true)
247
+ | | |-- metadata: map (nullable = true)
248
+ | | | |-- key: string
249
+ | | | |-- value: string (valueContainsNull = true)
250
+ """
251
+ if not isinstance(docPath, str):
252
+ raise TypeError("docPath must be a string")
253
+ jdf = self._java_obj.xls(docPath)
254
+ dataframe = self.getDataFrame(self.spark, jdf)
255
+ return dataframe
256
+
257
+ def ppt(self, docPath):
258
+ """
259
+ Reads power point document files and returns a Spark DataFrame.
260
+
261
+ Parameters
262
+ ----------
263
+ docPath : str
264
+ Path to an power point document file.
265
+
266
+ Returns
267
+ -------
268
+ pyspark.sql.DataFrame
269
+ A DataFrame containing parsed document content.
270
+
271
+ Examples
272
+ --------
273
+ >>> from sparknlp.reader import SparkNLPReader
274
+ >>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
275
+
276
+ You can use SparkNLP for one line of code
277
+
278
+ >>> import sparknlp
279
+ >>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
280
+ >>> pptDf.show(truncate=False)
281
+ +-------------------------------------+
282
+ |ppt |
283
+ +-------------------------------------+
284
+ |[{Title, Adding a Bullet Slide, {}},]|
285
+ +-------------------------------------+
286
+ """
287
+ if not isinstance(docPath, str):
288
+ raise TypeError("docPath must be a string")
289
+ jdf = self._java_obj.ppt(docPath)
290
+ dataframe = self.getDataFrame(self.spark, jdf)
291
+ return dataframe
292
+
293
+ def txt(self, docPath):
294
+ """Reads TXT files and returns a Spark DataFrame.
295
+
296
+ Parameters
297
+ ----------
298
+ docPath : str
299
+ Path to a TXT file.
300
+
301
+ Returns
302
+ -------
303
+ pyspark.sql.DataFrame
304
+ A DataFrame containing parsed document content.
305
+
306
+ Examples
307
+ --------
308
+ >>> from sparknlp.reader import SparkNLPReader
309
+ >>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
310
+
311
+ You can use SparkNLP for one line of code
312
+
313
+ >>> import sparknlp
314
+ >>> txtDf = sparknlp.read().txt("home/user/txt/files")
315
+ >>> txtDf.show(truncate=False)
316
+ +-----------------------------------------------+
317
+ |txt |
318
+ +-----------------------------------------------+
319
+ |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}]|
320
+ +-----------------------------------------------+
321
+ """
322
+ if not isinstance(docPath, str):
323
+ raise TypeError("docPath must be a string")
324
+ jdf = self._java_obj.txt(docPath)
325
+ return self.getDataFrame(self.spark, jdf)
326
+
327
+ def xml(self, docPath):
328
+ """Reads XML files and returns a Spark DataFrame.
329
+
330
+ Parameters
331
+ ----------
332
+ docPath : str
333
+ Path to an XML file or a directory containing XML files.
334
+
335
+ Returns
336
+ -------
337
+ pyspark.sql.DataFrame
338
+ A DataFrame containing parsed XML content.
339
+
340
+ Examples
341
+ --------
342
+ >>> from sparknlp.reader import SparkNLPReader
343
+ >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
344
+
345
+ You can use SparkNLP for one line of code
346
+
347
+ >>> import sparknlp
348
+ >>> xml_df = sparknlp.read().xml("home/user/xml-directory")
349
+ >>> xml_df.show(truncate=False)
350
+ +-----------------------------------------------------------+
351
+ |xml |
352
+ +-----------------------------------------------------------+
353
+ |[{Title, John Smith, {elementId -> ..., tag -> title}}] |
354
+ +-----------------------------------------------------------+
355
+
356
+ >>> xml_df.printSchema()
357
+ root
358
+ |-- path: string (nullable = true)
359
+ |-- xml: array (nullable = true)
360
+ | |-- element: struct (containsNull = true)
361
+ | | |-- elementType: string (nullable = true)
362
+ | | |-- content: string (nullable = true)
363
+ | | |-- metadata: map (nullable = true)
364
+ | | | |-- key: string
365
+ | | | |-- value: string (valueContainsNull = true)
366
+ """
367
+ if not isinstance(docPath, str):
368
+ raise TypeError("docPath must be a string")
369
+ jdf = self._java_obj.xml(docPath)
370
+ return self.getDataFrame(self.spark, jdf)
371
+
372
+
373
+ def md(self, filePath):
374
+ """Reads Markdown files and returns a Spark DataFrame.
375
+
376
+ Parameters
377
+ ----------
378
+ filePath : str
379
+ Path to a Markdown file or a directory containing Markdown files.
380
+
381
+ Returns
382
+ -------
383
+ pyspark.sql.DataFrame
384
+ A DataFrame containing parsed Markdown content.
385
+
386
+ Examples
387
+ --------
388
+ >>> from sparknlp.reader import SparkNLPReader
389
+ >>> md_df = SparkNLPReader(spark).md("home/user/markdown-directory")
390
+
391
+ You can use SparkNLP for one line of code
392
+
393
+ >>> import sparknlp
394
+ >>> md_df = sparknlp.read().md("home/user/markdown-directory")
395
+ >>> md_df.show(truncate=False)
396
+ +-----------------------------------------------------------+
397
+ |md |
398
+ +-----------------------------------------------------------+
399
+ |[{Title, Sample Markdown Document, {elementId -> ..., tag -> title}}]|
400
+ +-----------------------------------------------------------+
401
+
402
+ >>> md_df.printSchema()
403
+ root
404
+ |-- path: string (nullable = true)
405
+ |-- md: array (nullable = true)
406
+ | |-- element: struct (containsNull = true)
407
+ | | |-- elementType: string (nullable = true)
408
+ | | |-- content: string (nullable = true)
409
+ | | |-- metadata: map (nullable = true)
410
+ | | | |-- key: string
411
+ | | | |-- value: string (valueContainsNull = true)
412
+ """
413
+ if not isinstance(filePath, str):
414
+ raise TypeError("filePath must be a string")
415
+ jdf = self._java_obj.md(filePath)
416
+ return self.getDataFrame(self.spark, jdf)
417
+
418
+ def csv(self, csvPath):
419
+ """Reads CSV files and returns a Spark DataFrame.
420
+
421
+ Parameters
422
+ ----------
423
+ docPath : str
424
+ Path to an CSV file or a directory containing CSV files.
425
+
426
+ Returns
427
+ -------
428
+ pyspark.sql.DataFrame
429
+ A DataFrame containing parsed CSV content.
430
+
431
+ Examples
432
+ --------
433
+ >>> from sparknlp.reader import SparkNLPReader
434
+ >>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory")
435
+
436
+ You can use SparkNLP for one line of code
437
+
438
+ >>> import sparknlp
439
+ >>> csv_df = sparknlp.read().csv("home/user/csv-directory")
440
+ >>> csv_df.show(truncate=False)
441
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
442
+ |csv |
443
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
444
+ |[{NarrativeText, Alice 100 Bob 95, {}}, {Table, <table><tr><td>Alice</td><td>100</td></tr><tr><td>Bob</td><td>95</td></tr></table>, {}}] |
445
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
446
+
447
+ >>> csv_df.printSchema()
448
+ root
449
+ |-- path: string (nullable = true)
450
+ |-- csv: array (nullable = true)
451
+ | |-- element: struct (containsNull = true)
452
+ | | |-- elementType: string (nullable = true)
453
+ | | |-- content: string (nullable = true)
454
+ | | |-- metadata: map (nullable = true)
455
+ | | | |-- key: string
456
+ | | | |-- value: string (valueContainsNull = true)
457
+ """
458
+ if not isinstance(csvPath, str):
459
+ raise TypeError("docPath must be a string")
460
+ jdf = self._java_obj.csv(csvPath)
461
+ return self.getDataFrame(self.spark, jdf)
@@ -17,3 +17,4 @@ from sparknlp.training.conll import *
17
17
  from sparknlp.training.conllu import *
18
18
  from sparknlp.training.pos import *
19
19
  from sparknlp.training.pub_tator import *
20
+ from sparknlp.training.spacy_to_annotation import *
@@ -65,6 +65,8 @@ class CoNLL(ExtendedJavaWrapper):
65
65
  Whether to explode sentences to separate rows, by default True
66
66
  delimiter: str, optional
67
67
  Delimiter used to separate columns inside CoNLL file
68
+ includeDocId: bool, optional
69
+ Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
68
70
 
69
71
  Examples
70
72
  --------
@@ -92,10 +94,12 @@ class CoNLL(ExtendedJavaWrapper):
92
94
  posCol='pos',
93
95
  conllLabelIndex=3,
94
96
  conllPosIndex=1,
97
+ conllDocIdCol="doc_id",
95
98
  textCol='text',
96
99
  labelCol='label',
97
100
  explodeSentences=True,
98
- delimiter=' '
101
+ delimiter=' ',
102
+ includeDocId=False
99
103
  ):
100
104
  super(CoNLL, self).__init__("com.johnsnowlabs.nlp.training.CoNLL",
101
105
  documentCol,
@@ -104,10 +108,12 @@ class CoNLL(ExtendedJavaWrapper):
104
108
  posCol,
105
109
  conllLabelIndex,
106
110
  conllPosIndex,
111
+ conllDocIdCol,
107
112
  textCol,
108
113
  labelCol,
109
114
  explodeSentences,
110
- delimiter)
115
+ delimiter,
116
+ includeDocId)
111
117
 
112
118
  def readDataset(self, spark, path, read_as=ReadAs.TEXT, partitions=8, storage_level=pyspark.StorageLevel.DISK_ONLY):
113
119
  # ToDo Replace with std pyspark
@@ -0,0 +1,57 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark.sql import SparkSession
15
+
16
+ from sparknlp.internal import ExtendedJavaWrapper
17
+
18
+
19
+ class SpacyToAnnotation(ExtendedJavaWrapper):
20
+
21
+ """Helper class to load a list of tokens/sentences as JSON to Annotation.
22
+
23
+ The JSON will be in this format:
24
+ [
25
+ {
26
+ "tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."],
27
+ "token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false],
28
+ "sentence_ends": [2, 7, 12]
29
+ }
30
+ ]
31
+
32
+ Examples
33
+ --------
34
+ >>> from sparknlp.training import SpacyToAnnotation
35
+ >>> result = SpacyToAnnotation().readDataset(spark, "src/test/resources/spacy-to-annotation/multi_doc_tokens.json")
36
+ >>> result.show(False)
37
+ +-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
38
+ |document |sentence |token |
39
+ +-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
40
+ |[{document, 0, 55, John went to the store last night. He bought some bread., {}, []}]|[{document, 0, 33, John went to the store last night., {sentence -> 0}, []}, {document, 35, 55, He bought some bread., {sentence -> 1}, []}] |[{token, 0, 3, John, {sentence -> 0}, []}, {token, 5, 8, went, {sentence -> 0}, []}, {token, 10, 11, to, {sentence -> 0}, []}, {token, 13, 15, the, {sentence -> 0}, []}, {token, 17, 21, store, {sentence -> 0}, []}, {token, 23, 26, last, {sentence -> 0}, []}, {token, 28, 32, night, {sentence -> 0}, []}, {token, 33, 33, ., {sentence -> 0}, []}, {token, 35, 36, He, {sentence -> 1}, []}, {token, 38, 43, bought, {sentence -> 1}, []}, {token, 45, 48, some, {sentence -> 1}, []}, {token, 50, 54, bread, {sentence -> 1}, []}, {token, 55, 55, ., {sentence -> 1}, []}]|
41
+ |[{document, 0, 47, Hello world! How are you today? I'm fine thanks., {}, []}] |[{document, 0, 11, Hello world!, {sentence -> 0}, []}, {document, 13, 30, How are you today?, {sentence -> 1}, []}, {document, 32, 47, I'm fine thanks., {sentence -> 2}, []}]|[{token, 0, 4, Hello, {sentence -> 0}, []}, {token, 6, 10, world, {sentence -> 0}, []}, {token, 11, 11, !, {sentence -> 0}, []}, {token, 13, 15, How, {sentence -> 1}, []}, {token, 17, 19, are, {sentence -> 1}, []}, {token, 21, 23, you, {sentence -> 1}, []}, {token, 25, 29, today, {sentence -> 1}, []}, {token, 30, 30, ?, {sentence -> 1}, []}, {token, 32, 32, I, {sentence -> 2}, []}, {token, 33, 34, 'm, {sentence -> 2}, []}, {token, 36, 39, fine, {sentence -> 2}, []}, {token, 41, 46, thanks, {sentence -> 2}, []}, {token, 47, 47, ., {sentence -> 2}, []}] |
42
+ +-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
43
+
44
+ """
45
+
46
+ def __init__(self):
47
+ super(SpacyToAnnotation, self).__init__("com.johnsnowlabs.nlp.training.SpacyToAnnotation")
48
+
49
+ def readJsonFile(self, spark, jsonFilePath, params=None):
50
+ if params is None:
51
+ params = {}
52
+
53
+ jSession = spark._jsparkSession
54
+
55
+ jdf = self._java_obj.readJsonFileJava(jSession, jsonFilePath, params)
56
+ annotation_dataset = self.getDataFrame(spark, jdf)
57
+ return annotation_dataset
sparknlp/util.py CHANGED
@@ -15,6 +15,9 @@
15
15
 
16
16
 
17
17
  import sparknlp.internal as _internal
18
+ import numpy as np
19
+ from pyspark.sql import Row
20
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BinaryType
18
21
 
19
22
 
20
23
  def get_config_path():
@@ -33,3 +36,26 @@ class CoNLLGenerator:
33
36
  _internal._CoNLLGeneratorExportFromTargetAndPipeline(*args).apply()
34
37
  else:
35
38
  raise NotImplementedError(f"No exportConllFiles alternative takes {num_args} parameters")
39
+
40
+
41
+ class EmbeddingsDataFrameUtils:
42
+ """
43
+ Utility for creating DataFrames compatible with multimodal embedding models (e.g., E5VEmbeddings) for text-only scenarios.
44
+ Provides:
45
+ - imageSchema: the expected schema for Spark image DataFrames
46
+ - emptyImageRow: a dummy image row for text-only embedding
47
+ """
48
+ imageSchema = StructType([
49
+ StructField(
50
+ "image",
51
+ StructType([
52
+ StructField("origin", StringType(), True),
53
+ StructField("height", IntegerType(), True),
54
+ StructField("width", IntegerType(), True),
55
+ StructField("nChannels", IntegerType(), True),
56
+ StructField("mode", IntegerType(), True),
57
+ StructField("data", BinaryType(), True),
58
+ ]),
59
+ )
60
+ ])
61
+ emptyImageRow = Row(Row("", 0, 0, 0, 0, bytes()))