spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,228 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the DocumentNormalizer"""
15
+ from sparknlp.common import *
16
+
17
+
18
+ class DocumentCharacterTextSplitter(AnnotatorModel):
19
+ """Annotator which splits large documents into chunks of roughly given size.
20
+
21
+ DocumentCharacterTextSplitter takes a list of separators. It takes the separators in order and
22
+ splits subtexts if they are over the chunk length, considering optional overlap of the chunks.
23
+
24
+ For example, given chunk size 20 and overlap 5:
25
+
26
+ .. code-block:: python
27
+
28
+ "He was, I take it, the most perfect reasoning and observing machine that the world has seen."
29
+
30
+ ["He was, I take it,", "it, the most", "most perfect", "reasoning and", "and observing", "machine that the", "the world has seen."]
31
+
32
+
33
+ Additionally, you can set
34
+
35
+ - custom patterns with setSplitPatterns
36
+ - whether patterns should be interpreted as regex with setPatternsAreRegex
37
+ - whether to keep the separators with setKeepSeparators
38
+ - whether to trim whitespaces with setTrimWhitespace
39
+ - whether to explode the splits to individual rows with setExplodeSplits
40
+
41
+ For extended examples of usage, see the
42
+ `DocumentCharacterTextSplitterTest <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala>`__.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``DOCUMENT``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+
53
+ chunkSize
54
+ Size of each chunk of text.
55
+ chunkOverlap
56
+ Length of the overlap between text chunks , by default `0`.
57
+ splitPatterns
58
+ Patterns to separate the text by in decreasing priority , by default `["\\n\\n", "\\n", " ", ""]`.
59
+ patternsAreRegex
60
+ Whether to interpret the split patterns as regular expressions , by default `False`.
61
+ keepSeparators
62
+ Whether to keep the separators in the final result , by default `True`.
63
+ explodeSplits
64
+ Whether to explode split chunks to separate rows , by default `False`.
65
+ trimWhitespace
66
+ Whether to trim whitespaces of extracted chunks , by default `True`.
67
+
68
+ Examples
69
+ --------
70
+ >>> import sparknlp
71
+ >>> from sparknlp.base import *
72
+ >>> from sparknlp.annotator import *
73
+ >>> from pyspark.ml import Pipeline
74
+ >>> textDF = spark.read.text(
75
+ ... "sherlockholmes.txt",
76
+ ... wholetext=True
77
+ ... ).toDF("text")
78
+ >>> documentAssembler = DocumentAssembler().setInputCol("text")
79
+ >>> textSplitter = DocumentCharacterTextSplitter() \\
80
+ ... .setInputCols(["document"]) \\
81
+ ... .setOutputCol("splits") \\
82
+ ... .setChunkSize(20000) \\
83
+ ... .setChunkOverlap(200) \\
84
+ ... .setExplodeSplits(True)
85
+ >>> pipeline = Pipeline().setStages([documentAssembler, textSplitter])
86
+ >>> result = pipeline.fit(textDF).transform(textDF)
87
+ >>> result.selectExpr(
88
+ ... "splits.result",
89
+ ... "splits[0].begin",
90
+ ... "splits[0].end",
91
+ ... "splits[0].end - splits[0].begin as length") \\
92
+ ... .show(8, truncate = 80)
93
+ +--------------------------------------------------------------------------------+---------------+-------------+------+
94
+ | result|splits[0].begin|splits[0].end|length|
95
+ +--------------------------------------------------------------------------------+---------------+-------------+------+
96
+ |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 19994| 19994|
97
+ |["And Mademoiselle's address?" he asked.\\n\\n"Is Briony Lodge, Serpentine Aven...| 19798| 39395| 19597|
98
+ |["How did that help you?"\\n\\n"It was all-important. When a woman thinks that ...| 39371| 59242| 19871|
99
+ |["'But,' said I, 'there would be millions of red-headed men who\\nwould apply....| 59166| 77833| 18667|
100
+ |[My friend was an enthusiastic musician, being himself not only a\\nvery capab...| 77835| 97769| 19934|
101
+ |["And yet I am not convinced of it," I answered. "The cases which\\ncome to li...| 97771| 117248| 19477|
102
+ |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\\nfeather of...| 117250| 137242| 19992|
103
+ |["That sounds a little paradoxical."\\n\\n"But it is profoundly True. Singulari...| 137244| 157171| 19927|
104
+ +--------------------------------------------------------------------------------+---------------+-------------+------+
105
+
106
+ """
107
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
108
+
109
+ outputAnnotatorType = AnnotatorType.DOCUMENT
110
+
111
+ chunkSize = Param(Params._dummy(),
112
+ "chunkSize",
113
+ "Size of each chunk of text",
114
+ typeConverter=TypeConverters.toInt)
115
+ chunkOverlap = Param(Params._dummy(),
116
+ "chunkOverlap",
117
+ "Length of the overlap between text chunks",
118
+ typeConverter=TypeConverters.toInt)
119
+ splitPatterns = Param(Params._dummy(),
120
+ "splitPatterns",
121
+ "Patterns to separate the text by in decreasing priority",
122
+ typeConverter=TypeConverters.toListString)
123
+ patternsAreRegex = Param(Params._dummy(),
124
+ "patternsAreRegex",
125
+ "Whether to interpret the split patterns as regular expressions",
126
+ typeConverter=TypeConverters.toBoolean)
127
+ keepSeparators = Param(Params._dummy(),
128
+ "keepSeparators",
129
+ "Whether to keep the separators in the final result",
130
+ typeConverter=TypeConverters.toBoolean)
131
+ explodeSplits = Param(Params._dummy(),
132
+ "explodeSplits",
133
+ "Whether to explode split chunks to separate rows",
134
+ typeConverter=TypeConverters.toBoolean)
135
+ trimWhitespace = Param(Params._dummy(),
136
+ "trimWhitespace",
137
+ "Whether to trim whitespaces of extracted chunks",
138
+ typeConverter=TypeConverters.toBoolean)
139
+
140
+ @keyword_only
141
+ def __init__(self):
142
+ super(DocumentCharacterTextSplitter, self).__init__(
143
+ classname="com.johnsnowlabs.nlp.annotators.DocumentCharacterTextSplitter")
144
+ self._setDefault(
145
+ chunkOverlap=0,
146
+ explodeSplits=False,
147
+ keepSeparators=True,
148
+ patternsAreRegex=False,
149
+ splitPatterns=["\n\n", "\n", " ", ""],
150
+ trimWhitespace=True
151
+ )
152
+
153
+ def setChunkSize(self, value):
154
+ """Sets size of each chunk of text.
155
+
156
+ Parameters
157
+ ----------
158
+ value : int
159
+ Size of each chunk of text
160
+ """
161
+ if value < 1:
162
+ raise ValueError("Chunk size should be larger than 0.")
163
+ return self._set(chunkSize=value)
164
+
165
+ def setChunkOverlap(self, value):
166
+ """Sets length of the overlap between text chunks , by default `0`.
167
+
168
+ Parameters
169
+ ----------
170
+ value : int
171
+ Length of the overlap between text chunks
172
+ """
173
+ if value > self.getOrDefault(self.chunkSize):
174
+ raise ValueError("Chunk overlap can't be larger than chunk size.")
175
+ return self._set(chunkOverlap=value)
176
+
177
+ def setSplitPatterns(self, value):
178
+ """Sets patterns to separate the text by in decreasing priority , by default `["\n\n", "\n", " ", ""]`.
179
+
180
+ Parameters
181
+ ----------
182
+ value : List[str]
183
+ Patterns to separate the text by in decreasing priority
184
+ """
185
+ if len(value) == 0:
186
+ raise ValueError("Patterns are empty")
187
+
188
+ return self._set(splitPatterns=value)
189
+
190
+ def setPatternsAreRegex(self, value):
191
+ """Sets whether to interpret the split patterns as regular expressions , by default `False`.
192
+
193
+ Parameters
194
+ ----------
195
+ value : bool
196
+ Whether to interpret the split patterns as regular expressions
197
+ """
198
+ return self._set(patternsAreRegex=value)
199
+
200
+ def setKeepSeparators(self, value):
201
+ """Sets whether to keep the separators in the final result , by default `True`.
202
+
203
+ Parameters
204
+ ----------
205
+ value : bool
206
+ Whether to keep the separators in the final result
207
+ """
208
+ return self._set(keepSeparators=value)
209
+
210
+ def setExplodeSplits(self, value):
211
+ """Sets whether to explode split chunks to separate rows , by default `False`.
212
+
213
+ Parameters
214
+ ----------
215
+ value : bool
216
+ Whether to explode split chunks to separate rows
217
+ """
218
+ return self._set(explodeSplits=value)
219
+
220
+ def setTrimWhitespace(self, value):
221
+ """Sets whether to trim whitespaces of extracted chunks , by default `True`.
222
+
223
+ Parameters
224
+ ----------
225
+ value : bool
226
+ Whether to trim whitespaces of extracted chunks
227
+ """
228
+ return self._set(trimWhitespace=value)
@@ -23,7 +23,8 @@ class DocumentNormalizer(AnnotatorModel):
23
23
  patterns. Can apply not wanted character removal with a specific policy.
24
24
  Can apply lower case normalization.
25
25
 
26
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
26
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb
27
+ >`__.
27
28
 
28
29
  ====================== ======================
29
30
  Input Annotation types Output Annotation type
@@ -121,6 +122,21 @@ class DocumentNormalizer(AnnotatorModel):
121
122
  "file encoding to apply on normalized documents",
122
123
  typeConverter=TypeConverters.toString)
123
124
 
125
+ presetPattern = Param(
126
+ Params._dummy(),
127
+ "presetPattern",
128
+ "Selects a single text cleaning function from the functional presets (e.g., 'CLEAN_BULLETS', 'CLEAN_DASHES', etc.).",
129
+ typeConverter=TypeConverters.toString
130
+ )
131
+
132
+ autoMode = Param(
133
+ Params._dummy(),
134
+ "autoMode",
135
+ "Enables a predefined cleaning mode combining multiple text cleaner functions (e.g., 'light_clean', 'document_clean', 'html_clean', 'full_auto').",
136
+ typeConverter=TypeConverters.toString
137
+ )
138
+
139
+
124
140
  @keyword_only
125
141
  def __init__(self):
126
142
  super(DocumentNormalizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DocumentNormalizer")
@@ -197,3 +213,23 @@ class DocumentNormalizer(AnnotatorModel):
197
213
  """
198
214
  return self._set(encoding=value)
199
215
 
216
+ def setPresetPattern(self, value):
217
+ """Sets a single text cleaning preset pattern.
218
+
219
+ Parameters
220
+ ----------
221
+ value : str
222
+ Preset cleaning pattern name, e.g., 'CLEAN_BULLETS', 'CLEAN_DASHES'.
223
+ """
224
+ return self._set(presetPattern=value)
225
+
226
+
227
+ def setAutoMode(self, value):
228
+ """Sets an automatic text cleaning mode using predefined groups of cleaning functions.
229
+
230
+ Parameters
231
+ ----------
232
+ value : str
233
+ Auto cleaning mode, e.g., 'light_clean', 'document_clean', 'social_clean', 'html_clean', 'full_auto'.
234
+ """
235
+ return self._set(autoMode=value)
@@ -0,0 +1,175 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the DocumentNormalizer"""
15
+ from sparknlp.common import *
16
+
17
+
18
+ class DocumentTokenSplitter(AnnotatorModel):
19
+ """Annotator that splits large documents into smaller documents based on the number of tokens in
20
+ the text.
21
+
22
+ Currently, DocumentTokenSplitter splits the text by whitespaces to create the tokens. The
23
+ number of these tokens will then be used as a measure of the text length. In the future, other
24
+ tokenization techniques will be supported.
25
+
26
+ For example, given 3 tokens and overlap 1:
27
+
28
+ .. code-block:: python
29
+
30
+ He was, I take it, the most perfect reasoning and observing machine that the world has seen.
31
+
32
+ ["He was, I", "I take it,", "it, the most", "most perfect reasoning", "reasoning and observing", "observing machine that", "that the world", "world has seen."]
33
+
34
+
35
+ Additionally, you can set
36
+
37
+ - whether to trim whitespaces with setTrimWhitespace
38
+ - whether to explode the splits to individual rows with setExplodeSplits
39
+
40
+ For extended examples of usage, see the
41
+ `DocumentTokenSplitterTest <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala>`__.
42
+
43
+ ====================== ======================
44
+ Input Annotation types Output Annotation type
45
+ ====================== ======================
46
+ ``DOCUMENT`` ``DOCUMENT``
47
+ ====================== ======================
48
+
49
+ Parameters
50
+ ----------
51
+
52
+ numTokens
53
+ Limit of the number of tokens in a text
54
+ tokenOverlap
55
+ Length of the token overlap between text chunks, by default `0`.
56
+ explodeSplits
57
+ Whether to explode split chunks to separate rows, by default `False`.
58
+ trimWhitespace
59
+ Whether to trim whitespaces of extracted chunks, by default `True`.
60
+
61
+ Examples
62
+ --------
63
+ >>> import sparknlp
64
+ >>> from sparknlp.base import *
65
+ >>> from sparknlp.annotator import *
66
+ >>> from pyspark.ml import Pipeline
67
+ >>> textDF = spark.read.text(
68
+ ... "sherlockholmes.txt",
69
+ ... wholetext=True
70
+ ... ).toDF("text")
71
+ >>> documentAssembler = DocumentAssembler().setInputCol("text")
72
+ >>> textSplitter = DocumentTokenSplitter() \\
73
+ ... .setInputCols(["document"]) \\
74
+ ... .setOutputCol("splits") \\
75
+ ... .setNumTokens(512) \\
76
+ ... .setTokenOverlap(10) \\
77
+ ... .setExplodeSplits(True)
78
+ >>> pipeline = Pipeline().setStages([documentAssembler, textSplitter])
79
+ >>> result = pipeline.fit(textDF).transform(textDF)
80
+ >>> result.selectExpr(
81
+ ... "splits.result as result",
82
+ ... "splits[0].begin as begin",
83
+ ... "splits[0].end as end",
84
+ ... "splits[0].end - splits[0].begin as length",
85
+ ... "splits[0].metadata.numTokens as tokens") \\
86
+ ... .show(8, truncate = 80)
87
+ +--------------------------------------------------------------------------------+-----+-----+------+------+
88
+ | result|begin| end|length|tokens|
89
+ +--------------------------------------------------------------------------------+-----+-----+------+------+
90
+ |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 3018| 3018| 512|
91
+ |[study of crime, and occupied his\\nimmense faculties and extraordinary powers...| 2950| 5707| 2757| 512|
92
+ |[but as I have changed my clothes I can't imagine how you\\ndeduce it. As to M...| 5659| 8483| 2824| 512|
93
+ |[quarters received. Be in your chamber then at that hour, and do\\nnot take it...| 8427|11241| 2814| 512|
94
+ |[a pity\\nto miss it."\\n\\n"But your client--"\\n\\n"Never mind him. I may want y...|11188|13970| 2782| 512|
95
+ |[person who employs me wishes his agent to be unknown to\\nyou, and I may conf...|13918|16898| 2980| 512|
96
+ |[letters back."\\n\\n"Precisely so. But how--"\\n\\n"Was there a secret marriage?...|16836|19744| 2908| 512|
97
+ |[seven hundred in\\nnotes," he said.\\n\\nHolmes scribbled a receipt upon a shee...|19683|22551| 2868| 512|
98
+ +--------------------------------------------------------------------------------+-----+-----+------+------+
99
+
100
+ """
101
+
102
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
103
+
104
+ outputAnnotatorType = AnnotatorType.DOCUMENT
105
+
106
+ numTokens = Param(Params._dummy(),
107
+ "numTokens",
108
+ "Limit of the number of tokens in a text",
109
+ typeConverter=TypeConverters.toInt)
110
+ tokenOverlap = Param(Params._dummy(),
111
+ "tokenOverlap",
112
+ "Length of the token overlap between text chunks",
113
+ typeConverter=TypeConverters.toInt)
114
+ explodeSplits = Param(Params._dummy(),
115
+ "explodeSplits",
116
+ "Whether to explode split chunks to separate rows",
117
+ typeConverter=TypeConverters.toBoolean)
118
+ trimWhitespace = Param(Params._dummy(),
119
+ "trimWhitespace",
120
+ "Whether to trim whitespaces of extracted chunks",
121
+ typeConverter=TypeConverters.toBoolean)
122
+
123
+ @keyword_only
124
+ def __init__(self):
125
+ super(DocumentTokenSplitter, self).__init__(
126
+ classname="com.johnsnowlabs.nlp.annotators.DocumentTokenSplitter")
127
+ self._setDefault(
128
+ tokenOverlap=0,
129
+ explodeSplits=False,
130
+ trimWhitespace=True
131
+ )
132
+
133
+ def setNumTokens(self, value):
134
+ """Sets the limit of the number of tokens in a text
135
+
136
+ Parameters
137
+ ----------
138
+ value : int
139
+ Number of tokens in a text
140
+ """
141
+ if value < 1:
142
+ raise ValueError("Number of tokens should be larger than 0.")
143
+ return self._set(numTokens=value)
144
+
145
+ def setTokenOverlap(self, value):
146
+ """Length of the token overlap between text chunks, by default `0`.
147
+
148
+ Parameters
149
+ ----------
150
+ value : int
151
+ Length of the token overlap between text chunks
152
+ """
153
+ if value > self.getOrDefault(self.numTokens):
154
+ raise ValueError("Token overlap can't be larger than number of tokens.")
155
+ return self._set(tokenOverlap=value)
156
+
157
+ def setExplodeSplits(self, value):
158
+ """Sets whether to explode split chunks to separate rows, by default `False`.
159
+
160
+ Parameters
161
+ ----------
162
+ value : bool
163
+ Whether to explode split chunks to separate rows
164
+ """
165
+ return self._set(explodeSplits=value)
166
+
167
+ def setTrimWhitespace(self, value):
168
+ """Sets whether to trim whitespaces of extracted chunks, by default `True`.
169
+
170
+ Parameters
171
+ ----------
172
+ value : bool
173
+ Whether to trim whitespaces of extracted chunks
174
+ """
175
+ return self._set(trimWhitespace=value)
@@ -0,0 +1,85 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import unittest
15
+
16
+ import pytest
17
+
18
+ from sparknlp.annotator import *
19
+ from sparknlp.base import *
20
+ from test.util import SparkSessionForTest
21
+
22
+
23
+ @pytest.mark.fast
24
+ class DocumentTokenSplitterTestSpec(unittest.TestCase):
25
+ def setUp(self):
26
+ self.data = SparkSessionForTest.spark.createDataFrame(
27
+ [
28
+ [
29
+ (
30
+ "All emotions, and that\none particularly, were abhorrent to his cold, precise"
31
+ " but\nadmirably balanced mind.\n\nHe was, I take it, the most perfect\nreasoning"
32
+ " and observing machine that the world has seen."
33
+ )
34
+ ]
35
+ ]
36
+ ).toDF("text")
37
+
38
+ def test_run(self):
39
+ df = self.data
40
+
41
+ document_assembler = (
42
+ DocumentAssembler().setInputCol("text").setOutputCol("document")
43
+ )
44
+
45
+ document_token_splitter = (
46
+ DocumentTokenSplitter()
47
+ .setInputCols("document")
48
+ .setOutputCol("splits")
49
+ .setNumTokens(3)
50
+ .setTokenOverlap(1)
51
+ .setExplodeSplits(True)
52
+ .setTrimWhitespace(True)
53
+ )
54
+
55
+ pipeline = Pipeline().setStages([document_assembler, document_token_splitter])
56
+
57
+ pipeline_df = pipeline.fit(df).transform(df)
58
+
59
+ results = pipeline_df.select("splits").collect()
60
+
61
+ splits = [
62
+ row["splits"][0].result.replace("\n\n", " ").replace("\n", " ")
63
+ for row in results
64
+ ]
65
+
66
+ expected = [
67
+ "All emotions, and",
68
+ "and that one",
69
+ "one particularly, were",
70
+ "were abhorrent to",
71
+ "to his cold,",
72
+ "cold, precise but",
73
+ "but admirably balanced",
74
+ "balanced mind. He",
75
+ "He was, I",
76
+ "I take it,",
77
+ "it, the most",
78
+ "most perfect reasoning",
79
+ "reasoning and observing",
80
+ "observing machine that",
81
+ "that the world",
82
+ "world has seen.",
83
+ ]
84
+
85
+ assert splits == expected
@@ -22,7 +22,11 @@ from sparknlp.annotator.embeddings.deberta_embeddings import *
22
22
  from sparknlp.annotator.embeddings.distil_bert_embeddings import *
23
23
  from sparknlp.annotator.embeddings.doc2vec import *
24
24
  from sparknlp.annotator.embeddings.elmo_embeddings import *
25
+ from sparknlp.annotator.embeddings.e5_embeddings import *
26
+ from sparknlp.annotator.embeddings.instructor_embeddings import *
25
27
  from sparknlp.annotator.embeddings.longformer_embeddings import *
28
+ from sparknlp.annotator.embeddings.minilm_embeddings import *
29
+ from sparknlp.annotator.embeddings.mpnet_embeddings import *
26
30
  from sparknlp.annotator.embeddings.roberta_embeddings import *
27
31
  from sparknlp.annotator.embeddings.roberta_sentence_embeddings import *
28
32
  from sparknlp.annotator.embeddings.sentence_embeddings import *
@@ -32,3 +36,10 @@ from sparknlp.annotator.embeddings.word_embeddings import *
32
36
  from sparknlp.annotator.embeddings.xlm_roberta_embeddings import *
33
37
  from sparknlp.annotator.embeddings.xlm_roberta_sentence_embeddings import *
34
38
  from sparknlp.annotator.embeddings.xlnet_embeddings import *
39
+ from sparknlp.annotator.embeddings.bge_embeddings import *
40
+ from sparknlp.annotator.embeddings.uae_embeddings import *
41
+ from sparknlp.annotator.embeddings.mxbai_embeddings import *
42
+ from sparknlp.annotator.embeddings.snowflake_embeddings import *
43
+ from sparknlp.annotator.embeddings.nomic_embeddings import *
44
+ from sparknlp.annotator.embeddings.auto_gguf_embeddings import *
45
+ from sparknlp.annotator.embeddings.e5v_embeddings import *
@@ -21,7 +21,8 @@ class AlbertEmbeddings(AnnotatorModel,
21
21
  HasCaseSensitiveProperties,
22
22
  HasStorageRef,
23
23
  HasBatchedAnnotate,
24
- HasEngine):
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
25
26
  """ALBERT: A Lite Bert For Self-Supervised Learning Of Language
26
27
  Representations - Google Research, Toyota Technological Institute at Chicago
27
28
 
@@ -53,8 +54,8 @@ class AlbertEmbeddings(AnnotatorModel,
53
54
 
54
55
  The default model is ``"albert_base_uncased"``, if no name is provided.
55
56
 
56
- For extended examples of usage, see the `Spark NLP Workshop
57
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_albert.ipynb>`__.
57
+ For extended examples of usage, see the `Examples
58
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_albert.ipynb>`__.
58
59
  To see which models are compatible and how to import them see
59
60
  `Import Transformers into Spark NLP 🚀
60
61
  <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
@@ -163,11 +164,6 @@ class AlbertEmbeddings(AnnotatorModel,
163
164
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
164
165
  TypeConverters.toListInt)
165
166
 
166
- maxSentenceLength = Param(Params._dummy(),
167
- "maxSentenceLength",
168
- "Max sentence length to process",
169
- typeConverter=TypeConverters.toInt)
170
-
171
167
  def setConfigProtoBytes(self, b):
172
168
  """Sets configProto from tensorflow, serialized into byte array.
173
169
 
@@ -178,16 +174,6 @@ class AlbertEmbeddings(AnnotatorModel,
178
174
  """
179
175
  return self._set(configProtoBytes=b)
180
176
 
181
- def setMaxSentenceLength(self, value):
182
- """Sets max sentence length to process.
183
-
184
- Parameters
185
- ----------
186
- value : int
187
- Max sentence length to process
188
- """
189
- return self._set(maxSentenceLength=value)
190
-
191
177
  @keyword_only
192
178
  def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.AlbertEmbeddings", java_model=None):
193
179
  super(AlbertEmbeddings, self).__init__(