spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +81 -28
- sparknlp/annotation.py +3 -2
- sparknlp/annotator/__init__.py +6 -0
- sparknlp/annotator/audio/__init__.py +2 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/{base → annotator}/chunk2_doc.py +4 -7
- sparknlp/annotator/chunker.py +1 -2
- sparknlp/annotator/classifier_dl/__init__.py +17 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/spanbert_coref.py +4 -18
- sparknlp/annotator/cv/__init__.py +15 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/dependency_parser.py +2 -3
- sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +37 -1
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +11 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
- sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
- sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
- sparknlp/annotator/embeddings/doc2vec.py +7 -1
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
- sparknlp/annotator/embeddings/word2vec.py +7 -1
- sparknlp/annotator/embeddings/word_embeddings.py +4 -5
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
- sparknlp/annotator/er/entity_ruler.py +37 -23
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
- sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
- sparknlp/annotator/lemmatizer.py +3 -4
- sparknlp/annotator/matcher/date_matcher.py +35 -3
- sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
- sparknlp/annotator/matcher/regex_matcher.py +3 -3
- sparknlp/annotator/matcher/text_matcher.py +2 -3
- sparknlp/annotator/n_gram_generator.py +1 -2
- sparknlp/annotator/ner/__init__.py +3 -1
- sparknlp/annotator/ner/ner_converter.py +18 -0
- sparknlp/annotator/ner/ner_crf.py +4 -5
- sparknlp/annotator/ner/ner_dl.py +10 -5
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +2 -2
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +2 -2
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/pos/perceptron.py +6 -7
- sparknlp/annotator/sentence/sentence_detector.py +2 -2
- sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
- sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
- sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
- sparknlp/annotator/seq2seq/__init__.py +17 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
- sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
- sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
- sparknlp/annotator/stemmer.py +2 -3
- sparknlp/annotator/stop_words_cleaner.py +3 -4
- sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
- sparknlp/annotator/token/__init__.py +0 -1
- sparknlp/annotator/token/recursive_tokenizer.py +2 -3
- sparknlp/annotator/token/tokenizer.py +2 -3
- sparknlp/annotator/ws/word_segmenter.py +35 -10
- sparknlp/base/__init__.py +2 -3
- sparknlp/base/doc2_chunk.py +0 -3
- sparknlp/base/document_assembler.py +5 -5
- sparknlp/base/embeddings_finisher.py +14 -2
- sparknlp/base/finisher.py +15 -4
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/image_assembler.py +69 -0
- sparknlp/base/light_pipeline.py +53 -21
- sparknlp/base/multi_document_assembler.py +9 -13
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/token_assembler.py +1 -2
- sparknlp/common/__init__.py +2 -0
- sparknlp/common/annotator_type.py +1 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +914 -9
- sparknlp/internal/__init__.py +841 -116
- sparknlp/internal/annotator_java_ml.py +1 -1
- sparknlp/internal/annotator_transformer.py +3 -0
- sparknlp/logging/comet.py +2 -2
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/pretrained_pipeline.py +1 -1
- sparknlp/pretrained/resource_downloader.py +126 -2
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +1 -0
- sparknlp/training/conll.py +8 -2
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/util.py +26 -0
- spark_nlp-4.2.6.dist-info/METADATA +0 -1256
- spark_nlp-4.2.6.dist-info/RECORD +0 -196
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
- /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
|
@@ -29,9 +29,6 @@ class NorvigSweetingApproach(AnnotatorApproach):
|
|
|
29
29
|
|
|
30
30
|
For instantiated/pretrained models, see :class:`.NorvigSweetingModel`.
|
|
31
31
|
|
|
32
|
-
For extended examples of usage, see the `Spark NLP Workshop
|
|
33
|
-
<https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
|
|
34
|
-
|
|
35
32
|
====================== ======================
|
|
36
33
|
Input Annotation types Output Annotation type
|
|
37
34
|
====================== ======================
|
|
@@ -270,11 +267,11 @@ class NorvigSweetingModel(AnnotatorModel):
|
|
|
270
267
|
|
|
271
268
|
The default model is ``"spellcheck_norvig"``, if no name is provided. For
|
|
272
269
|
available pretrained models please see the `Models Hub
|
|
273
|
-
<https://
|
|
270
|
+
<https://sparknlp.org/models?task=Spell+Check>`__.
|
|
274
271
|
|
|
275
272
|
|
|
276
|
-
For extended examples of usage, see the `
|
|
277
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
273
|
+
For extended examples of usage, see the `Examples
|
|
274
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
|
|
278
275
|
|
|
279
276
|
====================== ======================
|
|
280
277
|
Input Annotation types Output Annotation type
|
|
@@ -212,7 +212,7 @@ class SymmetricDeleteModel(AnnotatorModel):
|
|
|
212
212
|
|
|
213
213
|
The default model is ``"spellcheck_sd"``, if no name is provided. For
|
|
214
214
|
available pretrained models please see the `Models Hub
|
|
215
|
-
<https://
|
|
215
|
+
<https://sparknlp.org/models?task=Spell+Check>`__.
|
|
216
216
|
|
|
217
217
|
====================== ======================
|
|
218
218
|
Input Annotation types Output Annotation type
|
sparknlp/annotator/stemmer.py
CHANGED
|
@@ -19,8 +19,8 @@ class Stemmer(AnnotatorModel):
|
|
|
19
19
|
"""Returns hard-stems out of words with the objective of retrieving the
|
|
20
20
|
meaningful part of the word.
|
|
21
21
|
|
|
22
|
-
For extended examples of usage, see the `
|
|
23
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
22
|
+
For extended examples of usage, see the `Examples
|
|
23
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb>`__.
|
|
24
24
|
|
|
25
25
|
====================== ======================
|
|
26
26
|
Input Annotation types Output Annotation type
|
|
@@ -77,4 +77,3 @@ class Stemmer(AnnotatorModel):
|
|
|
77
77
|
self._setDefault(
|
|
78
78
|
language="english"
|
|
79
79
|
)
|
|
80
|
-
|
|
@@ -34,10 +34,10 @@ class StopWordsCleaner(AnnotatorModel):
|
|
|
34
34
|
This will load the default pretrained model ``"stopwords_en"``.
|
|
35
35
|
|
|
36
36
|
For available pretrained models please see the `Models Hub
|
|
37
|
-
<https://
|
|
37
|
+
<https://sparknlp.orgtask=Stop+Words+Removal>`__.
|
|
38
38
|
|
|
39
|
-
For extended examples of usage, see the `
|
|
40
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
39
|
+
For extended examples of usage, see the `Examples
|
|
40
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb>`__.
|
|
41
41
|
|
|
42
42
|
====================== ======================
|
|
43
43
|
Input Annotation types Output Annotation type
|
|
@@ -188,4 +188,3 @@ class StopWordsCleaner(AnnotatorModel):
|
|
|
188
188
|
"""
|
|
189
189
|
from sparknlp.pretrained import ResourceDownloader
|
|
190
190
|
return ResourceDownloader.downloadModel(StopWordsCleaner, name, lang, remote_loc)
|
|
191
|
-
|
|
@@ -16,5 +16,4 @@
|
|
|
16
16
|
from sparknlp.annotator.token.chunk_tokenizer import *
|
|
17
17
|
from sparknlp.annotator.token.recursive_tokenizer import *
|
|
18
18
|
from sparknlp.annotator.token.regex_tokenizer import *
|
|
19
|
-
from sparknlp.annotator.token.token2_chunk import *
|
|
20
19
|
from sparknlp.annotator.token.tokenizer import *
|
|
@@ -28,8 +28,8 @@ class RecursiveTokenizer(AnnotatorApproach):
|
|
|
28
28
|
- ``infixes``: Strings that will be split when found at the middle of token.
|
|
29
29
|
- ``whitelist``: Whitelist of strings not to split
|
|
30
30
|
|
|
31
|
-
For extended examples of usage, see the `
|
|
32
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
31
|
+
For extended examples of usage, see the `Examples
|
|
32
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
|
|
33
33
|
|
|
34
34
|
====================== ======================
|
|
35
35
|
Input Annotation types Output Annotation type
|
|
@@ -203,4 +203,3 @@ class RecursiveTokenizerModel(AnnotatorModel):
|
|
|
203
203
|
classname=classname,
|
|
204
204
|
java_model=java_model
|
|
205
205
|
)
|
|
206
|
-
|
|
@@ -27,8 +27,8 @@ class Tokenizer(AnnotatorApproach):
|
|
|
27
27
|
Identifies tokens with tokenization open standards. A few rules will help
|
|
28
28
|
customizing it if defaults do not fit user needs.
|
|
29
29
|
|
|
30
|
-
For extended examples of usage see the `
|
|
31
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
30
|
+
For extended examples of usage see the `Examples
|
|
31
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__.
|
|
32
32
|
|
|
33
33
|
====================== ======================
|
|
34
34
|
Input Annotation types Output Annotation type
|
|
@@ -559,4 +559,3 @@ class TokenizerModel(AnnotatorModel):
|
|
|
559
559
|
"""
|
|
560
560
|
from sparknlp.pretrained import ResourceDownloader
|
|
561
561
|
return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)
|
|
562
|
-
|
|
@@ -20,11 +20,27 @@ class WordSegmenterApproach(AnnotatorApproach):
|
|
|
20
20
|
"""Trains a WordSegmenter which tokenizes non-english or non-whitespace
|
|
21
21
|
separated texts.
|
|
22
22
|
|
|
23
|
-
Many languages are not whitespace separated and their sentences are a
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
Many languages are not whitespace separated and their sentences are a concatenation
|
|
24
|
+
of many symbols, like Korean, Japanese or Chinese. Without understanding the
|
|
25
|
+
language, splitting the words into their corresponding tokens is impossible. The
|
|
26
|
+
WordSegmenter is trained to understand these languages and split them into
|
|
27
|
+
semantically correct parts.
|
|
28
|
+
|
|
29
|
+
This annotator is based on the paper Chinese Word Segmentation as Character Tagging
|
|
30
|
+
[1]. Word segmentation is treated as a tagging problem. Each character is be tagged
|
|
31
|
+
as on of four different labels: LL (left boundary), RR (right boundary), MM (middle)
|
|
32
|
+
and LR (word by itself). The label depends on the position of the word in the
|
|
33
|
+
sentence. LL tagged words will combine with the word on the right. Likewise, RR
|
|
34
|
+
tagged words combine with words on the left. MM tagged words are treated as the
|
|
35
|
+
middle of the word and combine with either side. LR tagged words are words by
|
|
36
|
+
themselves.
|
|
37
|
+
|
|
38
|
+
Example (from [1], Example 3(a) (raw), 3(b) (tagged), 3(c) (translation)):
|
|
39
|
+
- 上海 计划 到 本 世纪 末 实现 人均 国内 生产 总值 五千 美元
|
|
40
|
+
- 上/LL 海/RR 计/LL 划/RR 到/LR 本/LR 世/LL 纪/RR 末/LR 实/LL 现/RR 人/LL 均/RR
|
|
41
|
+
国/LL 内/RR 生/LL 产/RR 总/LL值/RR 五/LL 千/RR 美/LL 元/RR
|
|
42
|
+
- Shanghai plans to reach the goal of 5,000 dollars in per capita GDP by the end
|
|
43
|
+
of the century.
|
|
28
44
|
|
|
29
45
|
For instantiated/pretrained models, see :class:`.WordSegmenterModel`.
|
|
30
46
|
|
|
@@ -37,8 +53,17 @@ class WordSegmenterApproach(AnnotatorApproach):
|
|
|
37
53
|
The helper class :class:`.POS` might be useful to read training data into
|
|
38
54
|
data frames.
|
|
39
55
|
|
|
40
|
-
For extended examples of usage, see the `
|
|
41
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
56
|
+
For extended examples of usage, see the `Examples
|
|
57
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/chinese/word_segmentation>`__.
|
|
58
|
+
|
|
59
|
+
References
|
|
60
|
+
----------
|
|
61
|
+
|
|
62
|
+
`[1] <https://aclanthology.org/O03-4002.pdf>`__ Xue, Nianwen. “Chinese Word
|
|
63
|
+
Segmentation as Character Tagging.” International Journal of Computational
|
|
64
|
+
Linguistics & Chinese Language Processing, Volume 8, Number 1, February 2003:
|
|
65
|
+
Special Issue on Word Formation and Chinese Language Processing, 2003, pp. 29-48.
|
|
66
|
+
ACLWeb, https://aclanthology.org/O03-4002.
|
|
42
67
|
|
|
43
68
|
====================== ======================
|
|
44
69
|
Input Annotation types Output Annotation type
|
|
@@ -282,10 +307,10 @@ class WordSegmenterModel(AnnotatorModel):
|
|
|
282
307
|
|
|
283
308
|
The default model is ``"wordseg_pku"``, default language is ``"zh"``, if no
|
|
284
309
|
values are provided. For available pretrained models please see the `Models
|
|
285
|
-
Hub <https://
|
|
310
|
+
Hub <https://sparknlp.org/models?task=Word+Segmentation>`__.
|
|
286
311
|
|
|
287
|
-
For extended examples of usage, see the `
|
|
288
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
312
|
+
For extended examples of usage, see the `Examples
|
|
313
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/jupyter/annotation/chinese/word_segmentation/words_segmenter_demo.ipynb>`__.
|
|
289
314
|
|
|
290
315
|
====================== ======================
|
|
291
316
|
Input Annotation types Output Annotation type
|
sparknlp/base/__init__.py
CHANGED
|
@@ -12,13 +12,12 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Module of base Spark NLP annotators."""
|
|
15
|
-
|
|
16
|
-
from sparknlp.base.chunk2_doc import *
|
|
17
15
|
from sparknlp.base.doc2_chunk import *
|
|
18
16
|
from sparknlp.base.document_assembler import *
|
|
19
17
|
from sparknlp.base.multi_document_assembler import *
|
|
20
18
|
from sparknlp.base.embeddings_finisher import *
|
|
21
19
|
from sparknlp.base.finisher import *
|
|
20
|
+
from sparknlp.base.gguf_ranking_finisher import *
|
|
22
21
|
from sparknlp.base.graph_finisher import *
|
|
23
22
|
from sparknlp.base.has_recursive_fit import *
|
|
24
23
|
from sparknlp.base.has_recursive_transform import *
|
|
@@ -28,4 +27,4 @@ from sparknlp.base.token_assembler import *
|
|
|
28
27
|
from sparknlp.base.image_assembler import *
|
|
29
28
|
from sparknlp.base.audio_assembler import *
|
|
30
29
|
from sparknlp.base.table_assembler import *
|
|
31
|
-
|
|
30
|
+
from sparknlp.base.prompt_assembler import *
|
sparknlp/base/doc2_chunk.py
CHANGED
|
@@ -29,9 +29,6 @@ class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties):
|
|
|
29
29
|
``StringType`` or ``ArrayType[StringType]`` (using setIsArray). Useful for
|
|
30
30
|
annotators that require a CHUNK type input.
|
|
31
31
|
|
|
32
|
-
For more extended examples on document pre-processing see the
|
|
33
|
-
`Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
|
|
34
|
-
|
|
35
32
|
====================== ======================
|
|
36
33
|
Input Annotation types Output Annotation type
|
|
37
34
|
====================== ======================
|
|
@@ -24,13 +24,13 @@ class DocumentAssembler(AnnotatorTransformer):
|
|
|
24
24
|
"""Prepares data into a format that is processable by Spark NLP.
|
|
25
25
|
|
|
26
26
|
This is the entry point for every Spark NLP pipeline. The
|
|
27
|
-
`DocumentAssembler`
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
`DocumentAssembler` reads ``String`` columns. Additionally,
|
|
28
|
+
:meth:`.setCleanupMode` can be used to pre-process the
|
|
29
|
+
text (Default: ``disabled``). For possible options please refer the
|
|
30
|
+
parameters section.
|
|
31
31
|
|
|
32
32
|
For more extended examples on document pre-processing see the
|
|
33
|
-
`
|
|
33
|
+
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb>`__.
|
|
34
34
|
|
|
35
35
|
====================== ======================
|
|
36
36
|
Input Annotation types Output Annotation type
|
|
@@ -34,7 +34,8 @@ class EmbeddingsFinisher(AnnotatorTransformer):
|
|
|
34
34
|
require a ``featureCol``.
|
|
35
35
|
|
|
36
36
|
For more extended examples see the
|
|
37
|
-
`
|
|
37
|
+
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb
|
|
38
|
+
>`__.
|
|
38
39
|
|
|
39
40
|
====================== ======================
|
|
40
41
|
Input Annotation types Output Annotation type
|
|
@@ -127,7 +128,8 @@ class EmbeddingsFinisher(AnnotatorTransformer):
|
|
|
127
128
|
super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher")
|
|
128
129
|
self._setDefault(
|
|
129
130
|
cleanAnnotations=False,
|
|
130
|
-
outputAsVector=False
|
|
131
|
+
outputAsVector=False,
|
|
132
|
+
outputCols=[]
|
|
131
133
|
)
|
|
132
134
|
|
|
133
135
|
@keyword_only
|
|
@@ -187,3 +189,13 @@ class EmbeddingsFinisher(AnnotatorTransformer):
|
|
|
187
189
|
|
|
188
190
|
return self._set(outputAsVector=value)
|
|
189
191
|
|
|
192
|
+
def getInputCols(self):
|
|
193
|
+
"""Gets input columns name of annotations."""
|
|
194
|
+
return self.getOrDefault(self.inputCols)
|
|
195
|
+
|
|
196
|
+
def getOutputCols(self):
|
|
197
|
+
"""Gets output columns name of annotations."""
|
|
198
|
+
if len(self.getOrDefault(self.outputCols)) == 0:
|
|
199
|
+
return ["finished_" + input_col for input_col in self.getInputCols()]
|
|
200
|
+
else:
|
|
201
|
+
return self.getOrDefault(self.outputCols)
|
sparknlp/base/finisher.py
CHANGED
|
@@ -25,7 +25,8 @@ class Finisher(AnnotatorTransformer):
|
|
|
25
25
|
outputs annotation(s) values into ``String``.
|
|
26
26
|
|
|
27
27
|
For more extended examples on document pre-processing see the
|
|
28
|
-
`
|
|
28
|
+
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb
|
|
29
|
+
>`__.
|
|
29
30
|
|
|
30
31
|
====================== ======================
|
|
31
32
|
Input Annotation types Output Annotation type
|
|
@@ -97,7 +98,6 @@ class Finisher(AnnotatorTransformer):
|
|
|
97
98
|
includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
|
|
98
99
|
outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
|
|
99
100
|
parseEmbeddingsVectors = Param(Params._dummy(), "parseEmbeddingsVectors", "whether to include embeddings vectors in the process", typeConverter=TypeConverters.toBoolean)
|
|
100
|
-
|
|
101
101
|
name = "Finisher"
|
|
102
102
|
|
|
103
103
|
@keyword_only
|
|
@@ -109,7 +109,8 @@ class Finisher(AnnotatorTransformer):
|
|
|
109
109
|
outputAsArray=True,
|
|
110
110
|
parseEmbeddingsVectors=False,
|
|
111
111
|
valueSplitSymbol="#",
|
|
112
|
-
annotationSplitSymbol="@"
|
|
112
|
+
annotationSplitSymbol="@",
|
|
113
|
+
outputCols=[]
|
|
113
114
|
)
|
|
114
115
|
|
|
115
116
|
@keyword_only
|
|
@@ -122,7 +123,7 @@ class Finisher(AnnotatorTransformer):
|
|
|
122
123
|
|
|
123
124
|
Parameters
|
|
124
125
|
----------
|
|
125
|
-
*value : str
|
|
126
|
+
*value : List[str]
|
|
126
127
|
Input columns for the annotator
|
|
127
128
|
"""
|
|
128
129
|
if len(value) == 1 and type(value[0]) == list:
|
|
@@ -204,3 +205,13 @@ class Finisher(AnnotatorTransformer):
|
|
|
204
205
|
"""
|
|
205
206
|
return self._set(parseEmbeddingsVectors=value)
|
|
206
207
|
|
|
208
|
+
def getInputCols(self):
|
|
209
|
+
"""Gets input columns name of annotations."""
|
|
210
|
+
return self.getOrDefault(self.inputCols)
|
|
211
|
+
|
|
212
|
+
def getOutputCols(self):
|
|
213
|
+
"""Gets output columns name of annotations."""
|
|
214
|
+
if len(self.getOrDefault(self.outputCols)) == 0:
|
|
215
|
+
return ["finished_" + input_col for input_col in self.getInputCols()]
|
|
216
|
+
else:
|
|
217
|
+
return self.getOrDefault(self.outputCols)
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# Copyright 2017-2024 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the GGUFRankingFinisher."""
|
|
15
|
+
|
|
16
|
+
from pyspark import keyword_only
|
|
17
|
+
from pyspark.ml.param import TypeConverters, Params, Param
|
|
18
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GGUFRankingFinisher(AnnotatorTransformer):
|
|
22
|
+
"""Finisher for AutoGGUFReranker outputs that provides ranking capabilities
|
|
23
|
+
including top-k selection, sorting by relevance score, and score normalization.
|
|
24
|
+
|
|
25
|
+
This finisher processes the output of AutoGGUFReranker, which contains documents with
|
|
26
|
+
relevance scores in their metadata. It provides several options for post-processing:
|
|
27
|
+
|
|
28
|
+
- Top-k selection: Select only the top k documents by relevance score
|
|
29
|
+
- Score thresholding: Filter documents by minimum relevance score
|
|
30
|
+
- Min-max scaling: Normalize relevance scores to 0-1 range
|
|
31
|
+
- Sorting: Sort documents by relevance score in descending order
|
|
32
|
+
- Ranking: Add rank information to document metadata
|
|
33
|
+
|
|
34
|
+
The finisher preserves the document annotation structure while adding ranking information
|
|
35
|
+
to the metadata and optionally filtering/sorting the documents.
|
|
36
|
+
|
|
37
|
+
For extended examples of usage, see the `Examples
|
|
38
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.
|
|
39
|
+
|
|
40
|
+
====================== ======================
|
|
41
|
+
Input Annotation types Output Annotation type
|
|
42
|
+
====================== ======================
|
|
43
|
+
``DOCUMENT`` ``DOCUMENT``
|
|
44
|
+
====================== ======================
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
inputCols
|
|
49
|
+
Name of input annotation columns containing reranked documents
|
|
50
|
+
outputCol
|
|
51
|
+
Name of output annotation column containing ranked documents, by default "ranked_documents"
|
|
52
|
+
topK
|
|
53
|
+
Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
|
|
54
|
+
minRelevanceScore
|
|
55
|
+
Minimum relevance score threshold for filtering documents, by default Double.MinValue
|
|
56
|
+
minMaxScaling
|
|
57
|
+
Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False
|
|
58
|
+
|
|
59
|
+
Examples
|
|
60
|
+
--------
|
|
61
|
+
>>> import sparknlp
|
|
62
|
+
>>> from sparknlp.base import *
|
|
63
|
+
>>> from sparknlp.annotator import *
|
|
64
|
+
>>> from pyspark.ml import Pipeline
|
|
65
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
66
|
+
... .setInputCol("text") \\
|
|
67
|
+
... .setOutputCol("document")
|
|
68
|
+
>>> reranker = AutoGGUFReranker.pretrained() \\
|
|
69
|
+
... .setInputCols("document") \\
|
|
70
|
+
... .setOutputCol("reranked_documents") \\
|
|
71
|
+
... .setQuery("A man is eating pasta.")
|
|
72
|
+
>>> finisher = GGUFRankingFinisher() \\
|
|
73
|
+
... .setInputCols("reranked_documents") \\
|
|
74
|
+
... .setOutputCol("ranked_documents") \\
|
|
75
|
+
... .setTopK(3) \\
|
|
76
|
+
... .setMinMaxScaling(True)
|
|
77
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
|
|
78
|
+
>>> data = spark.createDataFrame([
|
|
79
|
+
... ("A man is eating food.",),
|
|
80
|
+
... ("A man is eating a piece of bread.",),
|
|
81
|
+
... ("The girl is carrying a baby.",),
|
|
82
|
+
... ("A man is riding a horse.",)
|
|
83
|
+
... ], ["text"])
|
|
84
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
85
|
+
>>> result.select("ranked_documents").show(truncate=False)
|
|
86
|
+
# Documents will be sorted by relevance with rank information in metadata
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
name = "GGUFRankingFinisher"
|
|
90
|
+
|
|
91
|
+
inputCols = Param(Params._dummy(),
|
|
92
|
+
"inputCols",
|
|
93
|
+
"Name of input annotation columns containing reranked documents",
|
|
94
|
+
typeConverter=TypeConverters.toListString)
|
|
95
|
+
|
|
96
|
+
outputCol = Param(Params._dummy(),
|
|
97
|
+
"outputCol",
|
|
98
|
+
"Name of output annotation column containing ranked documents",
|
|
99
|
+
typeConverter=TypeConverters.toListString)
|
|
100
|
+
|
|
101
|
+
topK = Param(Params._dummy(),
|
|
102
|
+
"topK",
|
|
103
|
+
"Maximum number of top documents to return based on relevance score (-1 for no limit)",
|
|
104
|
+
typeConverter=TypeConverters.toInt)
|
|
105
|
+
|
|
106
|
+
minRelevanceScore = Param(Params._dummy(),
|
|
107
|
+
"minRelevanceScore",
|
|
108
|
+
"Minimum relevance score threshold for filtering documents",
|
|
109
|
+
typeConverter=TypeConverters.toFloat)
|
|
110
|
+
|
|
111
|
+
minMaxScaling = Param(Params._dummy(),
|
|
112
|
+
"minMaxScaling",
|
|
113
|
+
"Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
|
|
114
|
+
typeConverter=TypeConverters.toBoolean)
|
|
115
|
+
|
|
116
|
+
@keyword_only
|
|
117
|
+
def __init__(self):
|
|
118
|
+
super(GGUFRankingFinisher, self).__init__(
|
|
119
|
+
classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
|
|
120
|
+
self._setDefault(
|
|
121
|
+
topK=-1,
|
|
122
|
+
minRelevanceScore=float('-inf'), # Equivalent to Double.MinValue
|
|
123
|
+
minMaxScaling=False,
|
|
124
|
+
outputCol=["ranked_documents"]
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
@keyword_only
|
|
128
|
+
def setParams(self):
|
|
129
|
+
kwargs = self._input_kwargs
|
|
130
|
+
return self._set(**kwargs)
|
|
131
|
+
|
|
132
|
+
def setInputCols(self, *value):
|
|
133
|
+
"""Sets input annotation column names.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
value : List[str]
|
|
138
|
+
Input annotation column names containing reranked documents
|
|
139
|
+
"""
|
|
140
|
+
if len(value) == 1 and isinstance(value[0], list):
|
|
141
|
+
return self._set(inputCols=value[0])
|
|
142
|
+
else:
|
|
143
|
+
return self._set(inputCols=list(value))
|
|
144
|
+
|
|
145
|
+
def getInputCols(self):
|
|
146
|
+
"""Gets input annotation column names.
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
List[str]
|
|
151
|
+
Input annotation column names
|
|
152
|
+
"""
|
|
153
|
+
return self.getOrDefault(self.inputCols)
|
|
154
|
+
|
|
155
|
+
def setOutputCol(self, value):
|
|
156
|
+
"""Sets output annotation column name.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
value : str
|
|
161
|
+
Output annotation column name
|
|
162
|
+
"""
|
|
163
|
+
return self._set(outputCol=[value])
|
|
164
|
+
|
|
165
|
+
def getOutputCol(self):
|
|
166
|
+
"""Gets output annotation column name.
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
str
|
|
171
|
+
Output annotation column name
|
|
172
|
+
"""
|
|
173
|
+
output_cols = self.getOrDefault(self.outputCol)
|
|
174
|
+
return output_cols[0] if output_cols else "ranked_documents"
|
|
175
|
+
|
|
176
|
+
def setTopK(self, value):
|
|
177
|
+
"""Sets maximum number of top documents to return.
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
value : int
|
|
182
|
+
Maximum number of top documents to return (-1 for no limit)
|
|
183
|
+
"""
|
|
184
|
+
return self._set(topK=value)
|
|
185
|
+
|
|
186
|
+
def getTopK(self):
|
|
187
|
+
"""Gets maximum number of top documents to return.
|
|
188
|
+
|
|
189
|
+
Returns
|
|
190
|
+
-------
|
|
191
|
+
int
|
|
192
|
+
Maximum number of top documents to return
|
|
193
|
+
"""
|
|
194
|
+
return self.getOrDefault(self.topK)
|
|
195
|
+
|
|
196
|
+
def setMinRelevanceScore(self, value):
|
|
197
|
+
"""Sets minimum relevance score threshold.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
value : float
|
|
202
|
+
Minimum relevance score threshold
|
|
203
|
+
"""
|
|
204
|
+
return self._set(minRelevanceScore=value)
|
|
205
|
+
|
|
206
|
+
def getMinRelevanceScore(self):
|
|
207
|
+
"""Gets minimum relevance score threshold.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
float
|
|
212
|
+
Minimum relevance score threshold
|
|
213
|
+
"""
|
|
214
|
+
return self.getOrDefault(self.minRelevanceScore)
|
|
215
|
+
|
|
216
|
+
def setMinMaxScaling(self, value):
|
|
217
|
+
"""Sets whether to apply min-max scaling.
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
value : bool
|
|
222
|
+
Whether to apply min-max scaling to normalize scores
|
|
223
|
+
"""
|
|
224
|
+
return self._set(minMaxScaling=value)
|
|
225
|
+
|
|
226
|
+
def getMinMaxScaling(self):
|
|
227
|
+
"""Gets whether to apply min-max scaling.
|
|
228
|
+
|
|
229
|
+
Returns
|
|
230
|
+
-------
|
|
231
|
+
bool
|
|
232
|
+
Whether min-max scaling is enabled
|
|
233
|
+
"""
|
|
234
|
+
return self.getOrDefault(self.minMaxScaling)
|
sparknlp/base/image_assembler.py
CHANGED
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
|
|
16
16
|
from pyspark import keyword_only
|
|
17
17
|
from pyspark.ml.param import TypeConverters, Params, Param
|
|
18
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
19
|
+
from pyspark.sql.functions import regexp_replace, col
|
|
18
20
|
|
|
19
21
|
from sparknlp.common import AnnotatorType
|
|
20
22
|
from sparknlp.internal import AnnotatorTransformer
|
|
@@ -65,6 +67,7 @@ class ImageAssembler(AnnotatorTransformer):
|
|
|
65
67
|
outputAnnotatorType = AnnotatorType.IMAGE
|
|
66
68
|
|
|
67
69
|
inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
|
|
70
|
+
textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
|
|
68
71
|
outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
|
|
69
72
|
name = 'ImageAssembler'
|
|
70
73
|
|
|
@@ -101,3 +104,69 @@ class ImageAssembler(AnnotatorTransformer):
|
|
|
101
104
|
def getOutputCol(self):
|
|
102
105
|
"""Gets output column name of annotations."""
|
|
103
106
|
return self.getOrDefault(self.outputCol)
|
|
107
|
+
|
|
108
|
+
def setTextCol(self, value):
|
|
109
|
+
"""Sets an optional text column name.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
value : str
|
|
114
|
+
Name of an optional input text column
|
|
115
|
+
"""
|
|
116
|
+
return self._set(inputCol=value)
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def loadImagesAsBytes(cls, spark: SparkSession, path: str):
|
|
120
|
+
"""
|
|
121
|
+
Loads images from a given path and returns them as raw bytes, instead of the default
|
|
122
|
+
OpenCV-compatible format. Supported image types include JPEG, PNG, GIF, and BMP.
|
|
123
|
+
|
|
124
|
+
Multimodal inference with llama.cpp requires raw bytes as input.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
spark : SparkSession
|
|
129
|
+
The active SparkSession.
|
|
130
|
+
path : str
|
|
131
|
+
The path to the images. Supported image types are JPEG, PNG, GIF, and BMP.
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
DataFrame
|
|
136
|
+
A DataFrame containing the images as raw bytes along with their metadata.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
# Replace the path separator in the `origin` field and `path` column, so that they match
|
|
140
|
+
def replace_path(column_name: str):
|
|
141
|
+
return regexp_replace(col(column_name), ":///", ":/")
|
|
142
|
+
|
|
143
|
+
# Load the images as metadata with the default Spark image format
|
|
144
|
+
data = (
|
|
145
|
+
spark.read.format("image")
|
|
146
|
+
.option("dropInvalid", True)
|
|
147
|
+
.load(path)
|
|
148
|
+
.withColumn(
|
|
149
|
+
"image", col("image").withField("origin", replace_path("image.origin"))
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Load the images as raw binary files
|
|
154
|
+
image_bytes = (
|
|
155
|
+
spark.read.format("binaryFile")
|
|
156
|
+
.option("pathGlobFilter", "*.{jpeg,jpg,png,gif,bmp,JPEG,JPG,PNG,GIF,BMP}")
|
|
157
|
+
.option("dropInvalid", True)
|
|
158
|
+
.load(path)
|
|
159
|
+
.withColumn("path", replace_path("path"))
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Join the two datasets on the file path
|
|
163
|
+
df_joined = data.join(
|
|
164
|
+
image_bytes, data["image.origin"] == image_bytes["path"], "inner"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Replace the `data` field of the `image` column with raw bytes
|
|
168
|
+
df_image_replaced = df_joined.withColumn(
|
|
169
|
+
"image", df_joined["image"].withField("data", df_joined["content"])
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return df_image_replaced
|