spark-nlp 6.0.5__py2.py3-none-any.whl → 6.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/METADATA +12 -11
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/RECORD +15 -12
- sparknlp/__init__.py +1 -1
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +4 -12
- sparknlp/annotator/seq2seq/__init__.py +1 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +10 -7
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +3 -3
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/common/properties.py +114 -85
- sparknlp/internal/__init__.py +9 -0
- sparknlp/reader/reader2doc.py +210 -0
- sparknlp/reader/reader2table.py +163 -0
- sparknlp/reader/sparknlp_reader.py +45 -0
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/WHEEL +0 -0
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/top_level.txt +0 -0
sparknlp/common/properties.py
CHANGED
|
@@ -628,7 +628,6 @@ class HasGeneratorProperties:
|
|
|
628
628
|
"The number of sequences to return from the beam search.",
|
|
629
629
|
typeConverter=TypeConverters.toInt)
|
|
630
630
|
|
|
631
|
-
|
|
632
631
|
def setTask(self, value):
|
|
633
632
|
"""Sets the transformer's task, e.g. ``summarize:``.
|
|
634
633
|
|
|
@@ -639,7 +638,6 @@ class HasGeneratorProperties:
|
|
|
639
638
|
"""
|
|
640
639
|
return self._set(task=value)
|
|
641
640
|
|
|
642
|
-
|
|
643
641
|
def setMinOutputLength(self, value):
|
|
644
642
|
"""Sets minimum length of the sequence to be generated.
|
|
645
643
|
|
|
@@ -650,7 +648,6 @@ class HasGeneratorProperties:
|
|
|
650
648
|
"""
|
|
651
649
|
return self._set(minOutputLength=value)
|
|
652
650
|
|
|
653
|
-
|
|
654
651
|
def setMaxOutputLength(self, value):
|
|
655
652
|
"""Sets maximum length of output text.
|
|
656
653
|
|
|
@@ -661,7 +658,6 @@ class HasGeneratorProperties:
|
|
|
661
658
|
"""
|
|
662
659
|
return self._set(maxOutputLength=value)
|
|
663
660
|
|
|
664
|
-
|
|
665
661
|
def setDoSample(self, value):
|
|
666
662
|
"""Sets whether or not to use sampling, use greedy decoding otherwise.
|
|
667
663
|
|
|
@@ -672,7 +668,6 @@ class HasGeneratorProperties:
|
|
|
672
668
|
"""
|
|
673
669
|
return self._set(doSample=value)
|
|
674
670
|
|
|
675
|
-
|
|
676
671
|
def setTemperature(self, value):
|
|
677
672
|
"""Sets the value used to module the next token probabilities.
|
|
678
673
|
|
|
@@ -683,7 +678,6 @@ class HasGeneratorProperties:
|
|
|
683
678
|
"""
|
|
684
679
|
return self._set(temperature=value)
|
|
685
680
|
|
|
686
|
-
|
|
687
681
|
def setTopK(self, value):
|
|
688
682
|
"""Sets the number of highest probability vocabulary tokens to keep for
|
|
689
683
|
top-k-filtering.
|
|
@@ -695,7 +689,6 @@ class HasGeneratorProperties:
|
|
|
695
689
|
"""
|
|
696
690
|
return self._set(topK=value)
|
|
697
691
|
|
|
698
|
-
|
|
699
692
|
def setTopP(self, value):
|
|
700
693
|
"""Sets the top cumulative probability for vocabulary tokens.
|
|
701
694
|
|
|
@@ -709,7 +702,6 @@ class HasGeneratorProperties:
|
|
|
709
702
|
"""
|
|
710
703
|
return self._set(topP=value)
|
|
711
704
|
|
|
712
|
-
|
|
713
705
|
def setRepetitionPenalty(self, value):
|
|
714
706
|
"""Sets the parameter for repetition penalty. 1.0 means no penalty.
|
|
715
707
|
|
|
@@ -725,7 +717,6 @@ class HasGeneratorProperties:
|
|
|
725
717
|
"""
|
|
726
718
|
return self._set(repetitionPenalty=value)
|
|
727
719
|
|
|
728
|
-
|
|
729
720
|
def setNoRepeatNgramSize(self, value):
|
|
730
721
|
"""Sets size of n-grams that can only occur once.
|
|
731
722
|
|
|
@@ -738,7 +729,6 @@ class HasGeneratorProperties:
|
|
|
738
729
|
"""
|
|
739
730
|
return self._set(noRepeatNgramSize=value)
|
|
740
731
|
|
|
741
|
-
|
|
742
732
|
def setBeamSize(self, value):
|
|
743
733
|
"""Sets the number of beam size for beam search.
|
|
744
734
|
|
|
@@ -749,7 +739,6 @@ class HasGeneratorProperties:
|
|
|
749
739
|
"""
|
|
750
740
|
return self._set(beamSize=value)
|
|
751
741
|
|
|
752
|
-
|
|
753
742
|
def setNReturnSequences(self, value):
|
|
754
743
|
"""Sets the number of sequences to return from the beam search.
|
|
755
744
|
|
|
@@ -765,14 +754,14 @@ class HasLlamaCppProperties:
|
|
|
765
754
|
# -------- MODEl PARAMETERS --------
|
|
766
755
|
nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation",
|
|
767
756
|
typeConverter=TypeConverters.toInt)
|
|
768
|
-
nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
|
|
769
|
-
|
|
757
|
+
# nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
|
|
758
|
+
# typeConverter=TypeConverters.toInt)
|
|
770
759
|
nThreadsBatch = Param(Params._dummy(), "nThreadsBatch",
|
|
771
760
|
"Set the number of threads to use during batch and prompt processing",
|
|
772
761
|
typeConverter=TypeConverters.toInt)
|
|
773
|
-
nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
|
|
774
|
-
|
|
775
|
-
|
|
762
|
+
# nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
|
|
763
|
+
# "Set the number of threads to use during batch and prompt processing",
|
|
764
|
+
# typeConverter=TypeConverters.toInt)
|
|
776
765
|
nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt)
|
|
777
766
|
nBatch = Param(Params._dummy(), "nBatch",
|
|
778
767
|
"Set the logical batch size for prompt processing (must be >=32 to use BLAS)",
|
|
@@ -782,12 +771,12 @@ class HasLlamaCppProperties:
|
|
|
782
771
|
typeConverter=TypeConverters.toInt)
|
|
783
772
|
nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding",
|
|
784
773
|
typeConverter=TypeConverters.toInt)
|
|
785
|
-
nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
|
|
786
|
-
|
|
787
|
-
nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
|
|
788
|
-
|
|
789
|
-
pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
|
|
790
|
-
|
|
774
|
+
# nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
|
|
775
|
+
# typeConverter=TypeConverters.toInt)
|
|
776
|
+
# nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
|
|
777
|
+
# typeConverter=TypeConverters.toInt)
|
|
778
|
+
# pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
|
|
779
|
+
# typeConverter=TypeConverters.toFloat)
|
|
791
780
|
nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)",
|
|
792
781
|
typeConverter=TypeConverters.toInt)
|
|
793
782
|
nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft",
|
|
@@ -802,10 +791,10 @@ class HasLlamaCppProperties:
|
|
|
802
791
|
typeConverter=TypeConverters.toString)
|
|
803
792
|
mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.",
|
|
804
793
|
typeConverter=TypeConverters.toInt)
|
|
805
|
-
tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
|
|
806
|
-
|
|
807
|
-
grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
|
|
808
|
-
grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
|
|
794
|
+
# tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
|
|
795
|
+
# typeConverter=TypeConverters.toListFloat)
|
|
796
|
+
# grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
|
|
797
|
+
# grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
|
|
809
798
|
ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling",
|
|
810
799
|
typeConverter=TypeConverters.toFloat)
|
|
811
800
|
ropeFreqScale = Param(Params._dummy(), "ropeFreqScale",
|
|
@@ -837,7 +826,7 @@ class HasLlamaCppProperties:
|
|
|
837
826
|
typeConverter=TypeConverters.toString)
|
|
838
827
|
# Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
|
|
839
828
|
#
|
|
840
|
-
# -
|
|
829
|
+
# - NONE: Don't use any scaling
|
|
841
830
|
# - LINEAR: Linear scaling
|
|
842
831
|
# - YARN: YaRN RoPE scaling
|
|
843
832
|
ropeScalingType = Param(Params._dummy(), "ropeScalingType",
|
|
@@ -845,29 +834,30 @@ class HasLlamaCppProperties:
|
|
|
845
834
|
typeConverter=TypeConverters.toString)
|
|
846
835
|
# Set the pooling type for embeddings, use model default if unspecified
|
|
847
836
|
#
|
|
848
|
-
# -
|
|
849
|
-
# -
|
|
850
|
-
# -
|
|
837
|
+
# - MEAN: Mean Pooling
|
|
838
|
+
# - CLS: CLS Pooling
|
|
839
|
+
# - LAST: Last token pooling
|
|
840
|
+
# - RANK: For reranked models
|
|
851
841
|
poolingType = Param(Params._dummy(), "poolingType",
|
|
852
842
|
"Set the pooling type for embeddings, use model default if unspecified",
|
|
853
843
|
typeConverter=TypeConverters.toString)
|
|
854
844
|
modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding",
|
|
855
845
|
typeConverter=TypeConverters.toString)
|
|
856
846
|
modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString)
|
|
857
|
-
lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
|
|
861
|
-
|
|
862
|
-
|
|
847
|
+
# lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
|
|
848
|
+
# "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
|
|
849
|
+
# typeConverter=TypeConverters.toString)
|
|
850
|
+
# lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
|
|
851
|
+
# "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
|
|
852
|
+
# typeConverter=TypeConverters.toString)
|
|
863
853
|
# loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
|
|
864
854
|
embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support",
|
|
865
855
|
typeConverter=TypeConverters.toBoolean)
|
|
866
856
|
flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention",
|
|
867
857
|
typeConverter=TypeConverters.toBoolean)
|
|
868
|
-
inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
|
|
869
|
-
|
|
870
|
-
|
|
858
|
+
# inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
|
|
859
|
+
# "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
|
|
860
|
+
# typeConverter=TypeConverters.toBoolean)
|
|
871
861
|
useMmap = Param(Params._dummy(), "useMmap",
|
|
872
862
|
"Whether to use memory-map model (faster load but may increase pageouts if not using mlock)",
|
|
873
863
|
typeConverter=TypeConverters.toBoolean)
|
|
@@ -880,6 +870,10 @@ class HasLlamaCppProperties:
|
|
|
880
870
|
typeConverter=TypeConverters.toString)
|
|
881
871
|
chatTemplate = Param(Params._dummy(), "chatTemplate", "The chat template to use",
|
|
882
872
|
typeConverter=TypeConverters.toString)
|
|
873
|
+
logVerbosity = Param(Params._dummy(), "logVerbosity", "Set the log verbosity level",
|
|
874
|
+
typeConverter=TypeConverters.toInt)
|
|
875
|
+
disableLog = Param(Params._dummy(), "disableLog", "Whether to disable logging",
|
|
876
|
+
typeConverter=TypeConverters.toBoolean)
|
|
883
877
|
|
|
884
878
|
# -------- INFERENCE PARAMETERS --------
|
|
885
879
|
inputPrefix = Param(Params._dummy(), "inputPrefix", "Set the prompt to start generation with",
|
|
@@ -948,17 +942,17 @@ class HasLlamaCppProperties:
|
|
|
948
942
|
"""Set the number of threads to use during generation"""
|
|
949
943
|
return self._set(nThreads=nThreads)
|
|
950
944
|
|
|
951
|
-
def setNThreadsDraft(self, nThreadsDraft: int):
|
|
952
|
-
|
|
953
|
-
|
|
945
|
+
# def setNThreadsDraft(self, nThreadsDraft: int):
|
|
946
|
+
# """Set the number of threads to use during draft generation"""
|
|
947
|
+
# return self._set(nThreadsDraft=nThreadsDraft)
|
|
954
948
|
|
|
955
949
|
def setNThreadsBatch(self, nThreadsBatch: int):
|
|
956
950
|
"""Set the number of threads to use during batch and prompt processing"""
|
|
957
951
|
return self._set(nThreadsBatch=nThreadsBatch)
|
|
958
952
|
|
|
959
|
-
def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
|
|
960
|
-
|
|
961
|
-
|
|
953
|
+
# def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
|
|
954
|
+
# """Set the number of threads to use during batch and prompt processing"""
|
|
955
|
+
# return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
|
|
962
956
|
|
|
963
957
|
def setNCtx(self, nCtx: int):
|
|
964
958
|
"""Set the size of the prompt context"""
|
|
@@ -976,17 +970,17 @@ class HasLlamaCppProperties:
|
|
|
976
970
|
"""Set the number of tokens to draft for speculative decoding"""
|
|
977
971
|
return self._set(nDraft=nDraft)
|
|
978
972
|
|
|
979
|
-
def setNChunks(self, nChunks: int):
|
|
980
|
-
|
|
981
|
-
|
|
973
|
+
# def setNChunks(self, nChunks: int):
|
|
974
|
+
# """Set the maximal number of chunks to process"""
|
|
975
|
+
# return self._set(nChunks=nChunks)
|
|
982
976
|
|
|
983
|
-
def setNSequences(self, nSequences: int):
|
|
984
|
-
|
|
985
|
-
|
|
977
|
+
# def setNSequences(self, nSequences: int):
|
|
978
|
+
# """Set the number of sequences to decode"""
|
|
979
|
+
# return self._set(nSequences=nSequences)
|
|
986
980
|
|
|
987
|
-
def setPSplit(self, pSplit: float):
|
|
988
|
-
|
|
989
|
-
|
|
981
|
+
# def setPSplit(self, pSplit: float):
|
|
982
|
+
# """Set the speculative decoding split probability"""
|
|
983
|
+
# return self._set(pSplit=pSplit)
|
|
990
984
|
|
|
991
985
|
def setNGpuLayers(self, nGpuLayers: int):
|
|
992
986
|
"""Set the number of layers to store in VRAM (-1 - use default)"""
|
|
@@ -1004,17 +998,17 @@ class HasLlamaCppProperties:
|
|
|
1004
998
|
"""Set the main GPU that is used for scratch and small tensors."""
|
|
1005
999
|
return self._set(mainGpu=mainGpu)
|
|
1006
1000
|
|
|
1007
|
-
def setTensorSplit(self, tensorSplit: List[float]):
|
|
1008
|
-
|
|
1009
|
-
|
|
1001
|
+
# def setTensorSplit(self, tensorSplit: List[float]):
|
|
1002
|
+
# """Set how split tensors should be distributed across GPUs"""
|
|
1003
|
+
# return self._set(tensorSplit=tensorSplit)
|
|
1010
1004
|
|
|
1011
|
-
def setGrpAttnN(self, grpAttnN: int):
|
|
1012
|
-
|
|
1013
|
-
|
|
1005
|
+
# def setGrpAttnN(self, grpAttnN: int):
|
|
1006
|
+
# """Set the group-attention factor"""
|
|
1007
|
+
# return self._set(grpAttnN=grpAttnN)
|
|
1014
1008
|
|
|
1015
|
-
def setGrpAttnW(self, grpAttnW: int):
|
|
1016
|
-
|
|
1017
|
-
|
|
1009
|
+
# def setGrpAttnW(self, grpAttnW: int):
|
|
1010
|
+
# """Set the group-attention width"""
|
|
1011
|
+
# return self._set(grpAttnW=grpAttnW)
|
|
1018
1012
|
|
|
1019
1013
|
def setRopeFreqBase(self, ropeFreqBase: float):
|
|
1020
1014
|
"""Set the RoPE base frequency, used by NTK-aware scaling"""
|
|
@@ -1049,7 +1043,16 @@ class HasLlamaCppProperties:
|
|
|
1049
1043
|
return self._set(defragmentationThreshold=defragmentationThreshold)
|
|
1050
1044
|
|
|
1051
1045
|
def setNumaStrategy(self, numaStrategy: str):
|
|
1052
|
-
"""Set optimization strategies that help on some NUMA systems (if available)
|
|
1046
|
+
"""Set optimization strategies that help on some NUMA systems (if available)
|
|
1047
|
+
|
|
1048
|
+
Possible values:
|
|
1049
|
+
|
|
1050
|
+
- DISABLED: No NUMA optimizations
|
|
1051
|
+
- DISTRIBUTE: spread execution evenly over all
|
|
1052
|
+
- ISOLATE: only spawn threads on CPUs on the node that execution started on
|
|
1053
|
+
- NUMA_CTL: use the CPU map provided by numactl
|
|
1054
|
+
- MIRROR: Mirrors the model across NUMA nodes
|
|
1055
|
+
"""
|
|
1053
1056
|
numaUpper = numaStrategy.upper()
|
|
1054
1057
|
numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"]
|
|
1055
1058
|
if numaUpper not in numaStrategies:
|
|
@@ -1060,13 +1063,35 @@ class HasLlamaCppProperties:
|
|
|
1060
1063
|
return self._set(numaStrategy=numaStrategy)
|
|
1061
1064
|
|
|
1062
1065
|
def setRopeScalingType(self, ropeScalingType: str):
|
|
1063
|
-
"""Set the RoPE frequency scaling method, defaults to linear unless specified by the model
|
|
1064
|
-
|
|
1066
|
+
"""Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
|
|
1067
|
+
|
|
1068
|
+
Possible values:
|
|
1069
|
+
|
|
1070
|
+
- NONE: Don't use any scaling
|
|
1071
|
+
- LINEAR: Linear scaling
|
|
1072
|
+
- YARN: YaRN RoPE scaling
|
|
1073
|
+
"""
|
|
1074
|
+
ropeScalingTypeUpper = ropeScalingType.upper()
|
|
1075
|
+
ropeScalingTypes = ["NONE", "LINEAR", "YARN"]
|
|
1076
|
+
if ropeScalingTypeUpper not in ropeScalingTypes:
|
|
1077
|
+
raise ValueError(
|
|
1078
|
+
f"Invalid RoPE scaling type: {ropeScalingType}. "
|
|
1079
|
+
+ f"Valid values are: {ropeScalingTypes}"
|
|
1080
|
+
)
|
|
1081
|
+
return self._set(ropeScalingType=ropeScalingTypeUpper)
|
|
1065
1082
|
|
|
1066
1083
|
def setPoolingType(self, poolingType: str):
|
|
1067
|
-
"""Set the pooling type for embeddings, use model default if unspecified
|
|
1084
|
+
"""Set the pooling type for embeddings, use model default if unspecified
|
|
1085
|
+
|
|
1086
|
+
Possible values:
|
|
1087
|
+
|
|
1088
|
+
- MEAN: Mean Pooling
|
|
1089
|
+
- CLS: CLS Pooling
|
|
1090
|
+
- LAST: Last token pooling
|
|
1091
|
+
- RANK: For reranked models
|
|
1092
|
+
"""
|
|
1068
1093
|
poolingTypeUpper = poolingType.upper()
|
|
1069
|
-
poolingTypes = ["NONE", "MEAN", "CLS", "LAST"]
|
|
1094
|
+
poolingTypes = ["NONE", "MEAN", "CLS", "LAST", "RANK"]
|
|
1070
1095
|
if poolingTypeUpper not in poolingTypes:
|
|
1071
1096
|
raise ValueError(
|
|
1072
1097
|
f"Invalid pooling type: {poolingType}. "
|
|
@@ -1082,25 +1107,21 @@ class HasLlamaCppProperties:
|
|
|
1082
1107
|
"""Set a model alias"""
|
|
1083
1108
|
return self._set(modelAlias=modelAlias)
|
|
1084
1109
|
|
|
1085
|
-
def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
|
|
1086
|
-
|
|
1087
|
-
|
|
1110
|
+
# def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
|
|
1111
|
+
# """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
|
|
1112
|
+
# return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
|
|
1088
1113
|
|
|
1089
|
-
def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
def setEmbedding(self, embedding: bool):
|
|
1094
|
-
"""Whether to load model with embedding support"""
|
|
1095
|
-
return self._set(embedding=embedding)
|
|
1114
|
+
# def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
|
|
1115
|
+
# """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
|
|
1116
|
+
# return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
|
|
1096
1117
|
|
|
1097
1118
|
def setFlashAttention(self, flashAttention: bool):
|
|
1098
1119
|
"""Whether to enable Flash Attention"""
|
|
1099
1120
|
return self._set(flashAttention=flashAttention)
|
|
1100
1121
|
|
|
1101
|
-
def setInputPrefixBos(self, inputPrefixBos: bool):
|
|
1102
|
-
|
|
1103
|
-
|
|
1122
|
+
# def setInputPrefixBos(self, inputPrefixBos: bool):
|
|
1123
|
+
# """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
|
|
1124
|
+
# return self._set(inputPrefixBos=inputPrefixBos)
|
|
1104
1125
|
|
|
1105
1126
|
def setUseMmap(self, useMmap: bool):
|
|
1106
1127
|
"""Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
|
|
@@ -1246,11 +1267,19 @@ class HasLlamaCppProperties:
|
|
|
1246
1267
|
def setUseChatTemplate(self, useChatTemplate: bool):
|
|
1247
1268
|
"""Set whether generate should apply a chat template"""
|
|
1248
1269
|
return self._set(useChatTemplate=useChatTemplate)
|
|
1249
|
-
|
|
1270
|
+
|
|
1250
1271
|
def setNParallel(self, nParallel: int):
|
|
1251
1272
|
"""Sets the number of parallel processes for decoding. This is an alias for `setBatchSize`."""
|
|
1252
1273
|
return self.setBatchSize(nParallel)
|
|
1253
1274
|
|
|
1275
|
+
def setLogVerbosity(self, logVerbosity: int):
|
|
1276
|
+
"""Set the log verbosity level"""
|
|
1277
|
+
return self._set(logVerbosity=logVerbosity)
|
|
1278
|
+
|
|
1279
|
+
def setDisableLog(self, disableLog: bool):
|
|
1280
|
+
"""Whether to disable logging"""
|
|
1281
|
+
return self._set(disableLog=disableLog)
|
|
1282
|
+
|
|
1254
1283
|
# -------- JAVA SETTERS --------
|
|
1255
1284
|
def setTokenIdBias(self, tokenIdBias: Dict[int, float]):
|
|
1256
1285
|
"""Set token id bias"""
|
|
@@ -1260,9 +1289,9 @@ class HasLlamaCppProperties:
|
|
|
1260
1289
|
"""Set token id bias"""
|
|
1261
1290
|
return self._call_java("setTokenBias", tokenBias)
|
|
1262
1291
|
|
|
1263
|
-
def setLoraAdapters(self, loraAdapters: Dict[str, float]):
|
|
1264
|
-
|
|
1265
|
-
|
|
1292
|
+
# def setLoraAdapters(self, loraAdapters: Dict[str, float]):
|
|
1293
|
+
# """Set LoRA adapters with their scaling factors"""
|
|
1294
|
+
# return self._call_java("setLoraAdapters", loraAdapters)
|
|
1266
1295
|
|
|
1267
1296
|
def getMetadata(self):
|
|
1268
1297
|
"""Gets the metadata of the model"""
|
sparknlp/internal/__init__.py
CHANGED
|
@@ -1182,4 +1182,13 @@ class _E5VEmbeddingsLoader(ExtendedJavaWrapper):
|
|
|
1182
1182
|
path,
|
|
1183
1183
|
jspark,
|
|
1184
1184
|
use_openvino
|
|
1185
|
+
)
|
|
1186
|
+
|
|
1187
|
+
class _Phi4Loader(ExtendedJavaWrapper):
|
|
1188
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
1189
|
+
super(_Phi4Loader, self).__init__(
|
|
1190
|
+
"com.johnsnowlabs.nlp.annotators.seq2seq.Phi4Transformer.loadSavedModel",
|
|
1191
|
+
path,
|
|
1192
|
+
jspark,
|
|
1193
|
+
use_openvino,
|
|
1185
1194
|
)
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from pyspark import keyword_only
|
|
15
|
+
from pyspark.ml.param import TypeConverters, Params, Param
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import AnnotatorType
|
|
18
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
19
|
+
from sparknlp.partition.partition_properties import *
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Reader2Doc(
|
|
23
|
+
AnnotatorTransformer,
|
|
24
|
+
HasEmailReaderProperties,
|
|
25
|
+
HasExcelReaderProperties,
|
|
26
|
+
HasHTMLReaderProperties,
|
|
27
|
+
HasPowerPointProperties,
|
|
28
|
+
HasTextReaderProperties
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
The Reader2Doc annotator allows you to use reading files more smoothly within existing
|
|
32
|
+
Spark NLP workflows, enabling seamless reuse of your pipelines.
|
|
33
|
+
|
|
34
|
+
Reader2Doc can be used for extracting structured content from various document types
|
|
35
|
+
using Spark NLP readers. It supports reading from many file types and returns parsed
|
|
36
|
+
output as a structured Spark DataFrame.
|
|
37
|
+
|
|
38
|
+
Supported formats include:
|
|
39
|
+
|
|
40
|
+
- Plain text
|
|
41
|
+
- HTML
|
|
42
|
+
- Word (.doc/.docx)
|
|
43
|
+
- Excel (.xls/.xlsx)
|
|
44
|
+
- PowerPoint (.ppt/.pptx)
|
|
45
|
+
- Email files (.eml, .msg)
|
|
46
|
+
- PDFs
|
|
47
|
+
|
|
48
|
+
Examples
|
|
49
|
+
--------
|
|
50
|
+
>>> from johnsnowlabs.reader import Reader2Doc
|
|
51
|
+
>>> from johnsnowlabs.nlp.base import DocumentAssembler
|
|
52
|
+
>>> from pyspark.ml import Pipeline
|
|
53
|
+
>>> # Initialize Reader2Doc for PDF files
|
|
54
|
+
>>> reader2doc = Reader2Doc() \\
|
|
55
|
+
... .setContentType("application/pdf") \\
|
|
56
|
+
... .setContentPath(f"{pdf_directory}/")
|
|
57
|
+
>>> # Build the pipeline with the Reader2Doc stage
|
|
58
|
+
>>> pipeline = Pipeline(stages=[reader2doc])
|
|
59
|
+
>>> # Fit the pipeline to an empty DataFrame
|
|
60
|
+
>>> pipeline_model = pipeline.fit(empty_data_set)
|
|
61
|
+
>>> result_df = pipeline_model.transform(empty_data_set)
|
|
62
|
+
>>> # Show the resulting DataFrame
|
|
63
|
+
>>> result_df.show()
|
|
64
|
+
+------------------------------------------------------------------------------------------------------------------------------------+
|
|
65
|
+
|document |
|
|
66
|
+
+------------------------------------------------------------------------------------------------------------------------------------+
|
|
67
|
+
|[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
|
|
68
|
+
|[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
|
|
69
|
+
|[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
|
|
70
|
+
+------------------------------------------------------------------------------------------------------------------------------------+
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
name = "Reader2Doc"
|
|
74
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
75
|
+
|
|
76
|
+
contentPath = Param(
|
|
77
|
+
Params._dummy(),
|
|
78
|
+
"contentPath",
|
|
79
|
+
"contentPath path to files to read",
|
|
80
|
+
typeConverter=TypeConverters.toString
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
outputCol = Param(
|
|
84
|
+
Params._dummy(),
|
|
85
|
+
"outputCol",
|
|
86
|
+
"output column name",
|
|
87
|
+
typeConverter=TypeConverters.toString
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
contentType = Param(
|
|
91
|
+
Params._dummy(),
|
|
92
|
+
"contentType",
|
|
93
|
+
"Set the content type to load following MIME specification",
|
|
94
|
+
typeConverter=TypeConverters.toString
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
explodeDocs = Param(
|
|
98
|
+
Params._dummy(),
|
|
99
|
+
"explodeDocs",
|
|
100
|
+
"whether to explode the documents into separate rows",
|
|
101
|
+
typeConverter=TypeConverters.toBoolean
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
flattenOutput = Param(
|
|
105
|
+
Params._dummy(),
|
|
106
|
+
"flattenOutput",
|
|
107
|
+
"If true, output is flattened to plain text with minimal metadata",
|
|
108
|
+
typeConverter=TypeConverters.toBoolean
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
titleThreshold = Param(
|
|
112
|
+
Params._dummy(),
|
|
113
|
+
"titleThreshold",
|
|
114
|
+
"Minimum font size threshold for title detection in PDF docs",
|
|
115
|
+
typeConverter=TypeConverters.toFloat
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
outputFormat = Param(
|
|
119
|
+
Params._dummy(),
|
|
120
|
+
"outputFormat",
|
|
121
|
+
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
|
|
122
|
+
typeConverter=TypeConverters.toString
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
@keyword_only
|
|
126
|
+
def __init__(self):
|
|
127
|
+
super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
|
|
128
|
+
self._setDefault(
|
|
129
|
+
outputCol="document",
|
|
130
|
+
explodeDocs=False,
|
|
131
|
+
contentType="",
|
|
132
|
+
flattenOutput=False,
|
|
133
|
+
titleThreshold=18
|
|
134
|
+
)
|
|
135
|
+
@keyword_only
|
|
136
|
+
def setParams(self):
|
|
137
|
+
kwargs = self._input_kwargs
|
|
138
|
+
return self._set(**kwargs)
|
|
139
|
+
|
|
140
|
+
def setContentPath(self, value):
|
|
141
|
+
"""Sets content path.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
value : str
|
|
146
|
+
contentPath path to files to read
|
|
147
|
+
"""
|
|
148
|
+
return self._set(contentPath=value)
|
|
149
|
+
|
|
150
|
+
def setContentType(self, value):
|
|
151
|
+
"""
|
|
152
|
+
Set the content type to load following MIME specification
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
value : str
|
|
157
|
+
content type to load following MIME specification
|
|
158
|
+
"""
|
|
159
|
+
return self._set(contentType=value)
|
|
160
|
+
|
|
161
|
+
def setExplodeDocs(self, value):
|
|
162
|
+
"""Sets whether to explode the documents into separate rows.
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
value : boolean
|
|
168
|
+
Whether to explode the documents into separate rows
|
|
169
|
+
"""
|
|
170
|
+
return self._set(explodeDocs=value)
|
|
171
|
+
|
|
172
|
+
def setOutputCol(self, value):
|
|
173
|
+
"""Sets output column name.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
value : str
|
|
178
|
+
Name of the Output Column
|
|
179
|
+
"""
|
|
180
|
+
return self._set(outputCol=value)
|
|
181
|
+
|
|
182
|
+
def setFlattenOutput(self, value):
|
|
183
|
+
"""Sets whether to flatten the output to plain text with minimal metadata.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
value : bool
|
|
188
|
+
If true, output is flattened to plain text with minimal metadata
|
|
189
|
+
"""
|
|
190
|
+
return self._set(flattenOutput=value)
|
|
191
|
+
|
|
192
|
+
def setTitleThreshold(self, value):
|
|
193
|
+
"""Sets the minimum font size threshold for title detection in PDF documents.
|
|
194
|
+
|
|
195
|
+
Parameters
|
|
196
|
+
----------
|
|
197
|
+
value : float
|
|
198
|
+
Minimum font size threshold for title detection in PDF docs
|
|
199
|
+
"""
|
|
200
|
+
return self._set(titleThreshold=value)
|
|
201
|
+
|
|
202
|
+
def setOutputFormat(self, value):
|
|
203
|
+
"""Sets the output format for the table content.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
value : str
|
|
208
|
+
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
|
|
209
|
+
"""
|
|
210
|
+
return self._set(outputFormat=value)
|