spark-nlp 6.0.5__py2.py3-none-any.whl → 6.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -628,7 +628,6 @@ class HasGeneratorProperties:
628
628
  "The number of sequences to return from the beam search.",
629
629
  typeConverter=TypeConverters.toInt)
630
630
 
631
-
632
631
  def setTask(self, value):
633
632
  """Sets the transformer's task, e.g. ``summarize:``.
634
633
 
@@ -639,7 +638,6 @@ class HasGeneratorProperties:
639
638
  """
640
639
  return self._set(task=value)
641
640
 
642
-
643
641
  def setMinOutputLength(self, value):
644
642
  """Sets minimum length of the sequence to be generated.
645
643
 
@@ -650,7 +648,6 @@ class HasGeneratorProperties:
650
648
  """
651
649
  return self._set(minOutputLength=value)
652
650
 
653
-
654
651
  def setMaxOutputLength(self, value):
655
652
  """Sets maximum length of output text.
656
653
 
@@ -661,7 +658,6 @@ class HasGeneratorProperties:
661
658
  """
662
659
  return self._set(maxOutputLength=value)
663
660
 
664
-
665
661
  def setDoSample(self, value):
666
662
  """Sets whether or not to use sampling, use greedy decoding otherwise.
667
663
 
@@ -672,7 +668,6 @@ class HasGeneratorProperties:
672
668
  """
673
669
  return self._set(doSample=value)
674
670
 
675
-
676
671
  def setTemperature(self, value):
677
672
  """Sets the value used to module the next token probabilities.
678
673
 
@@ -683,7 +678,6 @@ class HasGeneratorProperties:
683
678
  """
684
679
  return self._set(temperature=value)
685
680
 
686
-
687
681
  def setTopK(self, value):
688
682
  """Sets the number of highest probability vocabulary tokens to keep for
689
683
  top-k-filtering.
@@ -695,7 +689,6 @@ class HasGeneratorProperties:
695
689
  """
696
690
  return self._set(topK=value)
697
691
 
698
-
699
692
  def setTopP(self, value):
700
693
  """Sets the top cumulative probability for vocabulary tokens.
701
694
 
@@ -709,7 +702,6 @@ class HasGeneratorProperties:
709
702
  """
710
703
  return self._set(topP=value)
711
704
 
712
-
713
705
  def setRepetitionPenalty(self, value):
714
706
  """Sets the parameter for repetition penalty. 1.0 means no penalty.
715
707
 
@@ -725,7 +717,6 @@ class HasGeneratorProperties:
725
717
  """
726
718
  return self._set(repetitionPenalty=value)
727
719
 
728
-
729
720
  def setNoRepeatNgramSize(self, value):
730
721
  """Sets size of n-grams that can only occur once.
731
722
 
@@ -738,7 +729,6 @@ class HasGeneratorProperties:
738
729
  """
739
730
  return self._set(noRepeatNgramSize=value)
740
731
 
741
-
742
732
  def setBeamSize(self, value):
743
733
  """Sets the number of beam size for beam search.
744
734
 
@@ -749,7 +739,6 @@ class HasGeneratorProperties:
749
739
  """
750
740
  return self._set(beamSize=value)
751
741
 
752
-
753
742
  def setNReturnSequences(self, value):
754
743
  """Sets the number of sequences to return from the beam search.
755
744
 
@@ -765,14 +754,14 @@ class HasLlamaCppProperties:
765
754
  # -------- MODEl PARAMETERS --------
766
755
  nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation",
767
756
  typeConverter=TypeConverters.toInt)
768
- nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
769
- typeConverter=TypeConverters.toInt)
757
+ # nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
758
+ # typeConverter=TypeConverters.toInt)
770
759
  nThreadsBatch = Param(Params._dummy(), "nThreadsBatch",
771
760
  "Set the number of threads to use during batch and prompt processing",
772
761
  typeConverter=TypeConverters.toInt)
773
- nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
774
- "Set the number of threads to use during batch and prompt processing",
775
- typeConverter=TypeConverters.toInt)
762
+ # nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
763
+ # "Set the number of threads to use during batch and prompt processing",
764
+ # typeConverter=TypeConverters.toInt)
776
765
  nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt)
777
766
  nBatch = Param(Params._dummy(), "nBatch",
778
767
  "Set the logical batch size for prompt processing (must be >=32 to use BLAS)",
@@ -782,12 +771,12 @@ class HasLlamaCppProperties:
782
771
  typeConverter=TypeConverters.toInt)
783
772
  nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding",
784
773
  typeConverter=TypeConverters.toInt)
785
- nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
786
- typeConverter=TypeConverters.toInt)
787
- nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
788
- typeConverter=TypeConverters.toInt)
789
- pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
790
- typeConverter=TypeConverters.toFloat)
774
+ # nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
775
+ # typeConverter=TypeConverters.toInt)
776
+ # nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
777
+ # typeConverter=TypeConverters.toInt)
778
+ # pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
779
+ # typeConverter=TypeConverters.toFloat)
791
780
  nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)",
792
781
  typeConverter=TypeConverters.toInt)
793
782
  nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft",
@@ -802,10 +791,10 @@ class HasLlamaCppProperties:
802
791
  typeConverter=TypeConverters.toString)
803
792
  mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.",
804
793
  typeConverter=TypeConverters.toInt)
805
- tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
806
- typeConverter=TypeConverters.toListFloat)
807
- grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
808
- grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
794
+ # tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
795
+ # typeConverter=TypeConverters.toListFloat)
796
+ # grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
797
+ # grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
809
798
  ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling",
810
799
  typeConverter=TypeConverters.toFloat)
811
800
  ropeFreqScale = Param(Params._dummy(), "ropeFreqScale",
@@ -837,7 +826,7 @@ class HasLlamaCppProperties:
837
826
  typeConverter=TypeConverters.toString)
838
827
  # Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
839
828
  #
840
- # - UNSPECIFIED: Don't use any scaling
829
+ # - NONE: Don't use any scaling
841
830
  # - LINEAR: Linear scaling
842
831
  # - YARN: YaRN RoPE scaling
843
832
  ropeScalingType = Param(Params._dummy(), "ropeScalingType",
@@ -845,29 +834,30 @@ class HasLlamaCppProperties:
845
834
  typeConverter=TypeConverters.toString)
846
835
  # Set the pooling type for embeddings, use model default if unspecified
847
836
  #
848
- # - 0 NONE: Don't use any pooling
849
- # - 1 MEAN: Mean Pooling
850
- # - 2 CLS: CLS Pooling
837
+ # - MEAN: Mean Pooling
838
+ # - CLS: CLS Pooling
839
+ # - LAST: Last token pooling
840
+ # - RANK: For reranked models
851
841
  poolingType = Param(Params._dummy(), "poolingType",
852
842
  "Set the pooling type for embeddings, use model default if unspecified",
853
843
  typeConverter=TypeConverters.toString)
854
844
  modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding",
855
845
  typeConverter=TypeConverters.toString)
856
846
  modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString)
857
- lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
858
- "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
859
- typeConverter=TypeConverters.toString)
860
- lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
861
- "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
862
- typeConverter=TypeConverters.toString)
847
+ # lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
848
+ # "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
849
+ # typeConverter=TypeConverters.toString)
850
+ # lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
851
+ # "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
852
+ # typeConverter=TypeConverters.toString)
863
853
  # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
864
854
  embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support",
865
855
  typeConverter=TypeConverters.toBoolean)
866
856
  flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention",
867
857
  typeConverter=TypeConverters.toBoolean)
868
- inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
869
- "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
870
- typeConverter=TypeConverters.toBoolean)
858
+ # inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
859
+ # "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
860
+ # typeConverter=TypeConverters.toBoolean)
871
861
  useMmap = Param(Params._dummy(), "useMmap",
872
862
  "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)",
873
863
  typeConverter=TypeConverters.toBoolean)
@@ -880,6 +870,10 @@ class HasLlamaCppProperties:
880
870
  typeConverter=TypeConverters.toString)
881
871
  chatTemplate = Param(Params._dummy(), "chatTemplate", "The chat template to use",
882
872
  typeConverter=TypeConverters.toString)
873
+ logVerbosity = Param(Params._dummy(), "logVerbosity", "Set the log verbosity level",
874
+ typeConverter=TypeConverters.toInt)
875
+ disableLog = Param(Params._dummy(), "disableLog", "Whether to disable logging",
876
+ typeConverter=TypeConverters.toBoolean)
883
877
 
884
878
  # -------- INFERENCE PARAMETERS --------
885
879
  inputPrefix = Param(Params._dummy(), "inputPrefix", "Set the prompt to start generation with",
@@ -948,17 +942,17 @@ class HasLlamaCppProperties:
948
942
  """Set the number of threads to use during generation"""
949
943
  return self._set(nThreads=nThreads)
950
944
 
951
- def setNThreadsDraft(self, nThreadsDraft: int):
952
- """Set the number of threads to use during draft generation"""
953
- return self._set(nThreadsDraft=nThreadsDraft)
945
+ # def setNThreadsDraft(self, nThreadsDraft: int):
946
+ # """Set the number of threads to use during draft generation"""
947
+ # return self._set(nThreadsDraft=nThreadsDraft)
954
948
 
955
949
  def setNThreadsBatch(self, nThreadsBatch: int):
956
950
  """Set the number of threads to use during batch and prompt processing"""
957
951
  return self._set(nThreadsBatch=nThreadsBatch)
958
952
 
959
- def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
960
- """Set the number of threads to use during batch and prompt processing"""
961
- return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
953
+ # def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
954
+ # """Set the number of threads to use during batch and prompt processing"""
955
+ # return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
962
956
 
963
957
  def setNCtx(self, nCtx: int):
964
958
  """Set the size of the prompt context"""
@@ -976,17 +970,17 @@ class HasLlamaCppProperties:
976
970
  """Set the number of tokens to draft for speculative decoding"""
977
971
  return self._set(nDraft=nDraft)
978
972
 
979
- def setNChunks(self, nChunks: int):
980
- """Set the maximal number of chunks to process"""
981
- return self._set(nChunks=nChunks)
973
+ # def setNChunks(self, nChunks: int):
974
+ # """Set the maximal number of chunks to process"""
975
+ # return self._set(nChunks=nChunks)
982
976
 
983
- def setNSequences(self, nSequences: int):
984
- """Set the number of sequences to decode"""
985
- return self._set(nSequences=nSequences)
977
+ # def setNSequences(self, nSequences: int):
978
+ # """Set the number of sequences to decode"""
979
+ # return self._set(nSequences=nSequences)
986
980
 
987
- def setPSplit(self, pSplit: float):
988
- """Set the speculative decoding split probability"""
989
- return self._set(pSplit=pSplit)
981
+ # def setPSplit(self, pSplit: float):
982
+ # """Set the speculative decoding split probability"""
983
+ # return self._set(pSplit=pSplit)
990
984
 
991
985
  def setNGpuLayers(self, nGpuLayers: int):
992
986
  """Set the number of layers to store in VRAM (-1 - use default)"""
@@ -1004,17 +998,17 @@ class HasLlamaCppProperties:
1004
998
  """Set the main GPU that is used for scratch and small tensors."""
1005
999
  return self._set(mainGpu=mainGpu)
1006
1000
 
1007
- def setTensorSplit(self, tensorSplit: List[float]):
1008
- """Set how split tensors should be distributed across GPUs"""
1009
- return self._set(tensorSplit=tensorSplit)
1001
+ # def setTensorSplit(self, tensorSplit: List[float]):
1002
+ # """Set how split tensors should be distributed across GPUs"""
1003
+ # return self._set(tensorSplit=tensorSplit)
1010
1004
 
1011
- def setGrpAttnN(self, grpAttnN: int):
1012
- """Set the group-attention factor"""
1013
- return self._set(grpAttnN=grpAttnN)
1005
+ # def setGrpAttnN(self, grpAttnN: int):
1006
+ # """Set the group-attention factor"""
1007
+ # return self._set(grpAttnN=grpAttnN)
1014
1008
 
1015
- def setGrpAttnW(self, grpAttnW: int):
1016
- """Set the group-attention width"""
1017
- return self._set(grpAttnW=grpAttnW)
1009
+ # def setGrpAttnW(self, grpAttnW: int):
1010
+ # """Set the group-attention width"""
1011
+ # return self._set(grpAttnW=grpAttnW)
1018
1012
 
1019
1013
  def setRopeFreqBase(self, ropeFreqBase: float):
1020
1014
  """Set the RoPE base frequency, used by NTK-aware scaling"""
@@ -1049,7 +1043,16 @@ class HasLlamaCppProperties:
1049
1043
  return self._set(defragmentationThreshold=defragmentationThreshold)
1050
1044
 
1051
1045
  def setNumaStrategy(self, numaStrategy: str):
1052
- """Set optimization strategies that help on some NUMA systems (if available)"""
1046
+ """Set optimization strategies that help on some NUMA systems (if available)
1047
+
1048
+ Possible values:
1049
+
1050
+ - DISABLED: No NUMA optimizations
1051
+ - DISTRIBUTE: spread execution evenly over all
1052
+ - ISOLATE: only spawn threads on CPUs on the node that execution started on
1053
+ - NUMA_CTL: use the CPU map provided by numactl
1054
+ - MIRROR: Mirrors the model across NUMA nodes
1055
+ """
1053
1056
  numaUpper = numaStrategy.upper()
1054
1057
  numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"]
1055
1058
  if numaUpper not in numaStrategies:
@@ -1060,13 +1063,35 @@ class HasLlamaCppProperties:
1060
1063
  return self._set(numaStrategy=numaStrategy)
1061
1064
 
1062
1065
  def setRopeScalingType(self, ropeScalingType: str):
1063
- """Set the RoPE frequency scaling method, defaults to linear unless specified by the model"""
1064
- return self._set(ropeScalingType=ropeScalingType)
1066
+ """Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
1067
+
1068
+ Possible values:
1069
+
1070
+ - NONE: Don't use any scaling
1071
+ - LINEAR: Linear scaling
1072
+ - YARN: YaRN RoPE scaling
1073
+ """
1074
+ ropeScalingTypeUpper = ropeScalingType.upper()
1075
+ ropeScalingTypes = ["NONE", "LINEAR", "YARN"]
1076
+ if ropeScalingTypeUpper not in ropeScalingTypes:
1077
+ raise ValueError(
1078
+ f"Invalid RoPE scaling type: {ropeScalingType}. "
1079
+ + f"Valid values are: {ropeScalingTypes}"
1080
+ )
1081
+ return self._set(ropeScalingType=ropeScalingTypeUpper)
1065
1082
 
1066
1083
  def setPoolingType(self, poolingType: str):
1067
- """Set the pooling type for embeddings, use model default if unspecified"""
1084
+ """Set the pooling type for embeddings, use model default if unspecified
1085
+
1086
+ Possible values:
1087
+
1088
+ - MEAN: Mean Pooling
1089
+ - CLS: CLS Pooling
1090
+ - LAST: Last token pooling
1091
+ - RANK: For reranked models
1092
+ """
1068
1093
  poolingTypeUpper = poolingType.upper()
1069
- poolingTypes = ["NONE", "MEAN", "CLS", "LAST"]
1094
+ poolingTypes = ["NONE", "MEAN", "CLS", "LAST", "RANK"]
1070
1095
  if poolingTypeUpper not in poolingTypes:
1071
1096
  raise ValueError(
1072
1097
  f"Invalid pooling type: {poolingType}. "
@@ -1082,25 +1107,21 @@ class HasLlamaCppProperties:
1082
1107
  """Set a model alias"""
1083
1108
  return self._set(modelAlias=modelAlias)
1084
1109
 
1085
- def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
1086
- """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1087
- return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
1110
+ # def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
1111
+ # """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1112
+ # return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
1088
1113
 
1089
- def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
1090
- """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1091
- return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
1092
-
1093
- def setEmbedding(self, embedding: bool):
1094
- """Whether to load model with embedding support"""
1095
- return self._set(embedding=embedding)
1114
+ # def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
1115
+ # """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1116
+ # return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
1096
1117
 
1097
1118
  def setFlashAttention(self, flashAttention: bool):
1098
1119
  """Whether to enable Flash Attention"""
1099
1120
  return self._set(flashAttention=flashAttention)
1100
1121
 
1101
- def setInputPrefixBos(self, inputPrefixBos: bool):
1102
- """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1103
- return self._set(inputPrefixBos=inputPrefixBos)
1122
+ # def setInputPrefixBos(self, inputPrefixBos: bool):
1123
+ # """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1124
+ # return self._set(inputPrefixBos=inputPrefixBos)
1104
1125
 
1105
1126
  def setUseMmap(self, useMmap: bool):
1106
1127
  """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
@@ -1246,11 +1267,19 @@ class HasLlamaCppProperties:
1246
1267
  def setUseChatTemplate(self, useChatTemplate: bool):
1247
1268
  """Set whether generate should apply a chat template"""
1248
1269
  return self._set(useChatTemplate=useChatTemplate)
1249
-
1270
+
1250
1271
  def setNParallel(self, nParallel: int):
1251
1272
  """Sets the number of parallel processes for decoding. This is an alias for `setBatchSize`."""
1252
1273
  return self.setBatchSize(nParallel)
1253
1274
 
1275
+ def setLogVerbosity(self, logVerbosity: int):
1276
+ """Set the log verbosity level"""
1277
+ return self._set(logVerbosity=logVerbosity)
1278
+
1279
+ def setDisableLog(self, disableLog: bool):
1280
+ """Whether to disable logging"""
1281
+ return self._set(disableLog=disableLog)
1282
+
1254
1283
  # -------- JAVA SETTERS --------
1255
1284
  def setTokenIdBias(self, tokenIdBias: Dict[int, float]):
1256
1285
  """Set token id bias"""
@@ -1260,9 +1289,9 @@ class HasLlamaCppProperties:
1260
1289
  """Set token id bias"""
1261
1290
  return self._call_java("setTokenBias", tokenBias)
1262
1291
 
1263
- def setLoraAdapters(self, loraAdapters: Dict[str, float]):
1264
- """Set LoRA adapters with their scaling factors"""
1265
- return self._call_java("setLoraAdapters", loraAdapters)
1292
+ # def setLoraAdapters(self, loraAdapters: Dict[str, float]):
1293
+ # """Set LoRA adapters with their scaling factors"""
1294
+ # return self._call_java("setLoraAdapters", loraAdapters)
1266
1295
 
1267
1296
  def getMetadata(self):
1268
1297
  """Gets the metadata of the model"""
@@ -1182,4 +1182,13 @@ class _E5VEmbeddingsLoader(ExtendedJavaWrapper):
1182
1182
  path,
1183
1183
  jspark,
1184
1184
  use_openvino
1185
+ )
1186
+
1187
+ class _Phi4Loader(ExtendedJavaWrapper):
1188
+ def __init__(self, path, jspark, use_openvino=False):
1189
+ super(_Phi4Loader, self).__init__(
1190
+ "com.johnsnowlabs.nlp.annotators.seq2seq.Phi4Transformer.loadSavedModel",
1191
+ path,
1192
+ jspark,
1193
+ use_openvino,
1185
1194
  )
@@ -0,0 +1,210 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark import keyword_only
15
+ from pyspark.ml.param import TypeConverters, Params, Param
16
+
17
+ from sparknlp.common import AnnotatorType
18
+ from sparknlp.internal import AnnotatorTransformer
19
+ from sparknlp.partition.partition_properties import *
20
+
21
+
22
+ class Reader2Doc(
23
+ AnnotatorTransformer,
24
+ HasEmailReaderProperties,
25
+ HasExcelReaderProperties,
26
+ HasHTMLReaderProperties,
27
+ HasPowerPointProperties,
28
+ HasTextReaderProperties
29
+ ):
30
+ """
31
+ The Reader2Doc annotator allows you to use reading files more smoothly within existing
32
+ Spark NLP workflows, enabling seamless reuse of your pipelines.
33
+
34
+ Reader2Doc can be used for extracting structured content from various document types
35
+ using Spark NLP readers. It supports reading from many file types and returns parsed
36
+ output as a structured Spark DataFrame.
37
+
38
+ Supported formats include:
39
+
40
+ - Plain text
41
+ - HTML
42
+ - Word (.doc/.docx)
43
+ - Excel (.xls/.xlsx)
44
+ - PowerPoint (.ppt/.pptx)
45
+ - Email files (.eml, .msg)
46
+ - PDFs
47
+
48
+ Examples
49
+ --------
50
+ >>> from johnsnowlabs.reader import Reader2Doc
51
+ >>> from johnsnowlabs.nlp.base import DocumentAssembler
52
+ >>> from pyspark.ml import Pipeline
53
+ >>> # Initialize Reader2Doc for PDF files
54
+ >>> reader2doc = Reader2Doc() \\
55
+ ... .setContentType("application/pdf") \\
56
+ ... .setContentPath(f"{pdf_directory}/")
57
+ >>> # Build the pipeline with the Reader2Doc stage
58
+ >>> pipeline = Pipeline(stages=[reader2doc])
59
+ >>> # Fit the pipeline to an empty DataFrame
60
+ >>> pipeline_model = pipeline.fit(empty_data_set)
61
+ >>> result_df = pipeline_model.transform(empty_data_set)
62
+ >>> # Show the resulting DataFrame
63
+ >>> result_df.show()
64
+ +------------------------------------------------------------------------------------------------------------------------------------+
65
+ |document |
66
+ +------------------------------------------------------------------------------------------------------------------------------------+
67
+ |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
68
+ |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
69
+ |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
70
+ +------------------------------------------------------------------------------------------------------------------------------------+
71
+ """
72
+
73
+ name = "Reader2Doc"
74
+ outputAnnotatorType = AnnotatorType.DOCUMENT
75
+
76
+ contentPath = Param(
77
+ Params._dummy(),
78
+ "contentPath",
79
+ "contentPath path to files to read",
80
+ typeConverter=TypeConverters.toString
81
+ )
82
+
83
+ outputCol = Param(
84
+ Params._dummy(),
85
+ "outputCol",
86
+ "output column name",
87
+ typeConverter=TypeConverters.toString
88
+ )
89
+
90
+ contentType = Param(
91
+ Params._dummy(),
92
+ "contentType",
93
+ "Set the content type to load following MIME specification",
94
+ typeConverter=TypeConverters.toString
95
+ )
96
+
97
+ explodeDocs = Param(
98
+ Params._dummy(),
99
+ "explodeDocs",
100
+ "whether to explode the documents into separate rows",
101
+ typeConverter=TypeConverters.toBoolean
102
+ )
103
+
104
+ flattenOutput = Param(
105
+ Params._dummy(),
106
+ "flattenOutput",
107
+ "If true, output is flattened to plain text with minimal metadata",
108
+ typeConverter=TypeConverters.toBoolean
109
+ )
110
+
111
+ titleThreshold = Param(
112
+ Params._dummy(),
113
+ "titleThreshold",
114
+ "Minimum font size threshold for title detection in PDF docs",
115
+ typeConverter=TypeConverters.toFloat
116
+ )
117
+
118
+ outputFormat = Param(
119
+ Params._dummy(),
120
+ "outputFormat",
121
+ "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
122
+ typeConverter=TypeConverters.toString
123
+ )
124
+
125
+ @keyword_only
126
+ def __init__(self):
127
+ super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
128
+ self._setDefault(
129
+ outputCol="document",
130
+ explodeDocs=False,
131
+ contentType="",
132
+ flattenOutput=False,
133
+ titleThreshold=18
134
+ )
135
+ @keyword_only
136
+ def setParams(self):
137
+ kwargs = self._input_kwargs
138
+ return self._set(**kwargs)
139
+
140
+ def setContentPath(self, value):
141
+ """Sets content path.
142
+
143
+ Parameters
144
+ ----------
145
+ value : str
146
+ contentPath path to files to read
147
+ """
148
+ return self._set(contentPath=value)
149
+
150
+ def setContentType(self, value):
151
+ """
152
+ Set the content type to load following MIME specification
153
+
154
+ Parameters
155
+ ----------
156
+ value : str
157
+ content type to load following MIME specification
158
+ """
159
+ return self._set(contentType=value)
160
+
161
+ def setExplodeDocs(self, value):
162
+ """Sets whether to explode the documents into separate rows.
163
+
164
+
165
+ Parameters
166
+ ----------
167
+ value : boolean
168
+ Whether to explode the documents into separate rows
169
+ """
170
+ return self._set(explodeDocs=value)
171
+
172
+ def setOutputCol(self, value):
173
+ """Sets output column name.
174
+
175
+ Parameters
176
+ ----------
177
+ value : str
178
+ Name of the Output Column
179
+ """
180
+ return self._set(outputCol=value)
181
+
182
+ def setFlattenOutput(self, value):
183
+ """Sets whether to flatten the output to plain text with minimal metadata.
184
+
185
+ Parameters
186
+ ----------
187
+ value : bool
188
+ If true, output is flattened to plain text with minimal metadata
189
+ """
190
+ return self._set(flattenOutput=value)
191
+
192
+ def setTitleThreshold(self, value):
193
+ """Sets the minimum font size threshold for title detection in PDF documents.
194
+
195
+ Parameters
196
+ ----------
197
+ value : float
198
+ Minimum font size threshold for title detection in PDF docs
199
+ """
200
+ return self._set(titleThreshold=value)
201
+
202
+ def setOutputFormat(self, value):
203
+ """Sets the output format for the table content.
204
+
205
+ Parameters
206
+ ----------
207
+ value : str
208
+ Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
209
+ """
210
+ return self._set(outputFormat=value)