spark-nlp 6.1.3rc1__py2.py3-none-any.whl → 6.1.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -21,9 +21,10 @@ from sparknlp.partition.partition_properties import *
21
21
 
22
22
  class Reader2Doc(
23
23
  AnnotatorTransformer,
24
+ HasReaderProperties,
25
+ HasHTMLReaderProperties,
24
26
  HasEmailReaderProperties,
25
27
  HasExcelReaderProperties,
26
- HasHTMLReaderProperties,
27
28
  HasPowerPointProperties,
28
29
  HasTextReaderProperties
29
30
  ):
@@ -73,33 +74,6 @@ class Reader2Doc(
73
74
  name = "Reader2Doc"
74
75
  outputAnnotatorType = AnnotatorType.DOCUMENT
75
76
 
76
- contentPath = Param(
77
- Params._dummy(),
78
- "contentPath",
79
- "contentPath path to files to read",
80
- typeConverter=TypeConverters.toString
81
- )
82
-
83
- outputCol = Param(
84
- Params._dummy(),
85
- "outputCol",
86
- "output column name",
87
- typeConverter=TypeConverters.toString
88
- )
89
-
90
- contentType = Param(
91
- Params._dummy(),
92
- "contentType",
93
- "Set the content type to load following MIME specification",
94
- typeConverter=TypeConverters.toString
95
- )
96
-
97
- explodeDocs = Param(
98
- Params._dummy(),
99
- "explodeDocs",
100
- "whether to explode the documents into separate rows",
101
- typeConverter=TypeConverters.toBoolean
102
- )
103
77
 
104
78
  flattenOutput = Param(
105
79
  Params._dummy(),
@@ -115,11 +89,18 @@ class Reader2Doc(
115
89
  typeConverter=TypeConverters.toFloat
116
90
  )
117
91
 
118
- outputFormat = Param(
92
+ outputAsDocument = Param(
93
+ Params._dummy(),
94
+ "outputAsDocument",
95
+ "Whether to return all sentences joined into a single document",
96
+ typeConverter=TypeConverters.toBoolean
97
+ )
98
+
99
+ excludeNonText = Param(
119
100
  Params._dummy(),
120
- "outputFormat",
121
- "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
122
- typeConverter=TypeConverters.toString
101
+ "excludeNonText",
102
+ "Whether to exclude non-text content from the output. Default is False.",
103
+ typeConverter=TypeConverters.toBoolean
123
104
  )
124
105
 
125
106
  @keyword_only
@@ -137,52 +118,11 @@ class Reader2Doc(
137
118
  kwargs = self._input_kwargs
138
119
  return self._set(**kwargs)
139
120
 
140
- def setContentPath(self, value):
141
- """Sets content path.
142
-
143
- Parameters
144
- ----------
145
- value : str
146
- contentPath path to files to read
147
- """
148
- return self._set(contentPath=value)
149
-
150
- def setContentType(self, value):
151
- """
152
- Set the content type to load following MIME specification
153
-
154
- Parameters
155
- ----------
156
- value : str
157
- content type to load following MIME specification
158
- """
159
- return self._set(contentType=value)
160
-
161
- def setExplodeDocs(self, value):
162
- """Sets whether to explode the documents into separate rows.
163
-
164
-
165
- Parameters
166
- ----------
167
- value : boolean
168
- Whether to explode the documents into separate rows
169
- """
170
- return self._set(explodeDocs=value)
171
-
172
- def setOutputCol(self, value):
173
- """Sets output column name.
174
-
175
- Parameters
176
- ----------
177
- value : str
178
- Name of the Output Column
179
- """
180
- return self._set(outputCol=value)
181
121
 
182
122
  def setFlattenOutput(self, value):
183
123
  """Sets whether to flatten the output to plain text with minimal metadata.
184
124
 
185
- Parameters
125
+ ParametersF
186
126
  ----------
187
127
  value : bool
188
128
  If true, output is flattened to plain text with minimal metadata
@@ -199,12 +139,22 @@ class Reader2Doc(
199
139
  """
200
140
  return self._set(titleThreshold=value)
201
141
 
202
- def setOutputFormat(self, value):
203
- """Sets the output format for the table content.
142
+ def setOutputAsDocument(self, value):
143
+ """Sets whether to return all sentences joined into a single document.
144
+
145
+ Parameters
146
+ ----------
147
+ value : bool
148
+ Whether to return all sentences joined into a single document
149
+ """
150
+ return self._set(outputAsDocument=value)
151
+
152
+ def setExcludeNonText(self, value):
153
+ """Sets whether to exclude non-text content from the output.
204
154
 
205
155
  Parameters
206
156
  ----------
207
- value : str
208
- Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
157
+ value : bool
158
+ Whether to exclude non-text content from the output. Default is False.
209
159
  """
210
- return self._set(outputFormat=value)
160
+ return self._set(excludeNonText=value)
@@ -20,46 +20,84 @@ from sparknlp.partition.partition_properties import *
20
20
 
21
21
  class Reader2Image(
22
22
  AnnotatorTransformer,
23
- HasHTMLReaderProperties
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
25
+ HasPdfProperties
24
26
  ):
27
+ """
28
+ The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
29
+ Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
30
+ extracting structured image content from various document types using Spark NLP readers. It supports
31
+ reading from many file types and returns parsed output as a structured Spark DataFrame.
32
+
33
+ Supported formats include HTML and Markdown.
34
+
35
+ == Example ==
36
+ This example demonstrates how to load HTML files with images and process them into a structured
37
+ Spark DataFrame using Reader2Image.
38
+
39
+ Expected output:
40
+ +-------------------+--------------------+
41
+ | fileName| image|
42
+ +-------------------+--------------------+
43
+ |example-images.html|[{image, example-...|
44
+ |example-images.html|[{image, example-...|
45
+ +-------------------+--------------------+
46
+
47
+ Schema:
48
+ root
49
+ |-- fileName: string (nullable = true)
50
+ |-- image: array (nullable = false)
51
+ | |-- element: struct (containsNull = true)
52
+ | | |-- annotatorType: string (nullable = true)
53
+ | | |-- origin: string (nullable = true)
54
+ | | |-- height: integer (nullable = false)
55
+ | | |-- width: integer (nullable = false)
56
+ | | |-- nChannels: integer (nullable = false)
57
+ | | |-- mode: integer (nullable = false)
58
+ | | |-- result: binary (nullable = true)
59
+ | | |-- metadata: map (nullable = true)
60
+ | | | |-- key: string
61
+ | | | |-- value: string (valueContainsNull = true)
62
+ | | |-- text: string (nullable = true)
63
+ """
64
+
25
65
  name = "Reader2Image"
26
66
  outputAnnotatorType = AnnotatorType.IMAGE
27
67
 
28
- contentPath = Param(
68
+ userMessage = Param(
29
69
  Params._dummy(),
30
- "contentPath",
31
- "contentPath path to files to read",
70
+ "userMessage",
71
+ "Custom user message.",
32
72
  typeConverter=TypeConverters.toString
33
73
  )
34
74
 
35
- outputCol = Param(
75
+ promptTemplate = Param(
36
76
  Params._dummy(),
37
- "outputCol",
38
- "output column name",
77
+ "promptTemplate",
78
+ "Format of the output prompt.",
39
79
  typeConverter=TypeConverters.toString
40
80
  )
41
81
 
42
- contentType = Param(
82
+ customPromptTemplate = Param(
43
83
  Params._dummy(),
44
- "contentType",
45
- "Set the content type to load following MIME specification",
84
+ "customPromptTemplate",
85
+ "Custom prompt template for image models.",
46
86
  typeConverter=TypeConverters.toString
47
87
  )
48
88
 
49
- explodeDocs = Param(
50
- Params._dummy(),
51
- "explodeDocs",
52
- "whether to explode the documents into separate rows",
53
- typeConverter=TypeConverters.toBoolean
54
- )
55
-
56
89
  @keyword_only
57
90
  def __init__(self):
58
91
  super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
59
92
  self._setDefault(
60
- outputCol="document",
93
+ contentType="",
94
+ outputFormat="image",
61
95
  explodeDocs=True,
62
- contentType=""
96
+ userMessage="Describe this image",
97
+ promptTemplate="qwen2vl-chat",
98
+ readAsImage=True,
99
+ customPromptTemplate="",
100
+ ignoreExceptions=True
63
101
  )
64
102
 
65
103
  @keyword_only
@@ -67,44 +105,32 @@ class Reader2Image(
67
105
  kwargs = self._input_kwargs
68
106
  return self._set(**kwargs)
69
107
 
70
- def setContentPath(self, value):
71
- """Sets content path.
108
+ def setUserMessage(self, value: str):
109
+ """Sets custom user message.
72
110
 
73
111
  Parameters
74
112
  ----------
75
113
  value : str
76
- contentPath path to files to read
114
+ Custom user message to include.
77
115
  """
78
- return self._set(contentPath=value)
116
+ return self._set(userMessage=value)
79
117
 
80
- def setContentType(self, value):
81
- """
82
- Set the content type to load following MIME specification
118
+ def setPromptTemplate(self, value: str):
119
+ """Sets format of the output prompt.
83
120
 
84
121
  Parameters
85
122
  ----------
86
123
  value : str
87
- content type to load following MIME specification
88
- """
89
- return self._set(contentType=value)
90
-
91
- def setExplodeDocs(self, value):
92
- """Sets whether to explode the documents into separate rows.
93
-
94
-
95
- Parameters
96
- ----------
97
- value : boolean
98
- Whether to explode the documents into separate rows
124
+ Prompt template format.
99
125
  """
100
- return self._set(explodeDocs=value)
126
+ return self._set(promptTemplate=value)
101
127
 
102
- def setOutputCol(self, value):
103
- """Sets output column name.
128
+ def setCustomPromptTemplate(self, value: str):
129
+ """Sets custom prompt template for image models.
104
130
 
105
131
  Parameters
106
132
  ----------
107
133
  value : str
108
- Name of the Output Column
134
+ Custom prompt template string.
109
135
  """
110
- return self._set(outputCol=value)
136
+ return self._set(customPromptTemplate=value)
@@ -13,14 +13,15 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from pyspark import keyword_only
16
- from pyspark.ml.param import TypeConverters, Params, Param
17
16
 
18
17
  from sparknlp.common import AnnotatorType
19
18
  from sparknlp.internal import AnnotatorTransformer
20
19
  from sparknlp.partition.partition_properties import *
21
20
 
21
+
22
22
  class Reader2Table(
23
23
  AnnotatorTransformer,
24
+ HasReaderProperties,
24
25
  HasEmailReaderProperties,
25
26
  HasExcelReaderProperties,
26
27
  HasHTMLReaderProperties,
@@ -31,34 +32,6 @@ class Reader2Table(
31
32
 
32
33
  outputAnnotatorType = AnnotatorType.DOCUMENT
33
34
 
34
- contentPath = Param(
35
- Params._dummy(),
36
- "contentPath",
37
- "contentPath path to files to read",
38
- typeConverter=TypeConverters.toString
39
- )
40
-
41
- outputCol = Param(
42
- Params._dummy(),
43
- "outputCol",
44
- "output column name",
45
- typeConverter=TypeConverters.toString
46
- )
47
-
48
- contentType = Param(
49
- Params._dummy(),
50
- "contentType",
51
- "Set the content type to load following MIME specification",
52
- typeConverter=TypeConverters.toString
53
- )
54
-
55
- explodeDocs = Param(
56
- Params._dummy(),
57
- "explodeDocs",
58
- "whether to explode the documents into separate rows",
59
- typeConverter=TypeConverters.toBoolean
60
- )
61
-
62
35
  flattenOutput = Param(
63
36
  Params._dummy(),
64
37
  "flattenOutput",
@@ -73,13 +46,6 @@ class Reader2Table(
73
46
  typeConverter=TypeConverters.toFloat
74
47
  )
75
48
 
76
- outputFormat = Param(
77
- Params._dummy(),
78
- "outputFormat",
79
- "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
80
- typeConverter=TypeConverters.toString
81
- )
82
-
83
49
  @keyword_only
84
50
  def __init__(self):
85
51
  super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
@@ -90,48 +56,6 @@ class Reader2Table(
90
56
  kwargs = self._input_kwargs
91
57
  return self._set(**kwargs)
92
58
 
93
- def setContentPath(self, value):
94
- """Sets content path.
95
-
96
- Parameters
97
- ----------
98
- value : str
99
- contentPath path to files to read
100
- """
101
- return self._set(contentPath=value)
102
-
103
- def setContentType(self, value):
104
- """
105
- Set the content type to load following MIME specification
106
-
107
- Parameters
108
- ----------
109
- value : str
110
- content type to load following MIME specification
111
- """
112
- return self._set(contentType=value)
113
-
114
- def setExplodeDocs(self, value):
115
- """Sets whether to explode the documents into separate rows.
116
-
117
-
118
- Parameters
119
- ----------
120
- value : boolean
121
- Whether to explode the documents into separate rows
122
- """
123
- return self._set(explodeDocs=value)
124
-
125
- def setOutputCol(self, value):
126
- """Sets output column name.
127
-
128
- Parameters
129
- ----------
130
- value : str
131
- Name of the Output Column
132
- """
133
- return self._set(outputCol=value)
134
-
135
59
  def setFlattenOutput(self, value):
136
60
  """Sets whether to flatten the output to plain text with minimal metadata.
137
61
 
@@ -151,13 +75,3 @@ class Reader2Table(
151
75
  Minimum font size threshold for title detection in PDF docs
152
76
  """
153
77
  return self._set(titleThreshold=value)
154
-
155
- def setOutputFormat(self, value):
156
- """Sets the output format for the table content.
157
-
158
- Parameters
159
- ----------
160
- value : str
161
- Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
162
- """
163
- return self._set(outputFormat=value)
@@ -1,225 +0,0 @@
1
- # Copyright 2017-2023 John Snow Labs
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- """Contains classes for RoBertaForZeroShotClassification."""
15
-
16
- from sparknlp.common import *
17
-
18
-
19
- class RoBertaForZeroShotClassification(AnnotatorModel,
20
- HasCaseSensitiveProperties,
21
- HasBatchedAnnotate,
22
- HasClassifierActivationProperties,
23
- HasCandidateLabelsProperties,
24
- HasEngine):
25
- """RoBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
26
- inference) tasks. Equivalent of `RoBertaForSequenceClassification` models, but these models don't require a hardcoded
27
- number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
28
- flexible.
29
-
30
- Note that the model will loop through all provided labels. So the more labels you have, the
31
- longer this process will take.
32
-
33
- Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
34
- pair and passed to the pretrained model.
35
-
36
- Pretrained models can be loaded with :meth:`.pretrained` of the companion
37
- object:
38
-
39
- >>> sequenceClassifier = RoBertaForZeroShotClassification.pretrained() \\
40
- ... .setInputCols(["token", "document"]) \\
41
- ... .setOutputCol("label")
42
-
43
- The default model is ``"roberta_base_zero_shot_classifier_nli"``, if no name is
44
- provided.
45
-
46
- For available pretrained models please see the `Models Hub
47
- <https://sparknlp.orgtask=Text+Classification>`__.
48
-
49
- To see which models are compatible and how to import them see
50
- `Import Transformers into Spark NLP 🚀
51
- <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
52
-
53
- ====================== ======================
54
- Input Annotation types Output Annotation type
55
- ====================== ======================
56
- ``DOCUMENT, TOKEN`` ``CATEGORY``
57
- ====================== ======================
58
-
59
- Parameters
60
- ----------
61
- batchSize
62
- Batch size. Large values allows faster processing but requires more
63
- memory, by default 8
64
- caseSensitive
65
- Whether to ignore case in tokens for embeddings matching, by default
66
- True
67
- configProtoBytes
68
- ConfigProto from tensorflow, serialized into byte array.
69
- maxSentenceLength
70
- Max sentence length to process, by default 128
71
- coalesceSentences
72
- Instead of 1 class per sentence (if inputCols is `sentence`) output 1
73
- class per document by averaging probabilities in all sentences, by
74
- default False
75
- activation
76
- Whether to calculate logits via Softmax or Sigmoid, by default
77
- `"softmax"`.
78
-
79
- Examples
80
- --------
81
- >>> import sparknlp
82
- >>> from sparknlp.base import *
83
- >>> from sparknlp.annotator import *
84
- >>> from pyspark.ml import Pipeline
85
- >>> documentAssembler = DocumentAssembler() \\
86
- ... .setInputCol("text") \\
87
- ... .setOutputCol("document")
88
- >>> tokenizer = Tokenizer() \\
89
- ... .setInputCols(["document"]) \\
90
- ... .setOutputCol("token")
91
- >>> sequenceClassifier = RoBertaForZeroShotClassification.pretrained() \\
92
- ... .setInputCols(["token", "document"]) \\
93
- ... .setOutputCol("label") \\
94
- ... .setCaseSensitive(True)
95
- >>> pipeline = Pipeline().setStages([
96
- ... documentAssembler,
97
- ... tokenizer,
98
- ... sequenceClassifier
99
- ... ])
100
- >>> data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text")
101
- >>> result = pipeline.fit(data).transform(data)
102
- >>> result.select("label.result").show(truncate=False)
103
- +------+
104
- |result|
105
- +------+
106
- |[pos] |
107
- |[neg] |
108
- +------+
109
- """
110
- name = "RoBertaForZeroShotClassification"
111
-
112
- inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
113
-
114
- outputAnnotatorType = AnnotatorType.CATEGORY
115
-
116
- maxSentenceLength = Param(Params._dummy(),
117
- "maxSentenceLength",
118
- "Max sentence length to process",
119
- typeConverter=TypeConverters.toInt)
120
-
121
- configProtoBytes = Param(Params._dummy(),
122
- "configProtoBytes",
123
- "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
124
- TypeConverters.toListInt)
125
-
126
- coalesceSentences = Param(Params._dummy(), "coalesceSentences",
127
- "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
128
- TypeConverters.toBoolean)
129
-
130
- def getClasses(self):
131
- """
132
- Returns labels used to train this model
133
- """
134
- return self._call_java("getClasses")
135
-
136
- def setConfigProtoBytes(self, b):
137
- """Sets configProto from tensorflow, serialized into byte array.
138
-
139
- Parameters
140
- ----------
141
- b : List[int]
142
- ConfigProto from tensorflow, serialized into byte array
143
- """
144
- return self._set(configProtoBytes=b)
145
-
146
- def setMaxSentenceLength(self, value):
147
- """Sets max sentence length to process, by default 128.
148
-
149
- Parameters
150
- ----------
151
- value : int
152
- Max sentence length to process
153
- """
154
- return self._set(maxSentenceLength=value)
155
-
156
- def setCoalesceSentences(self, value):
157
- """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging
158
- probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as RoBerta
159
- (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities
160
- for the entire document instead of probabilities per sentence. (Default: true)
161
-
162
- Parameters
163
- ----------
164
- value : bool
165
- If the output of all sentences will be averaged to one output
166
- """
167
- return self._set(coalesceSentences=value)
168
-
169
- @keyword_only
170
- def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForZeroShotClassification",
171
- java_model=None):
172
- super(RoBertaForZeroShotClassification, self).__init__(
173
- classname=classname,
174
- java_model=java_model
175
- )
176
- self._setDefault(
177
- batchSize=8,
178
- maxSentenceLength=128,
179
- caseSensitive=True,
180
- coalesceSentences=False,
181
- activation="softmax"
182
- )
183
-
184
- @staticmethod
185
- def loadSavedModel(folder, spark_session):
186
- """Loads a locally saved model.
187
-
188
- Parameters
189
- ----------
190
- folder : str
191
- Folder of the saved model
192
- spark_session : pyspark.sql.SparkSession
193
- The current SparkSession
194
-
195
- Returns
196
- -------
197
- RoBertaForZeroShotClassification
198
- The restored model
199
- """
200
- from sparknlp.internal import _RoBertaForZeroShotClassification
201
- jModel = _RoBertaForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj
202
- return RoBertaForZeroShotClassification(java_model=jModel)
203
-
204
- @staticmethod
205
- def pretrained(name="roberta_base_zero_shot_classifier_nli", lang="en", remote_loc=None):
206
- """Downloads and loads a pretrained model.
207
-
208
- Parameters
209
- ----------
210
- name : str, optional
211
- Name of the pretrained model, by default
212
- "roberta_base_zero_shot_classifier_nli"
213
- lang : str, optional
214
- Language of the pretrained model, by default "en"
215
- remote_loc : str, optional
216
- Optional remote address of the resource, by default None. Will use
217
- Spark NLPs repositories otherwise.
218
-
219
- Returns
220
- -------
221
- RoBertaForZeroShotClassification
222
- The restored model
223
- """
224
- from sparknlp.pretrained import ResourceDownloader
225
- return ResourceDownloader.downloadModel(RoBertaForZeroShotClassification, name, lang, remote_loc)