spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/METADATA +13 -6
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/RECORD +39 -32
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +4 -2
- sparknlp/annotator/cv/__init__.py +2 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
- sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
- sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
- sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
- sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
- sparknlp/annotator/date2_chunk.py +1 -1
- sparknlp/annotator/document_character_text_splitter.py +8 -8
- sparknlp/annotator/document_token_splitter.py +7 -7
- sparknlp/annotator/embeddings/__init__.py +1 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
- sparknlp/annotator/openai/openai_completion.py +3 -4
- sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
- sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
- sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
- sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
- sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
- sparknlp/base/prompt_assembler.py +1 -1
- sparknlp/common/properties.py +7 -7
- sparknlp/internal/__init__.py +27 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +319 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/reader/pdf_to_text.py +50 -4
- sparknlp/reader/sparknlp_reader.py +101 -52
- sparknlp/training/spacy_to_annotation.py +7 -7
- sparknlp/util.py +26 -0
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains the Partition annotator for reading and processing various document types."""
|
|
15
|
+
import sparknlp
|
|
16
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Partition(ExtendedJavaWrapper):
|
|
20
|
+
"""
|
|
21
|
+
A unified interface for extracting structured content from various document types
|
|
22
|
+
using Spark NLP readers.
|
|
23
|
+
|
|
24
|
+
This class supports reading from files, URLs, in-memory strings, or byte arrays,
|
|
25
|
+
and returns parsed output as a structured Spark DataFrame.
|
|
26
|
+
|
|
27
|
+
Supported formats include:
|
|
28
|
+
- Plain text
|
|
29
|
+
- HTML
|
|
30
|
+
- Word (.doc/.docx)
|
|
31
|
+
- Excel (.xls/.xlsx)
|
|
32
|
+
- PowerPoint (.ppt/.pptx)
|
|
33
|
+
- Email files (.eml, .msg)
|
|
34
|
+
- PDFs
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
params : dict, optional
|
|
39
|
+
Configuration parameters, including:
|
|
40
|
+
|
|
41
|
+
- content_type : str
|
|
42
|
+
Override automatic file type detection.
|
|
43
|
+
- store_content : bool
|
|
44
|
+
Include raw file content in the output DataFrame.
|
|
45
|
+
- timeout : int
|
|
46
|
+
Timeout for fetching HTML content.
|
|
47
|
+
- title_font_size : int
|
|
48
|
+
Font size used to identify titles.
|
|
49
|
+
- include_page_breaks : bool
|
|
50
|
+
Tag content with page break metadata.
|
|
51
|
+
- group_broken_paragraphs : bool
|
|
52
|
+
Merge broken lines into full paragraphs.
|
|
53
|
+
- title_length_size : int
|
|
54
|
+
Max character length to qualify as title.
|
|
55
|
+
- paragraph_split : str
|
|
56
|
+
Regex to detect paragraph boundaries.
|
|
57
|
+
- short_line_word_threshold : int
|
|
58
|
+
Max words in a line to be considered short.
|
|
59
|
+
- threshold : float
|
|
60
|
+
Ratio of empty lines for switching grouping.
|
|
61
|
+
- max_line_count : int
|
|
62
|
+
Max lines evaluated in paragraph analysis.
|
|
63
|
+
- include_slide_notes : bool
|
|
64
|
+
Include speaker notes in output.
|
|
65
|
+
- infer_table_structure : bool
|
|
66
|
+
Generate HTML table structure.
|
|
67
|
+
- append_cells : bool
|
|
68
|
+
Merge Excel rows into one block.
|
|
69
|
+
- cell_separator : str
|
|
70
|
+
Join cell values in a row.
|
|
71
|
+
- add_attachment_content : bool
|
|
72
|
+
Include text of plain-text attachments.
|
|
73
|
+
- headers : dict
|
|
74
|
+
Request headers when using URLs.
|
|
75
|
+
|
|
76
|
+
Examples
|
|
77
|
+
--------
|
|
78
|
+
|
|
79
|
+
Reading Text Files
|
|
80
|
+
|
|
81
|
+
>>> txt_directory = "/content/txtfiles/reader/txt"
|
|
82
|
+
>>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
|
|
83
|
+
>>> partition_df.show()
|
|
84
|
+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
|
|
85
|
+
>>> partition_df.show()
|
|
86
|
+
>>> partition_df = Partition().partition(
|
|
87
|
+
... "https://www.wikipedia.com",
|
|
88
|
+
... headers={"Accept-Language": "es-ES"}
|
|
89
|
+
... )
|
|
90
|
+
>>> partition_df.show()
|
|
91
|
+
+--------------------+--------------------+
|
|
92
|
+
| path| txt|
|
|
93
|
+
+--------------------+--------------------+
|
|
94
|
+
|file:/content/txt...|[{Title, BIG DATA...|
|
|
95
|
+
+--------------------+--------------------+
|
|
96
|
+
|
|
97
|
+
Reading Email Files
|
|
98
|
+
|
|
99
|
+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
|
|
100
|
+
>>> partition_df.show()
|
|
101
|
+
+--------------------+--------------------+
|
|
102
|
+
| path| email|
|
|
103
|
+
+--------------------+--------------------+
|
|
104
|
+
|file:/content/ema...|[{Title, Test Sev...|
|
|
105
|
+
+--------------------+--------------------+
|
|
106
|
+
|
|
107
|
+
Reading Webpages
|
|
108
|
+
|
|
109
|
+
>>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
|
|
110
|
+
>>> partition_df.show()
|
|
111
|
+
+--------------------+--------------------+
|
|
112
|
+
| url| html|
|
|
113
|
+
+--------------------+--------------------+
|
|
114
|
+
|https://www.wikip...|[{Title, Wikipedi...|
|
|
115
|
+
+--------------------+--------------------+
|
|
116
|
+
|
|
117
|
+
For more examples, refer to:
|
|
118
|
+
`examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
|
|
119
|
+
"""
|
|
120
|
+
def __init__(self, **kwargs):
|
|
121
|
+
self.spark = sparknlp.start()
|
|
122
|
+
params = {}
|
|
123
|
+
for key, value in kwargs.items():
|
|
124
|
+
try:
|
|
125
|
+
params[key] = str(value)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
raise ValueError(f"Invalid value for key '{key}': Cannot cast {type(value)} to string. Original error: {e}")
|
|
128
|
+
|
|
129
|
+
super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def partition(self, path, headers=None):
|
|
133
|
+
"""
|
|
134
|
+
Reads and parses content from a URL, file, or directory path.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
path : str
|
|
139
|
+
Path to file or directory. URLs and DFS are supported.
|
|
140
|
+
headers : dict, optional
|
|
141
|
+
Headers for URL requests.
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
pyspark.sql.DataFrame
|
|
146
|
+
DataFrame with parsed content.
|
|
147
|
+
"""
|
|
148
|
+
if headers is None:
|
|
149
|
+
headers = {}
|
|
150
|
+
jdf = self._java_obj.partition(path, headers)
|
|
151
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
152
|
+
return dataframe
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def partition_urls(self, path, headers=None):
|
|
156
|
+
"""
|
|
157
|
+
Reads and parses content from multiple URLs.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
path : list[str]
|
|
162
|
+
List of URLs.
|
|
163
|
+
headers : dict, optional
|
|
164
|
+
Request headers for URLs.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
pyspark.sql.DataFrame
|
|
169
|
+
DataFrame with parsed URL content.
|
|
170
|
+
|
|
171
|
+
Examples
|
|
172
|
+
--------
|
|
173
|
+
>>> urls_df = Partition().partition_urls([
|
|
174
|
+
... "https://www.wikipedia.org", "https://example.com/"
|
|
175
|
+
... ])
|
|
176
|
+
>>> urls_df.show()
|
|
177
|
+
+--------------------+--------------------+
|
|
178
|
+
| url| html|
|
|
179
|
+
+--------------------+--------------------+
|
|
180
|
+
|https://www.wikip...|[{Title, Wikipedi...|
|
|
181
|
+
|https://example.com/|[{Title, Example ...|
|
|
182
|
+
+--------------------+--------------------+
|
|
183
|
+
|
|
184
|
+
>>> urls_df.printSchema()
|
|
185
|
+
root
|
|
186
|
+
|-- url: string (nullable = true)
|
|
187
|
+
|-- html: array (nullable = true)
|
|
188
|
+
| |-- element: struct (containsNull = true)
|
|
189
|
+
| | |-- elementType: string (nullable = true)
|
|
190
|
+
| | |-- content: string (nullable = true)
|
|
191
|
+
| | |-- metadata: map (nullable = true)
|
|
192
|
+
| | | |-- key: string
|
|
193
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
194
|
+
"""
|
|
195
|
+
if headers is None:
|
|
196
|
+
headers = {}
|
|
197
|
+
jdf = self._java_obj.partitionUrlsJava(path, headers)
|
|
198
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
199
|
+
return dataframe
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def partition_text(self, text):
|
|
203
|
+
"""
|
|
204
|
+
Parses content from a raw text string.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
text : str
|
|
209
|
+
Raw text input.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
pyspark.sql.DataFrame
|
|
214
|
+
DataFrame with parsed text.
|
|
215
|
+
|
|
216
|
+
Examples
|
|
217
|
+
--------
|
|
218
|
+
>>> raw_text = (
|
|
219
|
+
... "The big brown fox\\n"
|
|
220
|
+
... "was walking down the lane.\\n"
|
|
221
|
+
... "\\n"
|
|
222
|
+
... "At the end of the lane,\\n"
|
|
223
|
+
... "the fox met a bear."
|
|
224
|
+
... )
|
|
225
|
+
>>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
|
|
226
|
+
>>> text_df.show()
|
|
227
|
+
+--------------------------------------+
|
|
228
|
+
|txt |
|
|
229
|
+
+--------------------------------------+
|
|
230
|
+
|[{NarrativeText, The big brown fox was|
|
|
231
|
+
+--------------------------------------+
|
|
232
|
+
>>> text_df.printSchema()
|
|
233
|
+
root
|
|
234
|
+
|-- txt: array (nullable = true)
|
|
235
|
+
| |-- element: struct (containsNull = true)
|
|
236
|
+
| | |-- elementType: string (nullable = true)
|
|
237
|
+
| | |-- content: string (nullable = true)
|
|
238
|
+
| | |-- metadata: map (nullable = true)
|
|
239
|
+
| | | |-- key: string
|
|
240
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
241
|
+
"""
|
|
242
|
+
jdf = self._java_obj.partitionText(text)
|
|
243
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
244
|
+
return dataframe
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for partition properties used in reading various document types."""
|
|
15
|
+
from typing import Dict
|
|
16
|
+
|
|
17
|
+
from pyspark.ml.param import TypeConverters, Params, Param
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HasEmailReaderProperties(Params):
|
|
21
|
+
|
|
22
|
+
addAttachmentContent = Param(
|
|
23
|
+
Params._dummy(),
|
|
24
|
+
"addAttachmentContent",
|
|
25
|
+
"Whether to extract and include the textual content of plain-text attachments in the output",
|
|
26
|
+
typeConverter=TypeConverters.toBoolean
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def setAddAttachmentContent(self, value):
|
|
30
|
+
"""
|
|
31
|
+
Sets whether to extract and include the textual content of plain-text attachments in the output.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
value : bool
|
|
36
|
+
Whether to include text from plain-text attachments.
|
|
37
|
+
"""
|
|
38
|
+
return self._set(addAttachmentContent=value)
|
|
39
|
+
|
|
40
|
+
def getAddAttachmentContent(self):
|
|
41
|
+
"""
|
|
42
|
+
Gets whether to extract and include the textual content of plain-text attachments in the output.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
bool
|
|
47
|
+
Whether to include text from plain-text attachments.
|
|
48
|
+
"""
|
|
49
|
+
return self.getOrDefault(self.addAttachmentContent)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class HasExcelReaderProperties(Params):
|
|
53
|
+
|
|
54
|
+
cellSeparator = Param(
|
|
55
|
+
Params._dummy(),
|
|
56
|
+
"cellSeparator",
|
|
57
|
+
"String used to join cell values in a row when assembling textual output.",
|
|
58
|
+
typeConverter=TypeConverters.toString
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def setCellSeparator(self, value):
|
|
62
|
+
"""
|
|
63
|
+
Sets the string used to join cell values in a row when assembling textual output.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
value : str
|
|
68
|
+
Delimiter used to concatenate cell values.
|
|
69
|
+
"""
|
|
70
|
+
return self._set(cellSeparator=value)
|
|
71
|
+
|
|
72
|
+
def getCellSeparator(self):
|
|
73
|
+
"""
|
|
74
|
+
Gets the string used to join cell values in a row when assembling textual output.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str
|
|
79
|
+
Delimiter used to concatenate cell values.
|
|
80
|
+
"""
|
|
81
|
+
return self.getOrDefault(self.cellSeparator)
|
|
82
|
+
|
|
83
|
+
appendCells = Param(
|
|
84
|
+
Params._dummy(),
|
|
85
|
+
"appendCells",
|
|
86
|
+
"Whether to append all rows into a single content block instead of creating separate elements per row.",
|
|
87
|
+
typeConverter=TypeConverters.toBoolean
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def setAppendCells(self, value):
|
|
91
|
+
"""
|
|
92
|
+
Sets whether to append all rows into a single content block.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
value : bool
|
|
97
|
+
True to merge rows into one block, False for individual elements.
|
|
98
|
+
"""
|
|
99
|
+
return self._set(appendCells=value)
|
|
100
|
+
|
|
101
|
+
def getAppendCells(self):
|
|
102
|
+
"""
|
|
103
|
+
Gets whether to append all rows into a single content block.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
bool
|
|
108
|
+
True to merge rows into one block, False for individual elements.
|
|
109
|
+
"""
|
|
110
|
+
return self.getOrDefault(self.appendCells)
|
|
111
|
+
|
|
112
|
+
class HasHTMLReaderProperties(Params):
|
|
113
|
+
|
|
114
|
+
timeout = Param(
|
|
115
|
+
Params._dummy(),
|
|
116
|
+
"timeout",
|
|
117
|
+
"Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
|
|
118
|
+
typeConverter=TypeConverters.toInt
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def setTimeout(self, value):
|
|
122
|
+
"""
|
|
123
|
+
Sets the timeout (in seconds) for reading remote HTML resources.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
value : int
|
|
128
|
+
Timeout in seconds for remote content retrieval.
|
|
129
|
+
"""
|
|
130
|
+
return self._set(timeout=value)
|
|
131
|
+
|
|
132
|
+
def getTimeout(self):
|
|
133
|
+
"""
|
|
134
|
+
Gets the timeout value for reading remote HTML resources.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
int
|
|
139
|
+
Timeout in seconds.
|
|
140
|
+
"""
|
|
141
|
+
return self.getOrDefault(self.timeout)
|
|
142
|
+
|
|
143
|
+
def setHeaders(self, headers: Dict[str, str]):
|
|
144
|
+
self._call_java("setHeadersPython", headers)
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class HasPowerPointProperties(Params):
|
|
149
|
+
|
|
150
|
+
includeSlideNotes = Param(
|
|
151
|
+
Params._dummy(),
|
|
152
|
+
"includeSlideNotes",
|
|
153
|
+
"Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
|
|
154
|
+
typeConverter=TypeConverters.toBoolean
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def setIncludeSlideNotes(self, value):
|
|
158
|
+
"""
|
|
159
|
+
Sets whether to extract speaker notes from slides.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
value : bool
|
|
164
|
+
If True, notes are included as narrative text elements.
|
|
165
|
+
"""
|
|
166
|
+
return self._set(includeSlideNotes=value)
|
|
167
|
+
|
|
168
|
+
def getIncludeSlideNotes(self):
|
|
169
|
+
"""
|
|
170
|
+
Gets whether to extract speaker notes from slides.
|
|
171
|
+
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
bool
|
|
175
|
+
True if notes are included as narrative text elements.
|
|
176
|
+
"""
|
|
177
|
+
return self.getOrDefault(self.includeSlideNotes)
|
|
178
|
+
|
|
179
|
+
class HasTextReaderProperties(Params):
|
|
180
|
+
|
|
181
|
+
titleLengthSize = Param(
|
|
182
|
+
Params._dummy(),
|
|
183
|
+
"titleLengthSize",
|
|
184
|
+
"Maximum character length used to determine if a text block qualifies as a title during parsing.",
|
|
185
|
+
typeConverter=TypeConverters.toInt
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def setTitleLengthSize(self, value):
|
|
189
|
+
return self._set(titleLengthSize=value)
|
|
190
|
+
|
|
191
|
+
def getTitleLengthSize(self):
|
|
192
|
+
return self.getOrDefault(self.titleLengthSize)
|
|
193
|
+
|
|
194
|
+
groupBrokenParagraphs = Param(
|
|
195
|
+
Params._dummy(),
|
|
196
|
+
"groupBrokenParagraphs",
|
|
197
|
+
"Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
|
|
198
|
+
typeConverter=TypeConverters.toBoolean
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def setGroupBrokenParagraphs(self, value):
|
|
202
|
+
return self._set(groupBrokenParagraphs=value)
|
|
203
|
+
|
|
204
|
+
def getGroupBrokenParagraphs(self):
|
|
205
|
+
return self.getOrDefault(self.groupBrokenParagraphs)
|
|
206
|
+
|
|
207
|
+
paragraphSplit = Param(
|
|
208
|
+
Params._dummy(),
|
|
209
|
+
"paragraphSplit",
|
|
210
|
+
"Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
|
|
211
|
+
typeConverter=TypeConverters.toString
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def setParagraphSplit(self, value):
|
|
215
|
+
return self._set(paragraphSplit=value)
|
|
216
|
+
|
|
217
|
+
def getParagraphSplit(self):
|
|
218
|
+
return self.getOrDefault(self.paragraphSplit)
|
|
219
|
+
|
|
220
|
+
shortLineWordThreshold = Param(
|
|
221
|
+
Params._dummy(),
|
|
222
|
+
"shortLineWordThreshold",
|
|
223
|
+
"Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
|
|
224
|
+
typeConverter=TypeConverters.toInt
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def setShortLineWordThreshold(self, value):
|
|
228
|
+
return self._set(shortLineWordThreshold=value)
|
|
229
|
+
|
|
230
|
+
def getShortLineWordThreshold(self):
|
|
231
|
+
return self.getOrDefault(self.shortLineWordThreshold)
|
|
232
|
+
|
|
233
|
+
maxLineCount = Param(
|
|
234
|
+
Params._dummy(),
|
|
235
|
+
"maxLineCount",
|
|
236
|
+
"Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
|
|
237
|
+
typeConverter=TypeConverters.toInt
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def setMaxLineCount(self, value):
|
|
241
|
+
return self._set(maxLineCount=value)
|
|
242
|
+
|
|
243
|
+
def getMaxLineCount(self):
|
|
244
|
+
return self.getOrDefault(self.maxLineCount)
|
|
245
|
+
|
|
246
|
+
threshold = Param(
|
|
247
|
+
Params._dummy(),
|
|
248
|
+
"threshold",
|
|
249
|
+
"Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
|
|
250
|
+
typeConverter=TypeConverters.toFloat
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def setThreshold(self, value):
|
|
254
|
+
return self._set(threshold=value)
|
|
255
|
+
|
|
256
|
+
def getThreshold(self):
|
|
257
|
+
return self.getOrDefault(self.threshold)
|
|
258
|
+
|
|
259
|
+
class HasChunkerProperties(Params):
|
|
260
|
+
|
|
261
|
+
chunkingStrategy = Param(
|
|
262
|
+
Params._dummy(),
|
|
263
|
+
"chunkingStrategy",
|
|
264
|
+
"Set the chunking strategy",
|
|
265
|
+
typeConverter=TypeConverters.toString
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def setChunkingStrategy(self, value):
|
|
269
|
+
return self._set(chunkingStrategy=value)
|
|
270
|
+
|
|
271
|
+
maxCharacters = Param(
|
|
272
|
+
Params._dummy(),
|
|
273
|
+
"maxCharacters",
|
|
274
|
+
"Set the maximum number of characters",
|
|
275
|
+
typeConverter=TypeConverters.toInt
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def setMaxCharacters(self, value):
|
|
279
|
+
return self._set(maxCharacters=value)
|
|
280
|
+
|
|
281
|
+
newAfterNChars = Param(
|
|
282
|
+
Params._dummy(),
|
|
283
|
+
"newAfterNChars",
|
|
284
|
+
"Insert a new chunk after N characters",
|
|
285
|
+
typeConverter=TypeConverters.toInt
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def setNewAfterNChars(self, value):
|
|
289
|
+
return self._set(newAfterNChars=value)
|
|
290
|
+
|
|
291
|
+
overlap = Param(
|
|
292
|
+
Params._dummy(),
|
|
293
|
+
"overlap",
|
|
294
|
+
"Set the number of overlapping characters between chunks",
|
|
295
|
+
typeConverter=TypeConverters.toInt
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
def setOverlap(self, value):
|
|
299
|
+
return self._set(overlap=value)
|
|
300
|
+
|
|
301
|
+
combineTextUnderNChars = Param(
|
|
302
|
+
Params._dummy(),
|
|
303
|
+
"combineTextUnderNChars",
|
|
304
|
+
"Threshold to merge adjacent small sections",
|
|
305
|
+
typeConverter=TypeConverters.toInt
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
def setCombineTextUnderNChars(self, value):
|
|
309
|
+
return self._set(combineTextUnderNChars=value)
|
|
310
|
+
|
|
311
|
+
overlapAll = Param(
|
|
312
|
+
Params._dummy(),
|
|
313
|
+
"overlapAll",
|
|
314
|
+
"Apply overlap context between all sections, not just split chunks",
|
|
315
|
+
typeConverter=TypeConverters.toBoolean
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def setOverlapAll(self, value):
|
|
319
|
+
return self._set(overlapAll=value)
|