paddleocr-haystack 0.1.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ ## [integrations/paddleocr-v1.0.0] - 2026-01-12
4
+
5
+ ### 🧹 Chores
6
+
7
+ - Make fmt command more forgiving (#2671)
8
+ - [**breaking**] Paddleocr - drop Python 3.9 and use X|Y typing (#2714)
9
+
10
+ ### 🌀 Miscellaneous
11
+
12
+ - Feat: Add 'Client-Platform' header for server processing for PaddleOCR (#2657)
13
+
14
+ ## [integrations/paddleocr-v0.1.0] - 2025-12-10
15
+
16
+ ### 🚀 Features
17
+
18
+ - Add PaddleOCR-VL document converter (#2567)
19
+
20
+ <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paddleocr-haystack
3
- Version: 0.1.0
3
+ Version: 1.1.0
4
4
  Summary: An integration of PaddleOCR with Haystack
5
5
  Project-URL: Documentation, https://github.com/haystack-core-integrations/tree/main/integrations/paddleocr#readme
6
6
  Project-URL: Issues, https://github.com/haystack-core-integrations/issues
@@ -10,16 +10,15 @@ License-Expression: Apache-2.0
10
10
  License-File: LICENSE.txt
11
11
  Classifier: Development Status :: 4 - Beta
12
12
  Classifier: Programming Language :: Python
13
- Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
17
16
  Classifier: Programming Language :: Python :: Implementation :: CPython
18
17
  Classifier: Programming Language :: Python :: Implementation :: PyPy
19
- Requires-Python: >=3.9
20
- Requires-Dist: haystack-ai>=2.19.0
21
- Requires-Dist: paddleocr>=3.3.2
22
- Requires-Dist: paddlex[serving]>=3.3.10
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: haystack-ai>=2.22.0
20
+ Requires-Dist: paddleocr>=3.4.0
21
+ Requires-Dist: paddlex[serving]>=3.4.0
23
22
  Requires-Dist: requests>=2.25.0
24
23
  Description-Content-Type: text/markdown
25
24
 
@@ -7,7 +7,7 @@ name = "paddleocr-haystack"
7
7
  dynamic = ["version"]
8
8
  description = 'An integration of PaddleOCR with Haystack'
9
9
  readme = "README.md"
10
- requires-python = ">=3.9"
10
+ requires-python = ">=3.10"
11
11
  license = "Apache-2.0"
12
12
  keywords = []
13
13
  authors = [
@@ -16,7 +16,6 @@ authors = [
16
16
  classifiers = [
17
17
  "Development Status :: 4 - Beta",
18
18
  "Programming Language :: Python",
19
- "Programming Language :: Python :: 3.9",
20
19
  "Programming Language :: Python :: 3.10",
21
20
  "Programming Language :: Python :: 3.11",
22
21
  "Programming Language :: Python :: 3.12",
@@ -24,9 +23,9 @@ classifiers = [
24
23
  "Programming Language :: Python :: Implementation :: PyPy",
25
24
  ]
26
25
  dependencies = [
27
- "haystack-ai>=2.19.0",
28
- "paddleocr>=3.3.2",
29
- "paddlex[serving]>=3.3.10",
26
+ "haystack-ai>=2.22.0",
27
+ "paddleocr>=3.4.0",
28
+ "paddlex[serving]>=3.4.0",
30
29
  "requests>=2.25.0",
31
30
  ]
32
31
 
@@ -52,7 +51,7 @@ dependencies = ["haystack-pydoc-tools", "ruff"]
52
51
 
53
52
  [tool.hatch.envs.default.scripts]
54
53
  docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
55
- fmt = "ruff check --fix {args} && ruff format {args}"
54
+ fmt = "ruff check --fix {args}; ruff format {args}"
56
55
  fmt-check = "ruff check {args} && ruff format --check {args}"
57
56
 
58
57
  [tool.hatch.envs.test]
@@ -80,7 +79,6 @@ check_untyped_defs = true
80
79
  disallow_incomplete_defs = true
81
80
 
82
81
  [tool.ruff]
83
- target-version = "py39"
84
82
  line-length = 120
85
83
 
86
84
  [tool.ruff.lint]
@@ -127,10 +125,6 @@ ignore = [
127
125
  "B008",
128
126
  "S101",
129
127
  ]
130
- unfixable = [
131
- # Don't touch unused imports
132
- "F401",
133
- ]
134
128
 
135
129
  [tool.ruff.lint.isort]
136
130
  known-first-party = ["haystack_integrations"]
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
  import base64
5
5
  from pathlib import Path
6
- from typing import Any, Literal, Optional, Union
6
+ from typing import Any, Literal
7
7
 
8
8
  import requests
9
9
  from haystack import Document, component, default_from_dict, default_to_dict, logging
@@ -24,7 +24,7 @@ from paddlex.inference.serving.schemas.shared.ocr import FileType # type: ignor
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
26
 
27
- FileTypeInput = Union[Literal["pdf", "image"], None]
27
+ FileTypeInput = Literal["pdf", "image"] | None
28
28
 
29
29
  # Supported image file extensions
30
30
  _IMAGE_EXTENSIONS = {
@@ -41,9 +41,9 @@ _PDF_EXTENSIONS = {".pdf"}
41
41
 
42
42
 
43
43
  def _infer_file_type_from_source(
44
- source: Union[str, Path, ByteStream],
45
- mime_type: Optional[str] = None,
46
- ) -> Optional[FileType]:
44
+ source: str | Path | ByteStream,
45
+ mime_type: str | None = None,
46
+ ) -> FileType | None:
47
47
  """
48
48
  Infer file type from file extension or MIME type.
49
49
 
@@ -56,7 +56,7 @@ def _infer_file_type_from_source(
56
56
  determined.
57
57
  """
58
58
  # Try to get extension from file path
59
- file_path: Optional[str] = None
59
+ file_path: str | None = None
60
60
 
61
61
  # Check if source is a file path
62
62
  if isinstance(source, (str, Path)):
@@ -86,7 +86,7 @@ def _infer_file_type_from_source(
86
86
  return None
87
87
 
88
88
 
89
- def _normalize_file_type(file_type: Optional[FileTypeInput]) -> Optional[FileType]:
89
+ def _normalize_file_type(file_type: FileTypeInput) -> FileType | None:
90
90
  """
91
91
  Normalize file type input to the numeric format expected by the API.
92
92
 
@@ -145,36 +145,45 @@ class PaddleOCRVLDocumentConverter:
145
145
  *,
146
146
  api_url: str,
147
147
  access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"),
148
- file_type: Optional[FileTypeInput] = None,
149
- use_doc_orientation_classify: Optional[bool] = None,
150
- use_doc_unwarping: Optional[bool] = None,
151
- use_layout_detection: Optional[bool] = None,
152
- use_chart_recognition: Optional[bool] = None,
153
- layout_threshold: Optional[Union[float, dict]] = None,
154
- layout_nms: Optional[bool] = None,
155
- layout_unclip_ratio: Optional[Union[float, tuple[float, float], dict]] = None,
156
- layout_merge_bboxes_mode: Optional[Union[str, dict]] = None,
157
- prompt_label: Optional[str] = None,
158
- format_block_content: Optional[bool] = None,
159
- repetition_penalty: Optional[float] = None,
160
- temperature: Optional[float] = None,
161
- top_p: Optional[float] = None,
162
- min_pixels: Optional[int] = None,
163
- max_pixels: Optional[int] = None,
164
- prettify_markdown: Optional[bool] = None,
165
- show_formula_number: Optional[bool] = None,
166
- visualize: Optional[bool] = None,
167
- additional_params: Optional[dict[str, Any]] = None,
148
+ file_type: FileTypeInput = None,
149
+ use_doc_orientation_classify: bool | None = False,
150
+ use_doc_unwarping: bool | None = False,
151
+ use_layout_detection: bool | None = None,
152
+ use_chart_recognition: bool | None = None,
153
+ use_seal_recognition: bool | None = None,
154
+ use_ocr_for_image_block: bool | None = None,
155
+ layout_threshold: float | dict | None = None,
156
+ layout_nms: bool | None = None,
157
+ layout_unclip_ratio: float | tuple[float, float] | dict | None = None,
158
+ layout_merge_bboxes_mode: str | dict | None = None,
159
+ layout_shape_mode: str | None = None,
160
+ prompt_label: str | None = None,
161
+ format_block_content: bool | None = None,
162
+ repetition_penalty: float | None = None,
163
+ temperature: float | None = None,
164
+ top_p: float | None = None,
165
+ min_pixels: int | None = None,
166
+ max_pixels: int | None = None,
167
+ max_new_tokens: int | None = None,
168
+ merge_layout_blocks: bool | None = None,
169
+ markdown_ignore_labels: list[str] | None = None,
170
+ vlm_extra_args: dict | None = None,
171
+ prettify_markdown: bool | None = None,
172
+ show_formula_number: bool | None = None,
173
+ restructure_pages: bool | None = None,
174
+ merge_tables: bool | None = None,
175
+ relevel_titles: bool | None = None,
176
+ visualize: bool | None = None,
177
+ additional_params: dict[str, Any] | None = None,
168
178
  ):
169
179
  """
170
180
  Create a `PaddleOCRVLDocumentConverter` component.
171
181
 
172
182
  :param api_url:
173
183
  API URL. To obtain the API URL, visit the [PaddleOCR official
174
- website](https://aistudio.baidu.com/paddleocr/task), click the
175
- **API** button in the upper-left corner, choose the example code
176
- for **Large Model document parsing(PaddleOCR-VL)**, and copy the
177
- `API_URL`.
184
+ website](https://aistudio.baidu.com/paddleocr), click the
185
+ **API** button, choose the example code for PaddleOCR-VL, and copy
186
+ the `API_URL`.
178
187
  :param access_token:
179
188
  AI Studio access token. You can obtain it from [this
180
189
  page](https://aistudio.baidu.com/account/accessToken).
@@ -193,6 +202,10 @@ class PaddleOCRVLDocumentConverter:
193
202
  Whether to enable the layout detection function.
194
203
  :param use_chart_recognition:
195
204
  Whether to enable the chart recognition function.
205
+ :param use_seal_recognition:
206
+ Whether to enable the seal recognition function.
207
+ :param use_ocr_for_image_block:
208
+ Whether to recognize text in image blocks.
196
209
  :param layout_threshold:
197
210
  Layout detection threshold. Can be a float or a dict with
198
211
  page-specific thresholds.
@@ -204,9 +217,11 @@ class PaddleOCRVLDocumentConverter:
204
217
  dict with page-specific values.
205
218
  :param layout_merge_bboxes_mode:
206
219
  Layout merge bounding boxes mode. Can be a string or a dict.
220
+ :param layout_shape_mode:
221
+ Layout shape mode.
207
222
  :param prompt_label:
208
223
  Prompt type for the VLM. Possible values are "ocr", "formula",
209
- "table", and "chart".
224
+ "table", "chart", "seal", and "spotting".
210
225
  :param format_block_content:
211
226
  Whether to format block content.
212
227
  :param repetition_penalty:
@@ -219,10 +234,25 @@ class PaddleOCRVLDocumentConverter:
219
234
  Minimum number of pixels allowed during VLM preprocessing.
220
235
  :param max_pixels:
221
236
  Maximum number of pixels allowed during VLM preprocessing.
237
+ :param max_new_tokens:
238
+ Maximum number of tokens generated by the VLM.
239
+ :param merge_layout_blocks:
240
+ Whether to merge the layout detection boxes for cross-column or
241
+ staggered top and bottom columns.
242
+ :param markdown_ignore_labels:
243
+ Layout labels that need to be ignored in Markdown.
244
+ :param vlm_extra_args:
245
+ Additional configuration parameters for the VLM.
222
246
  :param prettify_markdown:
223
247
  Whether to prettify the output Markdown text.
224
248
  :param show_formula_number:
225
249
  Whether to include formula numbers in the output markdown text.
250
+ :param restructure_pages:
251
+ Whether to restructure results across multiple pages.
252
+ :param merge_tables:
253
+ Whether to merge tables across pages.
254
+ :param relevel_titles:
255
+ Whether to relevel titles.
226
256
  :param visualize:
227
257
  Whether to return visualization results.
228
258
  :param additional_params:
@@ -235,10 +265,13 @@ class PaddleOCRVLDocumentConverter:
235
265
  self.use_doc_unwarping = use_doc_unwarping
236
266
  self.use_layout_detection = use_layout_detection
237
267
  self.use_chart_recognition = use_chart_recognition
268
+ self.use_seal_recognition = use_seal_recognition
269
+ self.use_ocr_for_image_block = use_ocr_for_image_block
238
270
  self.layout_threshold = layout_threshold
239
271
  self.layout_nms = layout_nms
240
272
  self.layout_unclip_ratio = layout_unclip_ratio
241
273
  self.layout_merge_bboxes_mode = layout_merge_bboxes_mode
274
+ self.layout_shape_mode = layout_shape_mode
242
275
  self.prompt_label = prompt_label
243
276
  self.format_block_content = format_block_content
244
277
  self.repetition_penalty = repetition_penalty
@@ -246,8 +279,15 @@ class PaddleOCRVLDocumentConverter:
246
279
  self.top_p = top_p
247
280
  self.min_pixels = min_pixels
248
281
  self.max_pixels = max_pixels
282
+ self.max_new_tokens = max_new_tokens
283
+ self.merge_layout_blocks = merge_layout_blocks
284
+ self.markdown_ignore_labels = markdown_ignore_labels
285
+ self.vlm_extra_args = vlm_extra_args
249
286
  self.prettify_markdown = prettify_markdown
250
287
  self.show_formula_number = show_formula_number
288
+ self.restructure_pages = restructure_pages
289
+ self.merge_tables = merge_tables
290
+ self.relevel_titles = relevel_titles
251
291
  self.visualize = visualize
252
292
  self.additional_params = additional_params
253
293
 
@@ -267,10 +307,13 @@ class PaddleOCRVLDocumentConverter:
267
307
  use_doc_unwarping=self.use_doc_unwarping,
268
308
  use_layout_detection=self.use_layout_detection,
269
309
  use_chart_recognition=self.use_chart_recognition,
310
+ use_seal_recognition=self.use_seal_recognition,
311
+ use_ocr_for_image_block=self.use_ocr_for_image_block,
270
312
  layout_threshold=self.layout_threshold,
271
313
  layout_nms=self.layout_nms,
272
314
  layout_unclip_ratio=self.layout_unclip_ratio,
273
315
  layout_merge_bboxes_mode=self.layout_merge_bboxes_mode,
316
+ layout_shape_mode=self.layout_shape_mode,
274
317
  prompt_label=self.prompt_label,
275
318
  format_block_content=self.format_block_content,
276
319
  repetition_penalty=self.repetition_penalty,
@@ -278,8 +321,15 @@ class PaddleOCRVLDocumentConverter:
278
321
  top_p=self.top_p,
279
322
  min_pixels=self.min_pixels,
280
323
  max_pixels=self.max_pixels,
324
+ max_new_tokens=self.max_new_tokens,
325
+ merge_layout_blocks=self.merge_layout_blocks,
326
+ markdown_ignore_labels=self.markdown_ignore_labels,
327
+ vlm_extra_args=self.vlm_extra_args,
281
328
  prettify_markdown=self.prettify_markdown,
282
329
  show_formula_number=self.show_formula_number,
330
+ restructure_pages=self.restructure_pages,
331
+ merge_tables=self.merge_tables,
332
+ relevel_titles=self.relevel_titles,
283
333
  visualize=self.visualize,
284
334
  additional_params=self.additional_params,
285
335
  )
@@ -331,6 +381,10 @@ class PaddleOCRVLDocumentConverter:
331
381
  request_data["useLayoutDetection"] = self.use_layout_detection
332
382
  if self.use_chart_recognition is not None:
333
383
  request_data["useChartRecognition"] = self.use_chart_recognition
384
+ if self.use_seal_recognition is not None:
385
+ request_data["useSealRecognition"] = self.use_seal_recognition
386
+ if self.use_ocr_for_image_block is not None:
387
+ request_data["useOcrForImageBlock"] = self.use_ocr_for_image_block
334
388
  if self.layout_threshold is not None:
335
389
  request_data["layoutThreshold"] = self.layout_threshold
336
390
  if self.layout_nms is not None:
@@ -339,6 +393,8 @@ class PaddleOCRVLDocumentConverter:
339
393
  request_data["layoutUnclipRatio"] = self.layout_unclip_ratio
340
394
  if self.layout_merge_bboxes_mode is not None:
341
395
  request_data["layoutMergeBboxesMode"] = self.layout_merge_bboxes_mode
396
+ if self.layout_shape_mode is not None:
397
+ request_data["layoutShapeMode"] = self.layout_shape_mode
342
398
  if self.prompt_label is not None:
343
399
  request_data["promptLabel"] = self.prompt_label
344
400
  if self.format_block_content is not None:
@@ -353,10 +409,24 @@ class PaddleOCRVLDocumentConverter:
353
409
  request_data["minPixels"] = self.min_pixels
354
410
  if self.max_pixels is not None:
355
411
  request_data["maxPixels"] = self.max_pixels
412
+ if self.max_new_tokens is not None:
413
+ request_data["maxNewTokens"] = self.max_new_tokens
414
+ if self.merge_layout_blocks is not None:
415
+ request_data["mergeLayoutBlocks"] = self.merge_layout_blocks
416
+ if self.markdown_ignore_labels is not None:
417
+ request_data["markdownIgnoreLabels"] = self.markdown_ignore_labels
418
+ if self.vlm_extra_args is not None:
419
+ request_data["vlmExtraArgs"] = self.vlm_extra_args
356
420
  if self.prettify_markdown is not None:
357
421
  request_data["prettifyMarkdown"] = self.prettify_markdown
358
422
  if self.show_formula_number is not None:
359
423
  request_data["showFormulaNumber"] = self.show_formula_number
424
+ if self.restructure_pages is not None:
425
+ request_data["restructurePages"] = self.restructure_pages
426
+ if self.merge_tables is not None:
427
+ request_data["mergeTables"] = self.merge_tables
428
+ if self.relevel_titles is not None:
429
+ request_data["relevelTitles"] = self.relevel_titles
360
430
  if self.visualize is not None:
361
431
  request_data["visualize"] = self.visualize
362
432
  if self.additional_params is not None:
@@ -372,7 +442,7 @@ class PaddleOCRVLDocumentConverter:
372
442
 
373
443
  # Prepare headers with authentication
374
444
  access_token_value = self.access_token.resolve_value() if self.access_token else None
375
- headers = {"Content-Type": "application/json"}
445
+ headers = {"Content-Type": "application/json", "Client-Platform": "haystack"}
376
446
  if access_token_value:
377
447
  headers["Authorization"] = f"token {access_token_value}"
378
448
 
@@ -421,8 +491,8 @@ class PaddleOCRVLDocumentConverter:
421
491
  @component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
422
492
  def run(
423
493
  self,
424
- sources: list[Union[str, Path, ByteStream]],
425
- meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
494
+ sources: list[str | Path | ByteStream],
495
+ meta: dict[str, Any] | list[dict[str, Any]] | None = None,
426
496
  ) -> dict[str, Any]:
427
497
  """
428
498
  Convert image or PDF files to Documents.
@@ -448,7 +518,7 @@ class PaddleOCRVLDocumentConverter:
448
518
 
449
519
  meta_list = normalize_metadata(meta, sources_count=len(sources))
450
520
 
451
- for source, metadata in zip(sources, meta_list):
521
+ for source, metadata in zip(sources, meta_list, strict=True):
452
522
  try:
453
523
  bytestream = get_bytestream_from_source(source)
454
524
  except Exception as e:
@@ -42,8 +42,8 @@ class TestPaddleOCRVLDocumentConverter:
42
42
  assert converter.access_token == Secret.from_env_var("AISTUDIO_ACCESS_TOKEN")
43
43
  assert converter.api_url == "http://test-api-url.com"
44
44
  assert converter.file_type is None
45
- assert converter.use_doc_orientation_classify is None
46
- assert converter.use_doc_unwarping is None
45
+ assert converter.use_doc_orientation_classify is False
46
+ assert converter.use_doc_unwarping is False
47
47
  assert converter.use_layout_detection is None
48
48
  assert converter.use_chart_recognition is None
49
49
  assert converter.layout_threshold is None
@@ -126,14 +126,17 @@ class TestPaddleOCRVLDocumentConverter:
126
126
  "type": "env_var",
127
127
  },
128
128
  "file_type": None,
129
- "use_doc_orientation_classify": None,
130
- "use_doc_unwarping": None,
129
+ "use_doc_orientation_classify": False,
130
+ "use_doc_unwarping": False,
131
131
  "use_layout_detection": None,
132
132
  "use_chart_recognition": None,
133
+ "use_seal_recognition": None,
134
+ "use_ocr_for_image_block": None,
133
135
  "layout_threshold": None,
134
136
  "layout_nms": None,
135
137
  "layout_unclip_ratio": None,
136
138
  "layout_merge_bboxes_mode": None,
139
+ "layout_shape_mode": None,
137
140
  "prompt_label": None,
138
141
  "format_block_content": None,
139
142
  "repetition_penalty": None,
@@ -141,8 +144,15 @@ class TestPaddleOCRVLDocumentConverter:
141
144
  "top_p": None,
142
145
  "min_pixels": None,
143
146
  "max_pixels": None,
147
+ "max_new_tokens": None,
148
+ "merge_layout_blocks": None,
149
+ "markdown_ignore_labels": None,
150
+ "vlm_extra_args": None,
144
151
  "prettify_markdown": None,
145
152
  "show_formula_number": None,
153
+ "restructure_pages": None,
154
+ "merge_tables": None,
155
+ "relevel_titles": None,
146
156
  "visualize": None,
147
157
  "additional_params": None,
148
158
  },
@@ -158,10 +168,13 @@ class TestPaddleOCRVLDocumentConverter:
158
168
  use_doc_unwarping=False,
159
169
  use_layout_detection=True,
160
170
  use_chart_recognition=False,
171
+ use_seal_recognition=None,
172
+ use_ocr_for_image_block=None,
161
173
  layout_threshold=0.7,
162
174
  layout_nms=False,
163
175
  layout_unclip_ratio=2.0,
164
176
  layout_merge_bboxes_mode="separate",
177
+ layout_shape_mode=None,
165
178
  prompt_label="formula",
166
179
  format_block_content=False,
167
180
  repetition_penalty=1.2,
@@ -169,8 +182,15 @@ class TestPaddleOCRVLDocumentConverter:
169
182
  top_p=0.95,
170
183
  min_pixels=200,
171
184
  max_pixels=2000,
185
+ max_new_tokens=None,
186
+ merge_layout_blocks=None,
187
+ markdown_ignore_labels=None,
188
+ vlm_extra_args=None,
172
189
  prettify_markdown=True,
173
190
  show_formula_number=True,
191
+ restructure_pages=None,
192
+ merge_tables=None,
193
+ relevel_titles=None,
174
194
  visualize=False,
175
195
  additional_params={},
176
196
  )
@@ -190,10 +210,13 @@ class TestPaddleOCRVLDocumentConverter:
190
210
  "use_doc_unwarping": False,
191
211
  "use_layout_detection": True,
192
212
  "use_chart_recognition": False,
213
+ "use_seal_recognition": None,
214
+ "use_ocr_for_image_block": None,
193
215
  "layout_threshold": 0.7,
194
216
  "layout_nms": False,
195
217
  "layout_unclip_ratio": 2.0,
196
218
  "layout_merge_bboxes_mode": "separate",
219
+ "layout_shape_mode": None,
197
220
  "prompt_label": "formula",
198
221
  "format_block_content": False,
199
222
  "repetition_penalty": 1.2,
@@ -201,8 +224,15 @@ class TestPaddleOCRVLDocumentConverter:
201
224
  "top_p": 0.95,
202
225
  "min_pixels": 200,
203
226
  "max_pixels": 2000,
227
+ "max_new_tokens": None,
228
+ "merge_layout_blocks": None,
229
+ "markdown_ignore_labels": None,
230
+ "vlm_extra_args": None,
204
231
  "prettify_markdown": True,
205
232
  "show_formula_number": True,
233
+ "restructure_pages": None,
234
+ "merge_tables": None,
235
+ "relevel_titles": None,
206
236
  "visualize": False,
207
237
  "additional_params": {},
208
238
  },