docling 2.17.0__py3-none-any.whl → 2.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,21 +2,28 @@ import logging
2
2
  import re
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Set, Union
5
+ from typing import Any, Optional, Union
6
6
 
7
- import docx
8
7
  from docling_core.types.doc import (
9
8
  DocItemLabel,
10
9
  DoclingDocument,
11
10
  DocumentOrigin,
12
11
  GroupLabel,
13
12
  ImageRef,
13
+ NodeItem,
14
14
  TableCell,
15
15
  TableData,
16
16
  )
17
+ from docx import Document
18
+ from docx.document import Document as DocxDocument
19
+ from docx.oxml.table import CT_Tc
20
+ from docx.oxml.xmlchemy import BaseOxmlElement
21
+ from docx.table import Table, _Cell
22
+ from docx.text.paragraph import Paragraph
17
23
  from lxml import etree
18
24
  from lxml.etree import XPath
19
25
  from PIL import Image, UnidentifiedImageError
26
+ from typing_extensions import override
20
27
 
21
28
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
22
29
  from docling.datamodel.base_models import InputFormat
@@ -26,7 +33,10 @@ _log = logging.getLogger(__name__)
26
33
 
27
34
 
28
35
  class MsWordDocumentBackend(DeclarativeDocumentBackend):
29
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
36
+ @override
37
+ def __init__(
38
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
39
+ ) -> None:
30
40
  super().__init__(in_doc, path_or_stream)
31
41
  self.XML_KEY = (
32
42
  "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
@@ -36,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
36
46
  }
37
47
  # self.initialise(path_or_stream)
38
48
  # Word file:
39
- self.path_or_stream = path_or_stream
40
- self.valid = False
49
+ self.path_or_stream: Union[BytesIO, Path] = path_or_stream
50
+ self.valid: bool = False
41
51
  # Initialise the parents for the hierarchy
42
- self.max_levels = 10
43
- self.level_at_new_list = None
44
- self.parents = {} # type: ignore
52
+ self.max_levels: int = 10
53
+ self.level_at_new_list: Optional[int] = None
54
+ self.parents: dict[int, Optional[NodeItem]] = {}
45
55
  for i in range(-1, self.max_levels):
46
56
  self.parents[i] = None
47
57
 
48
58
  self.level = 0
49
59
  self.listIter = 0
50
60
 
51
- self.history = {
61
+ self.history: dict[str, Any] = {
52
62
  "names": [None],
53
63
  "levels": [None],
54
64
  "numids": [None],
@@ -58,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
58
68
  self.docx_obj = None
59
69
  try:
60
70
  if isinstance(self.path_or_stream, BytesIO):
61
- self.docx_obj = docx.Document(self.path_or_stream)
71
+ self.docx_obj = Document(self.path_or_stream)
62
72
  elif isinstance(self.path_or_stream, Path):
63
- self.docx_obj = docx.Document(str(self.path_or_stream))
73
+ self.docx_obj = Document(str(self.path_or_stream))
64
74
 
65
75
  self.valid = True
66
76
  except Exception as e:
@@ -68,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
68
78
  f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
69
79
  ) from e
70
80
 
81
+ @override
71
82
  def is_valid(self) -> bool:
72
83
  return self.valid
73
84
 
74
85
  @classmethod
86
+ @override
75
87
  def supports_pagination(cls) -> bool:
76
88
  return False
77
89
 
90
+ @override
78
91
  def unload(self):
79
92
  if isinstance(self.path_or_stream, BytesIO):
80
93
  self.path_or_stream.close()
@@ -82,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
82
95
  self.path_or_stream = None
83
96
 
84
97
  @classmethod
85
- def supported_formats(cls) -> Set[InputFormat]:
98
+ @override
99
+ def supported_formats(cls) -> set[InputFormat]:
86
100
  return {InputFormat.DOCX}
87
101
 
102
+ @override
88
103
  def convert(self) -> DoclingDocument:
89
- # Parses the DOCX into a structured document model.
104
+ """Parses the DOCX into a structured document model.
105
+
106
+ Returns:
107
+ The parsed document.
108
+ """
90
109
 
91
110
  origin = DocumentOrigin(
92
111
  filename=self.file.name or "file",
@@ -104,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
104
123
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
105
124
  )
106
125
 
107
- def update_history(self, name, level, numid, ilevel):
126
+ def update_history(
127
+ self,
128
+ name: str,
129
+ level: Optional[int],
130
+ numid: Optional[int],
131
+ ilevel: Optional[int],
132
+ ):
108
133
  self.history["names"].append(name)
109
134
  self.history["levels"].append(level)
110
135
 
111
136
  self.history["numids"].append(numid)
112
137
  self.history["indents"].append(ilevel)
113
138
 
114
- def prev_name(self):
139
+ def prev_name(self) -> Optional[str]:
115
140
  return self.history["names"][-1]
116
141
 
117
- def prev_level(self):
142
+ def prev_level(self) -> Optional[int]:
118
143
  return self.history["levels"][-1]
119
144
 
120
- def prev_numid(self):
145
+ def prev_numid(self) -> Optional[int]:
121
146
  return self.history["numids"][-1]
122
147
 
123
- def prev_indent(self):
148
+ def prev_indent(self) -> Optional[int]:
124
149
  return self.history["indents"][-1]
125
150
 
126
151
  def get_level(self) -> int:
@@ -130,13 +155,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
130
155
  return k
131
156
  return 0
132
157
 
133
- def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
158
+ def walk_linear(
159
+ self,
160
+ body: BaseOxmlElement,
161
+ docx_obj: DocxDocument,
162
+ doc: DoclingDocument,
163
+ ) -> DoclingDocument:
134
164
  for element in body:
135
165
  tag_name = etree.QName(element).localname
136
166
  # Check for Inline Images (blip elements)
137
167
  namespaces = {
138
168
  "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
139
169
  "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
170
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
140
171
  }
141
172
  xpath_expr = XPath(".//a:blip", namespaces=namespaces)
142
173
  drawing_blip = xpath_expr(element)
@@ -149,7 +180,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
149
180
  _log.debug("could not parse a table, broken docx table")
150
181
 
151
182
  elif drawing_blip:
152
- self.handle_pictures(element, docx_obj, drawing_blip, doc)
183
+ self.handle_pictures(docx_obj, drawing_blip, doc)
184
+ # Check for the sdt containers, like table of contents
185
+ elif tag_name in ["sdt"]:
186
+ sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
187
+ if sdt_content is not None:
188
+ # Iterate paragraphs, runs, or text inside <w:sdtContent>.
189
+ paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
190
+ for p in paragraphs:
191
+ self.handle_text_elements(p, docx_obj, doc)
153
192
  # Check for Text
154
193
  elif tag_name in ["p"]:
155
194
  # "tcPr", "sectPr"
@@ -158,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
158
197
  _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
159
198
  return doc
160
199
 
161
- def str_to_int(self, s, default=0):
200
+ def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
162
201
  if s is None:
163
202
  return None
164
203
  try:
@@ -166,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
166
205
  except ValueError:
167
206
  return default
168
207
 
169
- def split_text_and_number(self, input_string):
208
+ def split_text_and_number(self, input_string: str) -> list[str]:
170
209
  match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
171
210
  if match:
172
211
  parts = list(filter(None, match.groups()))
@@ -174,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
174
213
  else:
175
214
  return [input_string]
176
215
 
177
- def get_numId_and_ilvl(self, paragraph):
216
+ def get_numId_and_ilvl(
217
+ self, paragraph: Paragraph
218
+ ) -> tuple[Optional[int], Optional[int]]:
178
219
  # Access the XML element of the paragraph
179
220
  numPr = paragraph._element.find(
180
221
  ".//w:numPr", namespaces=paragraph._element.nsmap
@@ -187,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
187
228
  numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
188
229
  ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
189
230
 
190
- return self.str_to_int(numId, default=None), self.str_to_int(
191
- ilvl, default=None
192
- )
231
+ return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
193
232
 
194
233
  return None, None # If the paragraph is not part of a list
195
234
 
196
- def get_label_and_level(self, paragraph):
235
+ def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
197
236
  if paragraph.style is None:
198
237
  return "Normal", None
199
238
  label = paragraph.style.style_id
@@ -203,26 +242,31 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
203
242
  parts = label.split(":")
204
243
 
205
244
  if len(parts) == 2:
206
- return parts[0], int(parts[1])
245
+ return parts[0], self.str_to_int(parts[1], None)
207
246
 
208
247
  parts = self.split_text_and_number(label)
209
248
 
210
249
  if "Heading" in label and len(parts) == 2:
211
250
  parts.sort()
212
- label_str = ""
213
- label_level = 0
251
+ label_str: str = ""
252
+ label_level: Optional[int] = 0
214
253
  if parts[0] == "Heading":
215
254
  label_str = parts[0]
216
- label_level = self.str_to_int(parts[1], default=None)
255
+ label_level = self.str_to_int(parts[1], None)
217
256
  if parts[1] == "Heading":
218
257
  label_str = parts[1]
219
- label_level = self.str_to_int(parts[0], default=None)
258
+ label_level = self.str_to_int(parts[0], None)
220
259
  return label_str, label_level
221
260
  else:
222
261
  return label, None
223
262
 
224
- def handle_text_elements(self, element, docx_obj, doc):
225
- paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
263
+ def handle_text_elements(
264
+ self,
265
+ element: BaseOxmlElement,
266
+ docx_obj: DocxDocument,
267
+ doc: DoclingDocument,
268
+ ) -> None:
269
+ paragraph = Paragraph(element, docx_obj)
226
270
 
227
271
  if paragraph.text is None:
228
272
  return
@@ -240,13 +284,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
240
284
  numid = None
241
285
 
242
286
  # Handle lists
243
- if numid is not None and ilevel is not None:
287
+ if (
288
+ numid is not None
289
+ and ilevel is not None
290
+ and p_style_id not in ["Title", "Heading"]
291
+ ):
244
292
  self.add_listitem(
245
- element,
246
- docx_obj,
247
293
  doc,
248
- p_style_id,
249
- p_level,
250
294
  numid,
251
295
  ilevel,
252
296
  text,
@@ -254,20 +298,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
254
298
  )
255
299
  self.update_history(p_style_id, p_level, numid, ilevel)
256
300
  return
257
- elif numid is None and self.prev_numid() is not None: # Close list
258
- for key, val in self.parents.items():
259
- if key >= self.level_at_new_list:
301
+ elif (
302
+ numid is None
303
+ and self.prev_numid() is not None
304
+ and p_style_id not in ["Title", "Heading"]
305
+ ): # Close list
306
+ if self.level_at_new_list:
307
+ for key in range(len(self.parents)):
308
+ if key >= self.level_at_new_list:
309
+ self.parents[key] = None
310
+ self.level = self.level_at_new_list - 1
311
+ self.level_at_new_list = None
312
+ else:
313
+ for key in range(len(self.parents)):
260
314
  self.parents[key] = None
261
- self.level = self.level_at_new_list - 1
262
- self.level_at_new_list = None
315
+ self.level = 0
316
+
263
317
  if p_style_id in ["Title"]:
264
- for key, val in self.parents.items():
318
+ for key in range(len(self.parents)):
265
319
  self.parents[key] = None
266
320
  self.parents[0] = doc.add_text(
267
321
  parent=None, label=DocItemLabel.TITLE, text=text
268
322
  )
269
323
  elif "Heading" in p_style_id:
270
- self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
324
+ self.add_header(doc, p_level, text)
271
325
 
272
326
  elif p_style_id in [
273
327
  "Paragraph",
@@ -295,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
295
349
  self.update_history(p_style_id, p_level, numid, ilevel)
296
350
  return
297
351
 
298
- def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
352
+ def add_header(
353
+ self, doc: DoclingDocument, curr_level: Optional[int], text: str
354
+ ) -> None:
299
355
  level = self.get_level()
300
356
  if isinstance(curr_level, int):
301
357
  if curr_level > level:
@@ -308,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
308
364
  )
309
365
  elif curr_level < level:
310
366
  # remove the tail
311
- for key, val in self.parents.items():
367
+ for key in range(len(self.parents)):
312
368
  if key >= curr_level:
313
369
  self.parents[key] = None
314
370
 
@@ -327,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
327
383
 
328
384
  def add_listitem(
329
385
  self,
330
- element,
331
- docx_obj,
332
- doc,
333
- p_style_id,
334
- p_level,
335
- numid,
336
- ilevel,
386
+ doc: DoclingDocument,
387
+ numid: int,
388
+ ilevel: int,
337
389
  text: str,
338
- is_numbered=False,
339
- ):
340
- # is_numbered = is_numbered
390
+ is_numbered: bool = False,
391
+ ) -> None:
341
392
  enum_marker = ""
342
393
 
343
394
  level = self.get_level()
395
+ prev_indent = self.prev_indent()
344
396
  if self.prev_numid() is None: # Open new list
345
- self.level_at_new_list = level # type: ignore
397
+ self.level_at_new_list = level
346
398
 
347
399
  self.parents[level] = doc.add_group(
348
400
  label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
@@ -361,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
361
413
  )
362
414
 
363
415
  elif (
364
- self.prev_numid() == numid and self.prev_indent() < ilevel
416
+ self.prev_numid() == numid
417
+ and self.level_at_new_list is not None
418
+ and prev_indent is not None
419
+ and prev_indent < ilevel
365
420
  ): # Open indented list
366
421
  for i in range(
367
- self.level_at_new_list + self.prev_indent() + 1,
422
+ self.level_at_new_list + prev_indent + 1,
368
423
  self.level_at_new_list + ilevel + 1,
369
424
  ):
370
425
  # Determine if this is an unordered list or an ordered list.
@@ -393,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
393
448
  text=text,
394
449
  )
395
450
 
396
- elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list
451
+ elif (
452
+ self.prev_numid() == numid
453
+ and self.level_at_new_list is not None
454
+ and prev_indent is not None
455
+ and ilevel < prev_indent
456
+ ): # Close list
397
457
  for k, v in self.parents.items():
398
458
  if k > self.level_at_new_list + ilevel:
399
459
  self.parents[k] = None
@@ -411,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
411
471
  )
412
472
  self.listIter = 0
413
473
 
414
- elif self.prev_numid() == numid or self.prev_indent() == ilevel:
474
+ elif self.prev_numid() == numid or prev_indent == ilevel:
415
475
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
416
476
  self.listIter += 1
417
477
  if is_numbered:
@@ -425,31 +485,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
425
485
  )
426
486
  return
427
487
 
428
- def handle_tables(self, element, docx_obj, doc):
429
-
430
- # Function to check if a cell has a colspan (gridSpan)
431
- def get_colspan(cell):
432
- grid_span = cell._element.xpath("@w:gridSpan")
433
- if grid_span:
434
- return int(grid_span[0]) # Return the number of columns spanned
435
- return 1 # Default is 1 (no colspan)
436
-
437
- # Function to check if a cell has a rowspan (vMerge)
438
- def get_rowspan(cell):
439
- v_merge = cell._element.xpath("@w:vMerge")
440
- if v_merge:
441
- return v_merge[
442
- 0
443
- ] # 'restart' indicates the beginning of a rowspan, others are continuation
444
- return 1
445
-
446
- table = docx.table.Table(element, docx_obj)
447
-
488
+ def handle_tables(
489
+ self,
490
+ element: BaseOxmlElement,
491
+ docx_obj: DocxDocument,
492
+ doc: DoclingDocument,
493
+ ) -> None:
494
+ table: Table = Table(element, docx_obj)
448
495
  num_rows = len(table.rows)
449
- num_cols = 0
450
- for row in table.rows:
451
- # Calculate the max number of columns
452
- num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
496
+ num_cols = len(table.columns)
497
+ _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
453
498
 
454
499
  if num_rows == 1 and num_cols == 1:
455
500
  cell_element = table.rows[0].cells[0]
@@ -458,59 +503,56 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
458
503
  self.walk_linear(cell_element._element, docx_obj, doc)
459
504
  return
460
505
 
461
- # Initialize the table grid
462
- table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
463
-
464
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
465
-
506
+ data = TableData(num_rows=num_rows, num_cols=num_cols)
507
+ cell_set: set[CT_Tc] = set()
466
508
  for row_idx, row in enumerate(table.rows):
509
+ _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
467
510
  col_idx = 0
468
- for c, cell in enumerate(row.cells):
469
- row_span = get_rowspan(cell)
470
- col_span = get_colspan(cell)
471
-
472
- cell_text = cell.text
473
- # In case cell doesn't return text via docx library:
474
- if len(cell_text) == 0:
475
- cell_xml = cell._element
476
-
477
- texts = [""]
478
- for elem in cell_xml.iter():
479
- if elem.tag.endswith("t"): # <w:t> tags that contain text
480
- if elem.text:
481
- texts.append(elem.text)
482
- # Join the collected text
483
- cell_text = " ".join(texts).strip()
484
-
485
- # Find the next available column in the grid
486
- while table_grid[row_idx][col_idx] is not None:
487
- col_idx += 1
488
-
489
- # Fill the grid with the cell value, considering rowspan and colspan
490
- for i in range(row_span if row_span == "restart" else 1):
491
- for j in range(col_span):
492
- table_grid[row_idx + i][col_idx + j] = ""
493
-
494
- cell = TableCell(
495
- text=cell_text,
496
- row_span=row_span,
497
- col_span=col_span,
498
- start_row_offset_idx=row_idx,
499
- end_row_offset_idx=row_idx + row_span,
511
+ while col_idx < num_cols:
512
+ cell: _Cell = row.cells[col_idx]
513
+ _log.debug(
514
+ f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
515
+ )
516
+ if cell is None or cell._tc in cell_set:
517
+ _log.debug(f" skipped since repeated content")
518
+ col_idx += cell.grid_span
519
+ continue
520
+ else:
521
+ cell_set.add(cell._tc)
522
+
523
+ spanned_idx = row_idx
524
+ spanned_tc: Optional[CT_Tc] = cell._tc
525
+ while spanned_tc == cell._tc:
526
+ spanned_idx += 1
527
+ spanned_tc = (
528
+ table.rows[spanned_idx].cells[col_idx]._tc
529
+ if spanned_idx < num_rows
530
+ else None
531
+ )
532
+ _log.debug(f" spanned before row {spanned_idx}")
533
+
534
+ table_cell = TableCell(
535
+ text=cell.text,
536
+ row_span=spanned_idx - row_idx,
537
+ col_span=cell.grid_span,
538
+ start_row_offset_idx=row.grid_cols_before + row_idx,
539
+ end_row_offset_idx=row.grid_cols_before + spanned_idx,
500
540
  start_col_offset_idx=col_idx,
501
- end_col_offset_idx=col_idx + col_span,
541
+ end_col_offset_idx=col_idx + cell.grid_span,
502
542
  col_header=False,
503
543
  row_header=False,
504
544
  )
505
-
506
- data.table_cells.append(cell)
545
+ data.table_cells.append(table_cell)
546
+ col_idx += cell.grid_span
507
547
 
508
548
  level = self.get_level()
509
549
  doc.add_table(data=data, parent=self.parents[level - 1])
510
550
  return
511
551
 
512
- def handle_pictures(self, element, docx_obj, drawing_blip, doc):
513
- def get_docx_image(element, drawing_blip):
552
+ def handle_pictures(
553
+ self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
554
+ ) -> None:
555
+ def get_docx_image(drawing_blip):
514
556
  rId = drawing_blip[0].get(
515
557
  "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
516
558
  )
@@ -520,11 +562,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
520
562
  image_data = image_part.blob # Get the binary image data
521
563
  return image_data
522
564
 
523
- image_data = get_docx_image(element, drawing_blip)
524
- image_bytes = BytesIO(image_data)
525
565
  level = self.get_level()
526
566
  # Open the BytesIO object with PIL to create an Image
527
567
  try:
568
+ image_data = get_docx_image(drawing_blip)
569
+ image_bytes = BytesIO(image_data)
528
570
  pil_image = Image.open(image_bytes)
529
571
  doc.add_picture(
530
572
  parent=self.parents[level - 1],
docling/cli/main.py CHANGED
@@ -219,6 +219,13 @@ def convert(
219
219
  bool,
220
220
  typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
221
221
  ] = False,
222
+ enrich_picture_classes: Annotated[
223
+ bool,
224
+ typer.Option(
225
+ ...,
226
+ help="Enable the picture classification enrichment model in the pipeline.",
227
+ ),
228
+ ] = False,
222
229
  artifacts_path: Annotated[
223
230
  Optional[Path],
224
231
  typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -375,6 +382,7 @@ def convert(
375
382
  do_table_structure=True,
376
383
  do_code_enrichment=enrich_code,
377
384
  do_formula_enrichment=enrich_formula,
385
+ do_picture_classification=enrich_picture_classes,
378
386
  document_timeout=document_timeout,
379
387
  )
380
388
  pipeline_options.table_structure_options.do_cell_matching = (