docling 2.8.3__tar.gz → 2.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {docling-2.8.3 → docling-2.10.0}/PKG-INFO +7 -7
  2. {docling-2.8.3 → docling-2.10.0}/README.md +2 -2
  3. {docling-2.8.3 → docling-2.10.0}/docling/backend/docling_parse_backend.py +1 -1
  4. {docling-2.8.3 → docling-2.10.0}/docling/backend/docling_parse_v2_backend.py +7 -5
  5. {docling-2.8.3 → docling-2.10.0}/docling/backend/msword_backend.py +43 -27
  6. docling-2.10.0/docling/chunking/__init__.py +12 -0
  7. {docling-2.8.3 → docling-2.10.0}/docling/cli/main.py +83 -28
  8. {docling-2.8.3 → docling-2.10.0}/docling/datamodel/base_models.py +1 -0
  9. docling-2.10.0/docling/datamodel/document.py +309 -0
  10. {docling-2.8.3 → docling-2.10.0}/docling/datamodel/pipeline_options.py +5 -1
  11. {docling-2.8.3 → docling-2.10.0}/docling/document_converter.py +5 -5
  12. {docling-2.8.3 → docling-2.10.0}/docling/models/ds_glm_model.py +2 -2
  13. {docling-2.8.3 → docling-2.10.0}/docling/pipeline/standard_pdf_pipeline.py +2 -0
  14. docling-2.10.0/docling/py.typed +1 -0
  15. docling-2.10.0/docling/utils/glm_utils.py +336 -0
  16. {docling-2.8.3 → docling-2.10.0}/pyproject.toml +5 -5
  17. docling-2.8.3/docling/datamodel/document.py +0 -560
  18. {docling-2.8.3 → docling-2.10.0}/LICENSE +0 -0
  19. {docling-2.8.3 → docling-2.10.0}/docling/__init__.py +0 -0
  20. {docling-2.8.3 → docling-2.10.0}/docling/backend/__init__.py +0 -0
  21. {docling-2.8.3 → docling-2.10.0}/docling/backend/abstract_backend.py +0 -0
  22. {docling-2.8.3 → docling-2.10.0}/docling/backend/asciidoc_backend.py +0 -0
  23. {docling-2.8.3 → docling-2.10.0}/docling/backend/html_backend.py +0 -0
  24. {docling-2.8.3 → docling-2.10.0}/docling/backend/md_backend.py +0 -0
  25. {docling-2.8.3 → docling-2.10.0}/docling/backend/msexcel_backend.py +0 -0
  26. {docling-2.8.3 → docling-2.10.0}/docling/backend/mspowerpoint_backend.py +0 -0
  27. {docling-2.8.3 → docling-2.10.0}/docling/backend/pdf_backend.py +0 -0
  28. {docling-2.8.3 → docling-2.10.0}/docling/backend/pypdfium2_backend.py +0 -0
  29. {docling-2.8.3 → docling-2.10.0}/docling/cli/__init__.py +0 -0
  30. {docling-2.8.3 → docling-2.10.0}/docling/datamodel/__init__.py +0 -0
  31. {docling-2.8.3 → docling-2.10.0}/docling/datamodel/settings.py +0 -0
  32. {docling-2.8.3 → docling-2.10.0}/docling/exceptions.py +0 -0
  33. {docling-2.8.3 → docling-2.10.0}/docling/models/__init__.py +0 -0
  34. {docling-2.8.3 → docling-2.10.0}/docling/models/base_model.py +0 -0
  35. {docling-2.8.3 → docling-2.10.0}/docling/models/base_ocr_model.py +0 -0
  36. {docling-2.8.3 → docling-2.10.0}/docling/models/easyocr_model.py +0 -0
  37. {docling-2.8.3 → docling-2.10.0}/docling/models/layout_model.py +0 -0
  38. {docling-2.8.3 → docling-2.10.0}/docling/models/ocr_mac_model.py +0 -0
  39. {docling-2.8.3 → docling-2.10.0}/docling/models/page_assemble_model.py +0 -0
  40. {docling-2.8.3 → docling-2.10.0}/docling/models/page_preprocessing_model.py +0 -0
  41. {docling-2.8.3 → docling-2.10.0}/docling/models/rapid_ocr_model.py +0 -0
  42. {docling-2.8.3 → docling-2.10.0}/docling/models/table_structure_model.py +0 -0
  43. {docling-2.8.3 → docling-2.10.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  44. {docling-2.8.3 → docling-2.10.0}/docling/models/tesseract_ocr_model.py +0 -0
  45. {docling-2.8.3 → docling-2.10.0}/docling/pipeline/__init__.py +0 -0
  46. {docling-2.8.3 → docling-2.10.0}/docling/pipeline/base_pipeline.py +0 -0
  47. {docling-2.8.3 → docling-2.10.0}/docling/pipeline/simple_pipeline.py +0 -0
  48. {docling-2.8.3 → docling-2.10.0}/docling/utils/__init__.py +0 -0
  49. {docling-2.8.3 → docling-2.10.0}/docling/utils/export.py +0 -0
  50. {docling-2.8.3 → docling-2.10.0}/docling/utils/layout_utils.py +0 -0
  51. {docling-2.8.3 → docling-2.10.0}/docling/utils/profiling.py +0 -0
  52. {docling-2.8.3 → docling-2.10.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.8.3
3
+ Version: 2.10.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
25
25
  Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
- Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core (>=2.6.1,<3.0.0)
28
+ Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
- Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
31
+ Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
- Requires-Dist: pydantic (>=2.0.0,<2.10)
42
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
43
43
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
44
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
45
45
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
59
59
  </a>
60
60
  </p>
61
61
 
62
- # 🦆 Docling
62
+ # Docling
63
63
 
64
64
  <p align="center">
65
65
  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
81
81
 
82
82
  ## Features
83
83
 
84
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
84
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
85
85
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
86
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
87
  * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
@@ -4,7 +4,7 @@
4
4
  </a>
5
5
  </p>
6
6
 
7
- # 🦆 Docling
7
+ # Docling
8
8
 
9
9
  <p align="center">
10
10
  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -26,7 +26,7 @@ Docling parses documents and exports them to the desired format with ease and sp
26
26
 
27
27
  ## Features
28
28
 
29
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
29
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
30
30
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
31
31
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
32
32
  * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
- from docling_parse.docling_parse import pdf_parser_v1
9
+ from docling_parse.pdf_parsers import pdf_parser_v1
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_parse.docling_parse import pdf_parser_v2
9
+ from docling_parse.pdf_parsers import pdf_parser_v2
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
210
210
  self.parser = pdf_parser_v2("fatal")
211
211
 
212
212
  success = False
213
- if isinstance(path_or_stream, BytesIO):
213
+ if isinstance(self.path_or_stream, BytesIO):
214
214
  success = self.parser.load_document_from_bytesio(
215
- self.document_hash, path_or_stream
215
+ self.document_hash, self.path_or_stream
216
+ )
217
+ elif isinstance(self.path_or_stream, Path):
218
+ success = self.parser.load_document(
219
+ self.document_hash, str(self.path_or_stream)
216
220
  )
217
- elif isinstance(path_or_stream, Path):
218
- success = self.parser.load_document(self.document_hash, str(path_or_stream))
219
221
 
220
222
  if not success:
221
223
  raise RuntimeError(
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import re
2
3
  from io import BytesIO
3
4
  from pathlib import Path
4
5
  from typing import Set, Union
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
133
134
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
134
135
  for element in body:
135
136
  tag_name = etree.QName(element).localname
136
-
137
137
  # Check for Inline Images (blip elements)
138
138
  namespaces = {
139
139
  "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
153
153
  self.handle_pictures(element, docx_obj, drawing_blip, doc)
154
154
  # Check for Text
155
155
  elif tag_name in ["p"]:
156
+ # "tcPr", "sectPr"
156
157
  self.handle_text_elements(element, docx_obj, doc)
157
158
  else:
158
159
  _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
166
167
  except ValueError:
167
168
  return default
168
169
 
170
+ def split_text_and_number(self, input_string):
171
+ match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
172
+ if match:
173
+ parts = list(filter(None, match.groups()))
174
+ return parts
175
+ else:
176
+ return [input_string]
177
+
169
178
  def get_numId_and_ilvl(self, paragraph):
170
179
  # Access the XML element of the paragraph
171
180
  numPr = paragraph._element.find(
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
188
197
  def get_label_and_level(self, paragraph):
189
198
  if paragraph.style is None:
190
199
  return "Normal", None
191
- label = paragraph.style.name
200
+ label = paragraph.style.style_id
192
201
  if label is None:
193
202
  return "Normal", None
194
203
  if ":" in label:
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
197
206
  if len(parts) == 2:
198
207
  return parts[0], int(parts[1])
199
208
 
200
- parts = label.split(" ")
209
+ parts = self.split_text_and_number(label)
201
210
 
202
211
  if "Heading" in label and len(parts) == 2:
203
212
  parts.sort()
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
219
228
  if paragraph.text is None:
220
229
  return
221
230
  text = paragraph.text.strip()
222
- # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
223
231
 
224
232
  # Common styles for bullet and numbered lists.
225
233
  # "List Bullet", "List Number", "List Paragraph"
226
234
  # Identify wether list is a numbered list or not
227
235
  # is_numbered = "List Bullet" not in paragraph.style.name
228
236
  is_numbered = False
229
- p_style_name, p_level = self.get_label_and_level(paragraph)
237
+ p_style_id, p_level = self.get_label_and_level(paragraph)
230
238
  numid, ilevel = self.get_numId_and_ilvl(paragraph)
231
239
 
232
240
  if numid == 0:
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
238
246
  element,
239
247
  docx_obj,
240
248
  doc,
241
- p_style_name,
249
+ p_style_id,
242
250
  p_level,
243
251
  numid,
244
252
  ilevel,
245
253
  text,
246
254
  is_numbered,
247
255
  )
248
- self.update_history(p_style_name, p_level, numid, ilevel)
256
+ self.update_history(p_style_id, p_level, numid, ilevel)
249
257
  return
250
258
  elif numid is None and self.prev_numid() is not None: # Close list
251
259
  for key, val in self.parents.items():
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
253
261
  self.parents[key] = None
254
262
  self.level = self.level_at_new_list - 1
255
263
  self.level_at_new_list = None
256
- if p_style_name in ["Title"]:
264
+ if p_style_id in ["Title"]:
257
265
  for key, val in self.parents.items():
258
266
  self.parents[key] = None
259
267
  self.parents[0] = doc.add_text(
260
268
  parent=None, label=DocItemLabel.TITLE, text=text
261
269
  )
262
- elif "Heading" in p_style_name:
263
- self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
270
+ elif "Heading" in p_style_id:
271
+ self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
264
272
 
265
- elif p_style_name in [
273
+ elif p_style_id in [
266
274
  "Paragraph",
267
275
  "Normal",
268
276
  "Subtitle",
269
277
  "Author",
270
- "Default Text",
271
- "List Paragraph",
272
- "List Bullet",
278
+ "DefaultText",
279
+ "ListParagraph",
280
+ "ListBullet",
273
281
  "Quote",
274
282
  ]:
275
283
  level = self.get_level()
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
285
293
  label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
286
294
  )
287
295
 
288
- self.update_history(p_style_name, p_level, numid, ilevel)
296
+ self.update_history(p_style_id, p_level, numid, ilevel)
289
297
  return
290
298
 
291
299
  def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
292
300
  level = self.get_level()
293
301
  if isinstance(curr_level, int):
294
-
295
302
  if curr_level > level:
296
-
297
303
  # add invisible group
298
304
  for i in range(level, curr_level):
299
305
  self.parents[i] = doc.add_group(
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
301
307
  label=GroupLabel.SECTION,
302
308
  name=f"header-{i}",
303
309
  )
304
-
305
310
  elif curr_level < level:
306
-
307
311
  # remove the tail
308
312
  for key, val in self.parents.items():
309
313
  if key >= curr_level:
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
314
318
  text=text,
315
319
  level=curr_level,
316
320
  )
317
-
318
321
  else:
319
322
  self.parents[self.level] = doc.add_heading(
320
323
  parent=self.parents[self.level - 1],
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
328
331
  element,
329
332
  docx_obj,
330
333
  doc,
331
- p_style_name,
334
+ p_style_id,
332
335
  p_level,
333
336
  numid,
334
337
  ilevel,
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
346
349
  label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
347
350
  )
348
351
 
349
- # TODO: Set marker and enumerated arguments if this is an enumeration element.
352
+ # Set marker and enumerated arguments if this is an enumeration element.
350
353
  self.listIter += 1
351
354
  if is_numbered:
352
355
  enum_marker = str(self.listIter) + "."
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
365
368
  self.level_at_new_list + self.prev_indent() + 1,
366
369
  self.level_at_new_list + ilevel + 1,
367
370
  ):
368
- # TODO: determine if this is an unordered list or an ordered list.
369
- # Set GroupLabel.ORDERED_LIST when it fits.
371
+ # Determine if this is an unordered list or an ordered list.
372
+ # Set GroupLabel.ORDERED_LIST when it fits.
370
373
  self.listIter = 0
371
374
  if is_numbered:
372
375
  self.parents[i] = doc.add_group(
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
467
470
  row_span = get_rowspan(cell)
468
471
  col_span = get_colspan(cell)
469
472
 
473
+ cell_text = cell.text
474
+ # In case cell doesn't return text via docx library:
475
+ if len(cell_text) == 0:
476
+ cell_xml = cell._element
477
+
478
+ texts = [""]
479
+ for elem in cell_xml.iter():
480
+ if elem.tag.endswith("t"): # <w:t> tags that contain text
481
+ if elem.text:
482
+ texts.append(elem.text)
483
+ # Join the collected text
484
+ cell_text = " ".join(texts).strip()
485
+
470
486
  # Find the next available column in the grid
471
487
  while table_grid[row_idx][col_idx] is not None:
472
488
  col_idx += 1
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
477
493
  table_grid[row_idx + i][col_idx + j] = ""
478
494
 
479
495
  cell = TableCell(
480
- text=cell.text,
496
+ text=cell_text,
481
497
  row_span=row_span,
482
498
  col_span=col_span,
483
499
  start_row_offset_idx=row_idx,
484
500
  end_row_offset_idx=row_idx + row_span,
485
501
  start_col_offset_idx=col_idx,
486
502
  end_col_offset_idx=col_idx + col_span,
487
- col_header=False, # col_header,
488
- row_header=False, # ((not col_header) and html_cell.name=='th')
503
+ col_header=False,
504
+ row_header=False,
489
505
  )
490
506
 
491
507
  data.table_cells.append(cell)
@@ -0,0 +1,12 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
7
+ from docling_core.transforms.chunker.hierarchical_chunker import (
8
+ DocChunk,
9
+ DocMeta,
10
+ HierarchicalChunker,
11
+ )
12
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
@@ -10,7 +10,9 @@ from pathlib import Path
10
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
11
11
 
12
12
  import typer
13
+ from docling_core.types.doc import ImageRefMode
13
14
  from docling_core.utils.file import resolve_source_to_path
15
+ from pydantic import TypeAdapter, ValidationError
14
16
 
15
17
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
16
18
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -86,9 +88,11 @@ def export_documents(
86
88
  conv_results: Iterable[ConversionResult],
87
89
  output_dir: Path,
88
90
  export_json: bool,
91
+ export_html: bool,
89
92
  export_md: bool,
90
93
  export_txt: bool,
91
94
  export_doctags: bool,
95
+ image_export_mode: ImageRefMode,
92
96
  ):
93
97
 
94
98
  success_count = 0
@@ -99,33 +103,45 @@ def export_documents(
99
103
  success_count += 1
100
104
  doc_filename = conv_res.input.file.stem
101
105
 
102
- # Export Deep Search document JSON format:
106
+ # Export JSON format:
103
107
  if export_json:
104
108
  fname = output_dir / f"{doc_filename}.json"
105
- with fname.open("w", encoding="utf8") as fp:
106
- _log.info(f"writing JSON output to {fname}")
107
- fp.write(json.dumps(conv_res.document.export_to_dict()))
109
+ _log.info(f"writing JSON output to {fname}")
110
+ conv_res.document.save_as_json(
111
+ filename=fname, image_mode=image_export_mode
112
+ )
113
+
114
+ # Export HTML format:
115
+ if export_html:
116
+ fname = output_dir / f"{doc_filename}.html"
117
+ _log.info(f"writing HTML output to {fname}")
118
+ conv_res.document.save_as_html(
119
+ filename=fname, image_mode=image_export_mode
120
+ )
108
121
 
109
122
  # Export Text format:
110
123
  if export_txt:
111
124
  fname = output_dir / f"{doc_filename}.txt"
112
- with fname.open("w", encoding="utf8") as fp:
113
- _log.info(f"writing Text output to {fname}")
114
- fp.write(conv_res.document.export_to_markdown(strict_text=True))
125
+ _log.info(f"writing TXT output to {fname}")
126
+ conv_res.document.save_as_markdown(
127
+ filename=fname,
128
+ strict_text=True,
129
+ image_mode=ImageRefMode.PLACEHOLDER,
130
+ )
115
131
 
116
132
  # Export Markdown format:
117
133
  if export_md:
118
134
  fname = output_dir / f"{doc_filename}.md"
119
- with fname.open("w", encoding="utf8") as fp:
120
- _log.info(f"writing Markdown output to {fname}")
121
- fp.write(conv_res.document.export_to_markdown())
135
+ _log.info(f"writing Markdown output to {fname}")
136
+ conv_res.document.save_as_markdown(
137
+ filename=fname, image_mode=image_export_mode
138
+ )
122
139
 
123
140
  # Export Document Tags format:
124
141
  if export_doctags:
125
142
  fname = output_dir / f"{doc_filename}.doctags"
126
- with fname.open("w", encoding="utf8") as fp:
127
- _log.info(f"writing Doc Tags output to {fname}")
128
- fp.write(conv_res.document.export_to_document_tokens())
143
+ _log.info(f"writing Doc Tags output to {fname}")
144
+ conv_res.document.save_as_document_tokens(filename=fname)
129
145
 
130
146
  else:
131
147
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -160,6 +176,13 @@ def convert(
160
176
  to_formats: List[OutputFormat] = typer.Option(
161
177
  None, "--to", help="Specify output formats. Defaults to Markdown."
162
178
  ),
179
+ image_export_mode: Annotated[
180
+ ImageRefMode,
181
+ typer.Option(
182
+ ...,
183
+ help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
184
+ ),
185
+ ] = ImageRefMode.EMBEDDED,
163
186
  ocr: Annotated[
164
187
  bool,
165
188
  typer.Option(
@@ -185,7 +208,7 @@ def convert(
185
208
  ] = None,
186
209
  pdf_backend: Annotated[
187
210
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
188
- ] = PdfBackend.DLPARSE_V1,
211
+ ] = PdfBackend.DLPARSE_V2,
189
212
  table_mode: Annotated[
190
213
  TableFormerMode,
191
214
  typer.Option(..., help="The mode to use in the table structure model."),
@@ -260,24 +283,45 @@ def convert(
260
283
  with tempfile.TemporaryDirectory() as tempdir:
261
284
  input_doc_paths: List[Path] = []
262
285
  for src in input_sources:
263
- source = resolve_source_to_path(source=src, workdir=Path(tempdir))
264
- if not source.exists():
286
+ try:
287
+ # check if we can fetch some remote url
288
+ source = resolve_source_to_path(source=src, workdir=Path(tempdir))
289
+ input_doc_paths.append(source)
290
+ except FileNotFoundError:
265
291
  err_console.print(
266
- f"[red]Error: The input file {source} does not exist.[/red]"
292
+ f"[red]Error: The input file {src} does not exist.[/red]"
267
293
  )
268
294
  raise typer.Abort()
269
- elif source.is_dir():
270
- for fmt in from_formats:
271
- for ext in FormatToExtensions[fmt]:
272
- input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
273
- input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
274
- else:
275
- input_doc_paths.append(source)
295
+ except IsADirectoryError:
296
+ # if the input matches to a file or a folder
297
+ try:
298
+ local_path = TypeAdapter(Path).validate_python(src)
299
+ if local_path.exists() and local_path.is_dir():
300
+ for fmt in from_formats:
301
+ for ext in FormatToExtensions[fmt]:
302
+ input_doc_paths.extend(
303
+ list(local_path.glob(f"**/*.{ext}"))
304
+ )
305
+ input_doc_paths.extend(
306
+ list(local_path.glob(f"**/*.{ext.upper()}"))
307
+ )
308
+ elif local_path.exists():
309
+ input_doc_paths.append(local_path)
310
+ else:
311
+ err_console.print(
312
+ f"[red]Error: The input file {src} does not exist.[/red]"
313
+ )
314
+ raise typer.Abort()
315
+ except Exception as err:
316
+ err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
317
+ _log.info(err) # will print more details if verbose is activated
318
+ raise typer.Abort()
276
319
 
277
320
  if to_formats is None:
278
321
  to_formats = [OutputFormat.MARKDOWN]
279
322
 
280
323
  export_json = OutputFormat.JSON in to_formats
324
+ export_html = OutputFormat.HTML in to_formats
281
325
  export_md = OutputFormat.MARKDOWN in to_formats
282
326
  export_txt = OutputFormat.TEXT in to_formats
283
327
  export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -309,6 +353,13 @@ def convert(
309
353
  )
310
354
  pipeline_options.table_structure_options.mode = table_mode
311
355
 
356
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
357
+ pipeline_options.generate_page_images = True
358
+ pipeline_options.generate_picture_images = (
359
+ True # FIXME: to be deprecated in verson 3
360
+ )
361
+ pipeline_options.images_scale = 2
362
+
312
363
  if artifacts_path is not None:
313
364
  pipeline_options.artifacts_path = artifacts_path
314
365
 
@@ -321,11 +372,13 @@ def convert(
321
372
  else:
322
373
  raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
323
374
 
375
+ pdf_format_option = PdfFormatOption(
376
+ pipeline_options=pipeline_options,
377
+ backend=backend, # pdf_backend
378
+ )
324
379
  format_options: Dict[InputFormat, FormatOption] = {
325
- InputFormat.PDF: PdfFormatOption(
326
- pipeline_options=pipeline_options,
327
- backend=backend, # pdf_backend
328
- )
380
+ InputFormat.PDF: pdf_format_option,
381
+ InputFormat.IMAGE: pdf_format_option,
329
382
  }
330
383
  doc_converter = DocumentConverter(
331
384
  allowed_formats=from_formats,
@@ -343,9 +396,11 @@ def convert(
343
396
  conv_results,
344
397
  output_dir=output,
345
398
  export_json=export_json,
399
+ export_html=export_html,
346
400
  export_md=export_md,
347
401
  export_txt=export_txt,
348
402
  export_doctags=export_doctags,
403
+ image_export_mode=image_export_mode,
349
404
  )
350
405
 
351
406
  end_time = time.time() - start_time
@@ -41,6 +41,7 @@ class InputFormat(str, Enum):
41
41
  class OutputFormat(str, Enum):
42
42
  MARKDOWN = "md"
43
43
  JSON = "json"
44
+ HTML = "html"
44
45
  TEXT = "text"
45
46
  DOCTAGS = "doctags"
46
47