docling 2.8.3__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
- from docling_parse.docling_parse import pdf_parser_v1
9
+ from docling_parse.pdf_parsers import pdf_parser_v1
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_parse.docling_parse import pdf_parser_v2
9
+ from docling_parse.pdf_parsers import pdf_parser_v2
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
210
210
  self.parser = pdf_parser_v2("fatal")
211
211
 
212
212
  success = False
213
- if isinstance(path_or_stream, BytesIO):
213
+ if isinstance(self.path_or_stream, BytesIO):
214
214
  success = self.parser.load_document_from_bytesio(
215
- self.document_hash, path_or_stream
215
+ self.document_hash, self.path_or_stream
216
+ )
217
+ elif isinstance(self.path_or_stream, Path):
218
+ success = self.parser.load_document(
219
+ self.document_hash, str(self.path_or_stream)
216
220
  )
217
- elif isinstance(path_or_stream, Path):
218
- success = self.parser.load_document(self.document_hash, str(path_or_stream))
219
221
 
220
222
  if not success:
221
223
  raise RuntimeError(
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import re
2
3
  from io import BytesIO
3
4
  from pathlib import Path
4
5
  from typing import Set, Union
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
133
134
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
134
135
  for element in body:
135
136
  tag_name = etree.QName(element).localname
136
-
137
137
  # Check for Inline Images (blip elements)
138
138
  namespaces = {
139
139
  "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
153
153
  self.handle_pictures(element, docx_obj, drawing_blip, doc)
154
154
  # Check for Text
155
155
  elif tag_name in ["p"]:
156
+ # "tcPr", "sectPr"
156
157
  self.handle_text_elements(element, docx_obj, doc)
157
158
  else:
158
159
  _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
166
167
  except ValueError:
167
168
  return default
168
169
 
170
+ def split_text_and_number(self, input_string):
171
+ match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
172
+ if match:
173
+ parts = list(filter(None, match.groups()))
174
+ return parts
175
+ else:
176
+ return [input_string]
177
+
169
178
  def get_numId_and_ilvl(self, paragraph):
170
179
  # Access the XML element of the paragraph
171
180
  numPr = paragraph._element.find(
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
188
197
  def get_label_and_level(self, paragraph):
189
198
  if paragraph.style is None:
190
199
  return "Normal", None
191
- label = paragraph.style.name
200
+ label = paragraph.style.style_id
192
201
  if label is None:
193
202
  return "Normal", None
194
203
  if ":" in label:
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
197
206
  if len(parts) == 2:
198
207
  return parts[0], int(parts[1])
199
208
 
200
- parts = label.split(" ")
209
+ parts = self.split_text_and_number(label)
201
210
 
202
211
  if "Heading" in label and len(parts) == 2:
203
212
  parts.sort()
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
219
228
  if paragraph.text is None:
220
229
  return
221
230
  text = paragraph.text.strip()
222
- # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
223
231
 
224
232
  # Common styles for bullet and numbered lists.
225
233
  # "List Bullet", "List Number", "List Paragraph"
226
234
  # Identify wether list is a numbered list or not
227
235
  # is_numbered = "List Bullet" not in paragraph.style.name
228
236
  is_numbered = False
229
- p_style_name, p_level = self.get_label_and_level(paragraph)
237
+ p_style_id, p_level = self.get_label_and_level(paragraph)
230
238
  numid, ilevel = self.get_numId_and_ilvl(paragraph)
231
239
 
232
240
  if numid == 0:
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
238
246
  element,
239
247
  docx_obj,
240
248
  doc,
241
- p_style_name,
249
+ p_style_id,
242
250
  p_level,
243
251
  numid,
244
252
  ilevel,
245
253
  text,
246
254
  is_numbered,
247
255
  )
248
- self.update_history(p_style_name, p_level, numid, ilevel)
256
+ self.update_history(p_style_id, p_level, numid, ilevel)
249
257
  return
250
258
  elif numid is None and self.prev_numid() is not None: # Close list
251
259
  for key, val in self.parents.items():
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
253
261
  self.parents[key] = None
254
262
  self.level = self.level_at_new_list - 1
255
263
  self.level_at_new_list = None
256
- if p_style_name in ["Title"]:
264
+ if p_style_id in ["Title"]:
257
265
  for key, val in self.parents.items():
258
266
  self.parents[key] = None
259
267
  self.parents[0] = doc.add_text(
260
268
  parent=None, label=DocItemLabel.TITLE, text=text
261
269
  )
262
- elif "Heading" in p_style_name:
263
- self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
270
+ elif "Heading" in p_style_id:
271
+ self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
264
272
 
265
- elif p_style_name in [
273
+ elif p_style_id in [
266
274
  "Paragraph",
267
275
  "Normal",
268
276
  "Subtitle",
269
277
  "Author",
270
- "Default Text",
271
- "List Paragraph",
272
- "List Bullet",
278
+ "DefaultText",
279
+ "ListParagraph",
280
+ "ListBullet",
273
281
  "Quote",
274
282
  ]:
275
283
  level = self.get_level()
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
285
293
  label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
286
294
  )
287
295
 
288
- self.update_history(p_style_name, p_level, numid, ilevel)
296
+ self.update_history(p_style_id, p_level, numid, ilevel)
289
297
  return
290
298
 
291
299
  def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
292
300
  level = self.get_level()
293
301
  if isinstance(curr_level, int):
294
-
295
302
  if curr_level > level:
296
-
297
303
  # add invisible group
298
304
  for i in range(level, curr_level):
299
305
  self.parents[i] = doc.add_group(
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
301
307
  label=GroupLabel.SECTION,
302
308
  name=f"header-{i}",
303
309
  )
304
-
305
310
  elif curr_level < level:
306
-
307
311
  # remove the tail
308
312
  for key, val in self.parents.items():
309
313
  if key >= curr_level:
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
314
318
  text=text,
315
319
  level=curr_level,
316
320
  )
317
-
318
321
  else:
319
322
  self.parents[self.level] = doc.add_heading(
320
323
  parent=self.parents[self.level - 1],
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
328
331
  element,
329
332
  docx_obj,
330
333
  doc,
331
- p_style_name,
334
+ p_style_id,
332
335
  p_level,
333
336
  numid,
334
337
  ilevel,
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
346
349
  label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
347
350
  )
348
351
 
349
- # TODO: Set marker and enumerated arguments if this is an enumeration element.
352
+ # Set marker and enumerated arguments if this is an enumeration element.
350
353
  self.listIter += 1
351
354
  if is_numbered:
352
355
  enum_marker = str(self.listIter) + "."
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
365
368
  self.level_at_new_list + self.prev_indent() + 1,
366
369
  self.level_at_new_list + ilevel + 1,
367
370
  ):
368
- # TODO: determine if this is an unordered list or an ordered list.
369
- # Set GroupLabel.ORDERED_LIST when it fits.
371
+ # Determine if this is an unordered list or an ordered list.
372
+ # Set GroupLabel.ORDERED_LIST when it fits.
370
373
  self.listIter = 0
371
374
  if is_numbered:
372
375
  self.parents[i] = doc.add_group(
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
467
470
  row_span = get_rowspan(cell)
468
471
  col_span = get_colspan(cell)
469
472
 
473
+ cell_text = cell.text
474
+ # In case cell doesn't return text via docx library:
475
+ if len(cell_text) == 0:
476
+ cell_xml = cell._element
477
+
478
+ texts = [""]
479
+ for elem in cell_xml.iter():
480
+ if elem.tag.endswith("t"): # <w:t> tags that contain text
481
+ if elem.text:
482
+ texts.append(elem.text)
483
+ # Join the collected text
484
+ cell_text = " ".join(texts).strip()
485
+
470
486
  # Find the next available column in the grid
471
487
  while table_grid[row_idx][col_idx] is not None:
472
488
  col_idx += 1
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
477
493
  table_grid[row_idx + i][col_idx + j] = ""
478
494
 
479
495
  cell = TableCell(
480
- text=cell.text,
496
+ text=cell_text,
481
497
  row_span=row_span,
482
498
  col_span=col_span,
483
499
  start_row_offset_idx=row_idx,
484
500
  end_row_offset_idx=row_idx + row_span,
485
501
  start_col_offset_idx=col_idx,
486
502
  end_col_offset_idx=col_idx + col_span,
487
- col_header=False, # col_header,
488
- row_header=False, # ((not col_header) and html_cell.name=='th')
503
+ col_header=False,
504
+ row_header=False,
489
505
  )
490
506
 
491
507
  data.table_cells.append(cell)
@@ -0,0 +1,12 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
7
+ from docling_core.transforms.chunker.hierarchical_chunker import (
8
+ DocChunk,
9
+ DocMeta,
10
+ HierarchicalChunker,
11
+ )
12
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
docling/cli/main.py CHANGED
@@ -10,7 +10,9 @@ from pathlib import Path
10
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
11
11
 
12
12
  import typer
13
+ from docling_core.types.doc import ImageRefMode
13
14
  from docling_core.utils.file import resolve_source_to_path
15
+ from pydantic import TypeAdapter, ValidationError
14
16
 
15
17
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
16
18
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -86,9 +88,11 @@ def export_documents(
86
88
  conv_results: Iterable[ConversionResult],
87
89
  output_dir: Path,
88
90
  export_json: bool,
91
+ export_html: bool,
89
92
  export_md: bool,
90
93
  export_txt: bool,
91
94
  export_doctags: bool,
95
+ image_export_mode: ImageRefMode,
92
96
  ):
93
97
 
94
98
  success_count = 0
@@ -99,33 +103,45 @@ def export_documents(
99
103
  success_count += 1
100
104
  doc_filename = conv_res.input.file.stem
101
105
 
102
- # Export Deep Search document JSON format:
106
+ # Export JSON format:
103
107
  if export_json:
104
108
  fname = output_dir / f"{doc_filename}.json"
105
- with fname.open("w", encoding="utf8") as fp:
106
- _log.info(f"writing JSON output to {fname}")
107
- fp.write(json.dumps(conv_res.document.export_to_dict()))
109
+ _log.info(f"writing JSON output to {fname}")
110
+ conv_res.document.save_as_json(
111
+ filename=fname, image_mode=image_export_mode
112
+ )
113
+
114
+ # Export HTML format:
115
+ if export_html:
116
+ fname = output_dir / f"{doc_filename}.html"
117
+ _log.info(f"writing HTML output to {fname}")
118
+ conv_res.document.save_as_html(
119
+ filename=fname, image_mode=image_export_mode
120
+ )
108
121
 
109
122
  # Export Text format:
110
123
  if export_txt:
111
124
  fname = output_dir / f"{doc_filename}.txt"
112
- with fname.open("w", encoding="utf8") as fp:
113
- _log.info(f"writing Text output to {fname}")
114
- fp.write(conv_res.document.export_to_markdown(strict_text=True))
125
+ _log.info(f"writing TXT output to {fname}")
126
+ conv_res.document.save_as_markdown(
127
+ filename=fname,
128
+ strict_text=True,
129
+ image_mode=ImageRefMode.PLACEHOLDER,
130
+ )
115
131
 
116
132
  # Export Markdown format:
117
133
  if export_md:
118
134
  fname = output_dir / f"{doc_filename}.md"
119
- with fname.open("w", encoding="utf8") as fp:
120
- _log.info(f"writing Markdown output to {fname}")
121
- fp.write(conv_res.document.export_to_markdown())
135
+ _log.info(f"writing Markdown output to {fname}")
136
+ conv_res.document.save_as_markdown(
137
+ filename=fname, image_mode=image_export_mode
138
+ )
122
139
 
123
140
  # Export Document Tags format:
124
141
  if export_doctags:
125
142
  fname = output_dir / f"{doc_filename}.doctags"
126
- with fname.open("w", encoding="utf8") as fp:
127
- _log.info(f"writing Doc Tags output to {fname}")
128
- fp.write(conv_res.document.export_to_document_tokens())
143
+ _log.info(f"writing Doc Tags output to {fname}")
144
+ conv_res.document.save_as_document_tokens(filename=fname)
129
145
 
130
146
  else:
131
147
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -160,6 +176,13 @@ def convert(
160
176
  to_formats: List[OutputFormat] = typer.Option(
161
177
  None, "--to", help="Specify output formats. Defaults to Markdown."
162
178
  ),
179
+ image_export_mode: Annotated[
180
+ ImageRefMode,
181
+ typer.Option(
182
+ ...,
183
+ help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
184
+ ),
185
+ ] = ImageRefMode.EMBEDDED,
163
186
  ocr: Annotated[
164
187
  bool,
165
188
  typer.Option(
@@ -185,7 +208,7 @@ def convert(
185
208
  ] = None,
186
209
  pdf_backend: Annotated[
187
210
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
188
- ] = PdfBackend.DLPARSE_V1,
211
+ ] = PdfBackend.DLPARSE_V2,
189
212
  table_mode: Annotated[
190
213
  TableFormerMode,
191
214
  typer.Option(..., help="The mode to use in the table structure model."),
@@ -260,24 +283,45 @@ def convert(
260
283
  with tempfile.TemporaryDirectory() as tempdir:
261
284
  input_doc_paths: List[Path] = []
262
285
  for src in input_sources:
263
- source = resolve_source_to_path(source=src, workdir=Path(tempdir))
264
- if not source.exists():
286
+ try:
287
+ # check if we can fetch some remote url
288
+ source = resolve_source_to_path(source=src, workdir=Path(tempdir))
289
+ input_doc_paths.append(source)
290
+ except FileNotFoundError:
265
291
  err_console.print(
266
- f"[red]Error: The input file {source} does not exist.[/red]"
292
+ f"[red]Error: The input file {src} does not exist.[/red]"
267
293
  )
268
294
  raise typer.Abort()
269
- elif source.is_dir():
270
- for fmt in from_formats:
271
- for ext in FormatToExtensions[fmt]:
272
- input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
273
- input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
274
- else:
275
- input_doc_paths.append(source)
295
+ except IsADirectoryError:
296
+ # if the input matches to a file or a folder
297
+ try:
298
+ local_path = TypeAdapter(Path).validate_python(src)
299
+ if local_path.exists() and local_path.is_dir():
300
+ for fmt in from_formats:
301
+ for ext in FormatToExtensions[fmt]:
302
+ input_doc_paths.extend(
303
+ list(local_path.glob(f"**/*.{ext}"))
304
+ )
305
+ input_doc_paths.extend(
306
+ list(local_path.glob(f"**/*.{ext.upper()}"))
307
+ )
308
+ elif local_path.exists():
309
+ input_doc_paths.append(local_path)
310
+ else:
311
+ err_console.print(
312
+ f"[red]Error: The input file {src} does not exist.[/red]"
313
+ )
314
+ raise typer.Abort()
315
+ except Exception as err:
316
+ err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
317
+ _log.info(err) # will print more details if verbose is activated
318
+ raise typer.Abort()
276
319
 
277
320
  if to_formats is None:
278
321
  to_formats = [OutputFormat.MARKDOWN]
279
322
 
280
323
  export_json = OutputFormat.JSON in to_formats
324
+ export_html = OutputFormat.HTML in to_formats
281
325
  export_md = OutputFormat.MARKDOWN in to_formats
282
326
  export_txt = OutputFormat.TEXT in to_formats
283
327
  export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -309,6 +353,13 @@ def convert(
309
353
  )
310
354
  pipeline_options.table_structure_options.mode = table_mode
311
355
 
356
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
357
+ pipeline_options.generate_page_images = True
358
+ pipeline_options.generate_picture_images = (
359
+ True # FIXME: to be deprecated in verson 3
360
+ )
361
+ pipeline_options.images_scale = 2
362
+
312
363
  if artifacts_path is not None:
313
364
  pipeline_options.artifacts_path = artifacts_path
314
365
 
@@ -321,11 +372,13 @@ def convert(
321
372
  else:
322
373
  raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
323
374
 
375
+ pdf_format_option = PdfFormatOption(
376
+ pipeline_options=pipeline_options,
377
+ backend=backend, # pdf_backend
378
+ )
324
379
  format_options: Dict[InputFormat, FormatOption] = {
325
- InputFormat.PDF: PdfFormatOption(
326
- pipeline_options=pipeline_options,
327
- backend=backend, # pdf_backend
328
- )
380
+ InputFormat.PDF: pdf_format_option,
381
+ InputFormat.IMAGE: pdf_format_option,
329
382
  }
330
383
  doc_converter = DocumentConverter(
331
384
  allowed_formats=from_formats,
@@ -343,9 +396,11 @@ def convert(
343
396
  conv_results,
344
397
  output_dir=output,
345
398
  export_json=export_json,
399
+ export_html=export_html,
346
400
  export_md=export_md,
347
401
  export_txt=export_txt,
348
402
  export_doctags=export_doctags,
403
+ image_export_mode=image_export_mode,
349
404
  )
350
405
 
351
406
  end_time = time.time() - start_time
@@ -41,6 +41,7 @@ class InputFormat(str, Enum):
41
41
  class OutputFormat(str, Enum):
42
42
  MARKDOWN = "md"
43
43
  JSON = "json"
44
+ HTML = "html"
44
45
  TEXT = "text"
45
46
  DOCTAGS = "doctags"
46
47
 
@@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
33
33
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
34
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
35
  from docling_core.utils.file import resolve_source_to_stream
36
+ from docling_core.utils.legacy import docling_document_to_legacy
36
37
  from pydantic import BaseModel
37
38
  from typing_extensions import deprecated
38
39
 
@@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
189
190
  @property
190
191
  @deprecated("Use document instead.")
191
192
  def legacy_document(self):
192
- reverse_label_mapping = {
193
- DocItemLabel.CAPTION.value: "Caption",
194
- DocItemLabel.FOOTNOTE.value: "Footnote",
195
- DocItemLabel.FORMULA.value: "Formula",
196
- DocItemLabel.LIST_ITEM.value: "List-item",
197
- DocItemLabel.PAGE_FOOTER.value: "Page-footer",
198
- DocItemLabel.PAGE_HEADER.value: "Page-header",
199
- DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
200
- DocItemLabel.SECTION_HEADER.value: "Section-header",
201
- DocItemLabel.TABLE.value: "Table",
202
- DocItemLabel.TEXT.value: "Text",
203
- DocItemLabel.TITLE.value: "Title",
204
- DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
205
- DocItemLabel.CODE.value: "Code",
206
- DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
207
- DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
208
- DocItemLabel.FORM.value: "Form",
209
- DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
210
- DocItemLabel.PARAGRAPH.value: "paragraph",
211
- }
212
-
213
- title = ""
214
- desc = DsDocumentDescription(logs=[])
215
-
216
- page_hashes = [
217
- PageReference(
218
- hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
219
- page=p.page_no,
220
- model="default",
221
- )
222
- for p in self.document.pages.values()
223
- ]
224
-
225
- file_info = DsFileInfoObject(
226
- filename=self.input.file.name,
227
- document_hash=self.input.document_hash,
228
- num_pages=self.input.page_count,
229
- page_hashes=page_hashes,
230
- )
231
-
232
- main_text = []
233
- tables = []
234
- figures = []
235
- equations = []
236
- footnotes = []
237
- page_headers = []
238
- page_footers = []
239
-
240
- embedded_captions = set()
241
- for ix, (item, level) in enumerate(
242
- self.document.iterate_items(self.document.body)
243
- ):
244
-
245
- if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
246
- caption = item.caption_text(self.document)
247
- if caption:
248
- embedded_captions.add(caption)
249
-
250
- for item, level in self.document.iterate_items():
251
- if isinstance(item, DocItem):
252
- item_type = item.label
253
-
254
- if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
255
-
256
- if isinstance(item, ListItem) and item.marker:
257
- text = f"{item.marker} {item.text}"
258
- else:
259
- text = item.text
260
-
261
- # Can be empty.
262
- prov = [
263
- Prov(
264
- bbox=p.bbox.as_tuple(),
265
- page=p.page_no,
266
- span=[0, len(item.text)],
267
- )
268
- for p in item.prov
269
- ]
270
- main_text.append(
271
- BaseText(
272
- text=text,
273
- obj_type=layout_label_to_ds_type.get(item.label),
274
- name=reverse_label_mapping[item.label],
275
- prov=prov,
276
- )
277
- )
278
-
279
- # skip captions of they are embedded in the actual
280
- # floating object
281
- if item_type == DocItemLabel.CAPTION and text in embedded_captions:
282
- continue
283
-
284
- elif isinstance(item, TableItem) and item.data:
285
- index = len(tables)
286
- ref_str = f"#/tables/{index}"
287
- main_text.append(
288
- Ref(
289
- name=reverse_label_mapping[item.label],
290
- obj_type=layout_label_to_ds_type.get(item.label),
291
- ref=ref_str,
292
- ),
293
- )
294
-
295
- # Initialise empty table data grid (only empty cells)
296
- table_data = [
297
- [
298
- TableCell(
299
- text="",
300
- # bbox=[0,0,0,0],
301
- spans=[[i, j]],
302
- obj_type="body",
303
- )
304
- for j in range(item.data.num_cols)
305
- ]
306
- for i in range(item.data.num_rows)
307
- ]
308
-
309
- # Overwrite cells in table data for which there is actual cell content.
310
- for cell in item.data.table_cells:
311
- for i in range(
312
- min(cell.start_row_offset_idx, item.data.num_rows),
313
- min(cell.end_row_offset_idx, item.data.num_rows),
314
- ):
315
- for j in range(
316
- min(cell.start_col_offset_idx, item.data.num_cols),
317
- min(cell.end_col_offset_idx, item.data.num_cols),
318
- ):
319
- celltype = "body"
320
- if cell.column_header:
321
- celltype = "col_header"
322
- elif cell.row_header:
323
- celltype = "row_header"
324
- elif cell.row_section:
325
- celltype = "row_section"
326
-
327
- def make_spans(cell):
328
- for rspan in range(
329
- min(
330
- cell.start_row_offset_idx,
331
- item.data.num_rows,
332
- ),
333
- min(
334
- cell.end_row_offset_idx, item.data.num_rows
335
- ),
336
- ):
337
- for cspan in range(
338
- min(
339
- cell.start_col_offset_idx,
340
- item.data.num_cols,
341
- ),
342
- min(
343
- cell.end_col_offset_idx,
344
- item.data.num_cols,
345
- ),
346
- ):
347
- yield [rspan, cspan]
348
-
349
- spans = list(make_spans(cell))
350
- table_data[i][j] = GlmTableCell(
351
- text=cell.text,
352
- bbox=(
353
- cell.bbox.as_tuple()
354
- if cell.bbox is not None
355
- else None
356
- ), # check if this is bottom-left
357
- spans=spans,
358
- obj_type=celltype,
359
- col=j,
360
- row=i,
361
- row_header=cell.row_header,
362
- row_section=cell.row_section,
363
- col_header=cell.column_header,
364
- row_span=[
365
- cell.start_row_offset_idx,
366
- cell.end_row_offset_idx,
367
- ],
368
- col_span=[
369
- cell.start_col_offset_idx,
370
- cell.end_col_offset_idx,
371
- ],
372
- )
373
-
374
- # Compute the caption
375
- caption = item.caption_text(self.document)
376
-
377
- tables.append(
378
- DsSchemaTable(
379
- text=caption,
380
- num_cols=item.data.num_cols,
381
- num_rows=item.data.num_rows,
382
- obj_type=layout_label_to_ds_type.get(item.label),
383
- data=table_data,
384
- prov=[
385
- Prov(
386
- bbox=p.bbox.as_tuple(),
387
- page=p.page_no,
388
- span=[0, 0],
389
- )
390
- for p in item.prov
391
- ],
392
- )
393
- )
394
-
395
- elif isinstance(item, PictureItem):
396
- index = len(figures)
397
- ref_str = f"#/figures/{index}"
398
- main_text.append(
399
- Ref(
400
- name=reverse_label_mapping[item.label],
401
- obj_type=layout_label_to_ds_type.get(item.label),
402
- ref=ref_str,
403
- ),
404
- )
405
-
406
- # Compute the caption
407
- caption = item.caption_text(self.document)
408
-
409
- figures.append(
410
- Figure(
411
- prov=[
412
- Prov(
413
- bbox=p.bbox.as_tuple(),
414
- page=p.page_no,
415
- span=[0, len(caption)],
416
- )
417
- for p in item.prov
418
- ],
419
- obj_type=layout_label_to_ds_type.get(item.label),
420
- text=caption,
421
- # data=[[]],
422
- )
423
- )
424
-
425
- page_dimensions = [
426
- PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
427
- for p in self.document.pages.values()
428
- ]
429
-
430
- ds_doc = DsDocument(
431
- name=title,
432
- description=desc,
433
- file_info=file_info,
434
- main_text=main_text,
435
- equations=equations,
436
- footnotes=footnotes,
437
- page_headers=page_headers,
438
- page_footers=page_footers,
439
- tables=tables,
440
- figures=figures,
441
- page_dimensions=page_dimensions,
442
- )
443
-
444
- return ds_doc
193
+ return docling_document_to_legacy(self.document)
445
194
 
446
195
 
447
196
  class _DummyBackend(AbstractDocumentBackend):
@@ -143,7 +143,11 @@ class PdfPipelineOptions(PipelineOptions):
143
143
 
144
144
  table_structure_options: TableStructureOptions = TableStructureOptions()
145
145
  ocr_options: Union[
146
- EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
146
+ EasyOcrOptions,
147
+ TesseractCliOcrOptions,
148
+ TesseractOcrOptions,
149
+ OcrMacOptions,
150
+ RapidOcrOptions,
147
151
  ] = Field(EasyOcrOptions(), discriminator="kind")
148
152
 
149
153
  images_scale: float = 1.0
@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
10
10
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
11
  from docling.backend.asciidoc_backend import AsciiDocBackend
12
- from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
12
+ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
13
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
14
  from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
84
84
 
85
85
  class PdfFormatOption(FormatOption):
86
86
  pipeline_cls: Type = StandardPdfPipeline
87
- backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
87
+ backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
88
88
 
89
89
 
90
90
  class ImageFormatOption(FormatOption):
91
91
  pipeline_cls: Type = StandardPdfPipeline
92
- backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
92
+ backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
93
93
 
94
94
 
95
95
  def _get_default_option(format: InputFormat) -> FormatOption:
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
113
113
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
114
  ),
115
115
  InputFormat.IMAGE: FormatOption(
116
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
116
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
117
117
  ),
118
118
  InputFormat.PDF: FormatOption(
119
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
119
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
120
120
  ),
121
121
  }
122
122
  if (options := format_to_default_options.get(format)) is not None:
@@ -4,7 +4,6 @@ from pathlib import Path
4
4
  from typing import List, Union
5
5
 
6
6
  from deepsearch_glm.nlp_utils import init_nlp_model
7
- from deepsearch_glm.utils.doc_utils import to_docling_document
8
7
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
9
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
10
9
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
@@ -29,6 +28,7 @@ from pydantic import BaseModel, ConfigDict
29
28
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
30
29
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
30
  from docling.datamodel.settings import settings
31
+ from docling.utils.glm_utils import to_docling_document
32
32
  from docling.utils.profiling import ProfilingScope, TimeRecorder
33
33
  from docling.utils.utils import create_hash
34
34
 
@@ -232,7 +232,7 @@ class GlmModel:
232
232
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
233
233
  with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
234
234
  ds_doc = self._to_legacy_document(conv_res)
235
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
235
+ ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
236
236
 
237
237
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
238
238
 
@@ -97,7 +97,9 @@ class StandardPdfPipeline(PaginatedPipeline):
97
97
  local_dir: Optional[Path] = None, force: bool = False
98
98
  ) -> Path:
99
99
  from huggingface_hub import snapshot_download
100
+ from huggingface_hub.utils import disable_progress_bars
100
101
 
102
+ disable_progress_bars()
101
103
  download_path = snapshot_download(
102
104
  repo_id="ds4sd/docling-models",
103
105
  force_download=force,
docling/py.typed ADDED
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,336 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ import pandas as pd
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ProvenanceItem,
14
+ Size,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+
19
+
20
+ def resolve_item(paths, obj):
21
+ """Find item in document from a reference path"""
22
+
23
+ if len(paths) == 0:
24
+ return obj
25
+
26
+ if paths[0] == "#":
27
+ return resolve_item(paths[1:], obj)
28
+
29
+ try:
30
+ key = int(paths[0])
31
+ except:
32
+ key = paths[0]
33
+
34
+ if len(paths) == 1:
35
+ if isinstance(key, str) and key in obj:
36
+ return obj[key]
37
+ elif isinstance(key, int) and key < len(obj):
38
+ return obj[key]
39
+ else:
40
+ return None
41
+
42
+ elif len(paths) > 1:
43
+ if isinstance(key, str) and key in obj:
44
+ return resolve_item(paths[1:], obj[key])
45
+ elif isinstance(key, int) and key < len(obj):
46
+ return resolve_item(paths[1:], obj[key])
47
+ else:
48
+ return None
49
+
50
+ else:
51
+ return None
52
+
53
+
54
+ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
55
+ unique_objects = []
56
+ seen_spans = set()
57
+
58
+ for sublist in grid:
59
+ for obj in sublist:
60
+ # Convert the spans list to a tuple of tuples for hashing
61
+ spans_tuple = tuple(tuple(span) for span in obj["spans"])
62
+ if spans_tuple not in seen_spans:
63
+ seen_spans.add(spans_tuple)
64
+ unique_objects.append(obj)
65
+
66
+ return unique_objects
67
+
68
+
69
+ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
70
+ origin = DocumentOrigin(
71
+ mimetype="application/pdf",
72
+ filename=doc_glm["file-info"]["filename"],
73
+ binary_hash=doc_glm["file-info"]["document-hash"],
74
+ )
75
+ doc_name = Path(origin.filename).stem
76
+
77
+ doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
78
+
79
+ for page_dim in doc_glm["page-dimensions"]:
80
+ page_no = int(page_dim["page"])
81
+ size = Size(width=page_dim["width"], height=page_dim["height"])
82
+
83
+ doc.add_page(page_no=page_no, size=size)
84
+
85
+ if "properties" in doc_glm:
86
+ props = pd.DataFrame(
87
+ doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
88
+ )
89
+ else:
90
+ props = pd.DataFrame()
91
+
92
+ current_list = None
93
+
94
+ for ix, pelem in enumerate(doc_glm["page-elements"]):
95
+ ptype = pelem["type"]
96
+ span_i = pelem["span"][0]
97
+ span_j = pelem["span"][1]
98
+
99
+ if "iref" not in pelem:
100
+ # print(json.dumps(pelem, indent=2))
101
+ continue
102
+
103
+ iref = pelem["iref"]
104
+
105
+ if re.match("#/figures/(\\d+)/captions/(.+)", iref):
106
+ # print(f"skip {iref}")
107
+ continue
108
+
109
+ if re.match("#/tables/(\\d+)/captions/(.+)", iref):
110
+ # print(f"skip {iref}")
111
+ continue
112
+
113
+ path = iref.split("/")
114
+ obj = resolve_item(path, doc_glm)
115
+
116
+ if obj is None:
117
+ current_list = None
118
+ print(f"warning: undefined {path}")
119
+ continue
120
+
121
+ if ptype == "figure":
122
+ current_list = None
123
+ text = ""
124
+ caption_refs = []
125
+ for caption in obj["captions"]:
126
+ text += caption["text"]
127
+
128
+ for nprov in caption["prov"]:
129
+ npaths = nprov["$ref"].split("/")
130
+ nelem = resolve_item(npaths, doc_glm)
131
+
132
+ if nelem is None:
133
+ # print(f"warning: undefined caption {npaths}")
134
+ continue
135
+
136
+ span_i = nelem["span"][0]
137
+ span_j = nelem["span"][1]
138
+
139
+ cap_text = caption["text"][span_i:span_j]
140
+
141
+ # doc_glm["page-elements"].remove(nelem)
142
+
143
+ prov = ProvenanceItem(
144
+ page_no=nelem["page"],
145
+ charspan=tuple(nelem["span"]),
146
+ bbox=BoundingBox.from_tuple(
147
+ nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
148
+ ),
149
+ )
150
+
151
+ caption_obj = doc.add_text(
152
+ label=DocItemLabel.CAPTION, text=cap_text, prov=prov
153
+ )
154
+ caption_refs.append(caption_obj.get_ref())
155
+
156
+ prov = ProvenanceItem(
157
+ page_no=pelem["page"],
158
+ charspan=(0, len(text)),
159
+ bbox=BoundingBox.from_tuple(
160
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
161
+ ),
162
+ )
163
+
164
+ pic = doc.add_picture(prov=prov)
165
+ pic.captions.extend(caption_refs)
166
+ _add_child_elements(pic, doc, obj, pelem)
167
+
168
+ elif ptype == "table":
169
+ current_list = None
170
+ text = ""
171
+ caption_refs = []
172
+ for caption in obj["captions"]:
173
+ text += caption["text"]
174
+
175
+ for nprov in caption["prov"]:
176
+ npaths = nprov["$ref"].split("/")
177
+ nelem = resolve_item(npaths, doc_glm)
178
+
179
+ if nelem is None:
180
+ # print(f"warning: undefined caption {npaths}")
181
+ continue
182
+
183
+ span_i = nelem["span"][0]
184
+ span_j = nelem["span"][1]
185
+
186
+ cap_text = caption["text"][span_i:span_j]
187
+
188
+ # doc_glm["page-elements"].remove(nelem)
189
+
190
+ prov = ProvenanceItem(
191
+ page_no=nelem["page"],
192
+ charspan=tuple(nelem["span"]),
193
+ bbox=BoundingBox.from_tuple(
194
+ nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
195
+ ),
196
+ )
197
+
198
+ caption_obj = doc.add_text(
199
+ label=DocItemLabel.CAPTION, text=cap_text, prov=prov
200
+ )
201
+ caption_refs.append(caption_obj.get_ref())
202
+
203
+ table_cells_glm = _flatten_table_grid(obj["data"])
204
+
205
+ table_cells = []
206
+ for tbl_cell_glm in table_cells_glm:
207
+ if tbl_cell_glm["bbox"] is not None:
208
+ bbox = BoundingBox.from_tuple(
209
+ tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
210
+ )
211
+ else:
212
+ bbox = None
213
+
214
+ is_col_header = False
215
+ is_row_header = False
216
+ is_row_section = False
217
+
218
+ if tbl_cell_glm["type"] == "col_header":
219
+ is_col_header = True
220
+ elif tbl_cell_glm["type"] == "row_header":
221
+ is_row_header = True
222
+ elif tbl_cell_glm["type"] == "row_section":
223
+ is_row_section = True
224
+
225
+ table_cells.append(
226
+ TableCell(
227
+ row_span=tbl_cell_glm["row-span"][1]
228
+ - tbl_cell_glm["row-span"][0],
229
+ col_span=tbl_cell_glm["col-span"][1]
230
+ - tbl_cell_glm["col-span"][0],
231
+ start_row_offset_idx=tbl_cell_glm["row-span"][0],
232
+ end_row_offset_idx=tbl_cell_glm["row-span"][1],
233
+ start_col_offset_idx=tbl_cell_glm["col-span"][0],
234
+ end_col_offset_idx=tbl_cell_glm["col-span"][1],
235
+ text=tbl_cell_glm["text"],
236
+ bbox=bbox,
237
+ column_header=is_col_header,
238
+ row_header=is_row_header,
239
+ row_section=is_row_section,
240
+ )
241
+ )
242
+
243
+ tbl_data = TableData(
244
+ num_rows=obj.get("#-rows", 0),
245
+ num_cols=obj.get("#-cols", 0),
246
+ table_cells=table_cells,
247
+ )
248
+
249
+ prov = ProvenanceItem(
250
+ page_no=pelem["page"],
251
+ charspan=(0, 0),
252
+ bbox=BoundingBox.from_tuple(
253
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
254
+ ),
255
+ )
256
+
257
+ tbl = doc.add_table(data=tbl_data, prov=prov)
258
+ tbl.captions.extend(caption_refs)
259
+
260
+ elif ptype in ["form", "key_value_region"]:
261
+ label = DocItemLabel(ptype)
262
+ container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
263
+
264
+ _add_child_elements(container_el, doc, obj, pelem)
265
+
266
+ elif "text" in obj:
267
+ text = obj["text"][span_i:span_j]
268
+
269
+ type_label = pelem["type"]
270
+ name_label = pelem["name"]
271
+ if update_name_label and len(props) > 0 and type_label == "paragraph":
272
+ prop = props[
273
+ (props["type"] == "semantic") & (props["subj_path"] == iref)
274
+ ]
275
+ if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
276
+ name_label = prop.iloc[0]["label"]
277
+
278
+ prov = ProvenanceItem(
279
+ page_no=pelem["page"],
280
+ charspan=(0, len(text)),
281
+ bbox=BoundingBox.from_tuple(
282
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
283
+ ),
284
+ )
285
+ label = DocItemLabel(name_label)
286
+
287
+ if label == DocItemLabel.LIST_ITEM:
288
+ if current_list is None:
289
+ current_list = doc.add_group(label=GroupLabel.LIST, name="list")
290
+
291
+ # TODO: Infer if this is a numbered or a bullet list item
292
+ doc.add_list_item(
293
+ text=text, enumerated=False, prov=prov, parent=current_list
294
+ )
295
+ elif label == DocItemLabel.SECTION_HEADER:
296
+ current_list = None
297
+
298
+ doc.add_heading(text=text, prov=prov)
299
+ else:
300
+ current_list = None
301
+
302
+ doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
303
+
304
+ return doc
305
+
306
+
307
+ def _add_child_elements(container_el, doc, obj, pelem):
308
+ payload = obj.get("payload")
309
+ if payload is not None:
310
+ children = payload.get("children", [])
311
+
312
+ for child in children:
313
+ c_label = DocItemLabel(child["label"])
314
+ c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
315
+ doc.pages[pelem["page"]].size.height
316
+ )
317
+ c_text = " ".join(
318
+ [
319
+ cell["text"].replace("\x02", "-").strip()
320
+ for cell in child["cells"]
321
+ if len(cell["text"].strip()) > 0
322
+ ]
323
+ )
324
+
325
+ c_prov = ProvenanceItem(
326
+ page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
327
+ )
328
+ if c_label == DocItemLabel.LIST_ITEM:
329
+ # TODO: Infer if this is a numbered or a bullet list item
330
+ doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
331
+ elif c_label == DocItemLabel.SECTION_HEADER:
332
+ doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
333
+ else:
334
+ doc.add_text(
335
+ parent=container_el, label=c_label, text=c_text, prov=c_prov
336
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.8.3
3
+ Version: 2.10.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
25
25
  Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
- Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core (>=2.6.1,<3.0.0)
28
+ Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
- Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
31
+ Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
- Requires-Dist: pydantic (>=2.0.0,<2.10)
42
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
43
43
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
44
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
45
45
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
59
59
  </a>
60
60
  </p>
61
61
 
62
- # 🦆 Docling
62
+ # Docling
63
63
 
64
64
  <p align="center">
65
65
  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
81
81
 
82
82
  ## Features
83
83
 
84
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
84
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
85
85
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
86
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
87
  * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
@@ -2,28 +2,29 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
4
  docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
- docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJksWIYoXxZMjU,7633
6
- docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
5
+ docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
6
+ docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
7
7
  docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
10
10
  docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
11
- docling/backend/msword_backend.py,sha256=VFHPr-gCak7w3NJToc5Cs-JaTb4Vm3a1JnnRIfJO3TI,18526
11
+ docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
12
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
+ docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
14
15
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- docling/cli/main.py,sha256=R9ao2zCv1GZQIATOqg9b64O7AOUCWLwjJ-2FIpW8m0I,12236
16
+ docling/cli/main.py,sha256=bLk1RG0jwM4dn6G5qa5Q-S4_N3agKnoE28pTfbpV4-k,14713
16
17
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- docling/datamodel/base_models.py,sha256=mJ4h2haE0cOYz_eLd7QlRKU1y7u4yccMGk0tiZNICkQ,5542
18
- docling/datamodel/document.py,sha256=Y0NEFphwz44VxIaRaDRhtmw6rifzSC7MqyaDBzaR0lM,20902
19
- docling/datamodel/pipeline_options.py,sha256=K65nEZ52aRfF8hWIzl0zVvRQj-3XVwoBbxTacGS6jEg,4960
18
+ docling/datamodel/base_models.py,sha256=627IB8HZdXGmHNfsX4Qhf7kKSxx2btPjS7z8hitvhyE,5560
19
+ docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
20
+ docling/datamodel/pipeline_options.py,sha256=zQxLVioyBrldI4V9phQma1kTTgjmFQ6d3gVj2xq51gw,5010
20
21
  docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
21
- docling/document_converter.py,sha256=bsXGQCUrbL2LmaqaaEmlkfSANl2XwBBx8HDLwFrqhFY,11570
22
+ docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
22
23
  docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
23
24
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
25
  docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
25
26
  docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
26
- docling/models/ds_glm_model.py,sha256=hBRCx6oFGhxBbKEJlRSWVndDwFtB5IpeLOowFAVqFM0,12033
27
+ docling/models/ds_glm_model.py,sha256=3UpFu3Oavw9p0GItx2S9R7bPDdjY2NvpUQQDSVMctys,12045
27
28
  docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
28
29
  docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
29
30
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
@@ -36,14 +37,16 @@ docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUs
36
37
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
38
  docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
38
39
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
39
- docling/pipeline/standard_pdf_pipeline.py,sha256=7sbkh9EwXlhSfJSgf-WyjB5jdJ1El7Pn4siSssTJpq8,8789
40
+ docling/pipeline/standard_pdf_pipeline.py,sha256=B1q8xt3Dfecpi8s8DrcfPzdATh8TYgL43FDzzcS4vEA,8885
41
+ docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
40
42
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
43
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
44
+ docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11211
42
45
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
43
46
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
44
47
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
45
- docling-2.8.3.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
46
- docling-2.8.3.dist-info/METADATA,sha256=TKraAUApw0vLlToJ37cBQPNyJwoPmdWMIn73hYwq4Y8,7682
47
- docling-2.8.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
48
- docling-2.8.3.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
49
- docling-2.8.3.dist-info/RECORD,,
48
+ docling-2.10.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
49
+ docling-2.10.0.dist-info/METADATA,sha256=YVI-dBKxqAxrLATigzeXPZvwDZUhLSl_doltc-HenQ4,7731
50
+ docling-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
51
+ docling-2.10.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
52
+ docling-2.10.0.dist-info/RECORD,,