docling 2.36.0__tar.gz → 2.37.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.36.0 → docling-2.37.0}/PKG-INFO +2 -3
- {docling-2.36.0 → docling-2.37.0}/docling/backend/asciidoc_backend.py +39 -18
- {docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_backend.py +61 -59
- {docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_v2_backend.py +72 -62
- {docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_v4_backend.py +21 -19
- {docling-2.36.0 → docling-2.37.0}/docling/backend/mspowerpoint_backend.py +72 -113
- {docling-2.36.0 → docling-2.37.0}/docling/backend/msword_backend.py +28 -18
- {docling-2.36.0 → docling-2.37.0}/docling/backend/pypdfium2_backend.py +127 -53
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/base_models.py +10 -3
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/pipeline_options.py +3 -1
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/pipeline_options_vlm_model.py +2 -1
- {docling-2.36.0 → docling-2.37.0}/docling/models/base_ocr_model.py +33 -11
- {docling-2.36.0 → docling-2.37.0}/docling/models/easyocr_model.py +1 -1
- {docling-2.36.0 → docling-2.37.0}/docling/models/layout_model.py +2 -3
- {docling-2.36.0 → docling-2.37.0}/docling/models/ocr_mac_model.py +1 -1
- {docling-2.36.0 → docling-2.37.0}/docling/models/page_preprocessing_model.py +3 -6
- {docling-2.36.0 → docling-2.37.0}/docling/models/rapid_ocr_model.py +1 -1
- {docling-2.36.0 → docling-2.37.0}/docling/models/readingorder_model.py +2 -2
- {docling-2.36.0 → docling-2.37.0}/docling/models/tesseract_ocr_cli_model.py +4 -3
- {docling-2.36.0 → docling-2.37.0}/docling/models/tesseract_ocr_model.py +1 -1
- {docling-2.36.0 → docling-2.37.0}/docling/models/vlm_models_inline/hf_transformers_model.py +1 -0
- {docling-2.36.0 → docling-2.37.0}/docling/pipeline/standard_pdf_pipeline.py +0 -1
- {docling-2.36.0 → docling-2.37.0}/docling/utils/layout_postprocessor.py +11 -6
- {docling-2.36.0 → docling-2.37.0}/docling.egg-info/PKG-INFO +2 -3
- {docling-2.36.0 → docling-2.37.0}/docling.egg-info/requires.txt +1 -2
- {docling-2.36.0 → docling-2.37.0}/pyproject.toml +2 -3
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_asciidoc.py +23 -1
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_msexcel.py +11 -11
- {docling-2.36.0 → docling-2.37.0}/LICENSE +0 -0
- {docling-2.36.0 → docling-2.37.0}/README.md +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/html_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/md_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/chunking/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/cli/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/cli/main.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/cli/models.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/cli/tools.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/document.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/settings.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/document_converter.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/exceptions.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/base_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/py.typed +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/__init__.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/export.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/locks.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/orientation.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/profiling.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/utils.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling/utils/visualization.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.36.0 → docling-2.37.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.36.0 → docling-2.37.0}/setup.cfg +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_csv.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_html.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_jats.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_msword.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_backend_webp.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_cli.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_code_formula.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_e2e_ocr_conversion.py +2 -2
- {docling-2.36.0 → docling-2.37.0}/tests/test_input_doc.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_interfaces.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_invalid_input.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_options.py +0 -0
- {docling-2.36.0 → docling-2.37.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.37.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -37,7 +37,7 @@ Requires-Dist: requests<3.0.0,>=2.32.2
|
|
37
37
|
Requires-Dist: easyocr<2.0,>=1.7
|
38
38
|
Requires-Dist: certifi>=2024.7.4
|
39
39
|
Requires-Dist: rtree<2.0.0,>=1.3.0
|
40
|
-
Requires-Dist: typer<0.
|
40
|
+
Requires-Dist: typer<0.17.0,>=0.12.5
|
41
41
|
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
42
42
|
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
43
43
|
Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
|
@@ -49,7 +49,6 @@ Requires-Dist: pillow<12.0.0,>=10.0.0
|
|
49
49
|
Requires-Dist: tqdm<5.0.0,>=4.65.0
|
50
50
|
Requires-Dist: pluggy<2.0.0,>=1.0.0
|
51
51
|
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
|
-
Requires-Dist: click<8.2.0
|
53
52
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
54
53
|
Provides-Extra: tesserocr
|
55
54
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import re
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Set, Union
|
5
|
+
from typing import Final, Set, Union
|
6
6
|
|
7
7
|
from docling_core.types.doc import (
|
8
8
|
DocItemLabel,
|
@@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument
|
|
22
22
|
|
23
23
|
_log = logging.getLogger(__name__)
|
24
24
|
|
25
|
+
DEFAULT_IMAGE_WIDTH: Final = 128
|
26
|
+
DEFAULT_IMAGE_HEIGHT: Final = 128
|
27
|
+
|
25
28
|
|
26
29
|
class AsciiDocBackend(DeclarativeDocumentBackend):
|
27
30
|
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
@@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
200
203
|
|
201
204
|
item = self._parse_picture(line)
|
202
205
|
|
203
|
-
size
|
206
|
+
size: Size
|
204
207
|
if "width" in item and "height" in item:
|
205
208
|
size = Size(width=int(item["width"]), height=int(item["height"]))
|
209
|
+
else:
|
210
|
+
size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)
|
206
211
|
|
207
212
|
uri = None
|
208
213
|
if (
|
@@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
264
269
|
|
265
270
|
return doc
|
266
271
|
|
267
|
-
|
272
|
+
@staticmethod
|
273
|
+
def _get_current_level(parents):
|
268
274
|
for k, v in parents.items():
|
269
275
|
if v is None and k > 0:
|
270
276
|
return k - 1
|
271
277
|
|
272
278
|
return 0
|
273
279
|
|
274
|
-
|
280
|
+
@staticmethod
|
281
|
+
def _get_current_parent(parents):
|
275
282
|
for k, v in parents.items():
|
276
283
|
if v is None and k > 0:
|
277
284
|
return parents[k - 1]
|
@@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
279
286
|
return None
|
280
287
|
|
281
288
|
# ========= Title
|
282
|
-
|
289
|
+
@staticmethod
|
290
|
+
def _is_title(line):
|
283
291
|
return re.match(r"^= ", line)
|
284
292
|
|
285
|
-
|
293
|
+
@staticmethod
|
294
|
+
def _parse_title(line):
|
286
295
|
return {"type": "title", "text": line[2:].strip(), "level": 0}
|
287
296
|
|
288
297
|
# ========= Section headers
|
289
|
-
|
298
|
+
@staticmethod
|
299
|
+
def _is_section_header(line):
|
290
300
|
return re.match(r"^==+\s+", line)
|
291
301
|
|
292
|
-
|
302
|
+
@staticmethod
|
303
|
+
def _parse_section_header(line):
|
293
304
|
match = re.match(r"^(=+)\s+(.*)", line)
|
294
305
|
|
295
306
|
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
@@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
303
314
|
}
|
304
315
|
|
305
316
|
# ========= Lists
|
306
|
-
|
317
|
+
@staticmethod
|
318
|
+
def _is_list_item(line):
|
307
319
|
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
|
308
320
|
|
309
|
-
|
321
|
+
@staticmethod
|
322
|
+
def _parse_list_item(line):
|
310
323
|
"""Extract the item marker (number or bullet symbol) and the text of the item."""
|
311
324
|
|
312
325
|
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
|
@@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
342
355
|
}
|
343
356
|
|
344
357
|
# ========= Tables
|
345
|
-
|
358
|
+
@staticmethod
|
359
|
+
def _is_table_line(line):
|
346
360
|
return re.match(r"^\|.*\|", line)
|
347
361
|
|
348
|
-
|
362
|
+
@staticmethod
|
363
|
+
def _parse_table_line(line):
|
349
364
|
# Split table cells and trim extra spaces
|
350
365
|
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
351
366
|
|
352
|
-
|
367
|
+
@staticmethod
|
368
|
+
def _populate_table_as_grid(table_data):
|
353
369
|
num_rows = len(table_data)
|
354
370
|
|
355
371
|
# Adjust the table data into a grid format
|
@@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
380
396
|
return data
|
381
397
|
|
382
398
|
# ========= Pictures
|
383
|
-
|
399
|
+
@staticmethod
|
400
|
+
def _is_picture(line):
|
384
401
|
return re.match(r"^image::", line)
|
385
402
|
|
386
|
-
|
403
|
+
@staticmethod
|
404
|
+
def _parse_picture(line):
|
387
405
|
"""
|
388
406
|
Parse an image macro, extracting its path and attributes.
|
389
407
|
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
|
@@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
406
424
|
return {"type": "picture", "uri": line}
|
407
425
|
|
408
426
|
# ========= Captions
|
409
|
-
|
427
|
+
@staticmethod
|
428
|
+
def _is_caption(line):
|
410
429
|
return re.match(r"^\.(.+)", line)
|
411
430
|
|
412
|
-
|
431
|
+
@staticmethod
|
432
|
+
def _parse_caption(line):
|
413
433
|
mtch = re.match(r"^\.(.+)", line)
|
414
434
|
if mtch:
|
415
435
|
text = mtch.group(1)
|
@@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
418
438
|
return {"type": "caption", "text": ""}
|
419
439
|
|
420
440
|
# ========= Plain text
|
421
|
-
|
441
|
+
@staticmethod
|
442
|
+
def _parse_text(line):
|
422
443
|
return {"type": "text", "text": line.strip()}
|
@@ -7,12 +7,17 @@ from typing import List, Optional, Union
|
|
7
7
|
|
8
8
|
import pypdfium2 as pdfium
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
10
|
-
from docling_core.types.doc.page import
|
10
|
+
from docling_core.types.doc.page import (
|
11
|
+
BoundingRectangle,
|
12
|
+
SegmentedPdfPage,
|
13
|
+
TextCell,
|
14
|
+
)
|
11
15
|
from docling_parse.pdf_parsers import pdf_parser_v1
|
12
16
|
from PIL import Image, ImageDraw
|
13
17
|
from pypdfium2 import PdfPage
|
14
18
|
|
15
19
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
20
|
+
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
16
21
|
from docling.datamodel.document import InputDocument
|
17
22
|
|
18
23
|
_log = logging.getLogger(__name__)
|
@@ -36,43 +41,8 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
36
41
|
def is_valid(self) -> bool:
|
37
42
|
return self.valid
|
38
43
|
|
39
|
-
def
|
40
|
-
|
41
|
-
return ""
|
42
|
-
# Find intersecting cells on the page
|
43
|
-
text_piece = ""
|
44
|
-
page_size = self.get_size()
|
45
|
-
parser_width = self._dpage["width"]
|
46
|
-
parser_height = self._dpage["height"]
|
47
|
-
|
48
|
-
scale = (
|
49
|
-
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
50
|
-
)
|
51
|
-
|
52
|
-
for i in range(len(self._dpage["cells"])):
|
53
|
-
rect = self._dpage["cells"][i]["box"]["device"]
|
54
|
-
x0, y0, x1, y1 = rect
|
55
|
-
cell_bbox = BoundingBox(
|
56
|
-
l=x0 * scale * page_size.width / parser_width,
|
57
|
-
b=y0 * scale * page_size.height / parser_height,
|
58
|
-
r=x1 * scale * page_size.width / parser_width,
|
59
|
-
t=y1 * scale * page_size.height / parser_height,
|
60
|
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
61
|
-
).to_top_left_origin(page_height=page_size.height * scale)
|
62
|
-
|
63
|
-
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
64
|
-
|
65
|
-
if overlap_frac > 0.5:
|
66
|
-
if len(text_piece) > 0:
|
67
|
-
text_piece += " "
|
68
|
-
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
69
|
-
|
70
|
-
return text_piece
|
71
|
-
|
72
|
-
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
73
|
-
return None
|
74
|
-
|
75
|
-
def get_text_cells(self) -> Iterable[TextCell]:
|
44
|
+
def _compute_text_cells(self) -> List[TextCell]:
|
45
|
+
"""Compute text cells from docling-parse data."""
|
76
46
|
cells: List[TextCell] = []
|
77
47
|
cell_counter = 0
|
78
48
|
|
@@ -102,7 +72,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
102
72
|
from_ocr=False,
|
103
73
|
rect=BoundingRectangle.from_bounding_box(
|
104
74
|
BoundingBox(
|
105
|
-
# l=x0, b=y0, r=x1, t=y1,
|
106
75
|
l=x0 * page_size.width / parser_width,
|
107
76
|
b=y0 * page_size.height / parser_height,
|
108
77
|
r=x1 * page_size.width / parser_width,
|
@@ -115,30 +84,63 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
115
84
|
|
116
85
|
cell_counter += 1
|
117
86
|
|
118
|
-
|
119
|
-
image = (
|
120
|
-
self.get_page_image()
|
121
|
-
) # make new image to avoid drawing on the saved ones
|
122
|
-
draw = ImageDraw.Draw(image)
|
123
|
-
for c in cells:
|
124
|
-
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
125
|
-
cell_color = (
|
126
|
-
random.randint(30, 140),
|
127
|
-
random.randint(30, 140),
|
128
|
-
random.randint(30, 140),
|
129
|
-
)
|
130
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
131
|
-
image.show()
|
87
|
+
return cells
|
132
88
|
|
133
|
-
|
134
|
-
|
89
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
90
|
+
if not self.valid:
|
91
|
+
return ""
|
92
|
+
# Find intersecting cells on the page
|
93
|
+
text_piece = ""
|
94
|
+
page_size = self.get_size()
|
95
|
+
parser_width = self._dpage["width"]
|
96
|
+
parser_height = self._dpage["height"]
|
135
97
|
|
136
|
-
|
98
|
+
scale = (
|
99
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
100
|
+
)
|
137
101
|
|
138
|
-
|
139
|
-
|
102
|
+
for i in range(len(self._dpage["cells"])):
|
103
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
104
|
+
x0, y0, x1, y1 = rect
|
105
|
+
cell_bbox = BoundingBox(
|
106
|
+
l=x0 * scale * page_size.width / parser_width,
|
107
|
+
b=y0 * scale * page_size.height / parser_height,
|
108
|
+
r=x1 * scale * page_size.width / parser_width,
|
109
|
+
t=y1 * scale * page_size.height / parser_height,
|
110
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
111
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
140
112
|
|
141
|
-
|
113
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
114
|
+
|
115
|
+
if overlap_frac > 0.5:
|
116
|
+
if len(text_piece) > 0:
|
117
|
+
text_piece += " "
|
118
|
+
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
119
|
+
|
120
|
+
return text_piece
|
121
|
+
|
122
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
123
|
+
if not self.valid:
|
124
|
+
return None
|
125
|
+
|
126
|
+
text_cells = self._compute_text_cells()
|
127
|
+
|
128
|
+
# Get the PDF page geometry from pypdfium2
|
129
|
+
dimension = get_pdf_page_geometry(self._ppage)
|
130
|
+
|
131
|
+
# Create SegmentedPdfPage
|
132
|
+
return SegmentedPdfPage(
|
133
|
+
dimension=dimension,
|
134
|
+
textline_cells=text_cells,
|
135
|
+
char_cells=[],
|
136
|
+
word_cells=[],
|
137
|
+
has_lines=len(text_cells) > 0,
|
138
|
+
has_words=False,
|
139
|
+
has_chars=False,
|
140
|
+
)
|
141
|
+
|
142
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
143
|
+
return self._compute_text_cells()
|
142
144
|
|
143
145
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
144
146
|
AREA_THRESHOLD = 0 # 32 * 32
|
@@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
|
7
7
|
|
8
8
|
import pypdfium2 as pdfium
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
10
|
-
from docling_core.types.doc.page import
|
10
|
+
from docling_core.types.doc.page import (
|
11
|
+
BoundingRectangle,
|
12
|
+
PdfPageBoundaryType,
|
13
|
+
PdfPageGeometry,
|
14
|
+
SegmentedPdfPage,
|
15
|
+
TextCell,
|
16
|
+
)
|
11
17
|
from docling_parse.pdf_parsers import pdf_parser_v2
|
12
18
|
from PIL import Image, ImageDraw
|
13
19
|
from pypdfium2 import PdfPage
|
14
20
|
|
15
21
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
22
|
+
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
16
23
|
from docling.datamodel.base_models import Size
|
17
24
|
from docling.utils.locks import pypdfium2_lock
|
18
25
|
|
@@ -40,50 +47,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
40
47
|
def is_valid(self) -> bool:
|
41
48
|
return self.valid
|
42
49
|
|
43
|
-
def
|
44
|
-
|
45
|
-
return ""
|
46
|
-
# Find intersecting cells on the page
|
47
|
-
text_piece = ""
|
48
|
-
page_size = self.get_size()
|
49
|
-
|
50
|
-
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
51
|
-
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
52
|
-
|
53
|
-
scale = (
|
54
|
-
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
55
|
-
)
|
56
|
-
|
57
|
-
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
58
|
-
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
59
|
-
|
60
|
-
for i, cell_data in enumerate(cells_data):
|
61
|
-
x0 = cell_data[cells_header.index("x0")]
|
62
|
-
y0 = cell_data[cells_header.index("y0")]
|
63
|
-
x1 = cell_data[cells_header.index("x1")]
|
64
|
-
y1 = cell_data[cells_header.index("y1")]
|
65
|
-
|
66
|
-
cell_bbox = BoundingBox(
|
67
|
-
l=x0 * scale * page_size.width / parser_width,
|
68
|
-
b=y0 * scale * page_size.height / parser_height,
|
69
|
-
r=x1 * scale * page_size.width / parser_width,
|
70
|
-
t=y1 * scale * page_size.height / parser_height,
|
71
|
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
72
|
-
).to_top_left_origin(page_height=page_size.height * scale)
|
73
|
-
|
74
|
-
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
75
|
-
|
76
|
-
if overlap_frac > 0.5:
|
77
|
-
if len(text_piece) > 0:
|
78
|
-
text_piece += " "
|
79
|
-
text_piece += cell_data[cells_header.index("text")]
|
80
|
-
|
81
|
-
return text_piece
|
82
|
-
|
83
|
-
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
84
|
-
return None
|
85
|
-
|
86
|
-
def get_text_cells(self) -> Iterable[TextCell]:
|
50
|
+
def _compute_text_cells(self) -> List[TextCell]:
|
51
|
+
"""Compute text cells from docling-parse v2 data."""
|
87
52
|
cells: List[TextCell] = []
|
88
53
|
cell_counter = 0
|
89
54
|
|
@@ -118,7 +83,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
118
83
|
from_ocr=False,
|
119
84
|
rect=BoundingRectangle.from_bounding_box(
|
120
85
|
BoundingBox(
|
121
|
-
# l=x0, b=y0, r=x1, t=y1,
|
122
86
|
l=x0 * page_size.width / parser_width,
|
123
87
|
b=y0 * page_size.height / parser_height,
|
124
88
|
r=x1 * page_size.width / parser_width,
|
@@ -130,24 +94,70 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
130
94
|
)
|
131
95
|
cell_counter += 1
|
132
96
|
|
133
|
-
|
134
|
-
image = (
|
135
|
-
self.get_page_image()
|
136
|
-
) # make new image to avoid drawing on the saved ones
|
137
|
-
draw = ImageDraw.Draw(image)
|
138
|
-
for c in cells:
|
139
|
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
140
|
-
cell_color = (
|
141
|
-
random.randint(30, 140),
|
142
|
-
random.randint(30, 140),
|
143
|
-
random.randint(30, 140),
|
144
|
-
)
|
145
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
146
|
-
image.show()
|
97
|
+
return cells
|
147
98
|
|
148
|
-
|
99
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
100
|
+
if not self.valid:
|
101
|
+
return ""
|
102
|
+
# Find intersecting cells on the page
|
103
|
+
text_piece = ""
|
104
|
+
page_size = self.get_size()
|
149
105
|
|
150
|
-
|
106
|
+
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
107
|
+
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
108
|
+
|
109
|
+
scale = (
|
110
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
111
|
+
)
|
112
|
+
|
113
|
+
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
114
|
+
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
115
|
+
|
116
|
+
for i, cell_data in enumerate(cells_data):
|
117
|
+
x0 = cell_data[cells_header.index("x0")]
|
118
|
+
y0 = cell_data[cells_header.index("y0")]
|
119
|
+
x1 = cell_data[cells_header.index("x1")]
|
120
|
+
y1 = cell_data[cells_header.index("y1")]
|
121
|
+
|
122
|
+
cell_bbox = BoundingBox(
|
123
|
+
l=x0 * scale * page_size.width / parser_width,
|
124
|
+
b=y0 * scale * page_size.height / parser_height,
|
125
|
+
r=x1 * scale * page_size.width / parser_width,
|
126
|
+
t=y1 * scale * page_size.height / parser_height,
|
127
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
128
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
129
|
+
|
130
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
131
|
+
|
132
|
+
if overlap_frac > 0.5:
|
133
|
+
if len(text_piece) > 0:
|
134
|
+
text_piece += " "
|
135
|
+
text_piece += cell_data[cells_header.index("text")]
|
136
|
+
|
137
|
+
return text_piece
|
138
|
+
|
139
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
140
|
+
if not self.valid:
|
141
|
+
return None
|
142
|
+
|
143
|
+
text_cells = self._compute_text_cells()
|
144
|
+
|
145
|
+
# Get the PDF page geometry from pypdfium2
|
146
|
+
dimension = get_pdf_page_geometry(self._ppage)
|
147
|
+
|
148
|
+
# Create SegmentedPdfPage
|
149
|
+
return SegmentedPdfPage(
|
150
|
+
dimension=dimension,
|
151
|
+
textline_cells=text_cells,
|
152
|
+
char_cells=[],
|
153
|
+
word_cells=[],
|
154
|
+
has_textlines=len(text_cells) > 0,
|
155
|
+
has_words=False,
|
156
|
+
has_chars=False,
|
157
|
+
)
|
158
|
+
|
159
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
160
|
+
return self._compute_text_cells()
|
151
161
|
|
152
162
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
153
163
|
AREA_THRESHOLD = 0 # 32 * 32
|
@@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
59
59
|
return self._dpage
|
60
60
|
|
61
61
|
def get_text_cells(self) -> Iterable[TextCell]:
|
62
|
-
page_size = self.get_size()
|
63
|
-
|
64
|
-
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
|
65
|
-
|
66
|
-
# for cell in self._dpage.textline_cells:
|
67
|
-
# rect = cell.rect
|
68
|
-
#
|
69
|
-
# assert (
|
70
|
-
# rect.to_bounding_box().l <= rect.to_bounding_box().r
|
71
|
-
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
|
72
|
-
# assert (
|
73
|
-
# rect.to_bounding_box().t <= rect.to_bounding_box().b
|
74
|
-
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
|
75
|
-
|
76
62
|
return self._dpage.textline_cells
|
77
63
|
|
78
64
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
@@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
171
157
|
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
172
158
|
) -> DoclingParseV4PageBackend:
|
173
159
|
with pypdfium2_lock:
|
160
|
+
seg_page = self.dp_doc.get_page(
|
161
|
+
page_no + 1,
|
162
|
+
create_words=create_words,
|
163
|
+
create_textlines=create_textlines,
|
164
|
+
)
|
165
|
+
|
166
|
+
# In Docling, all TextCell instances are expected with top-left origin.
|
167
|
+
[
|
168
|
+
tc.to_top_left_origin(seg_page.dimension.height)
|
169
|
+
for tc in seg_page.textline_cells
|
170
|
+
]
|
171
|
+
[
|
172
|
+
tc.to_top_left_origin(seg_page.dimension.height)
|
173
|
+
for tc in seg_page.char_cells
|
174
|
+
]
|
175
|
+
[
|
176
|
+
tc.to_top_left_origin(seg_page.dimension.height)
|
177
|
+
for tc in seg_page.word_cells
|
178
|
+
]
|
179
|
+
|
174
180
|
return DoclingParseV4PageBackend(
|
175
|
-
|
176
|
-
page_no + 1,
|
177
|
-
create_words=create_words,
|
178
|
-
create_textlines=create_textlines,
|
179
|
-
),
|
181
|
+
seg_page,
|
180
182
|
self._pdoc[page_no],
|
181
183
|
)
|
182
184
|
|