docling 2.41.0__tar.gz → 2.42.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.41.0 → docling-2.42.0}/PKG-INFO +2 -1
- {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/latex/omml.py +9 -1
- {docling-2.41.0 → docling-2.42.0}/docling/backend/html_backend.py +25 -17
- {docling-2.41.0 → docling-2.42.0}/docling/backend/xml/jats_backend.py +12 -4
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/pipeline_options.py +4 -1
- {docling-2.41.0 → docling-2.42.0}/docling/document_converter.py +14 -11
- {docling-2.41.0 → docling-2.42.0}/docling/models/picture_description_vlm_model.py +2 -1
- {docling-2.41.0 → docling-2.42.0}/docling/utils/layout_postprocessor.py +3 -2
- {docling-2.41.0 → docling-2.42.0}/docling.egg-info/PKG-INFO +2 -1
- {docling-2.41.0 → docling-2.42.0}/docling.egg-info/requires.txt +1 -0
- {docling-2.41.0 → docling-2.42.0}/pyproject.toml +2 -1
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_jats.py +14 -14
- {docling-2.41.0 → docling-2.42.0}/LICENSE +0 -0
- {docling-2.41.0 → docling-2.42.0}/README.md +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/md_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/chunking/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/cli/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/cli/main.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/cli/models.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/cli/tools.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/document.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/settings.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/exceptions.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/base_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/layout_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/py.typed +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/__init__.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/export.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/locks.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/orientation.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/profiling.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/utils.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling/utils/visualization.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.41.0 → docling-2.42.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.41.0 → docling-2.42.0}/setup.cfg +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_csv.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_html.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_msword.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_backend_webp.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_cli.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_code_formula.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_input_doc.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_interfaces.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_invalid_input.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_options.py +0 -0
- {docling-2.41.0 → docling-2.42.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.42.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
|
|
50
50
|
Requires-Dist: pluggy<2.0.0,>=1.0.0
|
51
51
|
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
52
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
53
|
+
Requires-Dist: accelerate<2,>=1.0.0
|
53
54
|
Provides-Extra: tesserocr
|
54
55
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
55
56
|
Provides-Extra: ocrmac
|
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
|
|
260
260
|
the fraction object
|
261
261
|
"""
|
262
262
|
c_dict = self.process_children_dict(elm)
|
263
|
-
pr = c_dict
|
263
|
+
pr = c_dict.get("fPr")
|
264
|
+
if pr is None:
|
265
|
+
# Handle missing fPr element gracefully
|
266
|
+
_log.debug("Missing fPr element in fraction, using default formatting")
|
267
|
+
latex_s = F_DEFAULT
|
268
|
+
return latex_s.format(
|
269
|
+
num=c_dict.get("num"),
|
270
|
+
den=c_dict.get("den"),
|
271
|
+
)
|
264
272
|
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
265
273
|
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
266
274
|
|
@@ -379,6 +379,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
379
379
|
else:
|
380
380
|
_log.debug(f"list-item has no text: {element}")
|
381
381
|
|
382
|
+
@staticmethod
|
383
|
+
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
384
|
+
"""Extract colspan and rowspan values from a table cell tag.
|
385
|
+
|
386
|
+
This function retrieves the 'colspan' and 'rowspan' attributes from a given
|
387
|
+
table cell tag.
|
388
|
+
If the attribute does not exist or it is not numeric, it defaults to 1.
|
389
|
+
"""
|
390
|
+
raw_spans: tuple[str, str] = (
|
391
|
+
str(cell.get("colspan", "1")),
|
392
|
+
str(cell.get("rowspan", "1")),
|
393
|
+
)
|
394
|
+
int_spans: tuple[int, int] = (
|
395
|
+
int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
|
396
|
+
int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
|
397
|
+
)
|
398
|
+
|
399
|
+
return int_spans
|
400
|
+
|
382
401
|
@staticmethod
|
383
402
|
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
384
403
|
nested_tables = element.find("table")
|
@@ -398,10 +417,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
398
417
|
if not isinstance(row, Tag):
|
399
418
|
continue
|
400
419
|
cell_tag = cast(Tag, cell)
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
420
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
421
|
+
col_count += col_span
|
422
|
+
if cell_tag.name == "td" or row_span == 1:
|
405
423
|
is_row_header = False
|
406
424
|
num_cols = max(num_cols, col_count)
|
407
425
|
if not is_row_header:
|
@@ -428,10 +446,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
428
446
|
row_header = True
|
429
447
|
for html_cell in cells:
|
430
448
|
if isinstance(html_cell, Tag):
|
449
|
+
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
431
450
|
if html_cell.name == "td":
|
432
451
|
col_header = False
|
433
452
|
row_header = False
|
434
|
-
elif
|
453
|
+
elif row_span == 1:
|
435
454
|
row_header = False
|
436
455
|
if not row_header:
|
437
456
|
row_idx += 1
|
@@ -456,18 +475,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
456
475
|
text = html_cell.text
|
457
476
|
|
458
477
|
# label = html_cell.name
|
459
|
-
|
460
|
-
col_span = (
|
461
|
-
int(col_val)
|
462
|
-
if isinstance(col_val, str) and col_val.isnumeric()
|
463
|
-
else 1
|
464
|
-
)
|
465
|
-
row_val = html_cell.get("rowspan", "1")
|
466
|
-
row_span = (
|
467
|
-
int(row_val)
|
468
|
-
if isinstance(row_val, str) and row_val.isnumeric()
|
469
|
-
else 1
|
470
|
-
)
|
478
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
471
479
|
if row_header:
|
472
480
|
row_span -= 1
|
473
481
|
while (
|
@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
93
93
|
|
94
94
|
# Initialize the root of the document hierarchy
|
95
95
|
self.root: Optional[NodeItem] = None
|
96
|
-
|
97
|
-
self.valid = False
|
96
|
+
self.hlevel: int = 0
|
97
|
+
self.valid: bool = False
|
98
98
|
try:
|
99
99
|
if isinstance(self.path_or_stream, BytesIO):
|
100
100
|
self.path_or_stream.seek(0)
|
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
147
147
|
binary_hash=self.document_hash,
|
148
148
|
)
|
149
149
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
150
|
+
self.hlevel = 0
|
150
151
|
|
151
152
|
# Get metadata XML components
|
152
153
|
xml_components: XMLComponents = self._parse_metadata()
|
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
304
305
|
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
305
306
|
if not text:
|
306
307
|
continue
|
307
|
-
parent = doc.add_heading(
|
308
|
+
parent = doc.add_heading(
|
309
|
+
parent=self.root, text=title, level=self.hlevel + 1
|
310
|
+
)
|
308
311
|
doc.add_text(
|
309
312
|
parent=parent,
|
310
313
|
text=text,
|
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
637
640
|
elif child.tag == "ack":
|
638
641
|
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
|
639
642
|
if text:
|
640
|
-
|
643
|
+
self.hlevel += 1
|
644
|
+
new_parent = doc.add_heading(
|
645
|
+
text=text, parent=parent, level=self.hlevel
|
646
|
+
)
|
641
647
|
elif child.tag == "list":
|
642
648
|
new_parent = doc.add_group(
|
643
649
|
label=GroupLabel.LIST, name="list", parent=parent
|
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
694
700
|
new_text = self._walk_linear(doc, new_parent, child)
|
695
701
|
if not (node.getparent().tag == "p" and node.tag in flush_tags):
|
696
702
|
node_text += new_text
|
703
|
+
if child.tag in ("sec", "ack") and text:
|
704
|
+
self.hlevel -= 1
|
697
705
|
|
698
706
|
# pick up the tail text
|
699
707
|
node_text += child.tail.replace("\n", " ") if child.tail else ""
|
@@ -217,7 +217,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
217
217
|
|
218
218
|
# GraniteVision
|
219
219
|
granite_picture_description = PictureDescriptionVlmOptions(
|
220
|
-
repo_id="ibm-granite/granite-vision-3.
|
220
|
+
repo_id="ibm-granite/granite-vision-3.3-2b",
|
221
221
|
prompt="What is shown in this image?",
|
222
222
|
)
|
223
223
|
|
@@ -279,6 +279,9 @@ class LayoutOptions(BaseModel):
|
|
279
279
|
"""Options for layout processing."""
|
280
280
|
|
281
281
|
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
282
|
+
keep_empty_clusters: bool = (
|
283
|
+
False # Whether to keep clusters that contain no text cells
|
284
|
+
)
|
282
285
|
model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
|
283
286
|
|
284
287
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
3
|
import sys
|
4
|
+
import threading
|
4
5
|
import time
|
5
6
|
from collections.abc import Iterable, Iterator
|
6
7
|
from functools import partial
|
@@ -49,6 +50,7 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|
49
50
|
from docling.utils.utils import chunkify
|
50
51
|
|
51
52
|
_log = logging.getLogger(__name__)
|
53
|
+
_PIPELINE_CACHE_LOCK = threading.Lock()
|
52
54
|
|
53
55
|
|
54
56
|
class FormatOption(BaseModel):
|
@@ -315,17 +317,18 @@ class DocumentConverter:
|
|
315
317
|
# Use a composite key to cache pipelines
|
316
318
|
cache_key = (pipeline_class, options_hash)
|
317
319
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
320
|
+
with _PIPELINE_CACHE_LOCK:
|
321
|
+
if cache_key not in self.initialized_pipelines:
|
322
|
+
_log.info(
|
323
|
+
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
324
|
+
)
|
325
|
+
self.initialized_pipelines[cache_key] = pipeline_class(
|
326
|
+
pipeline_options=pipeline_options
|
327
|
+
)
|
328
|
+
else:
|
329
|
+
_log.debug(
|
330
|
+
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
331
|
+
)
|
329
332
|
|
330
333
|
return self.initialized_pipelines[cache_key]
|
331
334
|
|
@@ -65,6 +65,7 @@ class PictureDescriptionVlmModel(
|
|
65
65
|
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
66
66
|
self.model = AutoModelForVision2Seq.from_pretrained(
|
67
67
|
artifacts_path,
|
68
|
+
device_map=self.device,
|
68
69
|
torch_dtype=torch.bfloat16,
|
69
70
|
_attn_implementation=(
|
70
71
|
"flash_attention_2"
|
@@ -72,7 +73,7 @@ class PictureDescriptionVlmModel(
|
|
72
73
|
and accelerator_options.cuda_use_flash_attention2
|
73
74
|
else "eager"
|
74
75
|
),
|
75
|
-
)
|
76
|
+
)
|
76
77
|
|
77
78
|
self.provenance = f"{self.options.repo_id}"
|
78
79
|
|
@@ -267,8 +267,9 @@ class LayoutPostprocessor:
|
|
267
267
|
# Initial cell assignment
|
268
268
|
clusters = self._assign_cells_to_clusters(clusters)
|
269
269
|
|
270
|
-
# Remove clusters with no cells
|
271
|
-
|
270
|
+
# Remove clusters with no cells (if keep_empty_clusters is False)
|
271
|
+
if not self.options.keep_empty_clusters:
|
272
|
+
clusters = [cluster for cluster in clusters if cluster.cells]
|
272
273
|
|
273
274
|
# Handle orphaned cells
|
274
275
|
unassigned = self._find_unassigned_cells(clusters)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.42.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
|
|
50
50
|
Requires-Dist: pluggy<2.0.0,>=1.0.0
|
51
51
|
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
52
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
53
|
+
Requires-Dist: accelerate<2,>=1.0.0
|
53
54
|
Provides-Extra: tesserocr
|
54
55
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
55
56
|
Provides-Extra: ocrmac
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.42.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
license = "MIT"
|
6
6
|
keywords = [
|
@@ -70,6 +70,7 @@ dependencies = [
|
|
70
70
|
'scipy (>=1.6.0,<2.0.0)',
|
71
71
|
# 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
|
72
72
|
# 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
|
73
|
+
"accelerate>=1.0.0,<2",
|
73
74
|
]
|
74
75
|
|
75
76
|
[project.urls]
|
@@ -14,9 +14,9 @@ from .verify_utils import verify_document, verify_export
|
|
14
14
|
GENERATE = GEN_TEST_DATA
|
15
15
|
|
16
16
|
|
17
|
-
def
|
18
|
-
directory = Path(os.path.dirname(__file__) + "/data/
|
19
|
-
xml_files = sorted(directory.rglob("*.
|
17
|
+
def get_jats_paths():
|
18
|
+
directory = Path(os.path.dirname(__file__) + "/data/jats/")
|
19
|
+
xml_files = sorted(directory.rglob("*.nxml"))
|
20
20
|
return xml_files
|
21
21
|
|
22
22
|
|
@@ -25,20 +25,20 @@ def get_converter():
|
|
25
25
|
return converter
|
26
26
|
|
27
27
|
|
28
|
-
def
|
29
|
-
|
28
|
+
def test_e2e_jats_conversions(use_stream=False):
|
29
|
+
jats_paths = get_jats_paths()
|
30
30
|
converter = get_converter()
|
31
31
|
|
32
|
-
for
|
32
|
+
for jats_path in jats_paths:
|
33
33
|
gt_path = (
|
34
|
-
|
34
|
+
jats_path.parent.parent / "groundtruth" / "docling_v2" / jats_path.name
|
35
35
|
)
|
36
36
|
if use_stream:
|
37
|
-
buf = BytesIO(
|
38
|
-
stream = DocumentStream(name=
|
37
|
+
buf = BytesIO(jats_path.open("rb").read())
|
38
|
+
stream = DocumentStream(name=jats_path.name, stream=buf)
|
39
39
|
conv_result: ConversionResult = converter.convert(stream)
|
40
40
|
else:
|
41
|
-
conv_result: ConversionResult = converter.convert(
|
41
|
+
conv_result: ConversionResult = converter.convert(jats_path)
|
42
42
|
doc: DoclingDocument = conv_result.document
|
43
43
|
|
44
44
|
pred_md: str = doc.export_to_markdown()
|
@@ -54,9 +54,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
|
|
54
54
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
|
55
55
|
|
56
56
|
|
57
|
-
def
|
58
|
-
|
57
|
+
def test_e2e_jats_conversions_stream():
|
58
|
+
test_e2e_jats_conversions(use_stream=True)
|
59
59
|
|
60
60
|
|
61
|
-
def
|
62
|
-
|
61
|
+
def test_e2e_jats_conversions_no_stream():
|
62
|
+
test_e2e_jats_conversions(use_stream=False)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|