docling 2.20.0__py3-none-any.whl → 2.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/csv_backend.py +125 -0
- docling/cli/main.py +7 -0
- docling/datamodel/base_models.py +3 -0
- docling/datamodel/document.py +33 -0
- docling/datamodel/pipeline_options.py +1 -0
- docling/datamodel/settings.py +2 -1
- docling/document_converter.py +9 -0
- docling/exceptions.py +4 -0
- docling/models/ds_glm_model.py +60 -2
- docling/models/picture_description_api_model.py +11 -4
- docling/models/tesseract_ocr_model.py +1 -2
- docling/pipeline/standard_pdf_pipeline.py +9 -0
- docling/utils/glm_utils.py +10 -0
- {docling-2.20.0.dist-info → docling-2.22.0.dist-info}/METADATA +3 -3
- {docling-2.20.0.dist-info → docling-2.22.0.dist-info}/RECORD +18 -17
- {docling-2.20.0.dist-info → docling-2.22.0.dist-info}/LICENSE +0 -0
- {docling-2.20.0.dist-info → docling-2.22.0.dist-info}/WHEEL +0 -0
- {docling-2.20.0.dist-info → docling-2.22.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
import csv
|
2
|
+
import logging
|
3
|
+
import warnings
|
4
|
+
from io import BytesIO, StringIO
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Set, Union
|
7
|
+
|
8
|
+
from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData
|
9
|
+
|
10
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
11
|
+
from docling.datamodel.base_models import InputFormat
|
12
|
+
from docling.datamodel.document import InputDocument
|
13
|
+
|
14
|
+
_log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class CsvDocumentBackend(DeclarativeDocumentBackend):
|
18
|
+
content: StringIO
|
19
|
+
|
20
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
21
|
+
super().__init__(in_doc, path_or_stream)
|
22
|
+
|
23
|
+
# Load content
|
24
|
+
try:
|
25
|
+
if isinstance(self.path_or_stream, BytesIO):
|
26
|
+
self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
|
27
|
+
elif isinstance(self.path_or_stream, Path):
|
28
|
+
self.content = StringIO(self.path_or_stream.read_text("utf-8"))
|
29
|
+
self.valid = True
|
30
|
+
except Exception as e:
|
31
|
+
raise RuntimeError(
|
32
|
+
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
|
33
|
+
) from e
|
34
|
+
return
|
35
|
+
|
36
|
+
def is_valid(self) -> bool:
|
37
|
+
return self.valid
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
def supports_pagination(cls) -> bool:
|
41
|
+
return False
|
42
|
+
|
43
|
+
def unload(self):
|
44
|
+
if isinstance(self.path_or_stream, BytesIO):
|
45
|
+
self.path_or_stream.close()
|
46
|
+
self.path_or_stream = None
|
47
|
+
|
48
|
+
@classmethod
|
49
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
50
|
+
return {InputFormat.CSV}
|
51
|
+
|
52
|
+
def convert(self) -> DoclingDocument:
|
53
|
+
"""
|
54
|
+
Parses the CSV data into a structured document model.
|
55
|
+
"""
|
56
|
+
|
57
|
+
# Detect CSV dialect
|
58
|
+
head = self.content.readline()
|
59
|
+
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
60
|
+
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
61
|
+
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
|
62
|
+
raise RuntimeError(
|
63
|
+
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
64
|
+
)
|
65
|
+
|
66
|
+
# Parce CSV
|
67
|
+
self.content.seek(0)
|
68
|
+
result = csv.reader(self.content, dialect=dialect, strict=True)
|
69
|
+
self.csv_data = list(result)
|
70
|
+
_log.info(f"Detected {len(self.csv_data)} lines")
|
71
|
+
|
72
|
+
# Ensure uniform column length
|
73
|
+
expected_length = len(self.csv_data[0])
|
74
|
+
is_uniform = all(len(row) == expected_length for row in self.csv_data)
|
75
|
+
if not is_uniform:
|
76
|
+
warnings.warn(
|
77
|
+
f"Inconsistent column lengths detected in CSV data. "
|
78
|
+
f"Expected {expected_length} columns, but found rows with varying lengths. "
|
79
|
+
f"Ensure all rows have the same number of columns."
|
80
|
+
)
|
81
|
+
|
82
|
+
# Parse the CSV into a structured document model
|
83
|
+
origin = DocumentOrigin(
|
84
|
+
filename=self.file.name or "file.csv",
|
85
|
+
mimetype="text/csv",
|
86
|
+
binary_hash=self.document_hash,
|
87
|
+
)
|
88
|
+
|
89
|
+
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
|
90
|
+
|
91
|
+
if self.is_valid():
|
92
|
+
# Convert CSV data to table
|
93
|
+
if self.csv_data:
|
94
|
+
num_rows = len(self.csv_data)
|
95
|
+
num_cols = max(len(row) for row in self.csv_data)
|
96
|
+
|
97
|
+
table_data = TableData(
|
98
|
+
num_rows=num_rows,
|
99
|
+
num_cols=num_cols,
|
100
|
+
table_cells=[],
|
101
|
+
)
|
102
|
+
|
103
|
+
# Convert each cell to TableCell
|
104
|
+
for row_idx, row in enumerate(self.csv_data):
|
105
|
+
for col_idx, cell_value in enumerate(row):
|
106
|
+
cell = TableCell(
|
107
|
+
text=str(cell_value),
|
108
|
+
row_span=1, # CSV doesn't support merged cells
|
109
|
+
col_span=1,
|
110
|
+
start_row_offset_idx=row_idx,
|
111
|
+
end_row_offset_idx=row_idx + 1,
|
112
|
+
start_col_offset_idx=col_idx,
|
113
|
+
end_col_offset_idx=col_idx + 1,
|
114
|
+
col_header=row_idx == 0, # First row as header
|
115
|
+
row_header=False,
|
116
|
+
)
|
117
|
+
table_data.table_cells.append(cell)
|
118
|
+
|
119
|
+
doc.add_table(data=table_data)
|
120
|
+
else:
|
121
|
+
raise RuntimeError(
|
122
|
+
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
123
|
+
)
|
124
|
+
|
125
|
+
return doc
|
docling/cli/main.py
CHANGED
@@ -234,6 +234,12 @@ def convert(
|
|
234
234
|
Optional[Path],
|
235
235
|
typer.Option(..., help="If provided, the location of the model artifacts."),
|
236
236
|
] = None,
|
237
|
+
enable_remote_services: Annotated[
|
238
|
+
bool,
|
239
|
+
typer.Option(
|
240
|
+
..., help="Must be enabled when using models connecting to remote services."
|
241
|
+
),
|
242
|
+
] = False,
|
237
243
|
abort_on_error: Annotated[
|
238
244
|
bool,
|
239
245
|
typer.Option(
|
@@ -380,6 +386,7 @@ def convert(
|
|
380
386
|
|
381
387
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
382
388
|
pipeline_options = PdfPipelineOptions(
|
389
|
+
enable_remote_services=enable_remote_services,
|
383
390
|
accelerator_options=accelerator_options,
|
384
391
|
do_ocr=ocr,
|
385
392
|
ocr_options=ocr_options,
|
docling/datamodel/base_models.py
CHANGED
@@ -39,6 +39,7 @@ class InputFormat(str, Enum):
|
|
39
39
|
PDF = "pdf"
|
40
40
|
ASCIIDOC = "asciidoc"
|
41
41
|
MD = "md"
|
42
|
+
CSV = "csv"
|
42
43
|
XLSX = "xlsx"
|
43
44
|
XML_USPTO = "xml_uspto"
|
44
45
|
JSON_DOCLING = "json_docling"
|
@@ -61,6 +62,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
61
62
|
InputFormat.XML_PUBMED: ["xml", "nxml"],
|
62
63
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
63
64
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
65
|
+
InputFormat.CSV: ["csv"],
|
64
66
|
InputFormat.XLSX: ["xlsx"],
|
65
67
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
66
68
|
InputFormat.JSON_DOCLING: ["json"],
|
@@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
88
90
|
InputFormat.PDF: ["application/pdf"],
|
89
91
|
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
90
92
|
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
93
|
+
InputFormat.CSV: ["text/csv"],
|
91
94
|
InputFormat.XLSX: [
|
92
95
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
93
96
|
],
|
docling/datamodel/document.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import csv
|
1
2
|
import logging
|
2
3
|
import re
|
3
4
|
from enum import Enum
|
@@ -296,6 +297,7 @@ class _DocumentConversionInput(BaseModel):
|
|
296
297
|
mime = _DocumentConversionInput._mime_from_extension(ext)
|
297
298
|
|
298
299
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
300
|
+
mime = mime or _DocumentConversionInput._detect_csv(content)
|
299
301
|
mime = mime or "text/plain"
|
300
302
|
formats = MimeTypeToFormat.get(mime, [])
|
301
303
|
if formats:
|
@@ -352,6 +354,8 @@ class _DocumentConversionInput(BaseModel):
|
|
352
354
|
mime = FormatToMimeType[InputFormat.HTML][0]
|
353
355
|
elif ext in FormatToExtensions[InputFormat.MD]:
|
354
356
|
mime = FormatToMimeType[InputFormat.MD][0]
|
357
|
+
elif ext in FormatToExtensions[InputFormat.CSV]:
|
358
|
+
mime = FormatToMimeType[InputFormat.CSV][0]
|
355
359
|
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
356
360
|
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
357
361
|
elif ext in FormatToExtensions[InputFormat.PDF]:
|
@@ -392,3 +396,32 @@ class _DocumentConversionInput(BaseModel):
|
|
392
396
|
return "application/xml"
|
393
397
|
|
394
398
|
return None
|
399
|
+
|
400
|
+
@staticmethod
|
401
|
+
def _detect_csv(
|
402
|
+
content: bytes,
|
403
|
+
) -> Optional[Literal["text/csv"]]:
|
404
|
+
"""Guess the mime type of a CSV file from its content.
|
405
|
+
|
406
|
+
Args:
|
407
|
+
content: A short piece of a document from its beginning.
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
The mime type of a CSV file, or None if the content does
|
411
|
+
not match any of the format.
|
412
|
+
"""
|
413
|
+
content_str = content.decode("ascii", errors="ignore").strip()
|
414
|
+
|
415
|
+
# Ensure there's at least one newline (CSV is usually multi-line)
|
416
|
+
if "\n" not in content_str:
|
417
|
+
return None
|
418
|
+
|
419
|
+
# Use csv.Sniffer to detect CSV characteristics
|
420
|
+
try:
|
421
|
+
dialect = csv.Sniffer().sniff(content_str)
|
422
|
+
if dialect.delimiter in {",", ";", "\t", "|"}: # Common delimiters
|
423
|
+
return "text/csv"
|
424
|
+
except csv.Error:
|
425
|
+
return None
|
426
|
+
|
427
|
+
return None
|
docling/datamodel/settings.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import sys
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import Annotated, Tuple
|
3
|
+
from typing import Annotated, Optional, Tuple
|
4
4
|
|
5
5
|
from pydantic import BaseModel, PlainValidator
|
6
6
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
@@ -62,6 +62,7 @@ class AppSettings(BaseSettings):
|
|
62
62
|
debug: DebugSettings
|
63
63
|
|
64
64
|
cache_dir: Path = Path.home() / ".cache" / "docling"
|
65
|
+
artifacts_path: Optional[Path] = None
|
65
66
|
|
66
67
|
|
67
68
|
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
docling/document_converter.py
CHANGED
@@ -10,6 +10,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|
10
10
|
|
11
11
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
12
12
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
13
|
+
from docling.backend.csv_backend import CsvDocumentBackend
|
13
14
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
14
15
|
from docling.backend.html_backend import HTMLDocumentBackend
|
15
16
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
@@ -61,6 +62,11 @@ class FormatOption(BaseModel):
|
|
61
62
|
return self
|
62
63
|
|
63
64
|
|
65
|
+
class CsvFormatOption(FormatOption):
|
66
|
+
pipeline_cls: Type = SimplePipeline
|
67
|
+
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
68
|
+
|
69
|
+
|
64
70
|
class ExcelFormatOption(FormatOption):
|
65
71
|
pipeline_cls: Type = SimplePipeline
|
66
72
|
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
@@ -113,6 +119,9 @@ class PdfFormatOption(FormatOption):
|
|
113
119
|
|
114
120
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
115
121
|
format_to_default_options = {
|
122
|
+
InputFormat.CSV: FormatOption(
|
123
|
+
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
|
124
|
+
),
|
116
125
|
InputFormat.XLSX: FormatOption(
|
117
126
|
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
118
127
|
),
|
docling/exceptions.py
CHANGED
docling/models/ds_glm_model.py
CHANGED
@@ -4,7 +4,12 @@ from pathlib import Path
|
|
4
4
|
from typing import List, Union
|
5
5
|
|
6
6
|
from deepsearch_glm.andromeda_nlp import nlp_model
|
7
|
-
from docling_core.types.doc import
|
7
|
+
from docling_core.types.doc import (
|
8
|
+
BoundingBox,
|
9
|
+
CoordOrigin,
|
10
|
+
DocItemLabel,
|
11
|
+
DoclingDocument,
|
12
|
+
)
|
8
13
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
9
14
|
from docling_core.types.legacy_doc.base import (
|
10
15
|
Figure,
|
@@ -71,12 +76,15 @@ class GlmModel:
|
|
71
76
|
)
|
72
77
|
|
73
78
|
main_text: List[Union[Ref, BaseText]] = []
|
79
|
+
page_headers: List[Union[Ref, BaseText]] = []
|
80
|
+
page_footers: List[Union[Ref, BaseText]] = []
|
81
|
+
|
74
82
|
tables: List[DsSchemaTable] = []
|
75
83
|
figures: List[Figure] = []
|
76
84
|
|
77
85
|
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
78
86
|
|
79
|
-
for element in conv_res.assembled.
|
87
|
+
for element in conv_res.assembled.body:
|
80
88
|
# Convert bboxes to lower-left origin.
|
81
89
|
target_bbox = DsBoundingBox(
|
82
90
|
element.cluster.bbox.to_bottom_left_origin(
|
@@ -238,6 +246,53 @@ class GlmModel:
|
|
238
246
|
)
|
239
247
|
)
|
240
248
|
|
249
|
+
# We can throw in headers and footers at the end of the legacy doc
|
250
|
+
# since the reading-order will re-sort it later.
|
251
|
+
for element in conv_res.assembled.headers:
|
252
|
+
# Convert bboxes to lower-left origin.
|
253
|
+
target_bbox = DsBoundingBox(
|
254
|
+
element.cluster.bbox.to_bottom_left_origin(
|
255
|
+
page_no_to_page[element.page_no].size.height
|
256
|
+
).as_tuple()
|
257
|
+
)
|
258
|
+
|
259
|
+
if isinstance(element, TextElement):
|
260
|
+
|
261
|
+
tel = BaseText(
|
262
|
+
text=element.text,
|
263
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
264
|
+
name=element.label,
|
265
|
+
prov=[
|
266
|
+
Prov(
|
267
|
+
bbox=target_bbox,
|
268
|
+
page=element.page_no + 1,
|
269
|
+
span=[0, len(element.text)],
|
270
|
+
)
|
271
|
+
],
|
272
|
+
)
|
273
|
+
if element.label == DocItemLabel.PAGE_HEADER:
|
274
|
+
index = len(page_headers)
|
275
|
+
ref_str = f"#/page-headers/{index}"
|
276
|
+
main_text.append(
|
277
|
+
Ref(
|
278
|
+
name=element.label,
|
279
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
280
|
+
ref=ref_str,
|
281
|
+
),
|
282
|
+
)
|
283
|
+
page_headers.append(tel)
|
284
|
+
elif element.label == DocItemLabel.PAGE_FOOTER:
|
285
|
+
index = len(page_footers)
|
286
|
+
ref_str = f"#/page-footers/{index}"
|
287
|
+
main_text.append(
|
288
|
+
Ref(
|
289
|
+
name=element.label,
|
290
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
291
|
+
ref=ref_str,
|
292
|
+
),
|
293
|
+
)
|
294
|
+
page_footers.append(tel)
|
295
|
+
|
241
296
|
page_dimensions = [
|
242
297
|
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
243
298
|
for p in conv_res.pages
|
@@ -252,6 +307,8 @@ class GlmModel:
|
|
252
307
|
tables=tables,
|
253
308
|
figures=figures,
|
254
309
|
page_dimensions=page_dimensions,
|
310
|
+
page_headers=page_headers,
|
311
|
+
page_footers=page_footers,
|
255
312
|
)
|
256
313
|
|
257
314
|
return ds_doc
|
@@ -264,6 +321,7 @@ class GlmModel:
|
|
264
321
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
265
322
|
|
266
323
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
324
|
+
1 == 1
|
267
325
|
|
268
326
|
# DEBUG code:
|
269
327
|
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
@@ -8,6 +8,7 @@ from PIL import Image
|
|
8
8
|
from pydantic import BaseModel, ConfigDict
|
9
9
|
|
10
10
|
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
|
11
|
+
from docling.exceptions import OperationNotAllowed
|
11
12
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
12
13
|
|
13
14
|
_log = logging.getLogger(__name__)
|
@@ -45,14 +46,20 @@ class ApiResponse(BaseModel):
|
|
45
46
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
46
47
|
# elements_batch_size = 4
|
47
48
|
|
48
|
-
def __init__(
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
enabled: bool,
|
52
|
+
enable_remote_services: bool,
|
53
|
+
options: PictureDescriptionApiOptions,
|
54
|
+
):
|
49
55
|
super().__init__(enabled=enabled, options=options)
|
50
56
|
self.options: PictureDescriptionApiOptions
|
51
57
|
|
52
58
|
if self.enabled:
|
53
|
-
if
|
54
|
-
raise
|
55
|
-
"
|
59
|
+
if not enable_remote_services:
|
60
|
+
raise OperationNotAllowed(
|
61
|
+
"Connections to remote services is only allowed when set explicitly. "
|
62
|
+
"pipeline_options.enable_remote_services=True."
|
56
63
|
)
|
57
64
|
|
58
65
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
@@ -22,6 +22,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
22
22
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
23
|
self.reader = None
|
24
24
|
self.osd_reader = None
|
25
|
+
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
25
26
|
|
26
27
|
if self.enabled:
|
27
28
|
install_errmsg = (
|
@@ -57,8 +58,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|
57
58
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
58
59
|
lang = "+".join(self.options.lang)
|
59
60
|
|
60
|
-
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
61
|
-
|
62
61
|
if any([l.startswith("script/") for l in self._tesserocr_languages]):
|
63
62
|
self.script_prefix = "script/"
|
64
63
|
else:
|
@@ -61,6 +61,14 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
61
61
|
artifacts_path: Optional[Path] = None
|
62
62
|
if pipeline_options.artifacts_path is not None:
|
63
63
|
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
64
|
+
elif settings.artifacts_path is not None:
|
65
|
+
artifacts_path = Path(settings.artifacts_path).expanduser()
|
66
|
+
|
67
|
+
if artifacts_path is not None and not artifacts_path.is_dir():
|
68
|
+
raise RuntimeError(
|
69
|
+
f"The value of {artifacts_path=} is not valid. "
|
70
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
71
|
+
)
|
64
72
|
|
65
73
|
self.keep_images = (
|
66
74
|
self.pipeline_options.generate_page_images
|
@@ -201,6 +209,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
201
209
|
):
|
202
210
|
return PictureDescriptionApiModel(
|
203
211
|
enabled=self.pipeline_options.do_picture_description,
|
212
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
204
213
|
options=self.pipeline_options.picture_description_options,
|
205
214
|
)
|
206
215
|
elif isinstance(
|
docling/utils/glm_utils.py
CHANGED
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
|
|
15
15
|
TableCell,
|
16
16
|
TableData,
|
17
17
|
)
|
18
|
+
from docling_core.types.doc.document import ContentLayer
|
18
19
|
|
19
20
|
|
20
21
|
def resolve_item(paths, obj):
|
@@ -311,6 +312,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
311
312
|
current_list = None
|
312
313
|
|
313
314
|
doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
|
315
|
+
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
316
|
+
current_list = None
|
317
|
+
|
318
|
+
doc.add_text(
|
319
|
+
label=DocItemLabel(name_label),
|
320
|
+
text=text,
|
321
|
+
prov=prov,
|
322
|
+
content_layer=ContentLayer.FURNITURE,
|
323
|
+
)
|
314
324
|
else:
|
315
325
|
current_list = None
|
316
326
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.22.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
29
29
|
Requires-Dist: certifi (>=2024.7.4)
|
30
30
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.18.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -41,7 +41,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
41
41
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
42
42
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
43
43
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
44
|
-
Requires-Dist: pillow (>=10.0.0,<
|
44
|
+
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
45
45
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
46
46
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
47
47
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
@@ -2,6 +2,7 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
|
5
|
+
docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
|
5
6
|
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
6
7
|
docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
|
7
8
|
docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
|
@@ -18,51 +19,51 @@ docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-
|
|
18
19
|
docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
|
19
20
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
20
21
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
docling/cli/main.py,sha256=
|
22
|
+
docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
|
22
23
|
docling/cli/models.py,sha256=Z4IEuaXE9el5PuI6_6mR4D5Sn3y8WZzBtoIJPi6jL_s,3188
|
23
24
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
24
25
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
-
docling/datamodel/base_models.py,sha256=
|
26
|
-
docling/datamodel/document.py,sha256=
|
27
|
-
docling/datamodel/pipeline_options.py,sha256=
|
28
|
-
docling/datamodel/settings.py,sha256=
|
29
|
-
docling/document_converter.py,sha256=
|
30
|
-
docling/exceptions.py,sha256
|
26
|
+
docling/datamodel/base_models.py,sha256=_TPj-ADts3Qsc6vx1dpwZZnrOQCelqXOYIBCkK7A8FM,7107
|
27
|
+
docling/datamodel/document.py,sha256=Aeqpm7d_CCV_2mwMhvNGVeGPWtWN9DJ5WAE4sjqN-dw,14530
|
28
|
+
docling/datamodel/pipeline_options.py,sha256=pWCGtK0HEfltTR9Z14BYdS1-Zg6gZq9RlIHA014DpAk,9683
|
29
|
+
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
30
|
+
docling/document_converter.py,sha256=DX_bMqYyVO6rQvpf2JEy95HDR1QXT51v3T3Xn40pwjE,13196
|
31
|
+
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
31
32
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
33
|
docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,2679
|
33
34
|
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
34
35
|
docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
|
35
36
|
docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
|
36
|
-
docling/models/ds_glm_model.py,sha256=
|
37
|
+
docling/models/ds_glm_model.py,sha256=1jLEM-B_oHFevKq23zDQpdifE3eJL7qiLr5YLpEf1kQ,15217
|
37
38
|
docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
|
38
39
|
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
39
40
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
40
41
|
docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
|
41
42
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
42
|
-
docling/models/picture_description_api_model.py,sha256=
|
43
|
+
docling/models/picture_description_api_model.py,sha256=SKNoHpqzbfM8iO-DJJ4ccyNVqO0B2d9neLBnXqt50FY,3186
|
43
44
|
docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0ZxwO7yubhh2RkaC_8e8,1910
|
44
45
|
docling/models/picture_description_vlm_model.py,sha256=a2vYUdlcA0--_8neY0tTiU8reCf29NCbVMKwWdMy2QQ,3653
|
45
46
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
46
47
|
docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
|
47
48
|
docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
|
48
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
49
|
+
docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
|
49
50
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
51
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
51
52
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
52
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
53
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=Zoe8GGPujha16_TGYBAxcPriEwgYPaJPkp3BwG5XowU,12862
|
53
54
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
54
55
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
56
|
docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
|
56
57
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
57
|
-
docling/utils/glm_utils.py,sha256=
|
58
|
+
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
58
59
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
59
60
|
docling/utils/model_downloader.py,sha256=XK3ozGXyQcNPvrSsevTwR9VnY41JWovlsGk_ZBnu6FU,2787
|
60
61
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
61
62
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
62
63
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
63
64
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
64
|
-
docling-2.
|
65
|
-
docling-2.
|
66
|
-
docling-2.
|
67
|
-
docling-2.
|
68
|
-
docling-2.
|
65
|
+
docling-2.22.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
66
|
+
docling-2.22.0.dist-info/METADATA,sha256=eKFbLHbqOA9xMt4c0Pdqwh7tVBOXSqdSWh_MP4ztkeU,8720
|
67
|
+
docling-2.22.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
68
|
+
docling-2.22.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
69
|
+
docling-2.22.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|