docling-core 1.7.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +12 -8
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1288 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/METADATA +10 -10
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
- docling_core-2.0.0.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.2.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Models for the Docling Document data type."""
|
|
7
|
+
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Dict, Generic, Optional, Union
|
|
10
|
+
|
|
11
|
+
from pydantic import (
|
|
12
|
+
AnyHttpUrl,
|
|
13
|
+
BaseModel,
|
|
14
|
+
Field,
|
|
15
|
+
NonNegativeInt,
|
|
16
|
+
StrictStr,
|
|
17
|
+
model_validator,
|
|
18
|
+
)
|
|
19
|
+
from tabulate import tabulate
|
|
20
|
+
|
|
21
|
+
from docling_core.search.mapping import es_field
|
|
22
|
+
from docling_core.types.base import (
|
|
23
|
+
Acquisition,
|
|
24
|
+
CollectionDocumentInfo,
|
|
25
|
+
CollectionNameTypeT,
|
|
26
|
+
DescriptionAdvancedT,
|
|
27
|
+
DescriptionAnalyticsT,
|
|
28
|
+
FileInfoObject,
|
|
29
|
+
Identifier,
|
|
30
|
+
IdentifierTypeT,
|
|
31
|
+
LanguageT,
|
|
32
|
+
Log,
|
|
33
|
+
)
|
|
34
|
+
from docling_core.types.legacy_doc.base import (
|
|
35
|
+
BaseCell,
|
|
36
|
+
BaseText,
|
|
37
|
+
BitmapObject,
|
|
38
|
+
Figure,
|
|
39
|
+
PageDimensions,
|
|
40
|
+
PageReference,
|
|
41
|
+
Ref,
|
|
42
|
+
S3Data,
|
|
43
|
+
Table,
|
|
44
|
+
)
|
|
45
|
+
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
46
|
+
from docling_core.utils.alias import AliasModel
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class CCSFileInfoDescription(BaseModel, extra="forbid"):
|
|
50
|
+
"""File info description."""
|
|
51
|
+
|
|
52
|
+
author: Optional[list[StrictStr]] = None
|
|
53
|
+
keywords: Optional[str] = None
|
|
54
|
+
subject: Optional[str] = None
|
|
55
|
+
title: Optional[StrictStr] = None
|
|
56
|
+
creation_date: Optional[str] = None # datetime
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CCSFileInfoObject(FileInfoObject, extra="forbid"):
|
|
60
|
+
"""File info object."""
|
|
61
|
+
|
|
62
|
+
num_pages: Optional[int] = Field(default=None, alias="#-pages")
|
|
63
|
+
|
|
64
|
+
collection_name: Optional[str] = Field(
|
|
65
|
+
default=None,
|
|
66
|
+
alias="collection-name",
|
|
67
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
68
|
+
)
|
|
69
|
+
description: Optional[CCSFileInfoDescription] = Field(
|
|
70
|
+
default=None, json_schema_extra=es_field(suppress=True)
|
|
71
|
+
)
|
|
72
|
+
page_hashes: Optional[list[PageReference]] = Field(
|
|
73
|
+
default=None, alias="page-hashes"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class Affiliation(BaseModel, extra="forbid"):
|
|
78
|
+
"""Affiliation."""
|
|
79
|
+
|
|
80
|
+
name: str = Field(
|
|
81
|
+
...,
|
|
82
|
+
json_schema_extra=es_field(
|
|
83
|
+
fields={
|
|
84
|
+
"lower": {
|
|
85
|
+
"normalizer": "lowercase_asciifolding",
|
|
86
|
+
"type": "keyword",
|
|
87
|
+
"ignore_above": 8191,
|
|
88
|
+
},
|
|
89
|
+
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
90
|
+
},
|
|
91
|
+
),
|
|
92
|
+
)
|
|
93
|
+
id: Optional[str] = Field(
|
|
94
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
95
|
+
)
|
|
96
|
+
source: Optional[str] = Field(
|
|
97
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class Author(BaseModel, extra="forbid"):
|
|
102
|
+
"""Author."""
|
|
103
|
+
|
|
104
|
+
name: str = Field(
|
|
105
|
+
...,
|
|
106
|
+
json_schema_extra=es_field(
|
|
107
|
+
type="text",
|
|
108
|
+
fields={
|
|
109
|
+
"lower": {
|
|
110
|
+
"normalizer": "lowercase_asciifolding",
|
|
111
|
+
"type": "keyword",
|
|
112
|
+
"ignore_above": 8191,
|
|
113
|
+
},
|
|
114
|
+
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
115
|
+
},
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
id: Optional[str] = Field(
|
|
119
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
120
|
+
)
|
|
121
|
+
source: Optional[str] = Field(
|
|
122
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
123
|
+
)
|
|
124
|
+
affiliations: Optional[list[Affiliation]] = None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
|
|
128
|
+
"""Publication details of a journal or venue."""
|
|
129
|
+
|
|
130
|
+
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
|
|
131
|
+
default=None,
|
|
132
|
+
description="Unique identifiers of a publication venue.",
|
|
133
|
+
)
|
|
134
|
+
name: StrictStr = Field(
|
|
135
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
136
|
+
description="Name of the publication.",
|
|
137
|
+
)
|
|
138
|
+
alternate_names: Optional[list[StrictStr]] = Field(
|
|
139
|
+
default=None,
|
|
140
|
+
json_schema_extra=es_field(type="text"),
|
|
141
|
+
title="Alternate Names",
|
|
142
|
+
description="Other names or abbreviations of this publication.",
|
|
143
|
+
)
|
|
144
|
+
type: Optional[list[StrictStr]] = Field(
|
|
145
|
+
default=None,
|
|
146
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
147
|
+
description="Type of publication (journal article, conference, review,...).",
|
|
148
|
+
)
|
|
149
|
+
pages: Optional[StrictStr] = Field(
|
|
150
|
+
default=None,
|
|
151
|
+
json_schema_extra=es_field(type="text"),
|
|
152
|
+
description="Page range in the publication.",
|
|
153
|
+
)
|
|
154
|
+
issue: Optional[StrictStr] = Field(
|
|
155
|
+
default=None,
|
|
156
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
157
|
+
description="Publication issue (issue number).",
|
|
158
|
+
)
|
|
159
|
+
volume: Optional[StrictStr] = Field(
|
|
160
|
+
default=None,
|
|
161
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
162
|
+
description="Publication volume.",
|
|
163
|
+
)
|
|
164
|
+
url: Optional[AnyHttpUrl] = Field(
|
|
165
|
+
default=None,
|
|
166
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
167
|
+
description="URL on the publication site.",
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class DescriptionLicense(BaseModel, extra="forbid"):
|
|
172
|
+
"""Licence in document description."""
|
|
173
|
+
|
|
174
|
+
code: Optional[StrictStr] = Field(
|
|
175
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
176
|
+
)
|
|
177
|
+
text: Optional[StrictStr] = None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class CCSDocumentDescription(
|
|
181
|
+
AliasModel,
|
|
182
|
+
Generic[
|
|
183
|
+
DescriptionAdvancedT,
|
|
184
|
+
DescriptionAnalyticsT,
|
|
185
|
+
IdentifierTypeT,
|
|
186
|
+
LanguageT,
|
|
187
|
+
CollectionNameTypeT,
|
|
188
|
+
],
|
|
189
|
+
):
|
|
190
|
+
"""Description in document."""
|
|
191
|
+
|
|
192
|
+
title: Optional[StrictStr] = None
|
|
193
|
+
abstract: Optional[list[StrictStr]] = None
|
|
194
|
+
authors: Optional[list[Author]] = None
|
|
195
|
+
affiliations: Optional[list[Affiliation]] = None
|
|
196
|
+
subjects: Optional[list[str]] = Field(
|
|
197
|
+
default=None,
|
|
198
|
+
json_schema_extra=es_field(
|
|
199
|
+
fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
|
|
200
|
+
),
|
|
201
|
+
)
|
|
202
|
+
keywords: Optional[list[str]] = Field(
|
|
203
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
204
|
+
)
|
|
205
|
+
publication_date: Optional[datetime] = None
|
|
206
|
+
languages: Optional[list[LanguageT]] = Field(
|
|
207
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
208
|
+
)
|
|
209
|
+
license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
|
|
210
|
+
publishers: Optional[list[StrictStr]] = Field(
|
|
211
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
212
|
+
)
|
|
213
|
+
url_refs: Optional[list[str]] = Field(
|
|
214
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
215
|
+
)
|
|
216
|
+
references: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
217
|
+
publication: Optional[list[Publication]] = Field(
|
|
218
|
+
default=None, description="List of publication journals or venues."
|
|
219
|
+
)
|
|
220
|
+
reference_count: Optional[NonNegativeInt] = Field(
|
|
221
|
+
default=None,
|
|
222
|
+
title="Reference Count",
|
|
223
|
+
description="Total number of documents referenced by this document.",
|
|
224
|
+
json_schema_extra=es_field(type="integer"),
|
|
225
|
+
)
|
|
226
|
+
citation_count: Optional[NonNegativeInt] = Field(
|
|
227
|
+
default=None,
|
|
228
|
+
title="Citation Count",
|
|
229
|
+
description=(
|
|
230
|
+
"Total number of citations that this document has received (number "
|
|
231
|
+
"of documents in whose bibliography this document appears)."
|
|
232
|
+
),
|
|
233
|
+
json_schema_extra=es_field(type="integer"),
|
|
234
|
+
)
|
|
235
|
+
citation_date: Optional[datetime] = Field(
|
|
236
|
+
default=None,
|
|
237
|
+
title="Citation Count Date",
|
|
238
|
+
description="Last update date of the citation count.",
|
|
239
|
+
)
|
|
240
|
+
advanced: Optional[DescriptionAdvancedT] = None
|
|
241
|
+
analytics: Optional[DescriptionAnalyticsT] = None
|
|
242
|
+
logs: list[Log]
|
|
243
|
+
collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
|
|
244
|
+
default=None, description="The collection information of this document."
|
|
245
|
+
)
|
|
246
|
+
acquisition: Optional[Acquisition] = Field(
|
|
247
|
+
default=None,
|
|
248
|
+
description=(
|
|
249
|
+
"Information on how the document was obtained, for data governance"
|
|
250
|
+
" purposes."
|
|
251
|
+
),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class MinimalDocument(
|
|
256
|
+
AliasModel,
|
|
257
|
+
Generic[
|
|
258
|
+
DescriptionAdvancedT,
|
|
259
|
+
DescriptionAnalyticsT,
|
|
260
|
+
IdentifierTypeT,
|
|
261
|
+
LanguageT,
|
|
262
|
+
CollectionNameTypeT,
|
|
263
|
+
],
|
|
264
|
+
):
|
|
265
|
+
"""Minimal model for a document."""
|
|
266
|
+
|
|
267
|
+
name: StrictStr = Field(alias="_name")
|
|
268
|
+
obj_type: Optional[StrictStr] = Field("document", alias="type")
|
|
269
|
+
description: CCSDocumentDescription[
|
|
270
|
+
DescriptionAdvancedT,
|
|
271
|
+
DescriptionAnalyticsT,
|
|
272
|
+
IdentifierTypeT,
|
|
273
|
+
LanguageT,
|
|
274
|
+
CollectionNameTypeT,
|
|
275
|
+
]
|
|
276
|
+
file_info: FileInfoObject = Field(alias="file-info")
|
|
277
|
+
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
278
|
+
default=None, alias="main-text"
|
|
279
|
+
)
|
|
280
|
+
figures: Optional[list[Figure]] = None
|
|
281
|
+
tables: Optional[list[Table]] = None
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class CCSDocument(
|
|
285
|
+
MinimalDocument,
|
|
286
|
+
Generic[
|
|
287
|
+
DescriptionAdvancedT,
|
|
288
|
+
DescriptionAnalyticsT,
|
|
289
|
+
IdentifierTypeT,
|
|
290
|
+
LanguageT,
|
|
291
|
+
CollectionNameTypeT,
|
|
292
|
+
],
|
|
293
|
+
):
|
|
294
|
+
"""Model for a CCS-generated document."""
|
|
295
|
+
|
|
296
|
+
obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
|
|
297
|
+
bitmaps: Optional[list[BitmapObject]] = None
|
|
298
|
+
equations: Optional[list[BaseCell]] = None
|
|
299
|
+
footnotes: Optional[list[BaseText]] = None
|
|
300
|
+
file_info: CCSFileInfoObject = Field(alias="file-info")
|
|
301
|
+
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
302
|
+
default=None,
|
|
303
|
+
alias="main-text",
|
|
304
|
+
)
|
|
305
|
+
page_dimensions: Optional[list[PageDimensions]] = Field(
|
|
306
|
+
default=None, alias="page-dimensions"
|
|
307
|
+
)
|
|
308
|
+
page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
|
|
309
|
+
page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
|
|
310
|
+
s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
|
|
311
|
+
|
|
312
|
+
@model_validator(mode="before")
|
|
313
|
+
@classmethod
|
|
314
|
+
def from_dict(cls, data):
|
|
315
|
+
"""Validates and fixes the input data."""
|
|
316
|
+
if not isinstance(data, dict):
|
|
317
|
+
return data
|
|
318
|
+
description_collection = data["description"].get("collection")
|
|
319
|
+
if not description_collection:
|
|
320
|
+
data["description"].setdefault("collection", {})
|
|
321
|
+
|
|
322
|
+
data["description"]["collection"].setdefault("type", "Document")
|
|
323
|
+
logs = data["description"].get("logs")
|
|
324
|
+
if not logs:
|
|
325
|
+
data["description"].setdefault("logs", [])
|
|
326
|
+
|
|
327
|
+
abstract = data["description"].get("abstract")
|
|
328
|
+
if abstract is not None and not isinstance(abstract, list):
|
|
329
|
+
if isinstance(abstract, str):
|
|
330
|
+
data["description"]["abstract"] = [abstract]
|
|
331
|
+
else:
|
|
332
|
+
data["description"].pop("abstract")
|
|
333
|
+
|
|
334
|
+
for key in ["affiliations", "authors"]:
|
|
335
|
+
descr = data["description"].get(key)
|
|
336
|
+
if descr is not None and not isinstance(descr, list):
|
|
337
|
+
if isinstance(descr, dict):
|
|
338
|
+
data["description"][key] = [descr]
|
|
339
|
+
else:
|
|
340
|
+
data["description"].pop(key)
|
|
341
|
+
|
|
342
|
+
if data.get("main-text"):
|
|
343
|
+
for item in data["main-text"]:
|
|
344
|
+
if ref := item.pop("__ref", None):
|
|
345
|
+
item["$ref"] = ref
|
|
346
|
+
|
|
347
|
+
return data
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
class ExportedCCSDocument(
|
|
351
|
+
MinimalDocument,
|
|
352
|
+
Generic[
|
|
353
|
+
DescriptionAdvancedT,
|
|
354
|
+
DescriptionAnalyticsT,
|
|
355
|
+
IdentifierTypeT,
|
|
356
|
+
LanguageT,
|
|
357
|
+
CollectionNameTypeT,
|
|
358
|
+
],
|
|
359
|
+
):
|
|
360
|
+
"""Document model for Docling."""
|
|
361
|
+
|
|
362
|
+
obj_type: Optional[StrictStr] = Field(
|
|
363
|
+
"pdf-document",
|
|
364
|
+
alias="type",
|
|
365
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
366
|
+
)
|
|
367
|
+
bitmaps: Optional[list[BitmapObject]] = None
|
|
368
|
+
equations: Optional[list[BaseCell]] = None
|
|
369
|
+
footnotes: Optional[list[BaseText]] = None
|
|
370
|
+
description: CCSDocumentDescription[
|
|
371
|
+
DescriptionAdvancedT,
|
|
372
|
+
DescriptionAnalyticsT,
|
|
373
|
+
IdentifierTypeT,
|
|
374
|
+
LanguageT,
|
|
375
|
+
CollectionNameTypeT,
|
|
376
|
+
]
|
|
377
|
+
file_info: CCSFileInfoObject = Field(alias="file-info")
|
|
378
|
+
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
379
|
+
default=None, alias="main-text"
|
|
380
|
+
)
|
|
381
|
+
page_dimensions: Optional[list[PageDimensions]] = Field(
|
|
382
|
+
default=None, alias="page-dimensions"
|
|
383
|
+
)
|
|
384
|
+
page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
|
|
385
|
+
page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
|
|
386
|
+
s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
|
|
387
|
+
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
388
|
+
|
|
389
|
+
@model_validator(mode="before")
|
|
390
|
+
@classmethod
|
|
391
|
+
def from_dict(cls, data):
|
|
392
|
+
"""Fix ref in main-text."""
|
|
393
|
+
if not isinstance(data, dict):
|
|
394
|
+
return data
|
|
395
|
+
if data.get("main-text"):
|
|
396
|
+
for item in data["main-text"]:
|
|
397
|
+
if ref := item.pop("__ref", None):
|
|
398
|
+
item["$ref"] = ref
|
|
399
|
+
|
|
400
|
+
return data
|
|
401
|
+
|
|
402
|
+
def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
|
|
403
|
+
"""Return the resolved reference.
|
|
404
|
+
|
|
405
|
+
Resolved the Ref object within the document.
|
|
406
|
+
If the object is not found, None is returned.
|
|
407
|
+
"""
|
|
408
|
+
result: Optional[Union[BaseCell, BaseText]] = None
|
|
409
|
+
|
|
410
|
+
# NOTE: currently only resolves refs explicitely, such that we can make
|
|
411
|
+
# assumptions on ref parts
|
|
412
|
+
if item.obj_type == "table" and self.tables:
|
|
413
|
+
parts = item.ref.split("/")
|
|
414
|
+
result = self.tables[int(parts[2])]
|
|
415
|
+
elif item.obj_type == "figure" and self.figures:
|
|
416
|
+
parts = item.ref.split("/")
|
|
417
|
+
result = self.figures[int(parts[2])]
|
|
418
|
+
elif item.obj_type == "equation" and self.equations:
|
|
419
|
+
parts = item.ref.split("/")
|
|
420
|
+
result = self.equations[int(parts[2])]
|
|
421
|
+
elif item.obj_type == "footnote" and self.footnotes:
|
|
422
|
+
parts = item.ref.split("/")
|
|
423
|
+
result = self.footnotes[int(parts[2])]
|
|
424
|
+
|
|
425
|
+
return result
|
|
426
|
+
|
|
427
|
+
def get_map_to_page_dimensions(self):
|
|
428
|
+
"""Get a map from page-index (start at 1) to page-dim [width, height]."""
|
|
429
|
+
pagedims = {}
|
|
430
|
+
|
|
431
|
+
if self.page_dimensions is not None:
|
|
432
|
+
for _ in self.page_dimensions:
|
|
433
|
+
pagedims[_.page] = [_.width, _.height]
|
|
434
|
+
|
|
435
|
+
return pagedims
|
|
436
|
+
|
|
437
|
+
def export_to_dict(self) -> Dict:
|
|
438
|
+
"""export_to_dict."""
|
|
439
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
440
|
+
|
|
441
|
+
def export_to_markdown( # noqa: C901
|
|
442
|
+
self,
|
|
443
|
+
delim: str = "\n\n",
|
|
444
|
+
main_text_start: int = 0,
|
|
445
|
+
main_text_stop: Optional[int] = None,
|
|
446
|
+
main_text_labels: list[str] = [
|
|
447
|
+
"title",
|
|
448
|
+
"subtitle-level-1",
|
|
449
|
+
"paragraph",
|
|
450
|
+
"caption",
|
|
451
|
+
"table",
|
|
452
|
+
"figure",
|
|
453
|
+
],
|
|
454
|
+
strict_text: bool = False,
|
|
455
|
+
image_placeholder: str = "<!-- image -->",
|
|
456
|
+
) -> str:
|
|
457
|
+
r"""Serialize to Markdown.
|
|
458
|
+
|
|
459
|
+
Operates on a slice of the document's main_text as defined through arguments
|
|
460
|
+
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
delim (str, optional): Delimiter to use when concatenating the various
|
|
464
|
+
Markdown parts. Defaults to "\n\n".
|
|
465
|
+
main_text_start (int, optional): Main-text slicing start index (inclusive).
|
|
466
|
+
Defaults to 0.
|
|
467
|
+
main_text_end (Optional[int], optional): Main-text slicing stop index
|
|
468
|
+
(exclusive). Defaults to None.
|
|
469
|
+
main_text_labels (list[str], optional): The labels to include in the
|
|
470
|
+
markdown.
|
|
471
|
+
strict_text (bool, optional): if true, the output will be only plain text
|
|
472
|
+
without any markdown styling. Defaults to False.
|
|
473
|
+
image_placeholder (str, optional): the placeholder to include to position
|
|
474
|
+
images in the markdown. Defaults to a markdown comment "<!-- image -->".
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
str: The exported Markdown representation.
|
|
478
|
+
"""
|
|
479
|
+
has_title = False
|
|
480
|
+
prev_text = ""
|
|
481
|
+
md_texts: list[str] = []
|
|
482
|
+
|
|
483
|
+
if self.main_text is not None:
|
|
484
|
+
# collect all captions embedded in table and figure objects
|
|
485
|
+
# to avoid repeating them
|
|
486
|
+
embedded_captions = set()
|
|
487
|
+
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
488
|
+
item = (
|
|
489
|
+
self._resolve_ref(orig_item)
|
|
490
|
+
if isinstance(orig_item, Ref)
|
|
491
|
+
else orig_item
|
|
492
|
+
)
|
|
493
|
+
if item is None:
|
|
494
|
+
continue
|
|
495
|
+
|
|
496
|
+
if (
|
|
497
|
+
isinstance(item, (Table, Figure))
|
|
498
|
+
and item.text
|
|
499
|
+
and item.obj_type in main_text_labels
|
|
500
|
+
):
|
|
501
|
+
embedded_captions.add(item.text)
|
|
502
|
+
|
|
503
|
+
# serialize document to markdown
|
|
504
|
+
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
505
|
+
markdown_text = ""
|
|
506
|
+
|
|
507
|
+
item = (
|
|
508
|
+
self._resolve_ref(orig_item)
|
|
509
|
+
if isinstance(orig_item, Ref)
|
|
510
|
+
else orig_item
|
|
511
|
+
)
|
|
512
|
+
if item is None:
|
|
513
|
+
continue
|
|
514
|
+
|
|
515
|
+
item_type = item.obj_type
|
|
516
|
+
if isinstance(item, BaseText) and item_type in main_text_labels:
|
|
517
|
+
text = item.text
|
|
518
|
+
|
|
519
|
+
# skip captions of they are embedded in the actual
|
|
520
|
+
# floating object
|
|
521
|
+
if item_type == "caption" and text in embedded_captions:
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
# ignore repeated text
|
|
525
|
+
if prev_text == text or text is None:
|
|
526
|
+
continue
|
|
527
|
+
else:
|
|
528
|
+
prev_text = text
|
|
529
|
+
|
|
530
|
+
# first title match
|
|
531
|
+
if item_type == "title" and not has_title:
|
|
532
|
+
if strict_text:
|
|
533
|
+
markdown_text = f"{text}"
|
|
534
|
+
else:
|
|
535
|
+
markdown_text = f"# {text}"
|
|
536
|
+
has_title = True
|
|
537
|
+
|
|
538
|
+
# secondary titles
|
|
539
|
+
elif item_type in {"title", "subtitle-level-1"} or (
|
|
540
|
+
has_title and item_type == "title"
|
|
541
|
+
):
|
|
542
|
+
if strict_text:
|
|
543
|
+
markdown_text = f"{text}"
|
|
544
|
+
else:
|
|
545
|
+
markdown_text = f"## {text}"
|
|
546
|
+
|
|
547
|
+
# normal text
|
|
548
|
+
else:
|
|
549
|
+
markdown_text = text
|
|
550
|
+
|
|
551
|
+
elif (
|
|
552
|
+
isinstance(item, Table)
|
|
553
|
+
and item.data
|
|
554
|
+
and item_type in main_text_labels
|
|
555
|
+
):
|
|
556
|
+
|
|
557
|
+
md_table = ""
|
|
558
|
+
table = []
|
|
559
|
+
for row in item.data:
|
|
560
|
+
tmp = []
|
|
561
|
+
for col in row:
|
|
562
|
+
tmp.append(col.text)
|
|
563
|
+
table.append(tmp)
|
|
564
|
+
|
|
565
|
+
if len(table) > 1 and len(table[0]) > 0:
|
|
566
|
+
try:
|
|
567
|
+
md_table = tabulate(
|
|
568
|
+
table[1:], headers=table[0], tablefmt="github"
|
|
569
|
+
)
|
|
570
|
+
except ValueError:
|
|
571
|
+
md_table = tabulate(
|
|
572
|
+
table[1:],
|
|
573
|
+
headers=table[0],
|
|
574
|
+
tablefmt="github",
|
|
575
|
+
disable_numparse=True,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
markdown_text = ""
|
|
579
|
+
if item.text:
|
|
580
|
+
markdown_text = item.text
|
|
581
|
+
if not strict_text:
|
|
582
|
+
markdown_text += "\n\n" + md_table
|
|
583
|
+
|
|
584
|
+
elif isinstance(item, Figure) and item_type in main_text_labels:
|
|
585
|
+
|
|
586
|
+
markdown_text = ""
|
|
587
|
+
if item.text:
|
|
588
|
+
markdown_text = item.text
|
|
589
|
+
if not strict_text:
|
|
590
|
+
markdown_text += f"\n{image_placeholder}"
|
|
591
|
+
|
|
592
|
+
if markdown_text:
|
|
593
|
+
md_texts.append(markdown_text)
|
|
594
|
+
|
|
595
|
+
result = delim.join(md_texts)
|
|
596
|
+
return result
|
|
597
|
+
|
|
598
|
+
def export_to_document_tokens(
|
|
599
|
+
self,
|
|
600
|
+
delim: str = "\n\n",
|
|
601
|
+
main_text_start: int = 0,
|
|
602
|
+
main_text_stop: Optional[int] = None,
|
|
603
|
+
main_text_labels: list[str] = [
|
|
604
|
+
"title",
|
|
605
|
+
"subtitle-level-1",
|
|
606
|
+
"paragraph",
|
|
607
|
+
"caption",
|
|
608
|
+
"table",
|
|
609
|
+
"figure",
|
|
610
|
+
],
|
|
611
|
+
xsize: int = 100,
|
|
612
|
+
ysize: int = 100,
|
|
613
|
+
add_location: bool = True,
|
|
614
|
+
add_content: bool = True,
|
|
615
|
+
add_page_index: bool = True,
|
|
616
|
+
# table specific flags
|
|
617
|
+
add_table_cell_location: bool = False,
|
|
618
|
+
add_table_cell_label: bool = True,
|
|
619
|
+
add_table_cell_text: bool = True,
|
|
620
|
+
) -> str:
|
|
621
|
+
r"""Exports the document content to an DocumentToken format.
|
|
622
|
+
|
|
623
|
+
Operates on a slice of the document's main_text as defined through arguments
|
|
624
|
+
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
625
|
+
|
|
626
|
+
Returns:
|
|
627
|
+
str: The content of the document formatted as a DocTags string.
|
|
628
|
+
"""
|
|
629
|
+
new_line = ""
|
|
630
|
+
if delim:
|
|
631
|
+
new_line = "\n"
|
|
632
|
+
|
|
633
|
+
doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
|
|
634
|
+
|
|
635
|
+
# pagedims = self.get_map_to_page_dimensions()
|
|
636
|
+
|
|
637
|
+
if self.main_text is not None:
|
|
638
|
+
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
639
|
+
|
|
640
|
+
item = (
|
|
641
|
+
self._resolve_ref(orig_item)
|
|
642
|
+
if isinstance(orig_item, Ref)
|
|
643
|
+
else orig_item
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
if item is None:
|
|
647
|
+
continue
|
|
648
|
+
|
|
649
|
+
prov = item.prov
|
|
650
|
+
|
|
651
|
+
page_i = -1
|
|
652
|
+
page_w = 0.0
|
|
653
|
+
page_h = 0.0
|
|
654
|
+
|
|
655
|
+
if (
|
|
656
|
+
add_location
|
|
657
|
+
and self.page_dimensions is not None
|
|
658
|
+
and prov is not None
|
|
659
|
+
and len(prov) > 0
|
|
660
|
+
):
|
|
661
|
+
|
|
662
|
+
page_i = prov[0].page
|
|
663
|
+
page_dim = self.page_dimensions[page_i - 1]
|
|
664
|
+
|
|
665
|
+
page_w = float(page_dim.width)
|
|
666
|
+
page_h = float(page_dim.height)
|
|
667
|
+
|
|
668
|
+
item_type = item.obj_type
|
|
669
|
+
if isinstance(item, BaseText) and (item_type in main_text_labels):
|
|
670
|
+
|
|
671
|
+
doctags += item.export_to_document_tokens(
|
|
672
|
+
new_line=new_line,
|
|
673
|
+
page_w=page_w,
|
|
674
|
+
page_h=page_h,
|
|
675
|
+
xsize=xsize,
|
|
676
|
+
ysize=ysize,
|
|
677
|
+
add_location=add_location,
|
|
678
|
+
add_content=add_content,
|
|
679
|
+
add_page_index=add_page_index,
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
elif isinstance(item, Table) and (item_type in main_text_labels):
|
|
683
|
+
|
|
684
|
+
doctags += item.export_to_document_tokens(
|
|
685
|
+
new_line=new_line,
|
|
686
|
+
page_w=page_w,
|
|
687
|
+
page_h=page_h,
|
|
688
|
+
xsize=xsize,
|
|
689
|
+
ysize=ysize,
|
|
690
|
+
add_caption=True,
|
|
691
|
+
add_location=add_location,
|
|
692
|
+
add_content=add_content,
|
|
693
|
+
add_cell_location=add_table_cell_location,
|
|
694
|
+
add_cell_label=add_table_cell_label,
|
|
695
|
+
add_cell_text=add_table_cell_text,
|
|
696
|
+
add_page_index=add_page_index,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
elif isinstance(item, Figure) and (item_type in main_text_labels):
|
|
700
|
+
|
|
701
|
+
doctags += item.export_to_document_tokens(
|
|
702
|
+
new_line=new_line,
|
|
703
|
+
page_w=page_w,
|
|
704
|
+
page_h=page_h,
|
|
705
|
+
xsize=xsize,
|
|
706
|
+
ysize=ysize,
|
|
707
|
+
add_caption=True,
|
|
708
|
+
add_location=add_location,
|
|
709
|
+
add_content=add_content,
|
|
710
|
+
add_page_index=add_page_index,
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
doctags += DocumentToken.END_DOCUMENT.value
|
|
714
|
+
|
|
715
|
+
return doctags
|