docling-core 1.7.2__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +12 -8
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1288 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/METADATA +10 -10
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
  21. docling_core-2.0.0.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,715 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Models for the Docling Document data type."""
7
+
8
+ from datetime import datetime
9
+ from typing import Dict, Generic, Optional, Union
10
+
11
+ from pydantic import (
12
+ AnyHttpUrl,
13
+ BaseModel,
14
+ Field,
15
+ NonNegativeInt,
16
+ StrictStr,
17
+ model_validator,
18
+ )
19
+ from tabulate import tabulate
20
+
21
+ from docling_core.search.mapping import es_field
22
+ from docling_core.types.base import (
23
+ Acquisition,
24
+ CollectionDocumentInfo,
25
+ CollectionNameTypeT,
26
+ DescriptionAdvancedT,
27
+ DescriptionAnalyticsT,
28
+ FileInfoObject,
29
+ Identifier,
30
+ IdentifierTypeT,
31
+ LanguageT,
32
+ Log,
33
+ )
34
+ from docling_core.types.legacy_doc.base import (
35
+ BaseCell,
36
+ BaseText,
37
+ BitmapObject,
38
+ Figure,
39
+ PageDimensions,
40
+ PageReference,
41
+ Ref,
42
+ S3Data,
43
+ Table,
44
+ )
45
+ from docling_core.types.legacy_doc.tokens import DocumentToken
46
+ from docling_core.utils.alias import AliasModel
47
+
48
+
49
+ class CCSFileInfoDescription(BaseModel, extra="forbid"):
50
+ """File info description."""
51
+
52
+ author: Optional[list[StrictStr]] = None
53
+ keywords: Optional[str] = None
54
+ subject: Optional[str] = None
55
+ title: Optional[StrictStr] = None
56
+ creation_date: Optional[str] = None # datetime
57
+
58
+
59
+ class CCSFileInfoObject(FileInfoObject, extra="forbid"):
60
+ """File info object."""
61
+
62
+ num_pages: Optional[int] = Field(default=None, alias="#-pages")
63
+
64
+ collection_name: Optional[str] = Field(
65
+ default=None,
66
+ alias="collection-name",
67
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
68
+ )
69
+ description: Optional[CCSFileInfoDescription] = Field(
70
+ default=None, json_schema_extra=es_field(suppress=True)
71
+ )
72
+ page_hashes: Optional[list[PageReference]] = Field(
73
+ default=None, alias="page-hashes"
74
+ )
75
+
76
+
77
+ class Affiliation(BaseModel, extra="forbid"):
78
+ """Affiliation."""
79
+
80
+ name: str = Field(
81
+ ...,
82
+ json_schema_extra=es_field(
83
+ fields={
84
+ "lower": {
85
+ "normalizer": "lowercase_asciifolding",
86
+ "type": "keyword",
87
+ "ignore_above": 8191,
88
+ },
89
+ "keyword": {"type": "keyword", "ignore_above": 8191},
90
+ },
91
+ ),
92
+ )
93
+ id: Optional[str] = Field(
94
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
95
+ )
96
+ source: Optional[str] = Field(
97
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
98
+ )
99
+
100
+
101
+ class Author(BaseModel, extra="forbid"):
102
+ """Author."""
103
+
104
+ name: str = Field(
105
+ ...,
106
+ json_schema_extra=es_field(
107
+ type="text",
108
+ fields={
109
+ "lower": {
110
+ "normalizer": "lowercase_asciifolding",
111
+ "type": "keyword",
112
+ "ignore_above": 8191,
113
+ },
114
+ "keyword": {"type": "keyword", "ignore_above": 8191},
115
+ },
116
+ ),
117
+ )
118
+ id: Optional[str] = Field(
119
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
120
+ )
121
+ source: Optional[str] = Field(
122
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
123
+ )
124
+ affiliations: Optional[list[Affiliation]] = None
125
+
126
+
127
+ class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
128
+ """Publication details of a journal or venue."""
129
+
130
+ identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
131
+ default=None,
132
+ description="Unique identifiers of a publication venue.",
133
+ )
134
+ name: StrictStr = Field(
135
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
136
+ description="Name of the publication.",
137
+ )
138
+ alternate_names: Optional[list[StrictStr]] = Field(
139
+ default=None,
140
+ json_schema_extra=es_field(type="text"),
141
+ title="Alternate Names",
142
+ description="Other names or abbreviations of this publication.",
143
+ )
144
+ type: Optional[list[StrictStr]] = Field(
145
+ default=None,
146
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
147
+ description="Type of publication (journal article, conference, review,...).",
148
+ )
149
+ pages: Optional[StrictStr] = Field(
150
+ default=None,
151
+ json_schema_extra=es_field(type="text"),
152
+ description="Page range in the publication.",
153
+ )
154
+ issue: Optional[StrictStr] = Field(
155
+ default=None,
156
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
157
+ description="Publication issue (issue number).",
158
+ )
159
+ volume: Optional[StrictStr] = Field(
160
+ default=None,
161
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
162
+ description="Publication volume.",
163
+ )
164
+ url: Optional[AnyHttpUrl] = Field(
165
+ default=None,
166
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
167
+ description="URL on the publication site.",
168
+ )
169
+
170
+
171
+ class DescriptionLicense(BaseModel, extra="forbid"):
172
+ """Licence in document description."""
173
+
174
+ code: Optional[StrictStr] = Field(
175
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
176
+ )
177
+ text: Optional[StrictStr] = None
178
+
179
+
180
+ class CCSDocumentDescription(
181
+ AliasModel,
182
+ Generic[
183
+ DescriptionAdvancedT,
184
+ DescriptionAnalyticsT,
185
+ IdentifierTypeT,
186
+ LanguageT,
187
+ CollectionNameTypeT,
188
+ ],
189
+ ):
190
+ """Description in document."""
191
+
192
+ title: Optional[StrictStr] = None
193
+ abstract: Optional[list[StrictStr]] = None
194
+ authors: Optional[list[Author]] = None
195
+ affiliations: Optional[list[Affiliation]] = None
196
+ subjects: Optional[list[str]] = Field(
197
+ default=None,
198
+ json_schema_extra=es_field(
199
+ fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
200
+ ),
201
+ )
202
+ keywords: Optional[list[str]] = Field(
203
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
204
+ )
205
+ publication_date: Optional[datetime] = None
206
+ languages: Optional[list[LanguageT]] = Field(
207
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
208
+ )
209
+ license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
210
+ publishers: Optional[list[StrictStr]] = Field(
211
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
212
+ )
213
+ url_refs: Optional[list[str]] = Field(
214
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
215
+ )
216
+ references: Optional[list[Identifier[IdentifierTypeT]]] = None
217
+ publication: Optional[list[Publication]] = Field(
218
+ default=None, description="List of publication journals or venues."
219
+ )
220
+ reference_count: Optional[NonNegativeInt] = Field(
221
+ default=None,
222
+ title="Reference Count",
223
+ description="Total number of documents referenced by this document.",
224
+ json_schema_extra=es_field(type="integer"),
225
+ )
226
+ citation_count: Optional[NonNegativeInt] = Field(
227
+ default=None,
228
+ title="Citation Count",
229
+ description=(
230
+ "Total number of citations that this document has received (number "
231
+ "of documents in whose bibliography this document appears)."
232
+ ),
233
+ json_schema_extra=es_field(type="integer"),
234
+ )
235
+ citation_date: Optional[datetime] = Field(
236
+ default=None,
237
+ title="Citation Count Date",
238
+ description="Last update date of the citation count.",
239
+ )
240
+ advanced: Optional[DescriptionAdvancedT] = None
241
+ analytics: Optional[DescriptionAnalyticsT] = None
242
+ logs: list[Log]
243
+ collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
244
+ default=None, description="The collection information of this document."
245
+ )
246
+ acquisition: Optional[Acquisition] = Field(
247
+ default=None,
248
+ description=(
249
+ "Information on how the document was obtained, for data governance"
250
+ " purposes."
251
+ ),
252
+ )
253
+
254
+
255
+ class MinimalDocument(
256
+ AliasModel,
257
+ Generic[
258
+ DescriptionAdvancedT,
259
+ DescriptionAnalyticsT,
260
+ IdentifierTypeT,
261
+ LanguageT,
262
+ CollectionNameTypeT,
263
+ ],
264
+ ):
265
+ """Minimal model for a document."""
266
+
267
+ name: StrictStr = Field(alias="_name")
268
+ obj_type: Optional[StrictStr] = Field("document", alias="type")
269
+ description: CCSDocumentDescription[
270
+ DescriptionAdvancedT,
271
+ DescriptionAnalyticsT,
272
+ IdentifierTypeT,
273
+ LanguageT,
274
+ CollectionNameTypeT,
275
+ ]
276
+ file_info: FileInfoObject = Field(alias="file-info")
277
+ main_text: Optional[list[Union[Ref, BaseText]]] = Field(
278
+ default=None, alias="main-text"
279
+ )
280
+ figures: Optional[list[Figure]] = None
281
+ tables: Optional[list[Table]] = None
282
+
283
+
284
+ class CCSDocument(
285
+ MinimalDocument,
286
+ Generic[
287
+ DescriptionAdvancedT,
288
+ DescriptionAnalyticsT,
289
+ IdentifierTypeT,
290
+ LanguageT,
291
+ CollectionNameTypeT,
292
+ ],
293
+ ):
294
+ """Model for a CCS-generated document."""
295
+
296
+ obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
297
+ bitmaps: Optional[list[BitmapObject]] = None
298
+ equations: Optional[list[BaseCell]] = None
299
+ footnotes: Optional[list[BaseText]] = None
300
+ file_info: CCSFileInfoObject = Field(alias="file-info")
301
+ main_text: Optional[list[Union[Ref, BaseText]]] = Field(
302
+ default=None,
303
+ alias="main-text",
304
+ )
305
+ page_dimensions: Optional[list[PageDimensions]] = Field(
306
+ default=None, alias="page-dimensions"
307
+ )
308
+ page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
309
+ page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
310
+ s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
311
+
312
+ @model_validator(mode="before")
313
+ @classmethod
314
+ def from_dict(cls, data):
315
+ """Validates and fixes the input data."""
316
+ if not isinstance(data, dict):
317
+ return data
318
+ description_collection = data["description"].get("collection")
319
+ if not description_collection:
320
+ data["description"].setdefault("collection", {})
321
+
322
+ data["description"]["collection"].setdefault("type", "Document")
323
+ logs = data["description"].get("logs")
324
+ if not logs:
325
+ data["description"].setdefault("logs", [])
326
+
327
+ abstract = data["description"].get("abstract")
328
+ if abstract is not None and not isinstance(abstract, list):
329
+ if isinstance(abstract, str):
330
+ data["description"]["abstract"] = [abstract]
331
+ else:
332
+ data["description"].pop("abstract")
333
+
334
+ for key in ["affiliations", "authors"]:
335
+ descr = data["description"].get(key)
336
+ if descr is not None and not isinstance(descr, list):
337
+ if isinstance(descr, dict):
338
+ data["description"][key] = [descr]
339
+ else:
340
+ data["description"].pop(key)
341
+
342
+ if data.get("main-text"):
343
+ for item in data["main-text"]:
344
+ if ref := item.pop("__ref", None):
345
+ item["$ref"] = ref
346
+
347
+ return data
348
+
349
+
350
+ class ExportedCCSDocument(
351
+ MinimalDocument,
352
+ Generic[
353
+ DescriptionAdvancedT,
354
+ DescriptionAnalyticsT,
355
+ IdentifierTypeT,
356
+ LanguageT,
357
+ CollectionNameTypeT,
358
+ ],
359
+ ):
360
+ """Document model for Docling."""
361
+
362
+ obj_type: Optional[StrictStr] = Field(
363
+ "pdf-document",
364
+ alias="type",
365
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
366
+ )
367
+ bitmaps: Optional[list[BitmapObject]] = None
368
+ equations: Optional[list[BaseCell]] = None
369
+ footnotes: Optional[list[BaseText]] = None
370
+ description: CCSDocumentDescription[
371
+ DescriptionAdvancedT,
372
+ DescriptionAnalyticsT,
373
+ IdentifierTypeT,
374
+ LanguageT,
375
+ CollectionNameTypeT,
376
+ ]
377
+ file_info: CCSFileInfoObject = Field(alias="file-info")
378
+ main_text: Optional[list[Union[Ref, BaseText]]] = Field(
379
+ default=None, alias="main-text"
380
+ )
381
+ page_dimensions: Optional[list[PageDimensions]] = Field(
382
+ default=None, alias="page-dimensions"
383
+ )
384
+ page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
385
+ page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
386
+ s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
387
+ identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
388
+
389
+ @model_validator(mode="before")
390
+ @classmethod
391
+ def from_dict(cls, data):
392
+ """Fix ref in main-text."""
393
+ if not isinstance(data, dict):
394
+ return data
395
+ if data.get("main-text"):
396
+ for item in data["main-text"]:
397
+ if ref := item.pop("__ref", None):
398
+ item["$ref"] = ref
399
+
400
+ return data
401
+
402
+ def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
403
+ """Return the resolved reference.
404
+
405
+ Resolved the Ref object within the document.
406
+ If the object is not found, None is returned.
407
+ """
408
+ result: Optional[Union[BaseCell, BaseText]] = None
409
+
410
+ # NOTE: currently only resolves refs explicitely, such that we can make
411
+ # assumptions on ref parts
412
+ if item.obj_type == "table" and self.tables:
413
+ parts = item.ref.split("/")
414
+ result = self.tables[int(parts[2])]
415
+ elif item.obj_type == "figure" and self.figures:
416
+ parts = item.ref.split("/")
417
+ result = self.figures[int(parts[2])]
418
+ elif item.obj_type == "equation" and self.equations:
419
+ parts = item.ref.split("/")
420
+ result = self.equations[int(parts[2])]
421
+ elif item.obj_type == "footnote" and self.footnotes:
422
+ parts = item.ref.split("/")
423
+ result = self.footnotes[int(parts[2])]
424
+
425
+ return result
426
+
427
+ def get_map_to_page_dimensions(self):
428
+ """Get a map from page-index (start at 1) to page-dim [width, height]."""
429
+ pagedims = {}
430
+
431
+ if self.page_dimensions is not None:
432
+ for _ in self.page_dimensions:
433
+ pagedims[_.page] = [_.width, _.height]
434
+
435
+ return pagedims
436
+
437
+ def export_to_dict(self) -> Dict:
438
+ """export_to_dict."""
439
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
440
+
441
+ def export_to_markdown( # noqa: C901
442
+ self,
443
+ delim: str = "\n\n",
444
+ main_text_start: int = 0,
445
+ main_text_stop: Optional[int] = None,
446
+ main_text_labels: list[str] = [
447
+ "title",
448
+ "subtitle-level-1",
449
+ "paragraph",
450
+ "caption",
451
+ "table",
452
+ "figure",
453
+ ],
454
+ strict_text: bool = False,
455
+ image_placeholder: str = "<!-- image -->",
456
+ ) -> str:
457
+ r"""Serialize to Markdown.
458
+
459
+ Operates on a slice of the document's main_text as defined through arguments
460
+ main_text_start and main_text_stop; defaulting to the whole main_text.
461
+
462
+ Args:
463
+ delim (str, optional): Delimiter to use when concatenating the various
464
+ Markdown parts. Defaults to "\n\n".
465
+ main_text_start (int, optional): Main-text slicing start index (inclusive).
466
+ Defaults to 0.
467
+ main_text_end (Optional[int], optional): Main-text slicing stop index
468
+ (exclusive). Defaults to None.
469
+ main_text_labels (list[str], optional): The labels to include in the
470
+ markdown.
471
+ strict_text (bool, optional): if true, the output will be only plain text
472
+ without any markdown styling. Defaults to False.
473
+ image_placeholder (str, optional): the placeholder to include to position
474
+ images in the markdown. Defaults to a markdown comment "<!-- image -->".
475
+
476
+ Returns:
477
+ str: The exported Markdown representation.
478
+ """
479
+ has_title = False
480
+ prev_text = ""
481
+ md_texts: list[str] = []
482
+
483
+ if self.main_text is not None:
484
+ # collect all captions embedded in table and figure objects
485
+ # to avoid repeating them
486
+ embedded_captions = set()
487
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
488
+ item = (
489
+ self._resolve_ref(orig_item)
490
+ if isinstance(orig_item, Ref)
491
+ else orig_item
492
+ )
493
+ if item is None:
494
+ continue
495
+
496
+ if (
497
+ isinstance(item, (Table, Figure))
498
+ and item.text
499
+ and item.obj_type in main_text_labels
500
+ ):
501
+ embedded_captions.add(item.text)
502
+
503
+ # serialize document to markdown
504
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
505
+ markdown_text = ""
506
+
507
+ item = (
508
+ self._resolve_ref(orig_item)
509
+ if isinstance(orig_item, Ref)
510
+ else orig_item
511
+ )
512
+ if item is None:
513
+ continue
514
+
515
+ item_type = item.obj_type
516
+ if isinstance(item, BaseText) and item_type in main_text_labels:
517
+ text = item.text
518
+
519
+ # skip captions of they are embedded in the actual
520
+ # floating object
521
+ if item_type == "caption" and text in embedded_captions:
522
+ continue
523
+
524
+ # ignore repeated text
525
+ if prev_text == text or text is None:
526
+ continue
527
+ else:
528
+ prev_text = text
529
+
530
+ # first title match
531
+ if item_type == "title" and not has_title:
532
+ if strict_text:
533
+ markdown_text = f"{text}"
534
+ else:
535
+ markdown_text = f"# {text}"
536
+ has_title = True
537
+
538
+ # secondary titles
539
+ elif item_type in {"title", "subtitle-level-1"} or (
540
+ has_title and item_type == "title"
541
+ ):
542
+ if strict_text:
543
+ markdown_text = f"{text}"
544
+ else:
545
+ markdown_text = f"## {text}"
546
+
547
+ # normal text
548
+ else:
549
+ markdown_text = text
550
+
551
+ elif (
552
+ isinstance(item, Table)
553
+ and item.data
554
+ and item_type in main_text_labels
555
+ ):
556
+
557
+ md_table = ""
558
+ table = []
559
+ for row in item.data:
560
+ tmp = []
561
+ for col in row:
562
+ tmp.append(col.text)
563
+ table.append(tmp)
564
+
565
+ if len(table) > 1 and len(table[0]) > 0:
566
+ try:
567
+ md_table = tabulate(
568
+ table[1:], headers=table[0], tablefmt="github"
569
+ )
570
+ except ValueError:
571
+ md_table = tabulate(
572
+ table[1:],
573
+ headers=table[0],
574
+ tablefmt="github",
575
+ disable_numparse=True,
576
+ )
577
+
578
+ markdown_text = ""
579
+ if item.text:
580
+ markdown_text = item.text
581
+ if not strict_text:
582
+ markdown_text += "\n\n" + md_table
583
+
584
+ elif isinstance(item, Figure) and item_type in main_text_labels:
585
+
586
+ markdown_text = ""
587
+ if item.text:
588
+ markdown_text = item.text
589
+ if not strict_text:
590
+ markdown_text += f"\n{image_placeholder}"
591
+
592
+ if markdown_text:
593
+ md_texts.append(markdown_text)
594
+
595
+ result = delim.join(md_texts)
596
+ return result
597
+
598
+ def export_to_document_tokens(
599
+ self,
600
+ delim: str = "\n\n",
601
+ main_text_start: int = 0,
602
+ main_text_stop: Optional[int] = None,
603
+ main_text_labels: list[str] = [
604
+ "title",
605
+ "subtitle-level-1",
606
+ "paragraph",
607
+ "caption",
608
+ "table",
609
+ "figure",
610
+ ],
611
+ xsize: int = 100,
612
+ ysize: int = 100,
613
+ add_location: bool = True,
614
+ add_content: bool = True,
615
+ add_page_index: bool = True,
616
+ # table specific flags
617
+ add_table_cell_location: bool = False,
618
+ add_table_cell_label: bool = True,
619
+ add_table_cell_text: bool = True,
620
+ ) -> str:
621
+ r"""Exports the document content to an DocumentToken format.
622
+
623
+ Operates on a slice of the document's main_text as defined through arguments
624
+ main_text_start and main_text_stop; defaulting to the whole main_text.
625
+
626
+ Returns:
627
+ str: The content of the document formatted as a DocTags string.
628
+ """
629
+ new_line = ""
630
+ if delim:
631
+ new_line = "\n"
632
+
633
+ doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
634
+
635
+ # pagedims = self.get_map_to_page_dimensions()
636
+
637
+ if self.main_text is not None:
638
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
639
+
640
+ item = (
641
+ self._resolve_ref(orig_item)
642
+ if isinstance(orig_item, Ref)
643
+ else orig_item
644
+ )
645
+
646
+ if item is None:
647
+ continue
648
+
649
+ prov = item.prov
650
+
651
+ page_i = -1
652
+ page_w = 0.0
653
+ page_h = 0.0
654
+
655
+ if (
656
+ add_location
657
+ and self.page_dimensions is not None
658
+ and prov is not None
659
+ and len(prov) > 0
660
+ ):
661
+
662
+ page_i = prov[0].page
663
+ page_dim = self.page_dimensions[page_i - 1]
664
+
665
+ page_w = float(page_dim.width)
666
+ page_h = float(page_dim.height)
667
+
668
+ item_type = item.obj_type
669
+ if isinstance(item, BaseText) and (item_type in main_text_labels):
670
+
671
+ doctags += item.export_to_document_tokens(
672
+ new_line=new_line,
673
+ page_w=page_w,
674
+ page_h=page_h,
675
+ xsize=xsize,
676
+ ysize=ysize,
677
+ add_location=add_location,
678
+ add_content=add_content,
679
+ add_page_index=add_page_index,
680
+ )
681
+
682
+ elif isinstance(item, Table) and (item_type in main_text_labels):
683
+
684
+ doctags += item.export_to_document_tokens(
685
+ new_line=new_line,
686
+ page_w=page_w,
687
+ page_h=page_h,
688
+ xsize=xsize,
689
+ ysize=ysize,
690
+ add_caption=True,
691
+ add_location=add_location,
692
+ add_content=add_content,
693
+ add_cell_location=add_table_cell_location,
694
+ add_cell_label=add_table_cell_label,
695
+ add_cell_text=add_table_cell_text,
696
+ add_page_index=add_page_index,
697
+ )
698
+
699
+ elif isinstance(item, Figure) and (item_type in main_text_labels):
700
+
701
+ doctags += item.export_to_document_tokens(
702
+ new_line=new_line,
703
+ page_w=page_w,
704
+ page_h=page_h,
705
+ xsize=xsize,
706
+ ysize=ysize,
707
+ add_caption=True,
708
+ add_location=add_location,
709
+ add_content=add_content,
710
+ add_page_index=add_page_index,
711
+ )
712
+
713
+ doctags += DocumentToken.END_DOCUMENT.value
714
+
715
+ return doctags