docling-core 1.1.3__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (45) hide show
  1. {docling_core-1.1.3 → docling_core-1.2.0}/PKG-INFO +11 -10
  2. {docling_core-1.1.3 → docling_core-1.2.0}/README.md +10 -8
  3. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/base.py +6 -0
  4. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/document.py +286 -11
  5. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/statement.py +34 -0
  6. {docling_core-1.1.3 → docling_core-1.2.0}/pyproject.toml +1 -2
  7. {docling_core-1.1.3 → docling_core-1.2.0}/LICENSE +0 -0
  8. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/__init__.py +0 -0
  9. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/py.typed +0 -0
  10. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  11. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  12. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  13. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  14. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  15. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  16. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  17. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  18. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/__init__.py +0 -0
  19. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  20. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/mapping.py +0 -0
  21. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/meta.py +0 -0
  22. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/search/package.py +0 -0
  23. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/__init__.py +0 -0
  24. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/base.py +0 -0
  25. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/__init__.py +0 -0
  26. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/doc_ann.py +0 -0
  27. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/doc_ocr.py +0 -0
  28. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/doc/doc_raw.py +0 -0
  29. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/gen/__init__.py +0 -0
  30. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/gen/generic.py +0 -0
  31. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/nlp/__init__.py +0 -0
  32. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/nlp/qa.py +0 -0
  33. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/nlp/qa_labels.py +0 -0
  34. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/__init__.py +0 -0
  35. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/attribute.py +0 -0
  36. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/base.py +0 -0
  37. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/predicate.py +0 -0
  38. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/record.py +0 -0
  39. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/types/rec/subject.py +0 -0
  40. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/__init__.py +0 -0
  41. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/alias.py +0 -0
  42. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/ds_generate_docs.py +0 -0
  43. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  44. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/validate.py +0 -0
  45. {docling_core-1.1.3 → docling_core-1.2.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.1.3
3
+ Version: 1.2.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -28,7 +28,6 @@ Classifier: Typing :: Typed
28
28
  Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
29
29
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
30
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
- Requires-Dist: poetry (>=1.8.3,<2.0.0)
32
31
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
32
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
34
33
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
@@ -47,7 +46,7 @@ Description-Content-Type: text/markdown
47
46
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
48
47
  [![License MIT](https://img.shields.io/github/license/ds4sd/docling-core)](https://opensource.org/licenses/MIT)
49
48
 
50
- Docling Core is a library that defines the data types in [Docling](https://ds4sd.github.io), leveraging pydantic models.
49
+ Docling Core is a library that defines the data types in [Docling](https://github.com/DS4SD/docling), leveraging pydantic models.
51
50
 
52
51
  ## Installation
53
52
 
@@ -116,13 +115,15 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
116
115
  If you use Docling Core in your projects, please consider citing the following:
117
116
 
118
117
  ```bib
119
- @software{Docling,
120
- author = {Deep Search Team},
121
- month = {7},
122
- title = {{Docling}},
123
- url = {https://github.com/DS4SD/docling},
124
- version = {main},
125
- year = {2024}
118
+ @techreport{Docling,
119
+ author = "Deep Search Team",
120
+ month = 8,
121
+ title = "Docling Technical Report",
122
+ url = "https://arxiv.org/abs/2408.09869",
123
+ eprint = "2408.09869",
124
+ doi = "10.48550/arXiv.2408.09869",
125
+ version = "1.0.0",
126
+ year = 2024
126
127
  }
127
128
  ```
128
129
 
@@ -10,7 +10,7 @@
10
10
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
11
11
  [![License MIT](https://img.shields.io/github/license/ds4sd/docling-core)](https://opensource.org/licenses/MIT)
12
12
 
13
- Docling Core is a library that defines the data types in [Docling](https://ds4sd.github.io), leveraging pydantic models.
13
+ Docling Core is a library that defines the data types in [Docling](https://github.com/DS4SD/docling), leveraging pydantic models.
14
14
 
15
15
  ## Installation
16
16
 
@@ -79,13 +79,15 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
79
79
  If you use Docling Core in your projects, please consider citing the following:
80
80
 
81
81
  ```bib
82
- @software{Docling,
83
- author = {Deep Search Team},
84
- month = {7},
85
- title = {{Docling}},
86
- url = {https://github.com/DS4SD/docling},
87
- version = {main},
88
- year = {2024}
82
+ @techreport{Docling,
83
+ author = "Deep Search Team",
84
+ month = 8,
85
+ title = "Docling Technical Report",
86
+ url = "https://arxiv.org/abs/2408.09869",
87
+ eprint = "2408.09869",
88
+ doi = "10.48550/arXiv.2408.09869",
89
+ version = "1.0.0",
90
+ year = 2024
89
91
  }
90
92
  ```
91
93
 
@@ -131,6 +131,7 @@ class GlmTableCell(TableCell):
131
131
  class BaseCell(AliasModel):
132
132
  """Base cell."""
133
133
 
134
+ # FIXME: we need to check why we have bounding_box (this should be in prov)
134
135
  bounding_box: Optional[BoundingBoxContainer] = Field(
135
136
  default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
136
137
  )
@@ -152,6 +153,11 @@ class Table(BaseCell):
152
153
  model: Optional[str] = None
153
154
 
154
155
 
156
+ # FIXME: let's add some figure specific data-types later
157
+ class Figure(BaseCell):
158
+ """Figure."""
159
+
160
+
155
161
  class BaseText(AliasModel):
156
162
  """Base model for text objects."""
157
163
 
@@ -6,7 +6,8 @@
6
6
  """Models for the Docling Document data type."""
7
7
 
8
8
  from datetime import datetime
9
- from typing import Generic, Optional, Union
9
+ from enum import Enum
10
+ from typing import Generic, Optional, Tuple, Union
10
11
 
11
12
  from pydantic import (
12
13
  AnyHttpUrl,
@@ -35,6 +36,7 @@ from docling_core.types.doc.base import (
35
36
  BaseCell,
36
37
  BaseText,
37
38
  BitmapObject,
39
+ Figure,
38
40
  PageDimensions,
39
41
  PageReference,
40
42
  Ref,
@@ -275,7 +277,7 @@ class MinimalDocument(
275
277
  main_text: Optional[list[Union[Ref, BaseText]]] = Field(
276
278
  default=None, alias="main-text"
277
279
  )
278
- figures: Optional[list[BaseCell]] = None
280
+ figures: Optional[list[Figure]] = None
279
281
  tables: Optional[list[Table]] = None
280
282
 
281
283
 
@@ -311,6 +313,8 @@ class CCSDocument(
311
313
  @classmethod
312
314
  def from_dict(cls, data):
313
315
  """Validates and fixes the input data."""
316
+ if not isinstance(data, dict):
317
+ return data
314
318
  description_collection = data["description"].get("collection")
315
319
  if not description_collection:
316
320
  data["description"].setdefault("collection", {})
@@ -343,6 +347,107 @@ class CCSDocument(
343
347
  return data
344
348
 
345
349
 
350
+ class DocumentToken(Enum):
351
+ """Class to represent an LLM friendly representation of a Document."""
352
+
353
+ BEG_DOCUMENT = "<document>"
354
+ END_DOCUMENT = "</document>"
355
+
356
+ BEG_TITLE = "<title>"
357
+ END_TITLE = "</title>"
358
+
359
+ BEG_ABSTRACT = "<abstract>"
360
+ END_ABSTRACT = "</abstract>"
361
+
362
+ BEG_DOI = "<doi>"
363
+ END_DOI = "</doi>"
364
+ BEG_DATE = "<date>"
365
+ END_DATE = "</date>"
366
+
367
+ BEG_AUTHORS = "<authors>"
368
+ END_AUTHORS = "</authors>"
369
+ BEG_AUTHOR = "<author>"
370
+ END_AUTHOR = "</author>"
371
+
372
+ BEG_AFFILIATIONS = "<affiliations>"
373
+ END_AFFILIATIONS = "</affiliations>"
374
+ BEG_AFFILIATION = "<affiliation>"
375
+ END_AFFILIATION = "</affiliation>"
376
+
377
+ BEG_HEADER = "<section-header>"
378
+ END_HEADER = "</section-header>"
379
+ BEG_TEXT = "<text>"
380
+ END_TEXT = "</text>"
381
+ BEG_PARAGRAPH = "<paragraph>"
382
+ END_PARAGRAPH = "</paragraph>"
383
+ BEG_TABLE = "<table>"
384
+ END_TABLE = "</table>"
385
+ BEG_FIGURE = "<figure>"
386
+ END_FIGURE = "</figure>"
387
+ BEG_CAPTION = "<caption>"
388
+ END_CAPTION = "</caption>"
389
+ BEG_EQUATION = "<equation>"
390
+ END_EQUATION = "</equation>"
391
+ BEG_LIST = "<list>"
392
+ END_LIST = "</list>"
393
+ BEG_LISTITEM = "<list-item>"
394
+ END_LISTITEM = "</list-item>"
395
+
396
+ BEG_LOCATION = "<location>"
397
+ END_LOCATION = "</location>"
398
+ BEG_GROUP = "<group>"
399
+ END_GROUP = "</group>"
400
+
401
+ @classmethod
402
+ def get_special_tokens(
403
+ cls,
404
+ max_rows: int = 100,
405
+ max_cols: int = 100,
406
+ max_pages: int = 1000,
407
+ page_dimension: Tuple[int, int] = (100, 100),
408
+ ):
409
+ """Function to get all special document tokens."""
410
+ special_tokens = [token.value for token in cls]
411
+
412
+ # Adding dynamically generated row and col tokens
413
+ for i in range(0, max_rows):
414
+ special_tokens += [f"<row_{i}>", f"</row_{i}>"]
415
+
416
+ for i in range(0, max_cols):
417
+ special_tokens += [f"<col_{i}>", f"</col_{i}>"]
418
+
419
+ for i in range(6):
420
+ special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
421
+
422
+ # Adding dynamically generated page-tokens
423
+ for i in range(0, max_pages):
424
+ special_tokens.append(f"<page_{i}>")
425
+
426
+ # Adding dynamically generated location-tokens
427
+ for i in range(0, max(page_dimension[0], page_dimension[1])):
428
+ special_tokens.append(f"<loc_{i}>")
429
+
430
+ return special_tokens
431
+
432
+ @staticmethod
433
+ def get_page_token(page: int):
434
+ """Function to get page tokens."""
435
+ return f"<page_{page}>"
436
+
437
+ @staticmethod
438
+ def get_location_token(val: float, rnorm: int = 100):
439
+ """Function to get location tokens."""
440
+ val_ = round(rnorm * val)
441
+
442
+ if val_ < 0:
443
+ return "<loc_0>"
444
+
445
+ if val_ > rnorm:
446
+ return f"<loc_{rnorm}>"
447
+
448
+ return f"<loc_{val_}>"
449
+
450
+
346
451
  class ExportedCCSDocument(
347
452
  MinimalDocument,
348
453
  Generic[
@@ -386,6 +491,8 @@ class ExportedCCSDocument(
386
491
  @classmethod
387
492
  def from_dict(cls, data):
388
493
  """Fix ref in main-text."""
494
+ if not isinstance(data, dict):
495
+ return data
389
496
  if data.get("main-text"):
390
497
  for item in data["main-text"]:
391
498
  if ref := item.pop("__ref", None):
@@ -423,6 +530,14 @@ class ExportedCCSDocument(
423
530
  delim: str = "\n\n",
424
531
  main_text_start: int = 0,
425
532
  main_text_stop: Optional[int] = None,
533
+ main_text_labels: list[str] = [
534
+ "title",
535
+ "subtitle-level-1",
536
+ "paragraph",
537
+ "caption",
538
+ "table",
539
+ ],
540
+ strict_text: bool = False,
426
541
  ) -> str:
427
542
  r"""Serialize to Markdown.
428
543
 
@@ -457,12 +572,7 @@ class ExportedCCSDocument(
457
572
  continue
458
573
 
459
574
  item_type = item.obj_type
460
- if isinstance(item, BaseText) and item_type in {
461
- "title",
462
- "subtitle-level-1",
463
- "paragraph",
464
- "caption",
465
- }:
575
+ if isinstance(item, BaseText) and item_type in main_text_labels:
466
576
  text = item.text
467
577
 
468
578
  # ignore repeated text
@@ -473,20 +583,31 @@ class ExportedCCSDocument(
473
583
 
474
584
  # first title match
475
585
  if item_type == "title" and not has_title:
476
- markdown_text = f"# {text}"
586
+ if strict_text:
587
+ markdown_text = f"{text}"
588
+ else:
589
+ markdown_text = f"# {text}"
477
590
  has_title = True
478
591
 
479
592
  # secondary titles
480
593
  elif item_type in {"title", "subtitle-level-1"} or (
481
594
  has_title and item_type == "title"
482
595
  ):
483
- markdown_text = f"## {text}"
596
+ if strict_text:
597
+ markdown_text = f"{text}"
598
+ else:
599
+ markdown_text = f"## {text}"
484
600
 
485
601
  # normal text
486
602
  else:
487
603
  markdown_text = text
488
604
 
489
- elif isinstance(item, Table) and item.data:
605
+ elif (
606
+ isinstance(item, Table)
607
+ and item.data
608
+ and item_type in main_text_labels
609
+ and not strict_text
610
+ ):
490
611
  table = []
491
612
  for row in item.data:
492
613
  tmp = []
@@ -514,3 +635,157 @@ class ExportedCCSDocument(
514
635
 
515
636
  result = delim.join(md_texts)
516
637
  return result
638
+
639
+ def export_to_document_tokens(
640
+ self,
641
+ delim: str = "\n\n",
642
+ main_text_start: int = 0,
643
+ main_text_stop: Optional[int] = None,
644
+ main_text_labels: list[str] = [
645
+ "title",
646
+ "subtitle-level-1",
647
+ "paragraph",
648
+ "caption",
649
+ "table",
650
+ "figure",
651
+ ],
652
+ page_tagging: bool = True,
653
+ location_tagging: bool = True,
654
+ location_dimensions: Tuple[int, int] = (100, 100),
655
+ add_new_line: bool = True,
656
+ ) -> str:
657
+ r"""Exports the document content to an DocumentToken format.
658
+
659
+ Operates on a slice of the document's main_text as defined through arguments
660
+ main_text_start and main_text_stop; defaulting to the whole main_text.
661
+
662
+ Args:
663
+ delim (str, optional): The delimiter used to separate text blocks in the
664
+ exported XML. Default is two newline characters ("\n\n").
665
+ main_text_start (int, optional): The starting index of the main text to
666
+ be included in the XML. Default is 0 (the beginning of the text).
667
+ main_text_stop (Optional[int], optional): The stopping index of the main
668
+ text. If set to None, the export includes text up to the end.
669
+ Default is None.
670
+ main_text_labels (list[str], optional): A list of text labels that
671
+ categorize the different sections of the document (e.g., "title",
672
+ "subtitle-level-1", "paragraph", "caption"). Default labels are
673
+ "title", "subtitle-level-1", "paragraph", and "caption".
674
+ location_tagging (bool, optional): Determines whether to include
675
+ location-based tagging in the XML. If True, the exported XML will
676
+ contain information about the locations of the text elements.
677
+ Default is True.
678
+ location_dimensions (Tuple[int, int], optional): Specifies the dimensions
679
+ (width and height) for the location tagging, if enabled.
680
+ Default is [100, 100].
681
+ add_new_line (bool, optional): Whether to add new line characters after
682
+ each text block. If True, a new line is added after each block of
683
+ text in the XML. Default is True.
684
+
685
+ Returns:
686
+ str: The content of the document formatted as an XML string.
687
+ """
688
+ xml_str = DocumentToken.BEG_DOCUMENT.value
689
+
690
+ new_line = ""
691
+ if add_new_line:
692
+ new_line = "\n"
693
+
694
+ if self.main_text is not None:
695
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
696
+
697
+ item = (
698
+ self._resolve_ref(orig_item)
699
+ if isinstance(orig_item, Ref)
700
+ else orig_item
701
+ )
702
+
703
+ if item is None:
704
+ continue
705
+
706
+ prov = item.prov
707
+
708
+ loc_str = "" # default is zero
709
+ if (
710
+ location_tagging
711
+ and self.page_dimensions is not None
712
+ and prov is not None
713
+ and len(prov) > 0
714
+ ):
715
+
716
+ page = prov[0].page
717
+ page_dim = self.page_dimensions[page - 1]
718
+
719
+ page_w = float(page_dim.width)
720
+ page_h = float(page_dim.height)
721
+
722
+ x0 = float(prov[0].bbox[0]) / float(page_w)
723
+ y0 = float(prov[0].bbox[1]) / float(page_h)
724
+ x1 = float(prov[0].bbox[2]) / float(page_w)
725
+ y1 = float(prov[0].bbox[3]) / float(page_h)
726
+
727
+ page_tok = ""
728
+ if page_tagging:
729
+ page_tok = DocumentToken.get_page_token(page=page)
730
+
731
+ x0_tok = DocumentToken.get_location_token(
732
+ val=min(x0, x1), rnorm=location_dimensions[0]
733
+ )
734
+ y0_tok = DocumentToken.get_location_token(
735
+ val=min(y0, y1), rnorm=location_dimensions[1]
736
+ )
737
+ x1_tok = DocumentToken.get_location_token(
738
+ val=max(x0, x1), rnorm=location_dimensions[0]
739
+ )
740
+ y1_tok = DocumentToken.get_location_token(
741
+ val=max(y0, y1), rnorm=location_dimensions[1]
742
+ )
743
+
744
+ # update
745
+ loc_str = f"{DocumentToken.BEG_LOCATION.value}"
746
+ loc_str += f"{page_tok}"
747
+ loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
748
+ loc_str += f"{DocumentToken.END_LOCATION.value}"
749
+
750
+ item_type = item.obj_type
751
+ if isinstance(item, BaseText) and (item_type in main_text_labels):
752
+ text = item.text
753
+
754
+ xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
755
+
756
+ elif isinstance(item, Table) and (item_type in main_text_labels):
757
+
758
+ xml_str += f"<{item_type}>{loc_str}"
759
+
760
+ if item.text is not None and len(item.text) > 0:
761
+ xml_str += f"{DocumentToken.BEG_CAPTION.value}"
762
+ xml_str += (
763
+ f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
764
+ )
765
+
766
+ if item.data is not None and len(item.data) > 0:
767
+ for i, row in enumerate(item.data):
768
+ xml_str += f"<row_{i}>"
769
+ for j, col in enumerate(row):
770
+ text = col.text
771
+ xml_str += f"<col_{j}>{text}</col_{j}>"
772
+
773
+ xml_str += f"</row_{i}>{new_line}"
774
+
775
+ xml_str += f"</{item_type}>{new_line}"
776
+
777
+ elif isinstance(item, Figure) and (item_type in main_text_labels):
778
+
779
+ xml_str += f"<{item_type}>{loc_str}"
780
+
781
+ if item.text is not None and len(item.text) > 0:
782
+ xml_str += f"{DocumentToken.BEG_CAPTION.value}"
783
+ xml_str += (
784
+ f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
785
+ )
786
+
787
+ xml_str += f"</{item_type}>{new_line}"
788
+
789
+ xml_str += DocumentToken.END_DOCUMENT.value
790
+
791
+ return xml_str
@@ -4,6 +4,7 @@
4
4
  #
5
5
 
6
6
  """Define the model Statement."""
7
+ from enum import Enum
7
8
  from typing import Generic
8
9
 
9
10
  from pydantic import Field
@@ -21,6 +22,39 @@ from docling_core.types.rec.attribute import Attribute
21
22
  from docling_core.types.rec.subject import Subject
22
23
 
23
24
 
25
+ class StatementToken(Enum):
26
+ """Class to represent an LLM friendly representation of statements."""
27
+
28
+ BEG_STATEMENTS = "<statements>"
29
+ END_STATEMENTS = "</statements>"
30
+
31
+ BEG_STATEMENT = "<statement>"
32
+ END_STATEMENT = "</statement>"
33
+
34
+ BEG_PROV = "<prov>"
35
+ END_PROV = "</prov>"
36
+
37
+ BEG_SUBJECT = "<subject>"
38
+ END_SUBJECT = "</subject>"
39
+
40
+ BEG_PREDICATE = "<predicate>"
41
+ END_PREDICATE = "</predicate>"
42
+
43
+ BEG_PROPERTY = "<property>"
44
+ END_PROPERTY = "</property>"
45
+
46
+ BEG_VALUE = "<value>"
47
+ END_VALUE = "</value>"
48
+
49
+ BEG_UNIT = "<unit>"
50
+ END_UNIT = "</unit>"
51
+
52
+ @classmethod
53
+ def get_special_tokens(cls):
54
+ """Function to get all special statements tokens."""
55
+ return [token.value for token in cls]
56
+
57
+
24
58
  class Statement(
25
59
  Attribute,
26
60
  Generic[
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "1.1.3"
3
+ version = "1.2.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -51,7 +51,6 @@ jsonschema = "^4.16.0"
51
51
  pydantic = "^2.6.0"
52
52
  jsonref = "^1.1.0"
53
53
  json-schema-for-humans = "^1.0.0"
54
- poetry = "^1.8.3"
55
54
  pyproject-toml = "^0.0.10"
56
55
  tabulate = "^0.9.0"
57
56
 
File without changes