docling-core 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -10,6 +10,7 @@ import pandas as pd
10
10
  from pydantic import BaseModel, Field, PositiveInt, StrictStr
11
11
 
12
12
  from docling_core.search.mapping import es_field
13
+ from docling_core.types.doc.tokens import DocumentToken
13
14
  from docling_core.utils.alias import AliasModel
14
15
 
15
16
  CellData = tuple[float, float, float, float, str, str]
@@ -132,10 +133,6 @@ class GlmTableCell(TableCell):
132
133
  class BaseCell(AliasModel):
133
134
  """Base cell."""
134
135
 
135
- # FIXME: we need to check why we have bounding_box (this should be in prov)
136
- bounding_box: Optional[BoundingBoxContainer] = Field(
137
- default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
138
- )
139
136
  prov: Optional[list[Prov]] = None
140
137
  text: Optional[str] = Field(
141
138
  default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
@@ -144,6 +141,38 @@ class BaseCell(AliasModel):
144
141
  alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
145
142
  )
146
143
 
144
+ def get_location_tokens(
145
+ self,
146
+ new_line: str,
147
+ page_w: float,
148
+ page_h: float,
149
+ xsize: int = 100,
150
+ ysize: int = 100,
151
+ add_page_index: bool = True,
152
+ ) -> str:
153
+ """Get the location string for the BaseCell."""
154
+ if self.prov is None:
155
+ return ""
156
+
157
+ location = ""
158
+ for prov in self.prov:
159
+
160
+ page_i = -1
161
+ if add_page_index:
162
+ page_i = prov.page
163
+
164
+ loc_str = DocumentToken.get_location(
165
+ bbox=prov.bbox,
166
+ page_w=page_w,
167
+ page_h=page_h,
168
+ xsize=xsize,
169
+ ysize=ysize,
170
+ page_i=page_i,
171
+ )
172
+ location += f"{loc_str}{new_line}"
173
+
174
+ return location
175
+
147
176
 
148
177
  class Table(BaseCell):
149
178
  """Table."""
@@ -153,6 +182,11 @@ class Table(BaseCell):
153
182
  data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
154
183
  model: Optional[str] = None
155
184
 
185
+ # FIXME: we need to check why we have bounding_box (this should be in prov)
186
+ bounding_box: Optional[BoundingBoxContainer] = Field(
187
+ default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
188
+ )
189
+
156
190
  def _get_tablecell_span(self, cell: TableCell, ix: int):
157
191
  if cell.spans is None:
158
192
  span = set()
@@ -249,26 +283,185 @@ class Table(BaseCell):
249
283
 
250
284
  return body
251
285
 
286
+ def export_to_document_tokens(
287
+ self,
288
+ new_line: str = "\n",
289
+ page_w: float = 0.0,
290
+ page_h: float = 0.0,
291
+ xsize: int = 100,
292
+ ysize: int = 100,
293
+ add_location: bool = True,
294
+ add_caption: bool = True,
295
+ add_content: bool = True,
296
+ add_cell_location: bool = True,
297
+ add_cell_label: bool = True,
298
+ add_cell_text: bool = True,
299
+ add_page_index: bool = True,
300
+ ):
301
+ """Export table to document tokens format."""
302
+ body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
303
+
304
+ if add_location:
305
+ body += self.get_location_tokens(
306
+ new_line=new_line,
307
+ page_w=page_w,
308
+ page_h=page_h,
309
+ xsize=xsize,
310
+ ysize=ysize,
311
+ add_page_index=add_page_index,
312
+ )
313
+
314
+ if add_caption and self.text is not None and len(self.text) > 0:
315
+ body += f"{DocumentToken.BEG_CAPTION.value}"
316
+ body += f"{self.text.strip()}"
317
+ body += f"{DocumentToken.END_CAPTION.value}"
318
+ body += f"{new_line}"
319
+
320
+ if add_content and self.data is not None and len(self.data) > 0:
321
+ for i, row in enumerate(self.data):
322
+ body += f"<row_{i}>"
323
+ for j, col in enumerate(row):
324
+
325
+ text = ""
326
+ if add_cell_text:
327
+ text = col.text.strip()
328
+
329
+ cell_loc = ""
330
+ if (
331
+ col.bbox is not None
332
+ and add_cell_location
333
+ and add_page_index
334
+ and self.prov is not None
335
+ and len(self.prov) > 0
336
+ ):
337
+ cell_loc = DocumentToken.get_location(
338
+ bbox=col.bbox,
339
+ page_w=page_w,
340
+ page_h=page_h,
341
+ xsize=xsize,
342
+ ysize=ysize,
343
+ page_i=self.prov[0].page,
344
+ )
345
+ elif (
346
+ col.bbox is not None
347
+ and add_cell_location
348
+ and not add_page_index
349
+ ):
350
+ cell_loc = DocumentToken.get_location(
351
+ bbox=col.bbox,
352
+ page_w=page_w,
353
+ page_h=page_h,
354
+ xsize=xsize,
355
+ ysize=ysize,
356
+ page_i=-1,
357
+ )
358
+
359
+ cell_label = ""
360
+ if (
361
+ add_cell_label
362
+ and col.obj_type is not None
363
+ and len(col.obj_type) > 0
364
+ ):
365
+ cell_label = f"<{col.obj_type}>"
366
+
367
+ body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
368
+
369
+ body += f"</row_{i}>{new_line}"
370
+
371
+ body += f"{DocumentToken.END_TABLE.value}{new_line}"
372
+
373
+ return body
374
+
252
375
 
253
376
  # FIXME: let's add some figure specific data-types later
254
377
  class Figure(BaseCell):
255
378
  """Figure."""
256
379
 
380
+ # FIXME: we need to check why we have bounding_box (this should be in prov)
381
+ bounding_box: Optional[BoundingBoxContainer] = Field(
382
+ default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
383
+ )
384
+
385
+ def export_to_document_tokens(
386
+ self,
387
+ new_line: str = "\n",
388
+ page_w: float = 0.0,
389
+ page_h: float = 0.0,
390
+ xsize: int = 100,
391
+ ysize: int = 100,
392
+ add_location: bool = True,
393
+ add_caption: bool = True,
394
+ add_content: bool = True, # not used at the moment
395
+ add_page_index: bool = True,
396
+ ):
397
+ """Export figure to document tokens format."""
398
+ body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
399
+
400
+ if add_location:
401
+ body += self.get_location_tokens(
402
+ new_line=new_line,
403
+ page_w=page_w,
404
+ page_h=page_h,
405
+ xsize=xsize,
406
+ ysize=ysize,
407
+ add_page_index=add_page_index,
408
+ )
409
+
410
+ if add_caption and self.text is not None and len(self.text) > 0:
411
+ body += f"{DocumentToken.BEG_CAPTION.value}"
412
+ body += f"{self.text.strip()}"
413
+ body += f"{DocumentToken.END_CAPTION.value}"
414
+ body += f"{new_line}"
415
+
416
+ body += f"{DocumentToken.END_FIGURE.value}{new_line}"
417
+
418
+ return body
419
+
257
420
 
258
- class BaseText(AliasModel):
421
+ class BaseText(BaseCell):
259
422
  """Base model for text objects."""
260
423
 
261
- text: StrictStr = Field(
262
- json_schema_extra=es_field(term_vector="with_positions_offsets")
263
- )
264
- obj_type: StrictStr = Field(
265
- alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
266
- )
424
+ # FIXME: do we need these ???
267
425
  name: Optional[StrictStr] = Field(
268
426
  default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
269
427
  )
270
428
  font: Optional[str] = None
271
- prov: Optional[list[Prov]] = None
429
+
430
+ def export_to_document_tokens(
431
+ self,
432
+ new_line: str = "\n",
433
+ page_w: float = 0.0,
434
+ page_h: float = 0.0,
435
+ xsize: int = 100,
436
+ ysize: int = 100,
437
+ add_location: bool = True,
438
+ add_content: bool = True,
439
+ add_page_index: bool = True,
440
+ ):
441
+ """Export text element to document tokens format."""
442
+ body = f"<{self.obj_type}>"
443
+ # body = f"<{self.name}>"
444
+
445
+ assert DocumentToken.is_known_token(
446
+ body
447
+ ), f"failed DocumentToken.is_known_token({body})"
448
+
449
+ if add_location:
450
+ body += self.get_location_tokens(
451
+ new_line="",
452
+ page_w=page_w,
453
+ page_h=page_h,
454
+ xsize=xsize,
455
+ ysize=ysize,
456
+ add_page_index=add_page_index,
457
+ )
458
+
459
+ if add_content and self.text is not None:
460
+ body += self.text.strip()
461
+
462
+ body += f"</{self.obj_type}>{new_line}"
463
+
464
+ return body
272
465
 
273
466
 
274
467
  class ListItem(BaseText):
@@ -6,8 +6,7 @@
6
6
  """Models for the Docling Document data type."""
7
7
 
8
8
  from datetime import datetime
9
- from enum import Enum
10
- from typing import Generic, Optional, Tuple, Union
9
+ from typing import Generic, Optional, Union
11
10
 
12
11
  from pydantic import (
13
12
  AnyHttpUrl,
@@ -43,6 +42,7 @@ from docling_core.types.doc.base import (
43
42
  S3Data,
44
43
  Table,
45
44
  )
45
+ from docling_core.types.doc.tokens import DocumentToken
46
46
  from docling_core.utils.alias import AliasModel
47
47
 
48
48
 
@@ -347,107 +347,6 @@ class CCSDocument(
347
347
  return data
348
348
 
349
349
 
350
- class DocumentToken(Enum):
351
- """Class to represent an LLM friendly representation of a Document."""
352
-
353
- BEG_DOCUMENT = "<document>"
354
- END_DOCUMENT = "</document>"
355
-
356
- BEG_TITLE = "<title>"
357
- END_TITLE = "</title>"
358
-
359
- BEG_ABSTRACT = "<abstract>"
360
- END_ABSTRACT = "</abstract>"
361
-
362
- BEG_DOI = "<doi>"
363
- END_DOI = "</doi>"
364
- BEG_DATE = "<date>"
365
- END_DATE = "</date>"
366
-
367
- BEG_AUTHORS = "<authors>"
368
- END_AUTHORS = "</authors>"
369
- BEG_AUTHOR = "<author>"
370
- END_AUTHOR = "</author>"
371
-
372
- BEG_AFFILIATIONS = "<affiliations>"
373
- END_AFFILIATIONS = "</affiliations>"
374
- BEG_AFFILIATION = "<affiliation>"
375
- END_AFFILIATION = "</affiliation>"
376
-
377
- BEG_HEADER = "<section-header>"
378
- END_HEADER = "</section-header>"
379
- BEG_TEXT = "<text>"
380
- END_TEXT = "</text>"
381
- BEG_PARAGRAPH = "<paragraph>"
382
- END_PARAGRAPH = "</paragraph>"
383
- BEG_TABLE = "<table>"
384
- END_TABLE = "</table>"
385
- BEG_FIGURE = "<figure>"
386
- END_FIGURE = "</figure>"
387
- BEG_CAPTION = "<caption>"
388
- END_CAPTION = "</caption>"
389
- BEG_EQUATION = "<equation>"
390
- END_EQUATION = "</equation>"
391
- BEG_LIST = "<list>"
392
- END_LIST = "</list>"
393
- BEG_LISTITEM = "<list-item>"
394
- END_LISTITEM = "</list-item>"
395
-
396
- BEG_LOCATION = "<location>"
397
- END_LOCATION = "</location>"
398
- BEG_GROUP = "<group>"
399
- END_GROUP = "</group>"
400
-
401
- @classmethod
402
- def get_special_tokens(
403
- cls,
404
- max_rows: int = 100,
405
- max_cols: int = 100,
406
- max_pages: int = 1000,
407
- page_dimension: Tuple[int, int] = (100, 100),
408
- ):
409
- """Function to get all special document tokens."""
410
- special_tokens = [token.value for token in cls]
411
-
412
- # Adding dynamically generated row and col tokens
413
- for i in range(0, max_rows):
414
- special_tokens += [f"<row_{i}>", f"</row_{i}>"]
415
-
416
- for i in range(0, max_cols):
417
- special_tokens += [f"<col_{i}>", f"</col_{i}>"]
418
-
419
- for i in range(6):
420
- special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
421
-
422
- # Adding dynamically generated page-tokens
423
- for i in range(0, max_pages):
424
- special_tokens.append(f"<page_{i}>")
425
-
426
- # Adding dynamically generated location-tokens
427
- for i in range(0, max(page_dimension[0], page_dimension[1])):
428
- special_tokens.append(f"<loc_{i}>")
429
-
430
- return special_tokens
431
-
432
- @staticmethod
433
- def get_page_token(page: int):
434
- """Function to get page tokens."""
435
- return f"<page_{page}>"
436
-
437
- @staticmethod
438
- def get_location_token(val: float, rnorm: int = 100):
439
- """Function to get location tokens."""
440
- val_ = round(rnorm * val)
441
-
442
- if val_ < 0:
443
- return "<loc_0>"
444
-
445
- if val_ > rnorm:
446
- return f"<loc_{rnorm}>"
447
-
448
- return f"<loc_{val_}>"
449
-
450
-
451
350
  class ExportedCCSDocument(
452
351
  MinimalDocument,
453
352
  Generic[
@@ -525,6 +424,16 @@ class ExportedCCSDocument(
525
424
 
526
425
  return result
527
426
 
427
+ def get_map_to_page_dimensions(self):
428
+ """Get a map from page-index (start at 1) to page-dim [width, height]."""
429
+ pagedims = {}
430
+
431
+ if self.page_dimensions is not None:
432
+ for _ in self.page_dimensions:
433
+ pagedims[_.page] = [_.width, _.height]
434
+
435
+ return pagedims
436
+
528
437
  def export_to_markdown(
529
438
  self,
530
439
  delim: str = "\n\n",
@@ -576,7 +485,7 @@ class ExportedCCSDocument(
576
485
  text = item.text
577
486
 
578
487
  # ignore repeated text
579
- if prev_text == text:
488
+ if prev_text == text or text is None:
580
489
  continue
581
490
  else:
582
491
  prev_text = text
@@ -649,48 +558,32 @@ class ExportedCCSDocument(
649
558
  "table",
650
559
  "figure",
651
560
  ],
652
- page_tagging: bool = True,
653
- location_tagging: bool = True,
654
- location_dimensions: Tuple[int, int] = (100, 100),
655
- add_new_line: bool = True,
561
+ xsize: int = 100,
562
+ ysize: int = 100,
563
+ add_location: bool = True,
564
+ add_content: bool = True,
565
+ add_page_index: bool = True,
566
+ # table specific flags
567
+ add_table_cell_location: bool = False,
568
+ add_table_cell_label: bool = True,
569
+ add_table_cell_text: bool = True,
656
570
  ) -> str:
657
571
  r"""Exports the document content to an DocumentToken format.
658
572
 
659
573
  Operates on a slice of the document's main_text as defined through arguments
660
574
  main_text_start and main_text_stop; defaulting to the whole main_text.
661
575
 
662
- Args:
663
- delim (str, optional): The delimiter used to separate text blocks in the
664
- exported XML. Default is two newline characters ("\n\n").
665
- main_text_start (int, optional): The starting index of the main text to
666
- be included in the XML. Default is 0 (the beginning of the text).
667
- main_text_stop (Optional[int], optional): The stopping index of the main
668
- text. If set to None, the export includes text up to the end.
669
- Default is None.
670
- main_text_labels (list[str], optional): A list of text labels that
671
- categorize the different sections of the document (e.g., "title",
672
- "subtitle-level-1", "paragraph", "caption"). Default labels are
673
- "title", "subtitle-level-1", "paragraph", and "caption".
674
- location_tagging (bool, optional): Determines whether to include
675
- location-based tagging in the XML. If True, the exported XML will
676
- contain information about the locations of the text elements.
677
- Default is True.
678
- location_dimensions (Tuple[int, int], optional): Specifies the dimensions
679
- (width and height) for the location tagging, if enabled.
680
- Default is [100, 100].
681
- add_new_line (bool, optional): Whether to add new line characters after
682
- each text block. If True, a new line is added after each block of
683
- text in the XML. Default is True.
684
-
685
576
  Returns:
686
- str: The content of the document formatted as an XML string.
577
+ str: The content of the document formatted as a DocTags string.
687
578
  """
688
- xml_str = DocumentToken.BEG_DOCUMENT.value
689
-
690
579
  new_line = ""
691
- if add_new_line:
580
+ if delim:
692
581
  new_line = "\n"
693
582
 
583
+ doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
584
+
585
+ # pagedims = self.get_map_to_page_dimensions()
586
+
694
587
  if self.main_text is not None:
695
588
  for orig_item in self.main_text[main_text_start:main_text_stop]:
696
589
 
@@ -705,87 +598,68 @@ class ExportedCCSDocument(
705
598
 
706
599
  prov = item.prov
707
600
 
708
- loc_str = "" # default is zero
601
+ page_i = -1
602
+ page_w = 0.0
603
+ page_h = 0.0
604
+
709
605
  if (
710
- location_tagging
606
+ add_location
711
607
  and self.page_dimensions is not None
712
608
  and prov is not None
713
609
  and len(prov) > 0
714
610
  ):
715
611
 
716
- page = prov[0].page
717
- page_dim = self.page_dimensions[page - 1]
612
+ page_i = prov[0].page
613
+ page_dim = self.page_dimensions[page_i - 1]
718
614
 
719
615
  page_w = float(page_dim.width)
720
616
  page_h = float(page_dim.height)
721
617
 
722
- x0 = float(prov[0].bbox[0]) / float(page_w)
723
- y0 = float(prov[0].bbox[1]) / float(page_h)
724
- x1 = float(prov[0].bbox[2]) / float(page_w)
725
- y1 = float(prov[0].bbox[3]) / float(page_h)
726
-
727
- page_tok = ""
728
- if page_tagging:
729
- page_tok = DocumentToken.get_page_token(page=page)
730
-
731
- x0_tok = DocumentToken.get_location_token(
732
- val=min(x0, x1), rnorm=location_dimensions[0]
733
- )
734
- y0_tok = DocumentToken.get_location_token(
735
- val=min(y0, y1), rnorm=location_dimensions[1]
736
- )
737
- x1_tok = DocumentToken.get_location_token(
738
- val=max(x0, x1), rnorm=location_dimensions[0]
739
- )
740
- y1_tok = DocumentToken.get_location_token(
741
- val=max(y0, y1), rnorm=location_dimensions[1]
742
- )
743
-
744
- # update
745
- loc_str = f"{DocumentToken.BEG_LOCATION.value}"
746
- loc_str += f"{page_tok}"
747
- loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
748
- loc_str += f"{DocumentToken.END_LOCATION.value}"
749
-
750
618
  item_type = item.obj_type
751
619
  if isinstance(item, BaseText) and (item_type in main_text_labels):
752
- text = item.text
753
620
 
754
- xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
621
+ doctags += item.export_to_document_tokens(
622
+ new_line=new_line,
623
+ page_w=page_w,
624
+ page_h=page_h,
625
+ xsize=xsize,
626
+ ysize=ysize,
627
+ add_location=add_location,
628
+ add_content=add_content,
629
+ add_page_index=add_page_index,
630
+ )
755
631
 
756
632
  elif isinstance(item, Table) and (item_type in main_text_labels):
757
633
 
758
- xml_str += f"<{item_type}>{loc_str}"
759
-
760
- if item.text is not None and len(item.text) > 0:
761
- xml_str += f"{DocumentToken.BEG_CAPTION.value}"
762
- xml_str += (
763
- f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
764
- )
765
-
766
- if item.data is not None and len(item.data) > 0:
767
- for i, row in enumerate(item.data):
768
- xml_str += f"<row_{i}>"
769
- for j, col in enumerate(row):
770
- text = col.text
771
- xml_str += f"<col_{j}>{text}</col_{j}>"
772
-
773
- xml_str += f"</row_{i}>{new_line}"
774
-
775
- xml_str += f"</{item_type}>{new_line}"
634
+ doctags += item.export_to_document_tokens(
635
+ new_line=new_line,
636
+ page_w=page_w,
637
+ page_h=page_h,
638
+ xsize=xsize,
639
+ ysize=ysize,
640
+ add_caption=True,
641
+ add_location=add_location,
642
+ add_content=add_content,
643
+ add_cell_location=add_table_cell_location,
644
+ add_cell_label=add_table_cell_label,
645
+ add_cell_text=add_table_cell_text,
646
+ add_page_index=add_page_index,
647
+ )
776
648
 
777
649
  elif isinstance(item, Figure) and (item_type in main_text_labels):
778
650
 
779
- xml_str += f"<{item_type}>{loc_str}"
780
-
781
- if item.text is not None and len(item.text) > 0:
782
- xml_str += f"{DocumentToken.BEG_CAPTION.value}"
783
- xml_str += (
784
- f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
785
- )
786
-
787
- xml_str += f"</{item_type}>{new_line}"
651
+ doctags += item.export_to_document_tokens(
652
+ new_line=new_line,
653
+ page_w=page_w,
654
+ page_h=page_h,
655
+ xsize=xsize,
656
+ ysize=ysize,
657
+ add_caption=True,
658
+ add_location=add_location,
659
+ add_content=add_content,
660
+ add_page_index=add_page_index,
661
+ )
788
662
 
789
- xml_str += DocumentToken.END_DOCUMENT.value
663
+ doctags += DocumentToken.END_DOCUMENT.value
790
664
 
791
- return xml_str
665
+ return doctags
@@ -0,0 +1,202 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Tokens used in the docling document model."""
7
+
8
+ from enum import Enum
9
+ from typing import Annotated, Tuple
10
+
11
+ from pydantic import Field
12
+
13
+
14
+ class TableToken(Enum):
15
+ """Class to represent an LLM friendly representation of a Table."""
16
+
17
+ CELL_LABEL_COLUMN_HEADER = "<column_header>"
18
+ CELL_LABEL_ROW_HEADER = "<row_header>"
19
+ CELL_LABEL_SECTION_HEADERE = "<section_header>"
20
+ CELL_LABEL_DATA = "<data>"
21
+
22
+ OTSL_ECEL = "<ecel>" # empty cell
23
+ OTSL_FCEL = "<fcel>" # cell with content
24
+ OTSL_LCEL = "<lcel>" # left looking cell,
25
+ OTSL_UCEL = "<ucel>" # up looking cell,
26
+ OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
27
+ OTSL_NL = "<nl>" # new line,
28
+ OTSL_CHED = "<ched>" # - column header cell,
29
+ OTSL_RHED = "<rhed>" # - row header cell,
30
+ OTSL_SROW = "<srow>" # - section row cell
31
+
32
+ @classmethod
33
+ def get_special_tokens(cls):
34
+ """Function to get all special document tokens."""
35
+ special_tokens = [token.value for token in cls]
36
+ return special_tokens
37
+
38
+ @staticmethod
39
+ def is_known_token(label):
40
+ """Function to check if label is in tokens."""
41
+ return label in TableToken.get_special_tokens()
42
+
43
+
44
+ class DocumentToken(Enum):
45
+ """Class to represent an LLM friendly representation of a Document."""
46
+
47
+ BEG_DOCUMENT = "<document>"
48
+ END_DOCUMENT = "</document>"
49
+
50
+ BEG_TITLE = "<title>"
51
+ END_TITLE = "</title>"
52
+
53
+ BEG_ABSTRACT = "<abstract>"
54
+ END_ABSTRACT = "</abstract>"
55
+
56
+ BEG_DOI = "<doi>"
57
+ END_DOI = "</doi>"
58
+ BEG_DATE = "<date>"
59
+ END_DATE = "</date>"
60
+
61
+ BEG_AUTHORS = "<authors>"
62
+ END_AUTHORS = "</authors>"
63
+ BEG_AUTHOR = "<author>"
64
+ END_AUTHOR = "</author>"
65
+
66
+ BEG_AFFILIATIONS = "<affiliations>"
67
+ END_AFFILIATIONS = "</affiliations>"
68
+ BEG_AFFILIATION = "<affiliation>"
69
+ END_AFFILIATION = "</affiliation>"
70
+
71
+ BEG_HEADER = "<section-header>"
72
+ END_HEADER = "</section-header>"
73
+ BEG_TEXT = "<text>"
74
+ END_TEXT = "</text>"
75
+ BEG_PARAGRAPH = "<paragraph>"
76
+ END_PARAGRAPH = "</paragraph>"
77
+ BEG_TABLE = "<table>"
78
+ END_TABLE = "</table>"
79
+ BEG_FIGURE = "<figure>"
80
+ END_FIGURE = "</figure>"
81
+ BEG_CAPTION = "<caption>"
82
+ END_CAPTION = "</caption>"
83
+ BEG_EQUATION = "<equation>"
84
+ END_EQUATION = "</equation>"
85
+ BEG_LIST = "<list>"
86
+ END_LIST = "</list>"
87
+ BEG_LISTITEM = "<list-item>"
88
+ END_LISTITEM = "</list-item>"
89
+
90
+ BEG_LOCATION = "<location>"
91
+ END_LOCATION = "</location>"
92
+ BEG_GROUP = "<group>"
93
+ END_GROUP = "</group>"
94
+
95
+ @classmethod
96
+ def get_special_tokens(
97
+ cls,
98
+ max_rows: int = 100,
99
+ max_cols: int = 100,
100
+ max_pages: int = 1000,
101
+ page_dimension: Tuple[int, int] = (100, 100),
102
+ ):
103
+ """Function to get all special document tokens."""
104
+ special_tokens = [token.value for token in cls]
105
+
106
+ # Adding dynamically generated row and col tokens
107
+ for i in range(0, max_rows + 1):
108
+ special_tokens += [f"<row_{i}>", f"</row_{i}>"]
109
+
110
+ for i in range(0, max_cols + 1):
111
+ special_tokens += [f"<col_{i}>", f"</col_{i}>"]
112
+
113
+ for i in range(6):
114
+ special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
115
+
116
+ # FIXME: this is synonym of section header
117
+ for i in range(6):
118
+ special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
119
+
120
+ # Adding dynamically generated page-tokens
121
+ for i in range(0, max_pages + 1):
122
+ special_tokens.append(f"<page_{i}>")
123
+ special_tokens.append(f"</page_{i}>")
124
+
125
+ # Adding dynamically generated location-tokens
126
+ for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
127
+ special_tokens.append(f"<loc_{i}>")
128
+
129
+ return special_tokens
130
+
131
+ @staticmethod
132
+ def is_known_token(label):
133
+ """Function to check if label is in tokens."""
134
+ return label in DocumentToken.get_special_tokens()
135
+
136
+ @staticmethod
137
+ def get_row_token(row: int, beg=bool) -> str:
138
+ """Function to get page tokens."""
139
+ if beg:
140
+ return f"<row_{row}>"
141
+ else:
142
+ return f"</row_{row}>"
143
+
144
+ @staticmethod
145
+ def get_col_token(col: int, beg=bool) -> str:
146
+ """Function to get page tokens."""
147
+ if beg:
148
+ return f"<col_{col}>"
149
+ else:
150
+ return f"</col_{col}>"
151
+
152
+ @staticmethod
153
+ def get_page_token(page: int):
154
+ """Function to get page tokens."""
155
+ return f"<page_{page}>"
156
+
157
+ @staticmethod
158
+ def get_location_token(val: float, rnorm: int = 100):
159
+ """Function to get location tokens."""
160
+ val_ = round(rnorm * val)
161
+
162
+ if val_ < 0:
163
+ return "<loc_0>"
164
+
165
+ if val_ > rnorm:
166
+ return f"<loc_{rnorm}>"
167
+
168
+ return f"<loc_{val_}>"
169
+
170
+ @staticmethod
171
+ def get_location(
172
+ # bbox: Tuple[float, float, float, float],
173
+ bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
174
+ page_w: float,
175
+ page_h: float,
176
+ xsize: int = 100,
177
+ ysize: int = 100,
178
+ page_i: int = -1,
179
+ ):
180
+ """Get the location string give bbox and page-dim."""
181
+ assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
182
+ assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
183
+
184
+ x0 = bbox[0] / page_w
185
+ y0 = bbox[1] / page_h
186
+ x1 = bbox[2] / page_w
187
+ y1 = bbox[3] / page_h
188
+
189
+ page_tok = ""
190
+ if page_i != -1:
191
+ page_tok = DocumentToken.get_page_token(page=page_i)
192
+
193
+ x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
194
+ y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
195
+ x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
196
+ y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
197
+
198
+ loc_str = f"{DocumentToken.BEG_LOCATION.value}"
199
+ loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
200
+ loc_str += f"{DocumentToken.END_LOCATION.value}"
201
+
202
+ return loc_str
@@ -0,0 +1,54 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """File-related utilities."""
7
+
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import Union
11
+
12
+ import requests
13
+ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
14
+
15
+
16
+ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
17
+ """Resolves the source (URL, path) of a file to a local file path.
18
+
19
+ If a URL is provided, the content is first downloaded to a temporary local file.
20
+
21
+ Args:
22
+ source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
23
+
24
+ Raises:
25
+ ValueError: If source is of unexpected type.
26
+
27
+ Returns:
28
+ Path: The local file path.
29
+ """
30
+ try:
31
+ http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
32
+ res = requests.get(http_url, stream=True)
33
+ res.raise_for_status()
34
+ fname = None
35
+ # try to get filename from response header
36
+ if cont_disp := res.headers.get("Content-Disposition"):
37
+ for par in cont_disp.strip().split(";"):
38
+ # currently only handling directive "filename" (not "*filename")
39
+ if (split := par.split("=")) and split[0].strip() == "filename":
40
+ fname = "=".join(split[1:]).strip().strip("'\"") or None
41
+ break
42
+ # otherwise, use name from URL:
43
+ if fname is None:
44
+ fname = Path(http_url.path or "file").name
45
+ local_path = Path(tempfile.mkdtemp()) / fname
46
+ with open(local_path, "wb") as f:
47
+ for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
48
+ f.write(chunk)
49
+ except ValidationError:
50
+ try:
51
+ local_path = TypeAdapter(Path).validate_python(source)
52
+ except ValidationError:
53
+ raise ValueError(f"Unexpected source type encountered: {type(source)}")
54
+ return local_path
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.4.0
3
+ Version: 1.5.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -20,11 +20,12 @@ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=lAeHgJ4relA1EU0YV
20
20
  docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
21
21
  docling_core/types/base.py,sha256=fNtfQ20NKa_RBNBWbq0DfO8o0zC1Cec8UAMu0Znsltk,8170
22
22
  docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
23
- docling_core/types/doc/base.py,sha256=5qO6E9HZz_6oM2KPfDPyAoZkSrgMFKd4skomS94g9do,8628
23
+ docling_core/types/doc/base.py,sha256=QQC8KzQeYWnHFPY2_BNGcbTp6J2_rPbnLjsnbehICno,14710
24
24
  docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
25
25
  docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
26
26
  docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
27
- docling_core/types/doc/document.py,sha256=6puIPc3aK3kecklCFqVgJXgU4gvGPqjuXePx82poFYE,26934
27
+ docling_core/types/doc/document.py,sha256=FdAAyYfYnKXi3kqt0Qk2NYFAxsuGbHsNVB1EDIRVH3Y,22136
28
+ docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
28
29
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
29
30
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
30
31
  docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
@@ -41,10 +42,11 @@ docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iq
41
42
  docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
42
43
  docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0SYhFa6X7bQ,4248
43
44
  docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
45
+ docling_core/utils/file.py,sha256=VQgzjyvmJnAIHB6ex7ikcmbDAR4GA1ALreuO7Ubrp50,1895
44
46
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
45
47
  docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
46
- docling_core-1.4.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
47
- docling_core-1.4.0.dist-info/METADATA,sha256=38Mr22u1ht8OoUDwYIFc-Y43v1bs8irQqmS0b6ecXpQ,5432
48
- docling_core-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
49
- docling_core-1.4.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
50
- docling_core-1.4.0.dist-info/RECORD,,
48
+ docling_core-1.5.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
49
+ docling_core-1.5.0.dist-info/METADATA,sha256=Z5kzVlogRs8FQN_ummLsikFOrf9BBr-rR-ESfwLvwHs,5432
50
+ docling_core-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
51
+ docling_core-1.5.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
52
+ docling_core-1.5.0.dist-info/RECORD,,