docling 1.12.2__tar.gz → 1.13.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {docling-1.12.2 → docling-1.13.1}/PKG-INFO +5 -3
  2. {docling-1.12.2 → docling-1.13.1}/README.md +2 -0
  3. {docling-1.12.2 → docling-1.13.1}/docling/datamodel/document.py +18 -8
  4. {docling-1.12.2 → docling-1.13.1}/docling/utils/export.py +2 -63
  5. {docling-1.12.2 → docling-1.13.1}/pyproject.toml +3 -3
  6. {docling-1.12.2 → docling-1.13.1}/LICENSE +0 -0
  7. {docling-1.12.2 → docling-1.13.1}/docling/__init__.py +0 -0
  8. {docling-1.12.2 → docling-1.13.1}/docling/backend/__init__.py +0 -0
  9. {docling-1.12.2 → docling-1.13.1}/docling/backend/abstract_backend.py +0 -0
  10. {docling-1.12.2 → docling-1.13.1}/docling/backend/docling_parse_backend.py +0 -0
  11. {docling-1.12.2 → docling-1.13.1}/docling/backend/pypdfium2_backend.py +0 -0
  12. {docling-1.12.2 → docling-1.13.1}/docling/cli/__init__.py +0 -0
  13. {docling-1.12.2 → docling-1.13.1}/docling/cli/main.py +0 -0
  14. {docling-1.12.2 → docling-1.13.1}/docling/datamodel/__init__.py +0 -0
  15. {docling-1.12.2 → docling-1.13.1}/docling/datamodel/base_models.py +0 -0
  16. {docling-1.12.2 → docling-1.13.1}/docling/datamodel/settings.py +0 -0
  17. {docling-1.12.2 → docling-1.13.1}/docling/document_converter.py +0 -0
  18. {docling-1.12.2 → docling-1.13.1}/docling/models/__init__.py +0 -0
  19. {docling-1.12.2 → docling-1.13.1}/docling/models/base_ocr_model.py +0 -0
  20. {docling-1.12.2 → docling-1.13.1}/docling/models/ds_glm_model.py +0 -0
  21. {docling-1.12.2 → docling-1.13.1}/docling/models/easyocr_model.py +0 -0
  22. {docling-1.12.2 → docling-1.13.1}/docling/models/layout_model.py +0 -0
  23. {docling-1.12.2 → docling-1.13.1}/docling/models/page_assemble_model.py +0 -0
  24. {docling-1.12.2 → docling-1.13.1}/docling/models/table_structure_model.py +0 -0
  25. {docling-1.12.2 → docling-1.13.1}/docling/pipeline/__init__.py +0 -0
  26. {docling-1.12.2 → docling-1.13.1}/docling/pipeline/base_model_pipeline.py +0 -0
  27. {docling-1.12.2 → docling-1.13.1}/docling/pipeline/standard_model_pipeline.py +0 -0
  28. {docling-1.12.2 → docling-1.13.1}/docling/utils/__init__.py +0 -0
  29. {docling-1.12.2 → docling-1.13.1}/docling/utils/layout_utils.py +0 -0
  30. {docling-1.12.2 → docling-1.13.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.12.2
3
+ Version: 1.13.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -21,8 +21,8 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: examples
23
23
  Requires-Dist: certifi (>=2024.7.4)
24
- Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
25
- Requires-Dist: docling-core (>=1.3.0,<2.0.0)
24
+ Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
25
+ Requires-Dist: docling-core (>=1.5.0,<2.0.0)
26
26
  Requires-Dist: docling-ibm-models (>=1.2.0,<2.0.0)
27
27
  Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
28
28
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -122,7 +122,9 @@ from docling.document_converter import DocumentConverter
122
122
  source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
123
123
  converter = DocumentConverter()
124
124
  result = converter.convert_single(source)
125
+
125
126
  print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
127
+ print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
126
128
  ```
127
129
 
128
130
  ### Convert a batch of documents
@@ -70,7 +70,9 @@ from docling.document_converter import DocumentConverter
70
70
  source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
71
71
  converter = DocumentConverter()
72
72
  result = converter.convert_single(source)
73
+
73
74
  print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
75
+ print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
74
76
  ```
75
77
 
76
78
  ### Convert a batch of documents
@@ -368,20 +368,30 @@ class ConvertedDocument(BaseModel):
368
368
  "table",
369
369
  "figure",
370
370
  ],
371
- page_tagging: bool = True,
372
- location_tagging: bool = True,
373
- location_dimensions: Tuple[int, int] = (100, 100),
374
- add_new_line: bool = True,
371
+ xsize: int = 100,
372
+ ysize: int = 100,
373
+ add_location: bool = True,
374
+ add_content: bool = True,
375
+ add_page_index: bool = True,
376
+ # table specific flags
377
+ add_table_cell_location: bool = False,
378
+ add_table_cell_label: bool = True,
379
+ add_table_cell_text: bool = True,
375
380
  ) -> str:
376
381
  return self.output.export_to_document_tokens(
377
382
  delim=delim,
378
383
  main_text_start=main_text_start,
379
384
  main_text_stop=main_text_stop,
380
385
  main_text_labels=main_text_labels,
381
- page_tagging=page_tagging,
382
- location_tagging=location_tagging,
383
- location_dimensions=location_dimensions,
384
- add_new_line=add_new_line,
386
+ xsize=xsize,
387
+ ysize=ysize,
388
+ add_location=add_location,
389
+ add_content=add_content,
390
+ add_page_index=add_page_index,
391
+ # table specific flags
392
+ add_table_cell_location=add_table_cell_location,
393
+ add_table_cell_label=add_table_cell_label,
394
+ add_table_cell_text=add_table_cell_text,
385
395
  )
386
396
 
387
397
  def render_element_images(
@@ -9,67 +9,6 @@ from docling.datamodel.document import ConversionResult, Page
9
9
  _log = logging.getLogger(__name__)
10
10
 
11
11
 
12
- def _export_table_to_html(table: Table):
13
-
14
- # TODO: this is flagged as internal, because we will move it
15
- # to the docling-core package.
16
-
17
- def _get_tablecell_span(cell: TableCell, ix):
18
- if cell.spans is None:
19
- span = set()
20
- else:
21
- span = set([s[ix] for s in cell.spans])
22
- if len(span) == 0:
23
- return 1, None, None
24
- return len(span), min(span), max(span)
25
-
26
- body = ""
27
- nrows = table.num_rows
28
- ncols = table.num_cols
29
-
30
- if table.data is None:
31
- return ""
32
- for i in range(nrows):
33
- body += "<tr>"
34
- for j in range(ncols):
35
- cell: TableCell = table.data[i][j]
36
-
37
- rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
38
- colspan, colstart, colend = _get_tablecell_span(cell, 1)
39
-
40
- if rowstart is not None and rowstart != i:
41
- continue
42
- if colstart is not None and colstart != j:
43
- continue
44
-
45
- if rowstart is None:
46
- rowstart = i
47
- if colstart is None:
48
- colstart = j
49
-
50
- content = cell.text.strip()
51
- label = cell.obj_type
52
- label_class = "body"
53
- celltag = "td"
54
- if label in ["row_header", "row_multi_header", "row_title"]:
55
- label_class = "header"
56
- elif label in ["col_header", "col_multi_header"]:
57
- label_class = "header"
58
- celltag = "th"
59
-
60
- opening_tag = f"{celltag}"
61
- if rowspan > 1:
62
- opening_tag += f' rowspan="{rowspan}"'
63
- if colspan > 1:
64
- opening_tag += f' colspan="{colspan}"'
65
-
66
- body += f"<{opening_tag}>{content}</{celltag}>"
67
- body += "</tr>"
68
- body = f"<table>{body}</table>"
69
-
70
- return body
71
-
72
-
73
12
  def generate_multimodal_pages(
74
13
  doc_result: ConversionResult,
75
14
  ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
@@ -129,7 +68,7 @@ def generate_multimodal_pages(
129
68
  }
130
69
 
131
70
  if isinstance(item, Table):
132
- table_html = _export_table_to_html(item)
71
+ table_html = item.export_to_html()
133
72
  new_segment["data"].append(
134
73
  {
135
74
  "html_seq": table_html,
@@ -172,7 +111,7 @@ def generate_multimodal_pages(
172
111
  )
173
112
  # No page-tagging since we only do 1 page at the time
174
113
  content_dt = doc.export_to_document_tokens(
175
- main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
114
+ main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
176
115
  )
177
116
 
178
117
  return content_text, content_md, content_dt, page_cells, page_segments, page
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.12.2" # DO NOT EDIT, updated automatically
3
+ version = "1.13.1" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -23,9 +23,9 @@ packages = [{include = "docling"}]
23
23
  [tool.poetry.dependencies]
24
24
  python = "^3.10"
25
25
  pydantic = "^2.0.0"
26
- docling-core = "^1.3.0"
26
+ docling-core = "^1.5.0"
27
27
  docling-ibm-models = "^1.2.0"
28
- deepsearch-glm = "^0.21.0"
28
+ deepsearch-glm = "^0.21.1"
29
29
  filetype = "^1.2.0"
30
30
  pypdfium2 = "^4.30.0"
31
31
  pydantic-settings = "^2.3.0"
File without changes
File without changes
File without changes