docling-core 2.21.0__tar.gz → 2.21.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.21.0 → docling_core-2.21.2}/PKG-INFO +1 -1
  2. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/transforms/chunker/hybrid_chunker.py +2 -2
  3. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/doc/document.py +24 -9
  4. {docling_core-2.21.0 → docling_core-2.21.2}/pyproject.toml +1 -1
  5. {docling_core-2.21.0 → docling_core-2.21.2}/LICENSE +0 -0
  6. {docling_core-2.21.0 → docling_core-2.21.2}/README.md +0 -0
  7. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/__init__.py +0 -0
  8. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/py.typed +0 -0
  11. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/search/package.py +0 -0
  24. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/__init__.py +0 -0
  29. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/base.py +0 -0
  30. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/doc/__init__.py +0 -0
  31. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/doc/base.py +0 -0
  32. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/doc/labels.py +0 -0
  33. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.21.0 → docling_core-2.21.2}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.21.0
3
+ Version: 2.21.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
73
73
  for t in text:
74
74
  total += self._count_text_tokens(t)
75
75
  return total
76
- return len(self._tokenizer.tokenize(text, max_length=None))
76
+ return len(self._tokenizer.tokenize(text))
77
77
 
78
78
  class _ChunkLengthInfo(BaseModel):
79
79
  total_len: int
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
82
82
 
83
83
  def _count_chunk_tokens(self, doc_chunk: DocChunk):
84
84
  ser_txt = self.serialize(chunk=doc_chunk)
85
- return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
85
+ return len(self._tokenizer.tokenize(text=ser_txt))
86
86
 
87
87
  def _doc_chunk_length(self, doc_chunk: DocChunk):
88
88
  text_length = self._count_text_tokens(doc_chunk.text)
@@ -800,7 +800,7 @@ class CodeItem(FloatingItem, TextItem):
800
800
  :param add_content: bool: (Default value = True)
801
801
 
802
802
  """
803
- body = f"<{self.label.value}{new_line}"
803
+ body = f"<{self.label.value}>{new_line}"
804
804
 
805
805
  if add_location:
806
806
  body += self.get_location_tokens(
@@ -813,7 +813,7 @@ class CodeItem(FloatingItem, TextItem):
813
813
  if add_content and self.text is not None:
814
814
  body += f"<_{self.code_language.value}_>{self.text}{new_line}"
815
815
 
816
- body += f"</{self.label.value}\n"
816
+ body += f"</{self.label.value}>\n"
817
817
 
818
818
  return body
819
819
 
@@ -2487,16 +2487,25 @@ class DoclingDocument(BaseModel):
2487
2487
  is_inline_scope=is_inline_scope,
2488
2488
  visited=visited,
2489
2489
  )
2490
- # NOTE: assumes unordered (flag & marker currently in ListItem)
2491
2490
  indent_str = list_level * indent * " "
2491
+ is_ol = item.label == GroupLabel.ORDERED_LIST
2492
2492
  text = "\n".join(
2493
2493
  [
2494
2494
  # avoid additional marker on already evaled sublists
2495
- cpt if cpt and cpt[0] == " " else f"{indent_str}- {cpt}"
2496
- for cpt in comps
2495
+ (
2496
+ c
2497
+ if c and c[0] == " "
2498
+ else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
2499
+ )
2500
+ for i, c in enumerate(comps)
2497
2501
  ]
2498
2502
  )
2499
- _ingest_text(text=text)
2503
+ _ingest_text(
2504
+ text=text,
2505
+ # special chars have already been escaped as needed
2506
+ do_escape_html=False,
2507
+ do_escape_underscores=False,
2508
+ )
2500
2509
  elif item.label == GroupLabel.INLINE:
2501
2510
  comps = self._get_markdown_components(
2502
2511
  node=item,
@@ -2515,7 +2524,13 @@ class DoclingDocument(BaseModel):
2515
2524
  is_inline_scope=True,
2516
2525
  visited=visited,
2517
2526
  )
2518
- _ingest_text(" ".join(comps))
2527
+ text = " ".join(comps)
2528
+ _ingest_text(
2529
+ text=text,
2530
+ # special chars have already been escaped as needed
2531
+ do_escape_html=False,
2532
+ do_escape_underscores=False,
2533
+ )
2519
2534
  else:
2520
2535
  continue
2521
2536
 
@@ -2833,7 +2848,7 @@ class DoclingDocument(BaseModel):
2833
2848
 
2834
2849
  # Building a math equation in MathML format
2835
2850
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
2836
- elif formula_to_mathml:
2851
+ elif formula_to_mathml and len(math_formula) > 0:
2837
2852
  try:
2838
2853
  mathml_element = latex2mathml.converter.convert_to_element(
2839
2854
  math_formula, display="block"
@@ -2855,7 +2870,7 @@ class DoclingDocument(BaseModel):
2855
2870
  and img_fallback is not None
2856
2871
  ):
2857
2872
  text = img_fallback
2858
- elif len(math_formula) > 0:
2873
+ else:
2859
2874
  text = f"<pre>{math_formula}</pre>"
2860
2875
 
2861
2876
  elif math_formula != "":
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.21.0"
3
+ version = "2.21.2"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes