docling-core 2.21.1__tar.gz → 2.21.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.21.1 → docling_core-2.21.2}/PKG-INFO +1 -1
  2. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/transforms/chunker/hybrid_chunker.py +2 -2
  3. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/doc/document.py +17 -7
  4. {docling_core-2.21.1 → docling_core-2.21.2}/pyproject.toml +1 -1
  5. {docling_core-2.21.1 → docling_core-2.21.2}/LICENSE +0 -0
  6. {docling_core-2.21.1 → docling_core-2.21.2}/README.md +0 -0
  7. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/__init__.py +0 -0
  8. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/py.typed +0 -0
  11. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/search/package.py +0 -0
  24. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/__init__.py +0 -0
  29. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/base.py +0 -0
  30. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/doc/__init__.py +0 -0
  31. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/doc/base.py +0 -0
  32. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/doc/labels.py +0 -0
  33. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.21.1 → docling_core-2.21.2}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.21.1
3
+ Version: 2.21.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
73
73
  for t in text:
74
74
  total += self._count_text_tokens(t)
75
75
  return total
76
- return len(self._tokenizer.tokenize(text, max_length=None))
76
+ return len(self._tokenizer.tokenize(text))
77
77
 
78
78
  class _ChunkLengthInfo(BaseModel):
79
79
  total_len: int
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
82
82
 
83
83
  def _count_chunk_tokens(self, doc_chunk: DocChunk):
84
84
  ser_txt = self.serialize(chunk=doc_chunk)
85
- return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
85
+ return len(self._tokenizer.tokenize(text=ser_txt))
86
86
 
87
87
  def _doc_chunk_length(self, doc_chunk: DocChunk):
88
88
  text_length = self._count_text_tokens(doc_chunk.text)
@@ -800,7 +800,7 @@ class CodeItem(FloatingItem, TextItem):
800
800
  :param add_content: bool: (Default value = True)
801
801
 
802
802
  """
803
- body = f"<{self.label.value}{new_line}"
803
+ body = f"<{self.label.value}>{new_line}"
804
804
 
805
805
  if add_location:
806
806
  body += self.get_location_tokens(
@@ -813,7 +813,7 @@ class CodeItem(FloatingItem, TextItem):
813
813
  if add_content and self.text is not None:
814
814
  body += f"<_{self.code_language.value}_>{self.text}{new_line}"
815
815
 
816
- body += f"</{self.label.value}\n"
816
+ body += f"</{self.label.value}>\n"
817
817
 
818
818
  return body
819
819
 
@@ -2487,7 +2487,6 @@ class DoclingDocument(BaseModel):
2487
2487
  is_inline_scope=is_inline_scope,
2488
2488
  visited=visited,
2489
2489
  )
2490
- # NOTE: assumes unordered (flag & marker currently in ListItem)
2491
2490
  indent_str = list_level * indent * " "
2492
2491
  is_ol = item.label == GroupLabel.ORDERED_LIST
2493
2492
  text = "\n".join(
@@ -2501,7 +2500,12 @@ class DoclingDocument(BaseModel):
2501
2500
  for i, c in enumerate(comps)
2502
2501
  ]
2503
2502
  )
2504
- _ingest_text(text=text)
2503
+ _ingest_text(
2504
+ text=text,
2505
+ # special chars have already been escaped as needed
2506
+ do_escape_html=False,
2507
+ do_escape_underscores=False,
2508
+ )
2505
2509
  elif item.label == GroupLabel.INLINE:
2506
2510
  comps = self._get_markdown_components(
2507
2511
  node=item,
@@ -2520,7 +2524,13 @@ class DoclingDocument(BaseModel):
2520
2524
  is_inline_scope=True,
2521
2525
  visited=visited,
2522
2526
  )
2523
- _ingest_text(" ".join(comps))
2527
+ text = " ".join(comps)
2528
+ _ingest_text(
2529
+ text=text,
2530
+ # special chars have already been escaped as needed
2531
+ do_escape_html=False,
2532
+ do_escape_underscores=False,
2533
+ )
2524
2534
  else:
2525
2535
  continue
2526
2536
 
@@ -2838,7 +2848,7 @@ class DoclingDocument(BaseModel):
2838
2848
 
2839
2849
  # Building a math equation in MathML format
2840
2850
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
2841
- elif formula_to_mathml:
2851
+ elif formula_to_mathml and len(math_formula) > 0:
2842
2852
  try:
2843
2853
  mathml_element = latex2mathml.converter.convert_to_element(
2844
2854
  math_formula, display="block"
@@ -2860,7 +2870,7 @@ class DoclingDocument(BaseModel):
2860
2870
  and img_fallback is not None
2861
2871
  ):
2862
2872
  text = img_fallback
2863
- elif len(math_formula) > 0:
2873
+ else:
2864
2874
  text = f"<pre>{math_formula}</pre>"
2865
2875
 
2866
2876
  elif math_formula != "":
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.21.1"
3
+ version = "2.21.2"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes