dataknobs-xization 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -1,11 +1,34 @@
1
1
  """Text normalization and tokenization tools."""
2
2
 
3
- from dataknobs_xization import annotations, authorities, lexicon, masking_tokenizer, normalize
3
+ from dataknobs_xization import (
4
+ annotations,
5
+ authorities,
6
+ lexicon,
7
+ markdown,
8
+ masking_tokenizer,
9
+ normalize,
10
+ )
11
+ from dataknobs_xization.markdown import (
12
+ AdaptiveStreamingProcessor,
13
+ Chunk,
14
+ ChunkFormat,
15
+ ChunkMetadata,
16
+ HeadingInclusion,
17
+ MarkdownChunker,
18
+ MarkdownNode,
19
+ MarkdownParser,
20
+ StreamingMarkdownProcessor,
21
+ chunk_markdown_tree,
22
+ parse_markdown,
23
+ stream_markdown_file,
24
+ stream_markdown_string,
25
+ )
4
26
  from dataknobs_xization.masking_tokenizer import CharacterFeatures, TextFeatures
5
27
 
6
28
  __version__ = "1.0.0"
7
29
 
8
30
  __all__ = [
31
+ # Existing exports
9
32
  "CharacterFeatures",
10
33
  "TextFeatures",
11
34
  "annotations",
@@ -13,4 +36,20 @@ __all__ = [
13
36
  "lexicon",
14
37
  "masking_tokenizer",
15
38
  "normalize",
39
+ # Markdown module
40
+ "markdown",
41
+ # Markdown chunking classes and functions
42
+ "AdaptiveStreamingProcessor",
43
+ "Chunk",
44
+ "ChunkFormat",
45
+ "ChunkMetadata",
46
+ "HeadingInclusion",
47
+ "MarkdownChunker",
48
+ "MarkdownNode",
49
+ "MarkdownParser",
50
+ "StreamingMarkdownProcessor",
51
+ "chunk_markdown_tree",
52
+ "parse_markdown",
53
+ "stream_markdown_file",
54
+ "stream_markdown_string",
16
55
  ]
@@ -237,7 +237,7 @@ class Annotations:
237
237
  if self._df is not None:
238
238
  alist = self._df.to_dict(orient="records")
239
239
  self._df = None
240
- return alist if alist is not None else list()
240
+ return alist if alist is not None else []
241
241
 
242
242
  def _build_df(self) -> pd.DataFrame:
243
243
  """Get the annotations as a df."""
@@ -303,7 +303,7 @@ class AnnotationsBuilder:
303
303
  :param key_fields: The dictionary of key fields
304
304
  :param kwargs: Any extra fields to add
305
305
  """
306
- result = dict()
306
+ result = {}
307
307
  result.update(key_fields)
308
308
  if self.data_defaults is not None:
309
309
  # Add data_defaults
@@ -392,7 +392,7 @@ class AnnotationsGroup:
392
392
  :param autolock: True to automatically lock this group when (1) at
393
393
  least one row has been added and (2) a row is rejected.
394
394
  """
395
- self.rows = list() # List[RowData]
395
+ self.rows = [] # List[RowData]
396
396
  self.row_accessor = row_accessor
397
397
  self.field_col_type = field_col_type
398
398
  self.accept_fn = accept_fn
@@ -732,17 +732,17 @@ class AnnotationsGroupList:
732
732
  def __init__(
733
733
  self,
734
734
  groups: List[AnnotationsGroup] = None,
735
- accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda l, g: l.size
735
+ accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda lst, g: lst.size
736
736
  == 0
737
- or not g.is_subset_of_any(l.groups),
737
+ or not g.is_subset_of_any(lst.groups),
738
738
  ):
739
739
  """:param groups: The initial groups for this list
740
- :param accept_fn: A fn(l, g) that returns True to accept the group, g,
741
- into this list, l, or False to reject the group. If None, then all
740
+ :param accept_fn: A fn(lst, g) that returns True to accept the group, g,
741
+ into this list, lst, or False to reject the group. If None, then all
742
742
  groups are always accepted. The default function will reject any
743
743
  group that is a subset of any existing group in the list.
744
744
  """
745
- self.groups = groups if groups is not None else list()
745
+ self.groups = groups if groups is not None else []
746
746
  self.accept_fn = accept_fn
747
747
  self._coverage = None
748
748
 
@@ -838,7 +838,7 @@ class AnnotatedText(dk_doc.Text):
838
838
  def bookmarks(self) -> Dict[str, pd.DataFrame]:
839
839
  """Get this object's bookmarks"""
840
840
  if self._bookmarks is None:
841
- self._bookmarks = dict()
841
+ self._bookmarks = {}
842
842
  return self._bookmarks
843
843
 
844
844
  def get_text(
@@ -1134,13 +1134,14 @@ class EntityAnnotator(BasicAnnotator):
1134
1134
  :param largest_only: True to only mark largest records.
1135
1135
  :return: The annotations added to the text object
1136
1136
  """
1137
- annot2mask = (
1138
- None
1139
- if annot_mask_cols is None
1140
- else { # TODO: Use this?!
1141
- col: self.mask_char for col in annot_mask_cols
1142
- }
1143
- )
1137
+ # TODO: Use annot_mask_cols to mask annotations
1138
+ # annot2mask = (
1139
+ # None
1140
+ # if annot_mask_cols is None
1141
+ # else {
1142
+ # col: self.mask_char for col in annot_mask_cols
1143
+ # }
1144
+ # )
1144
1145
 
1145
1146
  annots = self.annotate_text(text_obj.text)
1146
1147
  if annots is None:
@@ -653,7 +653,7 @@ class RegexAuthority(Authority):
653
653
  :return: The added Annotations
654
654
  """
655
655
  for match in re.finditer(self.regex, text_obj.text):
656
- ann_dicts = list()
656
+ ann_dicts = []
657
657
  if match.lastindex is not None:
658
658
  if len(self.regex.groupindex) > 0: # we have named groups
659
659
  for group_name, group_num in self.regex.groupindex.items():
@@ -735,7 +735,7 @@ class AuthoritiesBundle(Authority):
735
735
  anns_validator=anns_validator,
736
736
  parent_auth=parent_auth,
737
737
  )
738
- self.auths = auths.copy() if auths is not None else list()
738
+ self.auths = auths.copy() if auths is not None else []
739
739
 
740
740
  def add(self, auth: Authority):
741
741
  """Add the authority to this bundle
@@ -56,7 +56,7 @@ class LexicalExpander:
56
56
  variations = {self.normalize_fn(v) for v in variations}
57
57
  # Add a mapping from each variation to its original term
58
58
  if variations is not None and len(variations) > 0:
59
- more_itertools.consume(map(lambda v: self.v2t[v].add(term), variations))
59
+ more_itertools.consume(self.v2t[v].add(term) for v in variations)
60
60
  return variations
61
61
 
62
62
  def normalize(self, input_term: str) -> str:
@@ -92,7 +92,7 @@ class TokenMatch:
92
92
 
93
93
  self.varparts = var.split()
94
94
  self.matches = True
95
- self.tokens = list()
95
+ self.tokens = []
96
96
  t = token
97
97
  for v in self.varparts:
98
98
  if t is not None and v == t.norm_text:
@@ -133,7 +133,7 @@ class TokenAligner:
133
133
  def __init__(self, first_token: dk_tok.Token, authority: dk_auth.LexicalAuthority):
134
134
  self.first_token = first_token
135
135
  self.auth = authority
136
- self.annotations = list() # List[Dict[str, Any]]
136
+ self.annotations = [] # List[Dict[str, Any]]
137
137
  self._processed_idx = set()
138
138
  self._process(self.first_token)
139
139
 
@@ -147,7 +147,7 @@ class TokenAligner:
147
147
  self._process(token.next_token)
148
148
 
149
149
  def _get_token_matches(self, token):
150
- token_matches = list()
150
+ token_matches = []
151
151
  vs = self.auth.find_variations(token.norm_text, starts_with=True)
152
152
  if len(vs) > 0:
153
153
  for val_idx, var in vs.items():
@@ -169,7 +169,7 @@ class DataframeAuthority(dk_auth.LexicalAuthority):
169
169
  authdata: dk_auth.AuthorityData,
170
170
  auth_anns_builder: dk_auth.AuthorityAnnotationsBuilder = None,
171
171
  field_groups: dk_auth.DerivedFieldGroups = None,
172
- anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
172
+ anns_validator: Callable[[dk_auth.Authority, Dict[str, Any]], bool] = None,
173
173
  parent_auth: dk_auth.Authority = None,
174
174
  ):
175
175
  """Initialize with the name, values, and associated ids of the authority;
@@ -351,7 +351,7 @@ class CorrelatedAuthorityData(dk_auth.AuthorityData):
351
351
 
352
352
  def __init__(self, df: pd.DataFrame, name: str):
353
353
  super().__init__(df, name)
354
- self._authority_data = dict()
354
+ self._authority_data = {}
355
355
 
356
356
  def sub_authority_names(self) -> List[str]:
357
357
  """Get the "sub" authority names."""
@@ -406,7 +406,7 @@ class MultiAuthorityData(CorrelatedAuthorityData):
406
406
 
407
407
  def __init__(self, df: pd.DataFrame, name: str):
408
408
  super().__init__(df, name)
409
- self._authority_data = dict()
409
+ self._authority_data = {}
410
410
 
411
411
  @abstractmethod
412
412
  def build_authority_data(self, name: str) -> dk_auth.AuthorityData:
@@ -0,0 +1,44 @@
1
+ """Markdown chunking utilities for RAG applications.
2
+
3
+ This module provides comprehensive utilities for parsing and chunking markdown
4
+ documents while preserving semantic structure and heading hierarchy.
5
+ """
6
+
7
+ from dataknobs_xization.markdown.md_chunker import (
8
+ Chunk,
9
+ ChunkFormat,
10
+ ChunkMetadata,
11
+ HeadingInclusion,
12
+ MarkdownChunker,
13
+ chunk_markdown_tree,
14
+ )
15
+ from dataknobs_xization.markdown.md_parser import (
16
+ MarkdownNode,
17
+ MarkdownParser,
18
+ parse_markdown,
19
+ )
20
+ from dataknobs_xization.markdown.md_streaming import (
21
+ AdaptiveStreamingProcessor,
22
+ StreamingMarkdownProcessor,
23
+ stream_markdown_file,
24
+ stream_markdown_string,
25
+ )
26
+
27
+ __all__ = [
28
+ # Parser
29
+ "MarkdownNode",
30
+ "MarkdownParser",
31
+ "parse_markdown",
32
+ # Chunker
33
+ "Chunk",
34
+ "ChunkFormat",
35
+ "ChunkMetadata",
36
+ "HeadingInclusion",
37
+ "MarkdownChunker",
38
+ "chunk_markdown_tree",
39
+ # Streaming
40
+ "AdaptiveStreamingProcessor",
41
+ "StreamingMarkdownProcessor",
42
+ "stream_markdown_file",
43
+ "stream_markdown_string",
44
+ ]
@@ -0,0 +1,429 @@
1
+ """Markdown chunker for generating RAG-optimized chunks from tree structures.
2
+
3
+ This module provides functionality to traverse markdown tree structures and
4
+ generate chunks suitable for RAG (Retrieval-Augmented Generation) applications.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ from typing import Any, Iterator
12
+
13
+ from dataknobs_structures.tree import Tree
14
+
15
+ from dataknobs_xization.markdown.md_parser import MarkdownNode
16
+
17
+
18
+ class ChunkFormat(Enum):
19
+ """Output format for chunk text."""
20
+
21
+ MARKDOWN = "markdown" # Include headings as markdown
22
+ PLAIN = "plain" # Plain text without markdown formatting
23
+ DICT = "dict" # Return as dictionary
24
+
25
+
26
+ class HeadingInclusion(Enum):
27
+ """Strategy for including headings in chunks."""
28
+
29
+ IN_TEXT = "in_text" # Include headings in chunk text
30
+ IN_METADATA = "in_metadata" # Include headings only in metadata
31
+ BOTH = "both" # Include in both text and metadata
32
+ NONE = "none" # Don't include headings
33
+
34
+
35
+ @dataclass
36
+ class ChunkMetadata:
37
+ """Metadata for a document chunk.
38
+
39
+ Attributes:
40
+ headings: List of heading texts from root to chunk
41
+ heading_levels: List of heading levels corresponding to headings
42
+ line_number: Starting line number in source document
43
+ chunk_index: Index of this chunk in the sequence
44
+ chunk_size: Size of chunk text in characters
45
+ custom: Additional custom metadata
46
+ """
47
+
48
+ headings: list[str] = field(default_factory=list)
49
+ heading_levels: list[int] = field(default_factory=list)
50
+ line_number: int = 0
51
+ chunk_index: int = 0
52
+ chunk_size: int = 0
53
+ custom: dict[str, Any] = field(default_factory=dict)
54
+
55
+ def to_dict(self) -> dict[str, Any]:
56
+ """Convert metadata to dictionary."""
57
+ return {
58
+ "headings": self.headings,
59
+ "heading_levels": self.heading_levels,
60
+ "line_number": self.line_number,
61
+ "chunk_index": self.chunk_index,
62
+ "chunk_size": self.chunk_size,
63
+ **self.custom,
64
+ }
65
+
66
+ def get_heading_path(self, separator: str = " > ") -> str:
67
+ """Get heading hierarchy as a single string.
68
+
69
+ Args:
70
+ separator: String to use between headings
71
+
72
+ Returns:
73
+ Formatted heading path
74
+ """
75
+ return separator.join(self.headings)
76
+
77
+
78
+ @dataclass
79
+ class Chunk:
80
+ """A chunk of text with associated metadata.
81
+
82
+ Attributes:
83
+ text: The chunk text content
84
+ metadata: Metadata for this chunk
85
+ """
86
+
87
+ text: str
88
+ metadata: ChunkMetadata
89
+
90
+ def to_dict(self) -> dict[str, Any]:
91
+ """Convert chunk to dictionary representation."""
92
+ return {
93
+ "text": self.text,
94
+ "metadata": self.metadata.to_dict(),
95
+ }
96
+
97
+ def to_markdown(self, include_headings: bool = True) -> str:
98
+ """Convert chunk to markdown format.
99
+
100
+ Args:
101
+ include_headings: Whether to include heading hierarchy
102
+
103
+ Returns:
104
+ Markdown-formatted string
105
+ """
106
+ if not include_headings or not self.metadata.headings:
107
+ return self.text
108
+
109
+ # Build heading hierarchy
110
+ lines = []
111
+ for heading, level in zip(
112
+ self.metadata.headings, self.metadata.heading_levels
113
+ ):
114
+ lines.append(f"{'#' * level} {heading}")
115
+
116
+ # Add body text
117
+ if self.text:
118
+ lines.append("")
119
+ lines.append(self.text)
120
+
121
+ return "\n".join(lines)
122
+
123
+
124
+ class MarkdownChunker:
125
+ """Chunker for generating chunks from markdown tree structures.
126
+
127
+ Traverses a Tree built from markdown and generates chunks with
128
+ configurable size, heading inclusion, and output format.
129
+ """
130
+
131
+ def __init__(
132
+ self,
133
+ max_chunk_size: int = 1000,
134
+ chunk_overlap: int = 100,
135
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
136
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
137
+ combine_under_heading: bool = True,
138
+ ):
139
+ """Initialize the markdown chunker.
140
+
141
+ Args:
142
+ max_chunk_size: Maximum size of chunk text in characters
143
+ chunk_overlap: Number of characters to overlap between chunks
144
+ heading_inclusion: How to include headings in chunks
145
+ chunk_format: Output format for chunks
146
+ combine_under_heading: Whether to combine body text under same heading
147
+ """
148
+ self.max_chunk_size = max_chunk_size
149
+ self.chunk_overlap = chunk_overlap
150
+ self.heading_inclusion = heading_inclusion
151
+ self.chunk_format = chunk_format
152
+ self.combine_under_heading = combine_under_heading
153
+ self._chunk_index = 0
154
+
155
+ def chunk(self, tree: Tree) -> Iterator[Chunk]:
156
+ """Generate chunks from a markdown tree.
157
+
158
+ Args:
159
+ tree: Tree structure built from markdown
160
+
161
+ Yields:
162
+ Chunk objects with text and metadata
163
+ """
164
+ self._chunk_index = 0
165
+
166
+ # Get all terminal (leaf) nodes - not headings or root
167
+ terminal_nodes = tree.collect_terminal_nodes(
168
+ accept_node_fn=lambda n: (
169
+ isinstance(n.data, MarkdownNode)
170
+ and not n.data.is_heading()
171
+ and n.data.node_type != "root"
172
+ )
173
+ )
174
+
175
+ if self.combine_under_heading:
176
+ # Group terminal nodes by their parent heading
177
+ yield from self._chunk_by_heading(terminal_nodes)
178
+ else:
179
+ # Process each terminal node individually
180
+ yield from self._chunk_individually(terminal_nodes)
181
+
182
+ def _chunk_by_heading(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
183
+ """Group nodes under same heading and chunk them.
184
+
185
+ Args:
186
+ terminal_nodes: List of terminal tree nodes
187
+
188
+ Yields:
189
+ Chunk objects
190
+ """
191
+ # Group nodes by their immediate parent
192
+ parent_groups: dict[Tree, list[Tree]] = {}
193
+ for node in terminal_nodes:
194
+ parent = node.parent
195
+ if parent not in parent_groups:
196
+ parent_groups[parent] = []
197
+ parent_groups[parent].append(node)
198
+
199
+ # Process each group
200
+ for parent, nodes in parent_groups.items():
201
+ # Get heading path for this group
202
+ headings, levels = self._get_heading_path(parent)
203
+
204
+ # Separate atomic constructs from regular body text
205
+ atomic_nodes = [n for n in nodes if n.data.is_atomic()]
206
+ body_nodes = [n for n in nodes if not n.data.is_atomic()]
207
+
208
+ # Process body text nodes (can be combined and split)
209
+ if body_nodes:
210
+ combined_text = "\n".join(
211
+ node.data.text for node in body_nodes if node.data.text.strip()
212
+ )
213
+
214
+ if combined_text.strip():
215
+ for chunk_text in self._split_text(combined_text):
216
+ yield self._create_chunk(
217
+ text=chunk_text,
218
+ headings=headings,
219
+ heading_levels=levels,
220
+ line_number=body_nodes[0].data.line_number if body_nodes else 0,
221
+ )
222
+
223
+ # Process atomic constructs (keep as complete units)
224
+ for atomic_node in atomic_nodes:
225
+ # Don't split atomic constructs, even if they exceed max_chunk_size
226
+ yield self._create_chunk(
227
+ text=atomic_node.data.text,
228
+ headings=headings,
229
+ heading_levels=levels,
230
+ line_number=atomic_node.data.line_number,
231
+ metadata=atomic_node.data.metadata,
232
+ node_type=atomic_node.data.node_type,
233
+ )
234
+
235
+ def _chunk_individually(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
236
+ """Process each terminal node individually.
237
+
238
+ Args:
239
+ terminal_nodes: List of terminal tree nodes
240
+
241
+ Yields:
242
+ Chunk objects
243
+ """
244
+ for node in terminal_nodes:
245
+ if not node.data.text.strip():
246
+ continue
247
+
248
+ headings, levels = self._get_heading_path(node.parent)
249
+
250
+ # Atomic constructs are kept whole
251
+ if node.data.is_atomic():
252
+ yield self._create_chunk(
253
+ text=node.data.text,
254
+ headings=headings,
255
+ heading_levels=levels,
256
+ line_number=node.data.line_number,
257
+ metadata=node.data.metadata,
258
+ node_type=node.data.node_type,
259
+ )
260
+ else:
261
+ # Regular body text can be split
262
+ for chunk_text in self._split_text(node.data.text):
263
+ yield self._create_chunk(
264
+ text=chunk_text,
265
+ headings=headings,
266
+ heading_levels=levels,
267
+ line_number=node.data.line_number,
268
+ )
269
+
270
+ def _get_heading_path(self, node: Tree | None) -> tuple[list[str], list[int]]:
271
+ """Get the heading path from root to this node.
272
+
273
+ Args:
274
+ node: Tree node to get path for
275
+
276
+ Returns:
277
+ Tuple of (heading_texts, heading_levels)
278
+ """
279
+ headings = []
280
+ levels = []
281
+
282
+ current = node
283
+ while current is not None:
284
+ if isinstance(current.data, MarkdownNode):
285
+ if current.data.is_heading():
286
+ headings.insert(0, current.data.text)
287
+ levels.insert(0, current.data.level)
288
+ current = current.parent
289
+
290
+ return headings, levels
291
+
292
+ def _split_text(self, text: str) -> list[str]:
293
+ """Split text into chunks respecting max_chunk_size.
294
+
295
+ Args:
296
+ text: Text to split
297
+
298
+ Returns:
299
+ List of text chunks
300
+ """
301
+ if len(text) <= self.max_chunk_size:
302
+ return [text]
303
+
304
+ chunks = []
305
+ start = 0
306
+
307
+ while start < len(text):
308
+ end = start + self.max_chunk_size
309
+
310
+ # If not at the end, try to break at a good boundary
311
+ if end < len(text):
312
+ # Try to break at paragraph boundary (double newline)
313
+ break_pos = text.rfind("\n\n", start, end)
314
+ if break_pos > start:
315
+ end = break_pos + 2
316
+ else:
317
+ # Try to break at sentence boundary
318
+ for punct in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
319
+ break_pos = text.rfind(punct, start, end)
320
+ if break_pos > start:
321
+ end = break_pos + len(punct)
322
+ break
323
+ else:
324
+ # Try to break at word boundary
325
+ break_pos = text.rfind(" ", start, end)
326
+ if break_pos > start:
327
+ end = break_pos + 1
328
+
329
+ chunks.append(text[start:end].strip())
330
+
331
+ # Move start position, accounting for overlap
332
+ start = max(start + 1, end - self.chunk_overlap)
333
+
334
+ return [c for c in chunks if c] # Filter out empty chunks
335
+
336
+ def _create_chunk(
337
+ self,
338
+ text: str,
339
+ headings: list[str],
340
+ heading_levels: list[int],
341
+ line_number: int,
342
+ metadata: dict[str, Any] | None = None,
343
+ node_type: str = "body",
344
+ ) -> Chunk:
345
+ """Create a chunk with appropriate format and metadata.
346
+
347
+ Args:
348
+ text: Body text for chunk
349
+ headings: List of heading texts
350
+ heading_levels: List of heading levels
351
+ line_number: Source line number
352
+ metadata: Optional metadata from the source node
353
+ node_type: Type of node ('body', 'code', 'list', 'table', etc.)
354
+
355
+ Returns:
356
+ Formatted Chunk object
357
+ """
358
+ # Build chunk text based on heading inclusion setting
359
+ chunk_text = text
360
+
361
+ if self.heading_inclusion in (HeadingInclusion.IN_TEXT, HeadingInclusion.BOTH):
362
+ # Prepend headings to text
363
+ heading_lines = []
364
+ for heading, level in zip(headings, heading_levels):
365
+ if self.chunk_format == ChunkFormat.MARKDOWN:
366
+ heading_lines.append(f"{'#' * level} {heading}")
367
+ else:
368
+ heading_lines.append(heading)
369
+
370
+ if heading_lines:
371
+ chunk_text = "\n".join(heading_lines) + "\n\n" + text
372
+
373
+ # Create custom metadata dict with node type and additional metadata
374
+ custom_metadata = {"node_type": node_type}
375
+ if metadata:
376
+ custom_metadata.update(metadata)
377
+
378
+ # Create chunk metadata
379
+ chunk_metadata = ChunkMetadata(
380
+ headings=headings if self.heading_inclusion in (
381
+ HeadingInclusion.IN_METADATA,
382
+ HeadingInclusion.BOTH,
383
+ ) else [],
384
+ heading_levels=heading_levels if self.heading_inclusion in (
385
+ HeadingInclusion.IN_METADATA,
386
+ HeadingInclusion.BOTH,
387
+ ) else [],
388
+ line_number=line_number,
389
+ chunk_index=self._chunk_index,
390
+ chunk_size=len(chunk_text),
391
+ custom=custom_metadata,
392
+ )
393
+
394
+ self._chunk_index += 1
395
+
396
+ return Chunk(text=chunk_text, metadata=chunk_metadata)
397
+
398
+
399
+ def chunk_markdown_tree(
400
+ tree: Tree,
401
+ max_chunk_size: int = 1000,
402
+ chunk_overlap: int = 100,
403
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
404
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
405
+ combine_under_heading: bool = True,
406
+ ) -> list[Chunk]:
407
+ """Generate chunks from a markdown tree.
408
+
409
+ Convenience function for creating and using a MarkdownChunker.
410
+
411
+ Args:
412
+ tree: Tree structure built from markdown
413
+ max_chunk_size: Maximum size of chunk text in characters
414
+ chunk_overlap: Number of characters to overlap between chunks
415
+ heading_inclusion: How to include headings in chunks
416
+ chunk_format: Output format for chunks
417
+ combine_under_heading: Whether to combine body text under same heading
418
+
419
+ Returns:
420
+ List of Chunk objects
421
+ """
422
+ chunker = MarkdownChunker(
423
+ max_chunk_size=max_chunk_size,
424
+ chunk_overlap=chunk_overlap,
425
+ heading_inclusion=heading_inclusion,
426
+ chunk_format=chunk_format,
427
+ combine_under_heading=combine_under_heading,
428
+ )
429
+ return list(chunker.chunk(tree))