dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,478 @@
1
+ """Markdown chunker for generating RAG-optimized chunks from tree structures.
2
+
3
+ This module provides functionality to traverse markdown tree structures and
4
+ generate chunks suitable for RAG (Retrieval-Augmented Generation) applications.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ from typing import Any, Iterator
12
+
13
+ from dataknobs_structures.tree import Tree
14
+
15
+ from dataknobs_xization.markdown.md_parser import MarkdownNode
16
+ from dataknobs_xization.markdown.enrichment import build_enriched_text
17
+ from dataknobs_xization.markdown.filters import ChunkQualityConfig, ChunkQualityFilter
18
+
19
+
20
+ class ChunkFormat(Enum):
21
+ """Output format for chunk text."""
22
+
23
+ MARKDOWN = "markdown" # Include headings as markdown
24
+ PLAIN = "plain" # Plain text without markdown formatting
25
+ DICT = "dict" # Return as dictionary
26
+
27
+
28
+ class HeadingInclusion(Enum):
29
+ """Strategy for including headings in chunks."""
30
+
31
+ IN_TEXT = "in_text" # Include headings in chunk text
32
+ IN_METADATA = "in_metadata" # Include headings only in metadata
33
+ BOTH = "both" # Include in both text and metadata
34
+ NONE = "none" # Don't include headings
35
+
36
+
37
+ @dataclass
38
+ class ChunkMetadata:
39
+ """Metadata for a document chunk.
40
+
41
+ Attributes:
42
+ headings: List of heading texts from root to chunk
43
+ heading_levels: List of heading levels corresponding to headings
44
+ line_number: Starting line number in source document
45
+ chunk_index: Index of this chunk in the sequence
46
+ chunk_size: Size of chunk text in characters
47
+ content_length: Size of content without headings (for quality decisions)
48
+ heading_display: Formatted heading path for display
49
+ embedding_text: Heading-enriched text for embedding (optional)
50
+ custom: Additional custom metadata
51
+ """
52
+
53
+ headings: list[str] = field(default_factory=list)
54
+ heading_levels: list[int] = field(default_factory=list)
55
+ line_number: int = 0
56
+ chunk_index: int = 0
57
+ chunk_size: int = 0
58
+ content_length: int = 0
59
+ heading_display: str = ""
60
+ embedding_text: str = ""
61
+ custom: dict[str, Any] = field(default_factory=dict)
62
+
63
+ def to_dict(self) -> dict[str, Any]:
64
+ """Convert metadata to dictionary."""
65
+ result = {
66
+ "headings": self.headings,
67
+ "heading_levels": self.heading_levels,
68
+ "line_number": self.line_number,
69
+ "chunk_index": self.chunk_index,
70
+ "chunk_size": self.chunk_size,
71
+ "content_length": self.content_length,
72
+ "heading_display": self.heading_display,
73
+ **self.custom,
74
+ }
75
+ # Only include embedding_text if it was generated
76
+ if self.embedding_text:
77
+ result["embedding_text"] = self.embedding_text
78
+ return result
79
+
80
+ def get_heading_path(self, separator: str = " > ") -> str:
81
+ """Get heading hierarchy as a single string.
82
+
83
+ Args:
84
+ separator: String to use between headings
85
+
86
+ Returns:
87
+ Formatted heading path
88
+ """
89
+ return separator.join(self.headings)
90
+
91
+
92
+ @dataclass
93
+ class Chunk:
94
+ """A chunk of text with associated metadata.
95
+
96
+ Attributes:
97
+ text: The chunk text content
98
+ metadata: Metadata for this chunk
99
+ """
100
+
101
+ text: str
102
+ metadata: ChunkMetadata
103
+
104
+ def to_dict(self) -> dict[str, Any]:
105
+ """Convert chunk to dictionary representation."""
106
+ return {
107
+ "text": self.text,
108
+ "metadata": self.metadata.to_dict(),
109
+ }
110
+
111
+ def to_markdown(self, include_headings: bool = True) -> str:
112
+ """Convert chunk to markdown format.
113
+
114
+ Args:
115
+ include_headings: Whether to include heading hierarchy
116
+
117
+ Returns:
118
+ Markdown-formatted string
119
+ """
120
+ if not include_headings or not self.metadata.headings:
121
+ return self.text
122
+
123
+ # Build heading hierarchy
124
+ lines = []
125
+ for heading, level in zip(
126
+ self.metadata.headings, self.metadata.heading_levels
127
+ ):
128
+ lines.append(f"{'#' * level} {heading}")
129
+
130
+ # Add body text
131
+ if self.text:
132
+ lines.append("")
133
+ lines.append(self.text)
134
+
135
+ return "\n".join(lines)
136
+
137
+
138
+ class MarkdownChunker:
139
+ """Chunker for generating chunks from markdown tree structures.
140
+
141
+ Traverses a Tree built from markdown and generates chunks with
142
+ configurable size, heading inclusion, and output format.
143
+ """
144
+
145
+ def __init__(
146
+ self,
147
+ max_chunk_size: int = 1000,
148
+ chunk_overlap: int = 100,
149
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
150
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
151
+ combine_under_heading: bool = True,
152
+ quality_filter: ChunkQualityConfig | None = None,
153
+ generate_embeddings: bool = False,
154
+ ):
155
+ """Initialize the markdown chunker.
156
+
157
+ Args:
158
+ max_chunk_size: Maximum size of chunk text in characters
159
+ chunk_overlap: Number of characters to overlap between chunks
160
+ heading_inclusion: How to include headings in chunks
161
+ chunk_format: Output format for chunks
162
+ combine_under_heading: Whether to combine body text under same heading
163
+ quality_filter: Optional config for filtering low-quality chunks
164
+ generate_embeddings: Whether to generate heading-enriched embedding text
165
+ """
166
+ self.max_chunk_size = max_chunk_size
167
+ self.chunk_overlap = chunk_overlap
168
+ self.heading_inclusion = heading_inclusion
169
+ self.chunk_format = chunk_format
170
+ self.combine_under_heading = combine_under_heading
171
+ self.generate_embeddings = generate_embeddings
172
+ self._chunk_index = 0
173
+
174
+ # Initialize quality filter if config provided
175
+ self._quality_filter = None
176
+ if quality_filter is not None:
177
+ self._quality_filter = ChunkQualityFilter(quality_filter)
178
+
179
+ def chunk(self, tree: Tree) -> Iterator[Chunk]:
180
+ """Generate chunks from a markdown tree.
181
+
182
+ Args:
183
+ tree: Tree structure built from markdown
184
+
185
+ Yields:
186
+ Chunk objects with text and metadata
187
+ """
188
+ self._chunk_index = 0
189
+
190
+ # Get all terminal (leaf) nodes - not headings or root
191
+ terminal_nodes = tree.collect_terminal_nodes(
192
+ accept_node_fn=lambda n: (
193
+ isinstance(n.data, MarkdownNode)
194
+ and not n.data.is_heading()
195
+ and n.data.node_type != "root"
196
+ )
197
+ )
198
+
199
+ if self.combine_under_heading:
200
+ # Group terminal nodes by their parent heading
201
+ chunk_iter = self._chunk_by_heading(terminal_nodes)
202
+ else:
203
+ # Process each terminal node individually
204
+ chunk_iter = self._chunk_individually(terminal_nodes)
205
+
206
+ # Apply quality filter if configured
207
+ for chunk in chunk_iter:
208
+ if self._quality_filter is None or self._quality_filter.is_valid(chunk):
209
+ yield chunk
210
+
211
+ def _chunk_by_heading(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
212
+ """Group nodes under same heading and chunk them.
213
+
214
+ Args:
215
+ terminal_nodes: List of terminal tree nodes
216
+
217
+ Yields:
218
+ Chunk objects
219
+ """
220
+ # Group nodes by their immediate parent
221
+ parent_groups: dict[Tree, list[Tree]] = {}
222
+ for node in terminal_nodes:
223
+ parent = node.parent
224
+ if parent not in parent_groups:
225
+ parent_groups[parent] = []
226
+ parent_groups[parent].append(node)
227
+
228
+ # Process each group
229
+ for parent, nodes in parent_groups.items():
230
+ # Get heading path for this group
231
+ headings, levels = self._get_heading_path(parent)
232
+
233
+ # Separate atomic constructs from regular body text
234
+ atomic_nodes = [n for n in nodes if n.data.is_atomic()]
235
+ body_nodes = [n for n in nodes if not n.data.is_atomic()]
236
+
237
+ # Process body text nodes (can be combined and split)
238
+ if body_nodes:
239
+ combined_text = "\n".join(
240
+ node.data.text for node in body_nodes if node.data.text.strip()
241
+ )
242
+
243
+ if combined_text.strip():
244
+ for chunk_text in self._split_text(combined_text):
245
+ yield self._create_chunk(
246
+ text=chunk_text,
247
+ headings=headings,
248
+ heading_levels=levels,
249
+ line_number=body_nodes[0].data.line_number if body_nodes else 0,
250
+ )
251
+
252
+ # Process atomic constructs (keep as complete units)
253
+ for atomic_node in atomic_nodes:
254
+ # Don't split atomic constructs, even if they exceed max_chunk_size
255
+ yield self._create_chunk(
256
+ text=atomic_node.data.text,
257
+ headings=headings,
258
+ heading_levels=levels,
259
+ line_number=atomic_node.data.line_number,
260
+ metadata=atomic_node.data.metadata,
261
+ node_type=atomic_node.data.node_type,
262
+ )
263
+
264
+ def _chunk_individually(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
265
+ """Process each terminal node individually.
266
+
267
+ Args:
268
+ terminal_nodes: List of terminal tree nodes
269
+
270
+ Yields:
271
+ Chunk objects
272
+ """
273
+ for node in terminal_nodes:
274
+ if not node.data.text.strip():
275
+ continue
276
+
277
+ headings, levels = self._get_heading_path(node.parent)
278
+
279
+ # Atomic constructs are kept whole
280
+ if node.data.is_atomic():
281
+ yield self._create_chunk(
282
+ text=node.data.text,
283
+ headings=headings,
284
+ heading_levels=levels,
285
+ line_number=node.data.line_number,
286
+ metadata=node.data.metadata,
287
+ node_type=node.data.node_type,
288
+ )
289
+ else:
290
+ # Regular body text can be split
291
+ for chunk_text in self._split_text(node.data.text):
292
+ yield self._create_chunk(
293
+ text=chunk_text,
294
+ headings=headings,
295
+ heading_levels=levels,
296
+ line_number=node.data.line_number,
297
+ )
298
+
299
+ def _get_heading_path(self, node: Tree | None) -> tuple[list[str], list[int]]:
300
+ """Get the heading path from root to this node.
301
+
302
+ Args:
303
+ node: Tree node to get path for
304
+
305
+ Returns:
306
+ Tuple of (heading_texts, heading_levels)
307
+ """
308
+ headings = []
309
+ levels = []
310
+
311
+ current = node
312
+ while current is not None:
313
+ if isinstance(current.data, MarkdownNode):
314
+ if current.data.is_heading():
315
+ headings.insert(0, current.data.text)
316
+ levels.insert(0, current.data.level)
317
+ current = current.parent
318
+
319
+ return headings, levels
320
+
321
+ def _split_text(self, text: str) -> list[str]:
322
+ """Split text into chunks respecting max_chunk_size.
323
+
324
+ Args:
325
+ text: Text to split
326
+
327
+ Returns:
328
+ List of text chunks
329
+ """
330
+ if len(text) <= self.max_chunk_size:
331
+ return [text]
332
+
333
+ chunks = []
334
+ start = 0
335
+
336
+ while start < len(text):
337
+ end = start + self.max_chunk_size
338
+
339
+ # If not at the end, try to break at a good boundary
340
+ if end < len(text):
341
+ # Try to break at paragraph boundary (double newline)
342
+ break_pos = text.rfind("\n\n", start, end)
343
+ if break_pos > start:
344
+ end = break_pos + 2
345
+ else:
346
+ # Try to break at sentence boundary
347
+ for punct in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
348
+ break_pos = text.rfind(punct, start, end)
349
+ if break_pos > start:
350
+ end = break_pos + len(punct)
351
+ break
352
+ else:
353
+ # Try to break at word boundary
354
+ break_pos = text.rfind(" ", start, end)
355
+ if break_pos > start:
356
+ end = break_pos + 1
357
+
358
+ chunks.append(text[start:end].strip())
359
+
360
+ # Move start position, accounting for overlap
361
+ start = max(start + 1, end - self.chunk_overlap)
362
+
363
+ return [c for c in chunks if c] # Filter out empty chunks
364
+
365
+ def _create_chunk(
366
+ self,
367
+ text: str,
368
+ headings: list[str],
369
+ heading_levels: list[int],
370
+ line_number: int,
371
+ metadata: dict[str, Any] | None = None,
372
+ node_type: str = "body",
373
+ ) -> Chunk:
374
+ """Create a chunk with appropriate format and metadata.
375
+
376
+ Args:
377
+ text: Body text for chunk
378
+ headings: List of heading texts
379
+ heading_levels: List of heading levels
380
+ line_number: Source line number
381
+ metadata: Optional metadata from the source node
382
+ node_type: Type of node ('body', 'code', 'list', 'table', etc.)
383
+
384
+ Returns:
385
+ Formatted Chunk object
386
+ """
387
+ # Store content length before adding headings
388
+ content_length = len(text)
389
+
390
+ # Build chunk text based on heading inclusion setting
391
+ chunk_text = text
392
+
393
+ if self.heading_inclusion in (HeadingInclusion.IN_TEXT, HeadingInclusion.BOTH):
394
+ # Prepend headings to text
395
+ heading_lines = []
396
+ for heading, level in zip(headings, heading_levels):
397
+ if self.chunk_format == ChunkFormat.MARKDOWN:
398
+ heading_lines.append(f"{'#' * level} {heading}")
399
+ else:
400
+ heading_lines.append(heading)
401
+
402
+ if heading_lines:
403
+ chunk_text = "\n".join(heading_lines) + "\n\n" + text
404
+
405
+ # Create custom metadata dict with node type and additional metadata
406
+ custom_metadata = {"node_type": node_type}
407
+ if metadata:
408
+ custom_metadata.update(metadata)
409
+
410
+ # Generate heading display string
411
+ heading_display = " > ".join(headings) if headings else ""
412
+
413
+ # Generate embedding text if enabled
414
+ embedding_text = ""
415
+ if self.generate_embeddings:
416
+ embedding_text = build_enriched_text(headings, text)
417
+
418
+ # Determine which headings to include in metadata
419
+ include_headings = self.heading_inclusion in (
420
+ HeadingInclusion.IN_METADATA,
421
+ HeadingInclusion.BOTH,
422
+ )
423
+
424
+ # Create chunk metadata
425
+ chunk_metadata = ChunkMetadata(
426
+ headings=headings if include_headings else [],
427
+ heading_levels=heading_levels if include_headings else [],
428
+ line_number=line_number,
429
+ chunk_index=self._chunk_index,
430
+ chunk_size=len(chunk_text),
431
+ content_length=content_length,
432
+ heading_display=heading_display,
433
+ embedding_text=embedding_text,
434
+ custom=custom_metadata,
435
+ )
436
+
437
+ self._chunk_index += 1
438
+
439
+ return Chunk(text=chunk_text, metadata=chunk_metadata)
440
+
441
+
442
+ def chunk_markdown_tree(
443
+ tree: Tree,
444
+ max_chunk_size: int = 1000,
445
+ chunk_overlap: int = 100,
446
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
447
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
448
+ combine_under_heading: bool = True,
449
+ quality_filter: ChunkQualityConfig | None = None,
450
+ generate_embeddings: bool = False,
451
+ ) -> list[Chunk]:
452
+ """Generate chunks from a markdown tree.
453
+
454
+ Convenience function for creating and using a MarkdownChunker.
455
+
456
+ Args:
457
+ tree: Tree structure built from markdown
458
+ max_chunk_size: Maximum size of chunk text in characters
459
+ chunk_overlap: Number of characters to overlap between chunks
460
+ heading_inclusion: How to include headings in chunks
461
+ chunk_format: Output format for chunks
462
+ combine_under_heading: Whether to combine body text under same heading
463
+ quality_filter: Optional config for filtering low-quality chunks
464
+ generate_embeddings: Whether to generate heading-enriched embedding text
465
+
466
+ Returns:
467
+ List of Chunk objects
468
+ """
469
+ chunker = MarkdownChunker(
470
+ max_chunk_size=max_chunk_size,
471
+ chunk_overlap=chunk_overlap,
472
+ heading_inclusion=heading_inclusion,
473
+ chunk_format=chunk_format,
474
+ combine_under_heading=combine_under_heading,
475
+ quality_filter=quality_filter,
476
+ generate_embeddings=generate_embeddings,
477
+ )
478
+ return list(chunker.chunk(tree))