dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ """Markdown chunking utilities for RAG applications.
2
+
3
+ This module provides comprehensive utilities for parsing and chunking markdown
4
+ documents while preserving semantic structure and heading hierarchy.
5
+ """
6
+
7
+ from dataknobs_xization.markdown.md_chunker import (
8
+ Chunk,
9
+ ChunkFormat,
10
+ ChunkMetadata,
11
+ HeadingInclusion,
12
+ MarkdownChunker,
13
+ chunk_markdown_tree,
14
+ )
15
+ from dataknobs_xization.markdown.md_parser import (
16
+ MarkdownNode,
17
+ MarkdownParser,
18
+ parse_markdown,
19
+ )
20
+ from dataknobs_xization.markdown.md_streaming import (
21
+ AdaptiveStreamingProcessor,
22
+ StreamingMarkdownProcessor,
23
+ stream_markdown_file,
24
+ stream_markdown_string,
25
+ )
26
+ from dataknobs_xization.markdown.filters import (
27
+ ChunkQualityConfig,
28
+ ChunkQualityFilter,
29
+ )
30
+ from dataknobs_xization.markdown.enrichment import (
31
+ EnrichedChunkData,
32
+ build_enriched_text,
33
+ enrich_chunk,
34
+ extract_heading_metadata,
35
+ format_heading_display,
36
+ format_heading_for_display,
37
+ get_dynamic_heading_display,
38
+ get_relevant_headings_for_display,
39
+ is_multiword,
40
+ )
41
+
42
+ __all__ = [
43
+ # Parser
44
+ "MarkdownNode",
45
+ "MarkdownParser",
46
+ "parse_markdown",
47
+ # Chunker
48
+ "Chunk",
49
+ "ChunkFormat",
50
+ "ChunkMetadata",
51
+ "HeadingInclusion",
52
+ "MarkdownChunker",
53
+ "chunk_markdown_tree",
54
+ # Streaming
55
+ "AdaptiveStreamingProcessor",
56
+ "StreamingMarkdownProcessor",
57
+ "stream_markdown_file",
58
+ "stream_markdown_string",
59
+ # Filters
60
+ "ChunkQualityConfig",
61
+ "ChunkQualityFilter",
62
+ # Enrichment
63
+ "EnrichedChunkData",
64
+ "build_enriched_text",
65
+ "enrich_chunk",
66
+ "extract_heading_metadata",
67
+ "format_heading_display",
68
+ "format_heading_for_display",
69
+ "get_dynamic_heading_display",
70
+ "get_relevant_headings_for_display",
71
+ "is_multiword",
72
+ ]
@@ -0,0 +1,260 @@
1
+ """Heading enrichment utilities for RAG-optimized chunk embeddings.
2
+
3
+ This module provides utilities to enrich chunk content with heading context
4
+ for improved semantic search, while keeping headings out of the displayed content.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+
12
+
13
+ def is_multiword(heading: str) -> bool:
14
+ """Check if a heading contains multiple words.
15
+
16
+ Args:
17
+ heading: The heading text to check
18
+
19
+ Returns:
20
+ True if the heading has more than one word
21
+ """
22
+ return len(heading.split()) > 1
23
+
24
+
25
+ def format_heading_display(
26
+ heading_path: list[str],
27
+ separator: str = " > ",
28
+ ) -> str:
29
+ """Format a heading path for display.
30
+
31
+ Args:
32
+ heading_path: List of headings from root to chunk
33
+ separator: Separator to use between headings
34
+
35
+ Returns:
36
+ Formatted heading path string
37
+ """
38
+ if not heading_path:
39
+ return ""
40
+ return separator.join(heading_path)
41
+
42
+
43
+ def get_dynamic_heading_display(
44
+ heading_path: list[str],
45
+ content_length: int,
46
+ small_threshold: int = 200,
47
+ medium_threshold: int = 800,
48
+ ) -> str:
49
+ """Get heading display based on content length.
50
+
51
+ Dynamic heading inclusion:
52
+ - Small chunks (< small_threshold): Full heading path
53
+ - Medium chunks (< medium_threshold): Last 2 headings
54
+ - Large chunks: No headings
55
+
56
+ Args:
57
+ heading_path: List of headings from root to chunk
58
+ content_length: Length of chunk content in characters
59
+ small_threshold: Max chars for "small" chunks
60
+ medium_threshold: Max chars for "medium" chunks
61
+
62
+ Returns:
63
+ Formatted heading display string
64
+ """
65
+ if not heading_path:
66
+ return ""
67
+
68
+ if content_length <= small_threshold:
69
+ # Small chunks: include full heading path
70
+ return format_heading_display(heading_path)
71
+ elif content_length <= medium_threshold:
72
+ # Medium chunks: include last 2 heading levels
73
+ relevant = heading_path[-2:] if len(heading_path) > 2 else heading_path
74
+ return format_heading_display(relevant)
75
+ else:
76
+ # Large chunks: omit headings
77
+ return ""
78
+
79
+
80
+ @dataclass
81
+ class EnrichedChunkData:
82
+ """Data for a chunk enriched with heading context.
83
+
84
+ Attributes:
85
+ content: Clean content text (no headings)
86
+ embedding_text: Text to use for embedding (heading-enriched)
87
+ heading_path: List of headings from root to chunk
88
+ heading_display: Formatted heading path for display
89
+ content_length: Length of clean content in characters
90
+ """
91
+
92
+ content: str
93
+ embedding_text: str
94
+ heading_path: list[str]
95
+ heading_display: str
96
+ content_length: int
97
+
98
+
99
+ def build_enriched_text(heading_path: list[str], content: str) -> str:
100
+ """Build text for embedding with relevant heading context.
101
+
102
+ Uses a modified approach where headings are included up from the chunk
103
+ until and including the first multi-word heading. This provides semantic
104
+ context without over-weighting deep, single-word labels like "Example".
105
+
106
+ Args:
107
+ heading_path: List of heading texts from root to chunk
108
+ content: The chunk content text
109
+
110
+ Returns:
111
+ Enriched text suitable for embedding
112
+
113
+ Examples:
114
+ >>> build_enriched_text(["Patterns", "Chain-of-Thought", "Example"], "code here")
115
+ 'Chain-of-Thought: Example: code here'
116
+
117
+ >>> build_enriched_text(["Setup"], "install steps")
118
+ 'Setup: install steps'
119
+
120
+ >>> build_enriched_text(["API Reference", "Authentication", "OAuth 2.0"], "...")
121
+ 'Authentication: OAuth 2.0: ...'
122
+
123
+ >>> build_enriched_text([], "standalone content")
124
+ 'standalone content'
125
+ """
126
+ if not heading_path:
127
+ return content
128
+
129
+ # Walk backwards from deepest heading to find relevant context
130
+ relevant_headings = []
131
+ for heading in reversed(heading_path):
132
+ relevant_headings.insert(0, heading)
133
+ # Stop after including a multi-word heading
134
+ if len(heading.split()) > 1:
135
+ break
136
+
137
+ # Build the enriched text
138
+ if relevant_headings:
139
+ prefix = ": ".join(relevant_headings)
140
+ return f"{prefix}: {content}"
141
+
142
+ return content
143
+
144
+
145
+ def extract_heading_metadata(
146
+ headings: list[str],
147
+ heading_levels: list[int],
148
+ separator: str = " > ",
149
+ ) -> dict[str, Any]:
150
+ """Extract heading metadata for storage.
151
+
152
+ Args:
153
+ headings: List of heading texts from root to chunk
154
+ heading_levels: Corresponding heading levels (1-6)
155
+ separator: Separator for display string
156
+
157
+ Returns:
158
+ Dictionary with heading metadata fields
159
+ """
160
+ return {
161
+ "heading_path": headings,
162
+ "heading_levels": heading_levels,
163
+ "heading_display": separator.join(headings) if headings else "",
164
+ "heading_depth": len(headings),
165
+ }
166
+
167
+
168
+ def get_relevant_headings_for_display(
169
+ heading_path: list[str],
170
+ content_length: int,
171
+ small_threshold: int = 200,
172
+ medium_threshold: int = 800,
173
+ ) -> list[str]:
174
+ """Get headings to display based on content length.
175
+
176
+ Implements dynamic heading inclusion:
177
+ - Small chunks: Full heading path (need context)
178
+ - Medium chunks: Last 2 heading levels
179
+ - Large chunks: No headings (content is self-contained)
180
+
181
+ Args:
182
+ heading_path: List of heading texts from root to chunk
183
+ content_length: Length of chunk content in characters
184
+ small_threshold: Max chars for "small" chunks
185
+ medium_threshold: Max chars for "medium" chunks
186
+
187
+ Returns:
188
+ List of headings to display
189
+ """
190
+ if not heading_path:
191
+ return []
192
+
193
+ if content_length < small_threshold:
194
+ # Small chunks: include full heading path
195
+ return heading_path
196
+ elif content_length < medium_threshold:
197
+ # Medium chunks: include last 2 heading levels
198
+ return heading_path[-2:] if len(heading_path) > 2 else heading_path
199
+ else:
200
+ # Large chunks: omit headings
201
+ return []
202
+
203
+
204
+ def format_heading_for_display(
205
+ headings: list[str],
206
+ heading_levels: list[int] | None = None,
207
+ format_style: str = "markdown",
208
+ ) -> str:
209
+ """Format headings for display in LLM context.
210
+
211
+ Args:
212
+ headings: List of heading texts to display
213
+ heading_levels: Corresponding levels (used for markdown format)
214
+ format_style: "markdown" for # syntax, "path" for > separated
215
+
216
+ Returns:
217
+ Formatted heading string
218
+ """
219
+ if not headings:
220
+ return ""
221
+
222
+ if format_style == "path":
223
+ return " > ".join(headings)
224
+
225
+ if format_style == "markdown" and heading_levels:
226
+ lines = []
227
+ for heading, level in zip(headings, heading_levels):
228
+ lines.append(f"{'#' * level} {heading}")
229
+ return "\n".join(lines)
230
+
231
+ # Default: just join with separator
232
+ return " > ".join(headings)
233
+
234
+
235
+ def enrich_chunk(
236
+ content: str,
237
+ headings: list[str],
238
+ heading_levels: list[int],
239
+ ) -> EnrichedChunkData:
240
+ """Create fully enriched chunk data from raw components.
241
+
242
+ Convenience function that combines all enrichment operations.
243
+
244
+ Args:
245
+ content: Raw chunk content text
246
+ headings: List of heading texts from root to chunk
247
+ heading_levels: Corresponding heading levels
248
+
249
+ Returns:
250
+ EnrichedChunkData with all computed fields
251
+ """
252
+ embedding_text = build_enriched_text(headings, content)
253
+
254
+ return EnrichedChunkData(
255
+ content=content,
256
+ embedding_text=embedding_text,
257
+ heading_path=headings,
258
+ heading_display=" > ".join(headings) if headings else "",
259
+ content_length=len(content),
260
+ )
@@ -0,0 +1,236 @@
1
+ """Quality filters for markdown chunks.
2
+
3
+ This module provides filtering utilities to identify and remove low-quality
4
+ chunks that would not contribute meaningful content to RAG retrieval.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from dataclasses import dataclass
11
+ from typing import TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ from dataknobs_xization.markdown.md_chunker import Chunk
15
+
16
+
17
+ @dataclass
18
+ class ChunkQualityConfig:
19
+ """Configuration for chunk quality filtering.
20
+
21
+ Attributes:
22
+ min_content_chars: Minimum characters of non-heading content
23
+ min_alphanumeric_ratio: Minimum ratio of alphanumeric to total chars
24
+ skip_heading_only: Skip chunks with only headings (no body content)
25
+ min_words: Minimum word count for content
26
+ allow_code_blocks: Allow short code blocks that would otherwise be filtered
27
+ allow_tables: Allow short tables that would otherwise be filtered
28
+ """
29
+
30
+ min_content_chars: int = 50
31
+ min_alphanumeric_ratio: float = 0.3
32
+ skip_heading_only: bool = True
33
+ min_words: int = 5
34
+ allow_code_blocks: bool = True
35
+ allow_tables: bool = True
36
+
37
+
38
+ class ChunkQualityFilter:
39
+ """Filter for identifying and removing low-quality chunks.
40
+
41
+ This filter helps ensure that only meaningful content is indexed
42
+ for RAG retrieval, reducing noise and improving retrieval quality.
43
+ """
44
+
45
+ def __init__(self, config: ChunkQualityConfig | None = None):
46
+ """Initialize the quality filter.
47
+
48
+ Args:
49
+ config: Quality configuration, uses defaults if not provided
50
+ """
51
+ self.config = config or ChunkQualityConfig()
52
+
53
+ def is_valid(self, chunk: Chunk) -> bool:
54
+ """Check if a chunk meets quality thresholds.
55
+
56
+ Args:
57
+ chunk: The chunk to evaluate
58
+
59
+ Returns:
60
+ True if chunk should be kept, False if it should be filtered
61
+ """
62
+ # Get node type from custom metadata
63
+ node_type = chunk.metadata.custom.get("node_type", "body")
64
+
65
+ # Special handling for code blocks and tables
66
+ if node_type == "code" and self.config.allow_code_blocks:
67
+ return self._is_valid_code_block(chunk)
68
+ if node_type == "table" and self.config.allow_tables:
69
+ return self._is_valid_table(chunk)
70
+
71
+ # Extract content without heading markers
72
+ content = self._extract_content_text(chunk.text)
73
+
74
+ # Check for heading-only chunks
75
+ if self.config.skip_heading_only and not content.strip():
76
+ return False
77
+
78
+ # Check minimum content length
79
+ if len(content) < self.config.min_content_chars:
80
+ return False
81
+
82
+ # Check alphanumeric ratio
83
+ if not self._meets_alphanumeric_threshold(content):
84
+ return False
85
+
86
+ # Check word count
87
+ if not self._meets_word_count(content):
88
+ return False
89
+
90
+ return True
91
+
92
+ def _extract_content_text(self, text: str) -> str:
93
+ """Extract content text, removing markdown heading markers.
94
+
95
+ Args:
96
+ text: Raw chunk text
97
+
98
+ Returns:
99
+ Content without heading lines
100
+ """
101
+ lines = text.split("\n")
102
+ content_lines = []
103
+
104
+ for line in lines:
105
+ # Skip markdown heading lines
106
+ if re.match(r"^#+\s+", line):
107
+ continue
108
+ content_lines.append(line)
109
+
110
+ return "\n".join(content_lines)
111
+
112
+ def _meets_alphanumeric_threshold(self, text: str) -> bool:
113
+ """Check if text meets minimum alphanumeric ratio.
114
+
115
+ Args:
116
+ text: Text to check
117
+
118
+ Returns:
119
+ True if ratio is met
120
+ """
121
+ if not text:
122
+ return False
123
+
124
+ alphanumeric_count = sum(1 for c in text if c.isalnum())
125
+ total_count = len(text)
126
+
127
+ if total_count == 0:
128
+ return False
129
+
130
+ ratio = alphanumeric_count / total_count
131
+ return ratio >= self.config.min_alphanumeric_ratio
132
+
133
+ def _meets_word_count(self, text: str) -> bool:
134
+ """Check if text meets minimum word count.
135
+
136
+ Args:
137
+ text: Text to check
138
+
139
+ Returns:
140
+ True if word count is met
141
+ """
142
+ words = text.split()
143
+ return len(words) >= self.config.min_words
144
+
145
+ def _is_valid_code_block(self, chunk: Chunk) -> bool:
146
+ """Check if a code block chunk is valid.
147
+
148
+ Code blocks are given more lenient filtering since they may be
149
+ short but still valuable (e.g., single function definitions).
150
+
151
+ Args:
152
+ chunk: Code block chunk
153
+
154
+ Returns:
155
+ True if code block should be kept
156
+ """
157
+ # Code blocks must have at least some content
158
+ content = chunk.text.strip()
159
+ if not content:
160
+ return False
161
+
162
+ # Allow code blocks with at least one non-whitespace line
163
+ lines = [line for line in content.split("\n") if line.strip()]
164
+ return len(lines) >= 1
165
+
166
+ def _is_valid_table(self, chunk: Chunk) -> bool:
167
+ """Check if a table chunk is valid.
168
+
169
+ Tables are given more lenient filtering since they may be
170
+ compact but information-rich.
171
+
172
+ Args:
173
+ chunk: Table chunk
174
+
175
+ Returns:
176
+ True if table should be kept
177
+ """
178
+ # Tables must have at least some content
179
+ content = chunk.text.strip()
180
+ if not content:
181
+ return False
182
+
183
+ # Tables should have at least header row and one data row
184
+ lines = [line for line in content.split("\n") if line.strip()]
185
+ return len(lines) >= 2
186
+
187
+ def filter_chunks(self, chunks: list[Chunk]) -> list[Chunk]:
188
+ """Filter a list of chunks, keeping only valid ones.
189
+
190
+ Args:
191
+ chunks: List of chunks to filter
192
+
193
+ Returns:
194
+ List of chunks that pass quality thresholds
195
+ """
196
+ return [chunk for chunk in chunks if self.is_valid(chunk)]
197
+
198
+ def get_rejection_reason(self, chunk: Chunk) -> str | None:
199
+ """Get the reason a chunk would be rejected.
200
+
201
+ Useful for debugging and understanding filtering behavior.
202
+
203
+ Args:
204
+ chunk: The chunk to evaluate
205
+
206
+ Returns:
207
+ Rejection reason string, or None if chunk is valid
208
+ """
209
+ node_type = chunk.metadata.custom.get("node_type", "body")
210
+
211
+ if node_type == "code" and self.config.allow_code_blocks:
212
+ if not self._is_valid_code_block(chunk):
213
+ return "Empty code block"
214
+ return None
215
+
216
+ if node_type == "table" and self.config.allow_tables:
217
+ if not self._is_valid_table(chunk):
218
+ return "Empty or single-row table"
219
+ return None
220
+
221
+ content = self._extract_content_text(chunk.text)
222
+
223
+ if self.config.skip_heading_only and not content.strip():
224
+ return "Heading-only chunk (no body content)"
225
+
226
+ if len(content) < self.config.min_content_chars:
227
+ return f"Content too short ({len(content)} < {self.config.min_content_chars} chars)"
228
+
229
+ if not self._meets_alphanumeric_threshold(content):
230
+ return f"Alphanumeric ratio below threshold ({self.config.min_alphanumeric_ratio})"
231
+
232
+ if not self._meets_word_count(content):
233
+ words = len(content.split())
234
+ return f"Word count too low ({words} < {self.config.min_words} words)"
235
+
236
+ return None