dataknobs-xization 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +110 -0
- dataknobs_xization/annotations.py +1476 -0
- dataknobs_xization/authorities.py +860 -0
- dataknobs_xization/content_transformer.py +570 -0
- dataknobs_xization/ingestion/__init__.py +27 -0
- dataknobs_xization/ingestion/config.py +352 -0
- dataknobs_xization/ingestion/processor.py +367 -0
- dataknobs_xization/json/__init__.py +17 -0
- dataknobs_xization/json/json_chunker.py +591 -0
- dataknobs_xization/lexicon.py +723 -0
- dataknobs_xization/markdown/__init__.py +72 -0
- dataknobs_xization/markdown/enrichment.py +260 -0
- dataknobs_xization/markdown/filters.py +236 -0
- dataknobs_xization/markdown/md_chunker.py +478 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +768 -0
- dataknobs_xization/normalize.py +520 -0
- dataknobs_xization/py.typed +0 -0
- dataknobs_xization-1.2.3.dist-info/METADATA +170 -0
- dataknobs_xization-1.2.3.dist-info/RECORD +23 -0
- dataknobs_xization-1.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Markdown chunking utilities for RAG applications.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive utilities for parsing and chunking markdown
|
|
4
|
+
documents while preserving semantic structure and heading hierarchy.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataknobs_xization.markdown.md_chunker import (
|
|
8
|
+
Chunk,
|
|
9
|
+
ChunkFormat,
|
|
10
|
+
ChunkMetadata,
|
|
11
|
+
HeadingInclusion,
|
|
12
|
+
MarkdownChunker,
|
|
13
|
+
chunk_markdown_tree,
|
|
14
|
+
)
|
|
15
|
+
from dataknobs_xization.markdown.md_parser import (
|
|
16
|
+
MarkdownNode,
|
|
17
|
+
MarkdownParser,
|
|
18
|
+
parse_markdown,
|
|
19
|
+
)
|
|
20
|
+
from dataknobs_xization.markdown.md_streaming import (
|
|
21
|
+
AdaptiveStreamingProcessor,
|
|
22
|
+
StreamingMarkdownProcessor,
|
|
23
|
+
stream_markdown_file,
|
|
24
|
+
stream_markdown_string,
|
|
25
|
+
)
|
|
26
|
+
from dataknobs_xization.markdown.filters import (
|
|
27
|
+
ChunkQualityConfig,
|
|
28
|
+
ChunkQualityFilter,
|
|
29
|
+
)
|
|
30
|
+
from dataknobs_xization.markdown.enrichment import (
|
|
31
|
+
EnrichedChunkData,
|
|
32
|
+
build_enriched_text,
|
|
33
|
+
enrich_chunk,
|
|
34
|
+
extract_heading_metadata,
|
|
35
|
+
format_heading_display,
|
|
36
|
+
format_heading_for_display,
|
|
37
|
+
get_dynamic_heading_display,
|
|
38
|
+
get_relevant_headings_for_display,
|
|
39
|
+
is_multiword,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
# Parser
|
|
44
|
+
"MarkdownNode",
|
|
45
|
+
"MarkdownParser",
|
|
46
|
+
"parse_markdown",
|
|
47
|
+
# Chunker
|
|
48
|
+
"Chunk",
|
|
49
|
+
"ChunkFormat",
|
|
50
|
+
"ChunkMetadata",
|
|
51
|
+
"HeadingInclusion",
|
|
52
|
+
"MarkdownChunker",
|
|
53
|
+
"chunk_markdown_tree",
|
|
54
|
+
# Streaming
|
|
55
|
+
"AdaptiveStreamingProcessor",
|
|
56
|
+
"StreamingMarkdownProcessor",
|
|
57
|
+
"stream_markdown_file",
|
|
58
|
+
"stream_markdown_string",
|
|
59
|
+
# Filters
|
|
60
|
+
"ChunkQualityConfig",
|
|
61
|
+
"ChunkQualityFilter",
|
|
62
|
+
# Enrichment
|
|
63
|
+
"EnrichedChunkData",
|
|
64
|
+
"build_enriched_text",
|
|
65
|
+
"enrich_chunk",
|
|
66
|
+
"extract_heading_metadata",
|
|
67
|
+
"format_heading_display",
|
|
68
|
+
"format_heading_for_display",
|
|
69
|
+
"get_dynamic_heading_display",
|
|
70
|
+
"get_relevant_headings_for_display",
|
|
71
|
+
"is_multiword",
|
|
72
|
+
]
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""Heading enrichment utilities for RAG-optimized chunk embeddings.
|
|
2
|
+
|
|
3
|
+
This module provides utilities to enrich chunk content with heading context
|
|
4
|
+
for improved semantic search, while keeping headings out of the displayed content.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_multiword(heading: str) -> bool:
|
|
14
|
+
"""Check if a heading contains multiple words.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
heading: The heading text to check
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
True if the heading has more than one word
|
|
21
|
+
"""
|
|
22
|
+
return len(heading.split()) > 1
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def format_heading_display(
|
|
26
|
+
heading_path: list[str],
|
|
27
|
+
separator: str = " > ",
|
|
28
|
+
) -> str:
|
|
29
|
+
"""Format a heading path for display.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
heading_path: List of headings from root to chunk
|
|
33
|
+
separator: Separator to use between headings
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Formatted heading path string
|
|
37
|
+
"""
|
|
38
|
+
if not heading_path:
|
|
39
|
+
return ""
|
|
40
|
+
return separator.join(heading_path)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_dynamic_heading_display(
|
|
44
|
+
heading_path: list[str],
|
|
45
|
+
content_length: int,
|
|
46
|
+
small_threshold: int = 200,
|
|
47
|
+
medium_threshold: int = 800,
|
|
48
|
+
) -> str:
|
|
49
|
+
"""Get heading display based on content length.
|
|
50
|
+
|
|
51
|
+
Dynamic heading inclusion:
|
|
52
|
+
- Small chunks (< small_threshold): Full heading path
|
|
53
|
+
- Medium chunks (< medium_threshold): Last 2 headings
|
|
54
|
+
- Large chunks: No headings
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
heading_path: List of headings from root to chunk
|
|
58
|
+
content_length: Length of chunk content in characters
|
|
59
|
+
small_threshold: Max chars for "small" chunks
|
|
60
|
+
medium_threshold: Max chars for "medium" chunks
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Formatted heading display string
|
|
64
|
+
"""
|
|
65
|
+
if not heading_path:
|
|
66
|
+
return ""
|
|
67
|
+
|
|
68
|
+
if content_length <= small_threshold:
|
|
69
|
+
# Small chunks: include full heading path
|
|
70
|
+
return format_heading_display(heading_path)
|
|
71
|
+
elif content_length <= medium_threshold:
|
|
72
|
+
# Medium chunks: include last 2 heading levels
|
|
73
|
+
relevant = heading_path[-2:] if len(heading_path) > 2 else heading_path
|
|
74
|
+
return format_heading_display(relevant)
|
|
75
|
+
else:
|
|
76
|
+
# Large chunks: omit headings
|
|
77
|
+
return ""
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class EnrichedChunkData:
|
|
82
|
+
"""Data for a chunk enriched with heading context.
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
content: Clean content text (no headings)
|
|
86
|
+
embedding_text: Text to use for embedding (heading-enriched)
|
|
87
|
+
heading_path: List of headings from root to chunk
|
|
88
|
+
heading_display: Formatted heading path for display
|
|
89
|
+
content_length: Length of clean content in characters
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
content: str
|
|
93
|
+
embedding_text: str
|
|
94
|
+
heading_path: list[str]
|
|
95
|
+
heading_display: str
|
|
96
|
+
content_length: int
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def build_enriched_text(heading_path: list[str], content: str) -> str:
|
|
100
|
+
"""Build text for embedding with relevant heading context.
|
|
101
|
+
|
|
102
|
+
Uses a modified approach where headings are included up from the chunk
|
|
103
|
+
until and including the first multi-word heading. This provides semantic
|
|
104
|
+
context without over-weighting deep, single-word labels like "Example".
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
heading_path: List of heading texts from root to chunk
|
|
108
|
+
content: The chunk content text
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Enriched text suitable for embedding
|
|
112
|
+
|
|
113
|
+
Examples:
|
|
114
|
+
>>> build_enriched_text(["Patterns", "Chain-of-Thought", "Example"], "code here")
|
|
115
|
+
'Chain-of-Thought: Example: code here'
|
|
116
|
+
|
|
117
|
+
>>> build_enriched_text(["Setup"], "install steps")
|
|
118
|
+
'Setup: install steps'
|
|
119
|
+
|
|
120
|
+
>>> build_enriched_text(["API Reference", "Authentication", "OAuth 2.0"], "...")
|
|
121
|
+
'Authentication: OAuth 2.0: ...'
|
|
122
|
+
|
|
123
|
+
>>> build_enriched_text([], "standalone content")
|
|
124
|
+
'standalone content'
|
|
125
|
+
"""
|
|
126
|
+
if not heading_path:
|
|
127
|
+
return content
|
|
128
|
+
|
|
129
|
+
# Walk backwards from deepest heading to find relevant context
|
|
130
|
+
relevant_headings = []
|
|
131
|
+
for heading in reversed(heading_path):
|
|
132
|
+
relevant_headings.insert(0, heading)
|
|
133
|
+
# Stop after including a multi-word heading
|
|
134
|
+
if len(heading.split()) > 1:
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
# Build the enriched text
|
|
138
|
+
if relevant_headings:
|
|
139
|
+
prefix = ": ".join(relevant_headings)
|
|
140
|
+
return f"{prefix}: {content}"
|
|
141
|
+
|
|
142
|
+
return content
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def extract_heading_metadata(
|
|
146
|
+
headings: list[str],
|
|
147
|
+
heading_levels: list[int],
|
|
148
|
+
separator: str = " > ",
|
|
149
|
+
) -> dict[str, Any]:
|
|
150
|
+
"""Extract heading metadata for storage.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
headings: List of heading texts from root to chunk
|
|
154
|
+
heading_levels: Corresponding heading levels (1-6)
|
|
155
|
+
separator: Separator for display string
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Dictionary with heading metadata fields
|
|
159
|
+
"""
|
|
160
|
+
return {
|
|
161
|
+
"heading_path": headings,
|
|
162
|
+
"heading_levels": heading_levels,
|
|
163
|
+
"heading_display": separator.join(headings) if headings else "",
|
|
164
|
+
"heading_depth": len(headings),
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_relevant_headings_for_display(
|
|
169
|
+
heading_path: list[str],
|
|
170
|
+
content_length: int,
|
|
171
|
+
small_threshold: int = 200,
|
|
172
|
+
medium_threshold: int = 800,
|
|
173
|
+
) -> list[str]:
|
|
174
|
+
"""Get headings to display based on content length.
|
|
175
|
+
|
|
176
|
+
Implements dynamic heading inclusion:
|
|
177
|
+
- Small chunks: Full heading path (need context)
|
|
178
|
+
- Medium chunks: Last 2 heading levels
|
|
179
|
+
- Large chunks: No headings (content is self-contained)
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
heading_path: List of heading texts from root to chunk
|
|
183
|
+
content_length: Length of chunk content in characters
|
|
184
|
+
small_threshold: Max chars for "small" chunks
|
|
185
|
+
medium_threshold: Max chars for "medium" chunks
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List of headings to display
|
|
189
|
+
"""
|
|
190
|
+
if not heading_path:
|
|
191
|
+
return []
|
|
192
|
+
|
|
193
|
+
if content_length < small_threshold:
|
|
194
|
+
# Small chunks: include full heading path
|
|
195
|
+
return heading_path
|
|
196
|
+
elif content_length < medium_threshold:
|
|
197
|
+
# Medium chunks: include last 2 heading levels
|
|
198
|
+
return heading_path[-2:] if len(heading_path) > 2 else heading_path
|
|
199
|
+
else:
|
|
200
|
+
# Large chunks: omit headings
|
|
201
|
+
return []
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def format_heading_for_display(
|
|
205
|
+
headings: list[str],
|
|
206
|
+
heading_levels: list[int] | None = None,
|
|
207
|
+
format_style: str = "markdown",
|
|
208
|
+
) -> str:
|
|
209
|
+
"""Format headings for display in LLM context.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
headings: List of heading texts to display
|
|
213
|
+
heading_levels: Corresponding levels (used for markdown format)
|
|
214
|
+
format_style: "markdown" for # syntax, "path" for > separated
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Formatted heading string
|
|
218
|
+
"""
|
|
219
|
+
if not headings:
|
|
220
|
+
return ""
|
|
221
|
+
|
|
222
|
+
if format_style == "path":
|
|
223
|
+
return " > ".join(headings)
|
|
224
|
+
|
|
225
|
+
if format_style == "markdown" and heading_levels:
|
|
226
|
+
lines = []
|
|
227
|
+
for heading, level in zip(headings, heading_levels):
|
|
228
|
+
lines.append(f"{'#' * level} {heading}")
|
|
229
|
+
return "\n".join(lines)
|
|
230
|
+
|
|
231
|
+
# Default: just join with separator
|
|
232
|
+
return " > ".join(headings)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def enrich_chunk(
|
|
236
|
+
content: str,
|
|
237
|
+
headings: list[str],
|
|
238
|
+
heading_levels: list[int],
|
|
239
|
+
) -> EnrichedChunkData:
|
|
240
|
+
"""Create fully enriched chunk data from raw components.
|
|
241
|
+
|
|
242
|
+
Convenience function that combines all enrichment operations.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
content: Raw chunk content text
|
|
246
|
+
headings: List of heading texts from root to chunk
|
|
247
|
+
heading_levels: Corresponding heading levels
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
EnrichedChunkData with all computed fields
|
|
251
|
+
"""
|
|
252
|
+
embedding_text = build_enriched_text(headings, content)
|
|
253
|
+
|
|
254
|
+
return EnrichedChunkData(
|
|
255
|
+
content=content,
|
|
256
|
+
embedding_text=embedding_text,
|
|
257
|
+
heading_path=headings,
|
|
258
|
+
heading_display=" > ".join(headings) if headings else "",
|
|
259
|
+
content_length=len(content),
|
|
260
|
+
)
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Quality filters for markdown chunks.
|
|
2
|
+
|
|
3
|
+
This module provides filtering utilities to identify and remove low-quality
|
|
4
|
+
chunks that would not contribute meaningful content to RAG retrieval.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from dataknobs_xization.markdown.md_chunker import Chunk
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ChunkQualityConfig:
|
|
19
|
+
"""Configuration for chunk quality filtering.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
min_content_chars: Minimum characters of non-heading content
|
|
23
|
+
min_alphanumeric_ratio: Minimum ratio of alphanumeric to total chars
|
|
24
|
+
skip_heading_only: Skip chunks with only headings (no body content)
|
|
25
|
+
min_words: Minimum word count for content
|
|
26
|
+
allow_code_blocks: Allow short code blocks that would otherwise be filtered
|
|
27
|
+
allow_tables: Allow short tables that would otherwise be filtered
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
min_content_chars: int = 50
|
|
31
|
+
min_alphanumeric_ratio: float = 0.3
|
|
32
|
+
skip_heading_only: bool = True
|
|
33
|
+
min_words: int = 5
|
|
34
|
+
allow_code_blocks: bool = True
|
|
35
|
+
allow_tables: bool = True
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ChunkQualityFilter:
|
|
39
|
+
"""Filter for identifying and removing low-quality chunks.
|
|
40
|
+
|
|
41
|
+
This filter helps ensure that only meaningful content is indexed
|
|
42
|
+
for RAG retrieval, reducing noise and improving retrieval quality.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, config: ChunkQualityConfig | None = None):
|
|
46
|
+
"""Initialize the quality filter.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
config: Quality configuration, uses defaults if not provided
|
|
50
|
+
"""
|
|
51
|
+
self.config = config or ChunkQualityConfig()
|
|
52
|
+
|
|
53
|
+
def is_valid(self, chunk: Chunk) -> bool:
|
|
54
|
+
"""Check if a chunk meets quality thresholds.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
chunk: The chunk to evaluate
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
True if chunk should be kept, False if it should be filtered
|
|
61
|
+
"""
|
|
62
|
+
# Get node type from custom metadata
|
|
63
|
+
node_type = chunk.metadata.custom.get("node_type", "body")
|
|
64
|
+
|
|
65
|
+
# Special handling for code blocks and tables
|
|
66
|
+
if node_type == "code" and self.config.allow_code_blocks:
|
|
67
|
+
return self._is_valid_code_block(chunk)
|
|
68
|
+
if node_type == "table" and self.config.allow_tables:
|
|
69
|
+
return self._is_valid_table(chunk)
|
|
70
|
+
|
|
71
|
+
# Extract content without heading markers
|
|
72
|
+
content = self._extract_content_text(chunk.text)
|
|
73
|
+
|
|
74
|
+
# Check for heading-only chunks
|
|
75
|
+
if self.config.skip_heading_only and not content.strip():
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
# Check minimum content length
|
|
79
|
+
if len(content) < self.config.min_content_chars:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
# Check alphanumeric ratio
|
|
83
|
+
if not self._meets_alphanumeric_threshold(content):
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
# Check word count
|
|
87
|
+
if not self._meets_word_count(content):
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
def _extract_content_text(self, text: str) -> str:
|
|
93
|
+
"""Extract content text, removing markdown heading markers.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Raw chunk text
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Content without heading lines
|
|
100
|
+
"""
|
|
101
|
+
lines = text.split("\n")
|
|
102
|
+
content_lines = []
|
|
103
|
+
|
|
104
|
+
for line in lines:
|
|
105
|
+
# Skip markdown heading lines
|
|
106
|
+
if re.match(r"^#+\s+", line):
|
|
107
|
+
continue
|
|
108
|
+
content_lines.append(line)
|
|
109
|
+
|
|
110
|
+
return "\n".join(content_lines)
|
|
111
|
+
|
|
112
|
+
def _meets_alphanumeric_threshold(self, text: str) -> bool:
|
|
113
|
+
"""Check if text meets minimum alphanumeric ratio.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
text: Text to check
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
True if ratio is met
|
|
120
|
+
"""
|
|
121
|
+
if not text:
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
alphanumeric_count = sum(1 for c in text if c.isalnum())
|
|
125
|
+
total_count = len(text)
|
|
126
|
+
|
|
127
|
+
if total_count == 0:
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
ratio = alphanumeric_count / total_count
|
|
131
|
+
return ratio >= self.config.min_alphanumeric_ratio
|
|
132
|
+
|
|
133
|
+
def _meets_word_count(self, text: str) -> bool:
|
|
134
|
+
"""Check if text meets minimum word count.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
text: Text to check
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if word count is met
|
|
141
|
+
"""
|
|
142
|
+
words = text.split()
|
|
143
|
+
return len(words) >= self.config.min_words
|
|
144
|
+
|
|
145
|
+
def _is_valid_code_block(self, chunk: Chunk) -> bool:
|
|
146
|
+
"""Check if a code block chunk is valid.
|
|
147
|
+
|
|
148
|
+
Code blocks are given more lenient filtering since they may be
|
|
149
|
+
short but still valuable (e.g., single function definitions).
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
chunk: Code block chunk
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
True if code block should be kept
|
|
156
|
+
"""
|
|
157
|
+
# Code blocks must have at least some content
|
|
158
|
+
content = chunk.text.strip()
|
|
159
|
+
if not content:
|
|
160
|
+
return False
|
|
161
|
+
|
|
162
|
+
# Allow code blocks with at least one non-whitespace line
|
|
163
|
+
lines = [line for line in content.split("\n") if line.strip()]
|
|
164
|
+
return len(lines) >= 1
|
|
165
|
+
|
|
166
|
+
def _is_valid_table(self, chunk: Chunk) -> bool:
|
|
167
|
+
"""Check if a table chunk is valid.
|
|
168
|
+
|
|
169
|
+
Tables are given more lenient filtering since they may be
|
|
170
|
+
compact but information-rich.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
chunk: Table chunk
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
True if table should be kept
|
|
177
|
+
"""
|
|
178
|
+
# Tables must have at least some content
|
|
179
|
+
content = chunk.text.strip()
|
|
180
|
+
if not content:
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
# Tables should have at least header row and one data row
|
|
184
|
+
lines = [line for line in content.split("\n") if line.strip()]
|
|
185
|
+
return len(lines) >= 2
|
|
186
|
+
|
|
187
|
+
def filter_chunks(self, chunks: list[Chunk]) -> list[Chunk]:
|
|
188
|
+
"""Filter a list of chunks, keeping only valid ones.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
chunks: List of chunks to filter
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
List of chunks that pass quality thresholds
|
|
195
|
+
"""
|
|
196
|
+
return [chunk for chunk in chunks if self.is_valid(chunk)]
|
|
197
|
+
|
|
198
|
+
def get_rejection_reason(self, chunk: Chunk) -> str | None:
|
|
199
|
+
"""Get the reason a chunk would be rejected.
|
|
200
|
+
|
|
201
|
+
Useful for debugging and understanding filtering behavior.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
chunk: The chunk to evaluate
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Rejection reason string, or None if chunk is valid
|
|
208
|
+
"""
|
|
209
|
+
node_type = chunk.metadata.custom.get("node_type", "body")
|
|
210
|
+
|
|
211
|
+
if node_type == "code" and self.config.allow_code_blocks:
|
|
212
|
+
if not self._is_valid_code_block(chunk):
|
|
213
|
+
return "Empty code block"
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
if node_type == "table" and self.config.allow_tables:
|
|
217
|
+
if not self._is_valid_table(chunk):
|
|
218
|
+
return "Empty or single-row table"
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
content = self._extract_content_text(chunk.text)
|
|
222
|
+
|
|
223
|
+
if self.config.skip_heading_only and not content.strip():
|
|
224
|
+
return "Heading-only chunk (no body content)"
|
|
225
|
+
|
|
226
|
+
if len(content) < self.config.min_content_chars:
|
|
227
|
+
return f"Content too short ({len(content)} < {self.config.min_content_chars} chars)"
|
|
228
|
+
|
|
229
|
+
if not self._meets_alphanumeric_threshold(content):
|
|
230
|
+
return f"Alphanumeric ratio below threshold ({self.config.min_alphanumeric_ratio})"
|
|
231
|
+
|
|
232
|
+
if not self._meets_word_count(content):
|
|
233
|
+
words = len(content.split())
|
|
234
|
+
return f"Word count too low ({words} < {self.config.min_words} words)"
|
|
235
|
+
|
|
236
|
+
return None
|