inkmark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +940 -0
- data/Cargo.toml +27 -0
- data/LICENSE.txt +21 -0
- data/NOTICE +16 -0
- data/README.md +1166 -0
- data/ext/inkmark/Cargo.toml +31 -0
- data/ext/inkmark/build.rs +5 -0
- data/ext/inkmark/extconf.rb +6 -0
- data/ext/inkmark/src/autolink.rs +167 -0
- data/ext/inkmark/src/chunks_by_heading.rs +325 -0
- data/ext/inkmark/src/chunks_by_size.rs +302 -0
- data/ext/inkmark/src/document.rs +411 -0
- data/ext/inkmark/src/emoji.rs +197 -0
- data/ext/inkmark/src/handler.rs +758 -0
- data/ext/inkmark/src/heading.rs +262 -0
- data/ext/inkmark/src/highlight.rs +202 -0
- data/ext/inkmark/src/image.rs +284 -0
- data/ext/inkmark/src/lib.rs +54 -0
- data/ext/inkmark/src/link.rs +291 -0
- data/ext/inkmark/src/options.rs +231 -0
- data/ext/inkmark/src/plain_text.rs +445 -0
- data/ext/inkmark/src/scheme_filter.rs +319 -0
- data/ext/inkmark/src/stats.rs +453 -0
- data/ext/inkmark/src/tag_filter.rs +226 -0
- data/ext/inkmark/src/toc.rs +221 -0
- data/ext/inkmark/src/truncate.rs +267 -0
- data/ext/inkmark/src/url_match.rs +178 -0
- data/lib/inkmark/event.rb +342 -0
- data/lib/inkmark/native.rb +8 -0
- data/lib/inkmark/options.rb +698 -0
- data/lib/inkmark/toc.rb +40 -0
- data/lib/inkmark/version.rb +6 -0
- data/lib/inkmark.rb +711 -0
- data/sig/inkmark.rbs +219 -0
- metadata +208 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
//! Sliding-window chunking for LLM / RAG pipelines.
|
|
2
|
+
//!
|
|
3
|
+
//! Splits a document into fixed-size chunks with optional overlap.
|
|
4
|
+
//! Unlike `chunks_by_heading` (which uses document structure), this
|
|
5
|
+
//! walks the filter-applied Markdown sequentially and emits windows
|
|
6
|
+
//! bounded by a character and/or word budget.
|
|
7
|
+
//!
|
|
8
|
+
//! Two boundary modes:
|
|
9
|
+
//! - [`BoundaryAt::Block`]: cut only between top-level Markdown blocks.
|
|
10
|
+
//! Output is always valid Markdown. Oversized blocks are emitted
|
|
11
|
+
//! as their own windows (decision A).
|
|
12
|
+
//! - [`BoundaryAt::Word`]: serialize the full filtered Markdown, cut
|
|
13
|
+
//! at the last Unicode word boundary that fits. Tighter fit but may
|
|
14
|
+
//! split open constructs (code fences, links).
|
|
15
|
+
//!
|
|
16
|
+
//! Overlap is measured in chars. Each new window begins with the
|
|
17
|
+
//! trailing `overlap` chars of the previous window's content, so
|
|
18
|
+
//! adjacent chunks share context.
|
|
19
|
+
|
|
20
|
+
use magnus::{Error, RArray, RHash, Ruby};
|
|
21
|
+
use pulldown_cmark::{Event, Parser};
|
|
22
|
+
use unicode_segmentation::UnicodeSegmentation;
|
|
23
|
+
|
|
24
|
+
use crate::document::apply_filters;
|
|
25
|
+
use crate::options::build_options;
|
|
26
|
+
|
|
27
|
+
#[derive(Clone, Copy, PartialEq, Eq)]
|
|
28
|
+
pub enum BoundaryAt {
|
|
29
|
+
Block,
|
|
30
|
+
Word,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
pub struct WindowParams {
|
|
34
|
+
pub chars: Option<usize>,
|
|
35
|
+
pub words: Option<usize>,
|
|
36
|
+
pub overlap: usize,
|
|
37
|
+
pub at: BoundaryAt,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
pub fn native_chunks_by_size(
|
|
41
|
+
ruby: &Ruby,
|
|
42
|
+
source: String,
|
|
43
|
+
opts_hash: RHash,
|
|
44
|
+
) -> Result<RArray, Error> {
|
|
45
|
+
let params = parse_params(ruby, &opts_hash)?;
|
|
46
|
+
let (cm_opts, flags) = build_options(ruby, opts_hash)?;
|
|
47
|
+
|
|
48
|
+
let events: Vec<Event> = Parser::new_ext(&source, cm_opts).collect();
|
|
49
|
+
let events = apply_filters(events, &flags);
|
|
50
|
+
|
|
51
|
+
let windows = match params.at {
|
|
52
|
+
BoundaryAt::Block => chunk_blocks(&events, ¶ms),
|
|
53
|
+
BoundaryAt::Word => chunk_words(&events, ¶ms),
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
build_result(ruby, &windows, flags.statistics)
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
fn parse_params(ruby: &Ruby, hash: &RHash) -> Result<WindowParams, Error> {
|
|
60
|
+
let nested: Option<RHash> = hash.lookup(ruby.to_symbol("__window"))?;
|
|
61
|
+
let params =
|
|
62
|
+
nested.ok_or_else(|| Error::new(ruby.exception_arg_error(), "missing window params"))?;
|
|
63
|
+
|
|
64
|
+
let chars: Option<usize> = params.lookup(ruby.to_symbol("chars"))?;
|
|
65
|
+
let words: Option<usize> = params.lookup(ruby.to_symbol("words"))?;
|
|
66
|
+
let overlap: Option<usize> = params.lookup(ruby.to_symbol("overlap"))?;
|
|
67
|
+
let at_str: Option<String> = params.lookup(ruby.to_symbol("at"))?;
|
|
68
|
+
|
|
69
|
+
let at = match at_str.as_deref() {
|
|
70
|
+
Some("word") => BoundaryAt::Word,
|
|
71
|
+
_ => BoundaryAt::Block,
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
Ok(WindowParams {
|
|
75
|
+
chars,
|
|
76
|
+
words,
|
|
77
|
+
overlap: overlap.unwrap_or(0),
|
|
78
|
+
at,
|
|
79
|
+
})
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
fn chunk_blocks(events: &[Event<'_>], params: &WindowParams) -> Vec<String> {
|
|
83
|
+
let mut out: Vec<String> = Vec::new();
|
|
84
|
+
let mut current = String::new();
|
|
85
|
+
let mut current_chars: usize = 0;
|
|
86
|
+
let mut current_words: usize = 0;
|
|
87
|
+
|
|
88
|
+
for (start, end) in top_level_blocks(events) {
|
|
89
|
+
let block = render_markdown(&events[start..=end]);
|
|
90
|
+
let block_chars = block.chars().count();
|
|
91
|
+
let block_words = block.unicode_words().count();
|
|
92
|
+
|
|
93
|
+
let would_exceed_chars = params
|
|
94
|
+
.chars
|
|
95
|
+
.map(|b| current_chars + block_chars > b)
|
|
96
|
+
.unwrap_or(false);
|
|
97
|
+
let would_exceed_words = params
|
|
98
|
+
.words
|
|
99
|
+
.map(|b| current_words + block_words > b)
|
|
100
|
+
.unwrap_or(false);
|
|
101
|
+
|
|
102
|
+
// Oversized blocks (a single block larger than the
|
|
103
|
+
// budget) get emitted as their own window, never silently
|
|
104
|
+
// dropped or truncated.
|
|
105
|
+
if (would_exceed_chars || would_exceed_words) && !current.is_empty() {
|
|
106
|
+
let finished = std::mem::take(&mut current);
|
|
107
|
+
current = seed_with_overlap(&finished, params.overlap);
|
|
108
|
+
current_chars = current.chars().count();
|
|
109
|
+
current_words = current.unicode_words().count();
|
|
110
|
+
out.push(finished);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
current.push_str(&block);
|
|
114
|
+
current_chars += block_chars;
|
|
115
|
+
current_words += block_words;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if !current.is_empty() {
|
|
119
|
+
out.push(current);
|
|
120
|
+
}
|
|
121
|
+
out
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/// Return the trailing `overlap` characters of `s`, aligned to a
|
|
125
|
+
/// char boundary. When `overlap == 0` or `s` is shorter than the
|
|
126
|
+
/// overlap budget, return an empty string.
|
|
127
|
+
fn seed_with_overlap(s: &str, overlap: usize) -> String {
|
|
128
|
+
if overlap == 0 {
|
|
129
|
+
return String::new();
|
|
130
|
+
}
|
|
131
|
+
let total = s.chars().count();
|
|
132
|
+
if overlap >= total {
|
|
133
|
+
return String::new();
|
|
134
|
+
}
|
|
135
|
+
let skip = total - overlap;
|
|
136
|
+
s.chars().skip(skip).collect()
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
fn top_level_blocks(events: &[Event<'_>]) -> Vec<(usize, usize)> {
|
|
140
|
+
let mut blocks = Vec::new();
|
|
141
|
+
let mut depth: i32 = 0;
|
|
142
|
+
let mut current_start: Option<usize> = None;
|
|
143
|
+
|
|
144
|
+
for (i, event) in events.iter().enumerate() {
|
|
145
|
+
match event {
|
|
146
|
+
Event::Start(_) => {
|
|
147
|
+
if depth == 0 {
|
|
148
|
+
current_start = Some(i);
|
|
149
|
+
}
|
|
150
|
+
depth += 1;
|
|
151
|
+
}
|
|
152
|
+
Event::End(_) => {
|
|
153
|
+
depth -= 1;
|
|
154
|
+
if depth == 0 {
|
|
155
|
+
if let Some(start) = current_start.take() {
|
|
156
|
+
blocks.push((start, i));
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
_ => {
|
|
161
|
+
if depth == 0 {
|
|
162
|
+
blocks.push((i, i));
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
blocks
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
fn render_markdown(events: &[Event<'_>]) -> String {
|
|
171
|
+
let mut buf = String::new();
|
|
172
|
+
pulldown_cmark_to_cmark::cmark(events.iter().cloned(), &mut buf)
|
|
173
|
+
.expect("markdown serialization failed");
|
|
174
|
+
buf
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Serialize the full filtered Markdown into one string, then walk
|
|
178
|
+
// word boundaries. Each window is a byte-aligned slice of the
|
|
179
|
+
// serialized output ending at a word boundary that fits the budget.
|
|
180
|
+
// Overlap is implemented by advancing the window start backward by
|
|
181
|
+
// `overlap` chars (to the next word boundary) before taking the
|
|
182
|
+
// next window.
|
|
183
|
+
|
|
184
|
+
fn chunk_words(events: &[Event<'_>], params: &WindowParams) -> Vec<String> {
|
|
185
|
+
let rendered = render_markdown(events);
|
|
186
|
+
if rendered.is_empty() {
|
|
187
|
+
return Vec::new();
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
let mut out: Vec<String> = Vec::new();
|
|
191
|
+
let mut cursor: usize = 0; // byte offset into `rendered`
|
|
192
|
+
let bytes_len = rendered.len();
|
|
193
|
+
|
|
194
|
+
while cursor < bytes_len {
|
|
195
|
+
// Find the largest byte offset `end_byte` such that the chars
|
|
196
|
+
// in rendered[cursor..end_byte] fit both char and word budgets
|
|
197
|
+
// and end on a word boundary.
|
|
198
|
+
let slice = &rendered[cursor..];
|
|
199
|
+
let mut used_chars: usize = 0;
|
|
200
|
+
let mut used_words: usize = 0;
|
|
201
|
+
let mut last_good_byte: usize = 0;
|
|
202
|
+
|
|
203
|
+
for (offset, segment) in slice.split_word_bound_indices() {
|
|
204
|
+
let seg_chars = segment.chars().count();
|
|
205
|
+
let seg_is_word = segment.unicode_words().next().is_some();
|
|
206
|
+
|
|
207
|
+
let next_chars = used_chars + seg_chars;
|
|
208
|
+
let next_words = if seg_is_word {
|
|
209
|
+
used_words + 1
|
|
210
|
+
} else {
|
|
211
|
+
used_words
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
let over_chars = params.chars.map(|b| next_chars > b).unwrap_or(false);
|
|
215
|
+
let over_words = params.words.map(|b| next_words > b).unwrap_or(false);
|
|
216
|
+
if over_chars || over_words {
|
|
217
|
+
break;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
used_chars = next_chars;
|
|
221
|
+
used_words = next_words;
|
|
222
|
+
last_good_byte = offset + segment.len();
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// If no progress at all, take the next whole segment to avoid
|
|
226
|
+
// an infinite loop (can happen if the first segment's char
|
|
227
|
+
// count already exceeds the budget).
|
|
228
|
+
if last_good_byte == 0 {
|
|
229
|
+
if let Some((_, segment)) = slice.split_word_bound_indices().next() {
|
|
230
|
+
last_good_byte = segment.len();
|
|
231
|
+
} else {
|
|
232
|
+
break;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
let window = slice[..last_good_byte].to_string();
|
|
237
|
+
if !window.is_empty() {
|
|
238
|
+
out.push(window);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Advance cursor by the full window length, then step back by
|
|
242
|
+
// `overlap` chars (to a word boundary) for the next window.
|
|
243
|
+
// Guarantee forward progress so we can't loop forever.
|
|
244
|
+
let next_cursor = cursor + last_good_byte;
|
|
245
|
+
let candidate = advance_with_overlap(&rendered, next_cursor, params.overlap);
|
|
246
|
+
cursor = if candidate <= cursor {
|
|
247
|
+
next_cursor
|
|
248
|
+
} else {
|
|
249
|
+
candidate
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
out
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/// Step the cursor back by roughly `overlap` characters, then land on
|
|
257
|
+
/// the next word boundary to keep slices aligned. Returns the new
|
|
258
|
+
/// cursor position (byte offset).
|
|
259
|
+
fn advance_with_overlap(rendered: &str, end_byte: usize, overlap: usize) -> usize {
|
|
260
|
+
if overlap == 0 {
|
|
261
|
+
return end_byte;
|
|
262
|
+
}
|
|
263
|
+
let prefix = &rendered[..end_byte];
|
|
264
|
+
let prefix_chars = prefix.chars().count();
|
|
265
|
+
if overlap >= prefix_chars {
|
|
266
|
+
return 0;
|
|
267
|
+
}
|
|
268
|
+
let target_char_index = prefix_chars - overlap;
|
|
269
|
+
|
|
270
|
+
// Find the word boundary at or before target_char_index.
|
|
271
|
+
let mut char_idx: usize = 0;
|
|
272
|
+
let mut last_boundary: usize = 0;
|
|
273
|
+
for (offset, segment) in prefix.split_word_bound_indices() {
|
|
274
|
+
if char_idx >= target_char_index {
|
|
275
|
+
return offset;
|
|
276
|
+
}
|
|
277
|
+
char_idx += segment.chars().count();
|
|
278
|
+
last_boundary = offset + segment.len();
|
|
279
|
+
}
|
|
280
|
+
last_boundary
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
fn build_result(ruby: &Ruby, windows: &[String], with_counts: bool) -> Result<RArray, Error> {
|
|
284
|
+
let arr = ruby.ary_new_capa(windows.len());
|
|
285
|
+
for (i, content) in windows.iter().enumerate() {
|
|
286
|
+
let hash = ruby.hash_new();
|
|
287
|
+
hash.aset(ruby.to_symbol("index"), i)?;
|
|
288
|
+
hash.aset(ruby.to_symbol("content"), content.as_str())?;
|
|
289
|
+
if with_counts {
|
|
290
|
+
hash.aset(
|
|
291
|
+
ruby.to_symbol("character_count"),
|
|
292
|
+
content.trim().chars().count(),
|
|
293
|
+
)?;
|
|
294
|
+
hash.aset(
|
|
295
|
+
ruby.to_symbol("word_count"),
|
|
296
|
+
content.unicode_words().count(),
|
|
297
|
+
)?;
|
|
298
|
+
}
|
|
299
|
+
arr.push(hash)?;
|
|
300
|
+
}
|
|
301
|
+
Ok(arr)
|
|
302
|
+
}
|