inkmark 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,302 @@
1
+ //! Sliding-window chunking for LLM / RAG pipelines.
2
+ //!
3
+ //! Splits a document into fixed-size chunks with optional overlap.
4
+ //! Unlike `chunks_by_heading` (which uses document structure), this
5
+ //! walks the filter-applied Markdown sequentially and emits windows
6
+ //! bounded by a character and/or word budget.
7
+ //!
8
+ //! Two boundary modes:
9
+ //! - [`BoundaryAt::Block`]: cut only between top-level Markdown blocks.
10
+ //! Output is always valid Markdown. Oversized blocks are emitted
11
+ //! as their own windows (decision A).
12
+ //! - [`BoundaryAt::Word`]: serialize the full filtered Markdown, cut
13
+ //! at the last Unicode word boundary that fits. Tighter fit but may
14
+ //! split open constructs (code fences, links).
15
+ //!
16
+ //! Overlap is measured in chars. Each new window begins with the
17
+ //! trailing `overlap` chars of the previous window's content, so
18
+ //! adjacent chunks share context.
19
+
20
+ use magnus::{Error, RArray, RHash, Ruby};
21
+ use pulldown_cmark::{Event, Parser};
22
+ use unicode_segmentation::UnicodeSegmentation;
23
+
24
+ use crate::document::apply_filters;
25
+ use crate::options::build_options;
26
+
27
+ #[derive(Clone, Copy, PartialEq, Eq)]
28
+ pub enum BoundaryAt {
29
+ Block,
30
+ Word,
31
+ }
32
+
33
+ pub struct WindowParams {
34
+ pub chars: Option<usize>,
35
+ pub words: Option<usize>,
36
+ pub overlap: usize,
37
+ pub at: BoundaryAt,
38
+ }
39
+
40
+ pub fn native_chunks_by_size(
41
+ ruby: &Ruby,
42
+ source: String,
43
+ opts_hash: RHash,
44
+ ) -> Result<RArray, Error> {
45
+ let params = parse_params(ruby, &opts_hash)?;
46
+ let (cm_opts, flags) = build_options(ruby, opts_hash)?;
47
+
48
+ let events: Vec<Event> = Parser::new_ext(&source, cm_opts).collect();
49
+ let events = apply_filters(events, &flags);
50
+
51
+ let windows = match params.at {
52
+ BoundaryAt::Block => chunk_blocks(&events, &params),
53
+ BoundaryAt::Word => chunk_words(&events, &params),
54
+ };
55
+
56
+ build_result(ruby, &windows, flags.statistics)
57
+ }
58
+
59
+ fn parse_params(ruby: &Ruby, hash: &RHash) -> Result<WindowParams, Error> {
60
+ let nested: Option<RHash> = hash.lookup(ruby.to_symbol("__window"))?;
61
+ let params =
62
+ nested.ok_or_else(|| Error::new(ruby.exception_arg_error(), "missing window params"))?;
63
+
64
+ let chars: Option<usize> = params.lookup(ruby.to_symbol("chars"))?;
65
+ let words: Option<usize> = params.lookup(ruby.to_symbol("words"))?;
66
+ let overlap: Option<usize> = params.lookup(ruby.to_symbol("overlap"))?;
67
+ let at_str: Option<String> = params.lookup(ruby.to_symbol("at"))?;
68
+
69
+ let at = match at_str.as_deref() {
70
+ Some("word") => BoundaryAt::Word,
71
+ _ => BoundaryAt::Block,
72
+ };
73
+
74
+ Ok(WindowParams {
75
+ chars,
76
+ words,
77
+ overlap: overlap.unwrap_or(0),
78
+ at,
79
+ })
80
+ }
81
+
82
+ fn chunk_blocks(events: &[Event<'_>], params: &WindowParams) -> Vec<String> {
83
+ let mut out: Vec<String> = Vec::new();
84
+ let mut current = String::new();
85
+ let mut current_chars: usize = 0;
86
+ let mut current_words: usize = 0;
87
+
88
+ for (start, end) in top_level_blocks(events) {
89
+ let block = render_markdown(&events[start..=end]);
90
+ let block_chars = block.chars().count();
91
+ let block_words = block.unicode_words().count();
92
+
93
+ let would_exceed_chars = params
94
+ .chars
95
+ .map(|b| current_chars + block_chars > b)
96
+ .unwrap_or(false);
97
+ let would_exceed_words = params
98
+ .words
99
+ .map(|b| current_words + block_words > b)
100
+ .unwrap_or(false);
101
+
102
+ // Oversized blocks (a single block larger than the
103
+ // budget) get emitted as their own window, never silently
104
+ // dropped or truncated.
105
+ if (would_exceed_chars || would_exceed_words) && !current.is_empty() {
106
+ let finished = std::mem::take(&mut current);
107
+ current = seed_with_overlap(&finished, params.overlap);
108
+ current_chars = current.chars().count();
109
+ current_words = current.unicode_words().count();
110
+ out.push(finished);
111
+ }
112
+
113
+ current.push_str(&block);
114
+ current_chars += block_chars;
115
+ current_words += block_words;
116
+ }
117
+
118
+ if !current.is_empty() {
119
+ out.push(current);
120
+ }
121
+ out
122
+ }
123
+
124
+ /// Return the trailing `overlap` characters of `s`, aligned to a
125
+ /// char boundary. When `overlap == 0` or `s` is shorter than the
126
+ /// overlap budget, return an empty string.
127
+ fn seed_with_overlap(s: &str, overlap: usize) -> String {
128
+ if overlap == 0 {
129
+ return String::new();
130
+ }
131
+ let total = s.chars().count();
132
+ if overlap >= total {
133
+ return String::new();
134
+ }
135
+ let skip = total - overlap;
136
+ s.chars().skip(skip).collect()
137
+ }
138
+
139
+ fn top_level_blocks(events: &[Event<'_>]) -> Vec<(usize, usize)> {
140
+ let mut blocks = Vec::new();
141
+ let mut depth: i32 = 0;
142
+ let mut current_start: Option<usize> = None;
143
+
144
+ for (i, event) in events.iter().enumerate() {
145
+ match event {
146
+ Event::Start(_) => {
147
+ if depth == 0 {
148
+ current_start = Some(i);
149
+ }
150
+ depth += 1;
151
+ }
152
+ Event::End(_) => {
153
+ depth -= 1;
154
+ if depth == 0 {
155
+ if let Some(start) = current_start.take() {
156
+ blocks.push((start, i));
157
+ }
158
+ }
159
+ }
160
+ _ => {
161
+ if depth == 0 {
162
+ blocks.push((i, i));
163
+ }
164
+ }
165
+ }
166
+ }
167
+ blocks
168
+ }
169
+
170
+ fn render_markdown(events: &[Event<'_>]) -> String {
171
+ let mut buf = String::new();
172
+ pulldown_cmark_to_cmark::cmark(events.iter().cloned(), &mut buf)
173
+ .expect("markdown serialization failed");
174
+ buf
175
+ }
176
+
177
+ // Serialize the full filtered Markdown into one string, then walk
178
+ // word boundaries. Each window is a byte-aligned slice of the
179
+ // serialized output ending at a word boundary that fits the budget.
180
+ // Overlap is implemented by advancing the window start backward by
181
+ // `overlap` chars (to the next word boundary) before taking the
182
+ // next window.
183
+
184
+ fn chunk_words(events: &[Event<'_>], params: &WindowParams) -> Vec<String> {
185
+ let rendered = render_markdown(events);
186
+ if rendered.is_empty() {
187
+ return Vec::new();
188
+ }
189
+
190
+ let mut out: Vec<String> = Vec::new();
191
+ let mut cursor: usize = 0; // byte offset into `rendered`
192
+ let bytes_len = rendered.len();
193
+
194
+ while cursor < bytes_len {
195
+ // Find the largest byte offset `end_byte` such that the chars
196
+ // in rendered[cursor..end_byte] fit both char and word budgets
197
+ // and end on a word boundary.
198
+ let slice = &rendered[cursor..];
199
+ let mut used_chars: usize = 0;
200
+ let mut used_words: usize = 0;
201
+ let mut last_good_byte: usize = 0;
202
+
203
+ for (offset, segment) in slice.split_word_bound_indices() {
204
+ let seg_chars = segment.chars().count();
205
+ let seg_is_word = segment.unicode_words().next().is_some();
206
+
207
+ let next_chars = used_chars + seg_chars;
208
+ let next_words = if seg_is_word {
209
+ used_words + 1
210
+ } else {
211
+ used_words
212
+ };
213
+
214
+ let over_chars = params.chars.map(|b| next_chars > b).unwrap_or(false);
215
+ let over_words = params.words.map(|b| next_words > b).unwrap_or(false);
216
+ if over_chars || over_words {
217
+ break;
218
+ }
219
+
220
+ used_chars = next_chars;
221
+ used_words = next_words;
222
+ last_good_byte = offset + segment.len();
223
+ }
224
+
225
+ // If no progress at all, take the next whole segment to avoid
226
+ // an infinite loop (can happen if the first segment's char
227
+ // count already exceeds the budget).
228
+ if last_good_byte == 0 {
229
+ if let Some((_, segment)) = slice.split_word_bound_indices().next() {
230
+ last_good_byte = segment.len();
231
+ } else {
232
+ break;
233
+ }
234
+ }
235
+
236
+ let window = slice[..last_good_byte].to_string();
237
+ if !window.is_empty() {
238
+ out.push(window);
239
+ }
240
+
241
+ // Advance cursor by the full window length, then step back by
242
+ // `overlap` chars (to a word boundary) for the next window.
243
+ // Guarantee forward progress so we can't loop forever.
244
+ let next_cursor = cursor + last_good_byte;
245
+ let candidate = advance_with_overlap(&rendered, next_cursor, params.overlap);
246
+ cursor = if candidate <= cursor {
247
+ next_cursor
248
+ } else {
249
+ candidate
250
+ };
251
+ }
252
+
253
+ out
254
+ }
255
+
256
+ /// Step the cursor back by roughly `overlap` characters, then land on
257
+ /// the next word boundary to keep slices aligned. Returns the new
258
+ /// cursor position (byte offset).
259
+ fn advance_with_overlap(rendered: &str, end_byte: usize, overlap: usize) -> usize {
260
+ if overlap == 0 {
261
+ return end_byte;
262
+ }
263
+ let prefix = &rendered[..end_byte];
264
+ let prefix_chars = prefix.chars().count();
265
+ if overlap >= prefix_chars {
266
+ return 0;
267
+ }
268
+ let target_char_index = prefix_chars - overlap;
269
+
270
+ // Find the word boundary at or before target_char_index.
271
+ let mut char_idx: usize = 0;
272
+ let mut last_boundary: usize = 0;
273
+ for (offset, segment) in prefix.split_word_bound_indices() {
274
+ if char_idx >= target_char_index {
275
+ return offset;
276
+ }
277
+ char_idx += segment.chars().count();
278
+ last_boundary = offset + segment.len();
279
+ }
280
+ last_boundary
281
+ }
282
+
283
+ fn build_result(ruby: &Ruby, windows: &[String], with_counts: bool) -> Result<RArray, Error> {
284
+ let arr = ruby.ary_new_capa(windows.len());
285
+ for (i, content) in windows.iter().enumerate() {
286
+ let hash = ruby.hash_new();
287
+ hash.aset(ruby.to_symbol("index"), i)?;
288
+ hash.aset(ruby.to_symbol("content"), content.as_str())?;
289
+ if with_counts {
290
+ hash.aset(
291
+ ruby.to_symbol("character_count"),
292
+ content.trim().chars().count(),
293
+ )?;
294
+ hash.aset(
295
+ ruby.to_symbol("word_count"),
296
+ content.unicode_words().count(),
297
+ )?;
298
+ }
299
+ arr.push(hash)?;
300
+ }
301
+ Ok(arr)
302
+ }