lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,468 @@
1
+ //! LexStream - High-performance streaming lexer
2
+ //!
3
+ //! This module implements the performance-critical parts of LexStream in Rust.
4
+ //! Higher-level operations like token_name, text, and line_col are implemented in Ruby.
5
+
6
+ use crate::types::CompiledProgram;
7
+ use crate::vm;
8
+ use rb_sys::*;
9
+ use std::ptr;
10
+
11
+ /// Define a Ruby method with no arguments
12
+ macro_rules! define_ruby_method_noarg {
13
+ ($class:expr, $name:expr, $fn:expr) => {
14
+ rb_define_method(
15
+ $class,
16
+ $name.as_ptr(),
17
+ Some(std::mem::transmute::<
18
+ unsafe extern "C" fn(VALUE) -> VALUE,
19
+ unsafe extern "C" fn() -> VALUE,
20
+ >($fn)),
21
+ 0,
22
+ )
23
+ };
24
+ }
25
+
26
+ /// Define a Ruby method with variable arguments
27
+ macro_rules! define_ruby_method_vararg {
28
+ ($class:expr, $name:expr, $fn:expr) => {
29
+ rb_define_method(
30
+ $class,
31
+ $name.as_ptr(),
32
+ Some(std::mem::transmute::<
33
+ unsafe extern "C" fn(libc::c_int, *const VALUE, VALUE) -> VALUE,
34
+ unsafe extern "C" fn() -> VALUE,
35
+ >($fn)),
36
+ -1,
37
+ )
38
+ };
39
+ }
40
+
41
+ const STRIDE: usize = 3;
42
+
43
+ /// Token buffer using Vec for safe memory management
44
+ ///
45
+ /// Memory allocation and deallocation is handled safely by Vec.
46
+ pub(crate) struct FastTokenBuffer {
47
+ /// Flat array: [token_id, start, len] * count
48
+ data: Vec<i32>,
49
+ /// Number of tokens (data.len() / 3)
50
+ count: usize,
51
+ /// Current position
52
+ index: usize,
53
+ }
54
+
55
+ impl FastTokenBuffer {
56
+ /// Create from collected tokens
57
+ pub(crate) fn from_tokens(tokens: Vec<(u16, usize, usize)>) -> Result<Self, &'static str> {
58
+ let count = tokens.len();
59
+ let mut data = Vec::with_capacity(count * STRIDE);
60
+
61
+ for (token_id, start, length) in tokens {
62
+ if start > i32::MAX as usize || length > i32::MAX as usize {
63
+ return Err("token position exceeds maximum value (input too large)");
64
+ }
65
+ data.push(token_id as i32);
66
+ data.push(start as i32);
67
+ data.push(length as i32);
68
+ }
69
+
70
+ Ok(Self { data, count, index: 0 })
71
+ }
72
+
73
+ #[inline(always)]
74
+ pub(crate) fn eof(&self) -> bool {
75
+ self.index >= self.count
76
+ }
77
+
78
+ #[inline(always)]
79
+ pub(crate) fn token_id(&self) -> i32 {
80
+ if self.index >= self.count {
81
+ -1
82
+ } else {
83
+ self.data[self.index * STRIDE]
84
+ }
85
+ }
86
+
87
+ #[inline(always)]
88
+ pub(crate) fn start(&self) -> i32 {
89
+ if self.index >= self.count {
90
+ -1
91
+ } else {
92
+ self.data[self.index * STRIDE + 1]
93
+ }
94
+ }
95
+
96
+ #[inline(always)]
97
+ pub(crate) fn len(&self) -> i32 {
98
+ if self.index >= self.count {
99
+ -1
100
+ } else {
101
+ self.data[self.index * STRIDE + 2]
102
+ }
103
+ }
104
+
105
+ #[inline(always)]
106
+ pub(crate) fn peek_token_id(&self, n: usize) -> i32 {
107
+ let i = self.index + n;
108
+ if i >= self.count {
109
+ -1
110
+ } else {
111
+ self.data[i * STRIDE]
112
+ }
113
+ }
114
+
115
+ #[inline(always)]
116
+ pub(crate) fn peek_start(&self, n: usize) -> i32 {
117
+ let i = self.index + n;
118
+ if i >= self.count {
119
+ -1
120
+ } else {
121
+ self.data[i * STRIDE + 1]
122
+ }
123
+ }
124
+
125
+ #[inline(always)]
126
+ pub(crate) fn peek_len(&self, n: usize) -> i32 {
127
+ let i = self.index + n;
128
+ if i >= self.count {
129
+ -1
130
+ } else {
131
+ self.data[i * STRIDE + 2]
132
+ }
133
+ }
134
+
135
+ #[inline(always)]
136
+ pub(crate) fn advance(&mut self) {
137
+ if self.index < self.count {
138
+ self.index += 1;
139
+ }
140
+ }
141
+
142
+ /// Get total number of tokens (for testing)
143
+ #[cfg(test)]
144
+ pub(crate) fn count(&self) -> usize {
145
+ self.count
146
+ }
147
+
148
+ /// Get current index (for testing)
149
+ #[cfg(test)]
150
+ pub(crate) fn index(&self) -> usize {
151
+ self.index
152
+ }
153
+ }
154
+
155
+ // No manual Drop needed - Vec handles memory deallocation automatically
156
+
157
+ // ============================================================================
158
+ // LexStream - Minimal Rust structure for fast VM execution
159
+ // ============================================================================
160
+
161
+ /// LexStream holds token buffer. Higher-level state is managed in Ruby.
162
+ #[repr(C)]
163
+ pub struct LexStream {
164
+ buffer: FastTokenBuffer,
165
+ }
166
+
167
+ // ============================================================================
168
+ // Ruby TypedData integration
169
+ // ============================================================================
170
+
171
+ static mut LEX_STREAM_CLASS: VALUE = 0;
172
+ static mut LEX_STREAM_DATA_TYPE: *const rb_data_type_t = ptr::null();
173
+
174
+ // No mark function needed - LexStream no longer holds Ruby VALUEs
175
+
176
+ unsafe extern "C" fn lex_stream_free(ptr: *mut libc::c_void) {
177
+ if !ptr.is_null() {
178
+ let _ = Box::from_raw(ptr as *mut LexStream);
179
+ }
180
+ }
181
+
182
+ unsafe extern "C" fn lex_stream_size(ptr: *const libc::c_void) -> size_t {
183
+ if ptr.is_null() {
184
+ 0
185
+ } else {
186
+ let stream = &*(ptr as *const LexStream);
187
+ (std::mem::size_of::<LexStream>()
188
+ + stream.buffer.data.capacity() * std::mem::size_of::<i32>()) as size_t
189
+ }
190
+ }
191
+
192
+ #[inline(always)]
193
+ unsafe fn get_stream(obj: VALUE) -> *mut LexStream {
194
+ RTYPEDDATA_GET_DATA(obj) as *mut LexStream
195
+ }
196
+
197
+ // ============================================================================
198
+ // Thread-local storage for passing program between magnus and rb-sys
199
+ // ============================================================================
200
+
201
+ thread_local! {
202
+ static CURRENT_PROGRAM: std::cell::RefCell<*const CompiledProgram> = const { std::cell::RefCell::new(ptr::null()) };
203
+ }
204
+
205
+ pub fn set_current_program(prog: *const CompiledProgram) {
206
+ CURRENT_PROGRAM.with(|p| {
207
+ *p.borrow_mut() = prog;
208
+ });
209
+ }
210
+
211
+ fn take_current_program() -> *const CompiledProgram {
212
+ CURRENT_PROGRAM.with(|p| {
213
+ let ptr = *p.borrow();
214
+ *p.borrow_mut() = ptr::null(); // Clear after taking
215
+ ptr
216
+ })
217
+ }
218
+
219
+ // ============================================================================
220
+ // Ruby method implementations
221
+ // ============================================================================
222
+
223
+ /// LexStream.create(program, input) -> LexStream
224
+ ///
225
+ /// SAFETY: This method requires CURRENT_PROGRAM to be set via set_current_program()
226
+ /// before calling. It is intended to be called only from CompiledProgram#stream.
227
+ /// Direct calls will fail with an error.
228
+ unsafe extern "C" fn lex_stream_create(
229
+ _klass: VALUE,
230
+ program: VALUE,
231
+ input: VALUE,
232
+ ) -> VALUE {
233
+ // Take the compiled program from thread-local (clears it to prevent reuse)
234
+ let prog_ptr = take_current_program();
235
+ if prog_ptr.is_null() {
236
+ rb_raise(
237
+ rb_eArgError,
238
+ c"LexStream.create must be called via CompiledProgram#stream".as_ptr(),
239
+ );
240
+ }
241
+
242
+ // Get input bytes
243
+ let input_ptr = rb_string_value_ptr(&input as *const _ as *mut _);
244
+ let input_len = RSTRING_LEN(input) as usize;
245
+ let bytes = std::slice::from_raw_parts(input_ptr as *const u8, input_len);
246
+
247
+ // Collect tokens using VM
248
+ let prog = &*prog_ptr;
249
+ let tokens = match vm::collect_tokens(prog, bytes) {
250
+ Ok(t) => t,
251
+ Err(msg) => {
252
+ let err_msg = std::ffi::CString::new(msg)
253
+ .unwrap_or_else(|_| std::ffi::CString::new("VM error").unwrap());
254
+ rb_raise(rb_eRuntimeError, err_msg.as_ptr());
255
+ unreachable!()
256
+ }
257
+ };
258
+
259
+ // Create buffer
260
+ let buffer = match FastTokenBuffer::from_tokens(tokens) {
261
+ Ok(b) => b,
262
+ Err(_msg) => {
263
+ rb_raise(rb_eRuntimeError, c"Lexer execution failed".as_ptr());
264
+ unreachable!()
265
+ }
266
+ };
267
+
268
+ // Create LexStream (minimal Rust struct)
269
+ let stream = Box::new(LexStream { buffer });
270
+ let stream_ptr = Box::into_raw(stream);
271
+
272
+ let obj = rb_data_typed_object_wrap(
273
+ LEX_STREAM_CLASS,
274
+ stream_ptr as *mut libc::c_void,
275
+ LEX_STREAM_DATA_TYPE,
276
+ );
277
+
278
+ // Get token_names from program
279
+ let token_names = rb_funcall(program, rb_intern(c"token_names".as_ptr()), 0);
280
+
281
+ // Set instance variables directly (avoids rb_funcall overhead)
282
+ rb_ivar_set(obj, rb_intern(c"@input".as_ptr()), input);
283
+ rb_ivar_set(obj, rb_intern(c"@token_names".as_ptr()), token_names);
284
+
285
+ obj
286
+ }
287
+
288
+ /// eof? -> bool
289
+ unsafe extern "C" fn lex_stream_eof(slf: VALUE) -> VALUE {
290
+ let stream = get_stream(slf);
291
+ if (*stream).buffer.eof() {
292
+ Qtrue as VALUE
293
+ } else {
294
+ Qfalse as VALUE
295
+ }
296
+ }
297
+
298
+ /// advance -> self
299
+ unsafe extern "C" fn lex_stream_advance(slf: VALUE) -> VALUE {
300
+ let stream = get_stream(slf);
301
+ (*stream).buffer.advance();
302
+ slf
303
+ }
304
+
305
+ /// token_id -> Integer or nil
306
+ unsafe extern "C" fn lex_stream_token_id(slf: VALUE) -> VALUE {
307
+ let stream = get_stream(slf);
308
+ let id = (*stream).buffer.token_id();
309
+ if id < 0 {
310
+ Qnil as VALUE
311
+ } else {
312
+ rb_int2inum(id as isize)
313
+ }
314
+ }
315
+
316
+ /// input -> String
317
+ unsafe extern "C" fn lex_stream_input(slf: VALUE) -> VALUE {
318
+ rb_ivar_get(slf, rb_intern(c"@input".as_ptr()))
319
+ }
320
+
321
+ /// start -> Integer or nil
322
+ unsafe extern "C" fn lex_stream_start(slf: VALUE) -> VALUE {
323
+ let stream = get_stream(slf);
324
+ let s = (*stream).buffer.start();
325
+ if s < 0 {
326
+ Qnil as VALUE
327
+ } else {
328
+ rb_int2inum(s as isize)
329
+ }
330
+ }
331
+
332
+ /// len -> Integer or nil
333
+ unsafe extern "C" fn lex_stream_len(slf: VALUE) -> VALUE {
334
+ let stream = get_stream(slf);
335
+ let l = (*stream).buffer.len();
336
+ if l < 0 {
337
+ Qnil as VALUE
338
+ } else {
339
+ rb_int2inum(l as isize)
340
+ }
341
+ }
342
+
343
+ /// peek_token_id(n = 1) -> Integer or nil
344
+ unsafe extern "C" fn lex_stream_peek_token_id(argc: libc::c_int, argv: *const VALUE, slf: VALUE) -> VALUE {
345
+ let n = if argc == 0 {
346
+ 1
347
+ } else {
348
+ let n_val = rb_num2long(*argv);
349
+ if n_val < 0 {
350
+ rb_raise(rb_eArgError, c"offset must be non-negative".as_ptr());
351
+ unreachable!()
352
+ }
353
+ n_val as usize
354
+ };
355
+
356
+ let stream = get_stream(slf);
357
+ let id = (*stream).buffer.peek_token_id(n);
358
+ if id < 0 {
359
+ Qnil as VALUE
360
+ } else {
361
+ rb_int2inum(id as isize)
362
+ }
363
+ }
364
+
365
+ /// peek_start(n = 1) -> Integer or nil
366
+ unsafe extern "C" fn lex_stream_peek_start(argc: libc::c_int, argv: *const VALUE, slf: VALUE) -> VALUE {
367
+ let n = if argc == 0 {
368
+ 1
369
+ } else {
370
+ let n_val = rb_num2long(*argv);
371
+ if n_val < 0 {
372
+ rb_raise(rb_eArgError, c"offset must be non-negative".as_ptr());
373
+ unreachable!()
374
+ }
375
+ n_val as usize
376
+ };
377
+
378
+ let stream = get_stream(slf);
379
+ let s = (*stream).buffer.peek_start(n);
380
+ if s < 0 {
381
+ Qnil as VALUE
382
+ } else {
383
+ rb_int2inum(s as isize)
384
+ }
385
+ }
386
+
387
+ /// peek_len(n = 1) -> Integer or nil
388
+ unsafe extern "C" fn lex_stream_peek_len(argc: libc::c_int, argv: *const VALUE, slf: VALUE) -> VALUE {
389
+ let n = if argc == 0 {
390
+ 1
391
+ } else {
392
+ let n_val = rb_num2long(*argv);
393
+ if n_val < 0 {
394
+ rb_raise(rb_eArgError, c"offset must be non-negative".as_ptr());
395
+ unreachable!()
396
+ }
397
+ n_val as usize
398
+ };
399
+
400
+ let stream = get_stream(slf);
401
+ let l = (*stream).buffer.peek_len(n);
402
+ if l < 0 {
403
+ Qnil as VALUE
404
+ } else {
405
+ rb_int2inum(l as isize)
406
+ }
407
+ }
408
+
409
+ // ============================================================================
410
+ // Initialization
411
+ // ============================================================================
412
+
413
+ pub fn init_lex_stream() {
414
+ unsafe {
415
+ let data_type = Box::new(rb_data_type_t {
416
+ wrap_struct_name: c"LexerKit::LexStream".as_ptr(),
417
+ function: rb_data_type_struct__bindgen_ty_1 {
418
+ dmark: None, // No Ruby VALUEs to mark
419
+ dfree: Some(lex_stream_free),
420
+ dsize: Some(lex_stream_size),
421
+ dcompact: None,
422
+ reserved: [ptr::null_mut(); 1],
423
+ },
424
+ parent: ptr::null(),
425
+ data: ptr::null_mut(),
426
+ flags: 0,
427
+ });
428
+ LEX_STREAM_DATA_TYPE = Box::into_raw(data_type);
429
+
430
+ let lexer_kit_id = rb_intern(c"LexerKit".as_ptr());
431
+ let lexer_kit = if rb_const_defined(rb_cObject, lexer_kit_id) != 0 {
432
+ rb_const_get(rb_cObject, lexer_kit_id)
433
+ } else {
434
+ rb_define_module(c"LexerKit".as_ptr())
435
+ };
436
+
437
+ LEX_STREAM_CLASS = rb_define_class_under(
438
+ lexer_kit,
439
+ c"LexStream".as_ptr(),
440
+ rb_cObject,
441
+ );
442
+
443
+ rb_undef_alloc_func(LEX_STREAM_CLASS);
444
+
445
+ // Class method
446
+ rb_define_singleton_method(
447
+ LEX_STREAM_CLASS,
448
+ c"create".as_ptr(),
449
+ Some(std::mem::transmute::<
450
+ unsafe extern "C" fn(VALUE, VALUE, VALUE) -> VALUE,
451
+ unsafe extern "C" fn() -> VALUE,
452
+ >(lex_stream_create)),
453
+ 2,
454
+ );
455
+
456
+ // Instance methods (performance-critical, kept in Rust)
457
+ define_ruby_method_noarg!(LEX_STREAM_CLASS, c"eof?", lex_stream_eof);
458
+ define_ruby_method_noarg!(LEX_STREAM_CLASS, c"advance", lex_stream_advance);
459
+ define_ruby_method_noarg!(LEX_STREAM_CLASS, c"token_id", lex_stream_token_id);
460
+ define_ruby_method_noarg!(LEX_STREAM_CLASS, c"start", lex_stream_start);
461
+ define_ruby_method_noarg!(LEX_STREAM_CLASS, c"len", lex_stream_len);
462
+ define_ruby_method_noarg!(LEX_STREAM_CLASS, c"input", lex_stream_input);
463
+
464
+ define_ruby_method_vararg!(LEX_STREAM_CLASS, c"peek_token_id", lex_stream_peek_token_id);
465
+ define_ruby_method_vararg!(LEX_STREAM_CLASS, c"peek_start", lex_stream_peek_start);
466
+ define_ruby_method_vararg!(LEX_STREAM_CLASS, c"peek_len", lex_stream_peek_len);
467
+ }
468
+ }
@@ -0,0 +1,248 @@
1
+ //! LexerKit Rust Extension
2
+ //!
3
+ //! High-performance VM for executing compiled lexer programs.
4
+
5
+ mod deserializer;
6
+ #[cfg(not(test))]
7
+ mod dfa;
8
+ #[cfg(test)]
9
+ pub mod dfa; // テスト時は公開
10
+ #[cfg(not(test))]
11
+ mod fast_stream;
12
+ #[cfg(test)]
13
+ pub mod fast_stream; // テスト時は公開
14
+ #[cfg(not(test))]
15
+ mod opcodes;
16
+ #[cfg(test)]
17
+ pub mod opcodes; // テスト時は公開
18
+ #[cfg(not(test))]
19
+ mod trie;
20
+ #[cfg(test)]
21
+ pub mod trie; // テスト時は公開
22
+ #[cfg(not(test))]
23
+ mod types;
24
+ #[cfg(test)]
25
+ pub mod types; // テスト時は公開
26
+ #[cfg(not(test))]
27
+ mod vm;
28
+ #[cfg(test)]
29
+ pub mod vm; // テスト時は公開
30
+
31
+ #[cfg(test)]
32
+ mod safety_test;
33
+
34
+ use magnus::{
35
+ function, method, prelude::*, Error, IntoValue, RArray, RHash, RString, Ruby, TryConvert, Value,
36
+ };
37
+ use std::cell::RefCell;
38
+ use types::CompiledProgram;
39
+
40
+ /// Wrapper for CompiledProgram to make it shareable with Ruby
41
+ #[magnus::wrap(class = "LexerKit::IR::RustHandle", free_immediately, size)]
42
+ struct RustHandle {
43
+ prog: RefCell<CompiledProgram>,
44
+ }
45
+
46
+ impl RustHandle {
47
+ /// Load program from Ruby data hash
48
+ fn load_from_ruby(rb_data: RHash) -> Result<Self, Error> {
49
+ let prog = deserializer::parse_ruby_data(rb_data)?;
50
+ Ok(Self {
51
+ prog: RefCell::new(prog),
52
+ })
53
+ }
54
+
55
+ /// Tokenize input using the VM and yield tokens
56
+ fn lex_native(&self, rb_bytes: RString) -> Result<RArray, Error> {
57
+ let ruby = Ruby::get().unwrap();
58
+ let bytes = unsafe { rb_bytes.as_slice() };
59
+ let prog = self.prog.borrow();
60
+
61
+ let tokens = vm::collect_tokens(&prog, bytes)
62
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e))?;
63
+
64
+ let result = ruby.ary_new_capa(tokens.len());
65
+ for (token_id, start, length) in tokens {
66
+ let token = ruby.ary_new_from_values(&[
67
+ ruby.into_value(token_id),
68
+ ruby.into_value(start),
69
+ ruby.into_value(length),
70
+ ]);
71
+ result.push(token)?;
72
+ }
73
+
74
+ Ok(result)
75
+ }
76
+
77
+ /// Tokenize input and yield each token to a block
78
+ fn lex_native_block(&self, rb_bytes: RString) -> Result<Value, Error> {
79
+ let ruby = Ruby::get().unwrap();
80
+ let bytes = unsafe { rb_bytes.as_slice() };
81
+ let prog = self.prog.borrow();
82
+
83
+ let tokens = vm::collect_tokens(&prog, bytes)
84
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), e))?;
85
+
86
+ for (token_id, start, length) in tokens {
87
+ let args = (
88
+ ruby.into_value(token_id),
89
+ ruby.into_value(start),
90
+ ruby.into_value(length),
91
+ );
92
+ let _: Value = ruby.yield_values(args)?;
93
+ }
94
+
95
+ Ok(ruby.qnil().as_value())
96
+ }
97
+
98
+ }
99
+
100
+ /// Internal variable name for Rust handle (non-@ name, inaccessible from Ruby)
101
+ static RUST_HANDLE_IVAR: &std::ffi::CStr = c"__lkt_rust__";
102
+
103
+ /// RustNative module methods - added to CompiledProgram
104
+ struct RustNative;
105
+
106
+ impl RustNative {
107
+ /// Load Rust native handle from Ruby data
108
+ /// compiled_program.load_rust_native(data) -> self
109
+ fn load_rust_native(rb_self: Value, rb_data: RHash) -> Result<Value, Error> {
110
+ let ruby = Ruby::get().unwrap();
111
+ let handle = RustHandle::load_from_ruby(rb_data)?;
112
+
113
+ // Store handle as internal variable (non-@ name)
114
+ unsafe {
115
+ let ivar_id = rb_sys::rb_intern(RUST_HANDLE_IVAR.as_ptr());
116
+ let handle_value: rb_sys::VALUE = std::mem::transmute::<Value, rb_sys::VALUE>(handle.into_value_with(&ruby));
117
+ rb_sys::rb_ivar_set(std::mem::transmute::<Value, rb_sys::VALUE>(rb_self), ivar_id, handle_value);
118
+ }
119
+
120
+ Ok(rb_self)
121
+ }
122
+
123
+ /// Check if Rust native handle is loaded
124
+ fn rust_native_loaded(rb_self: Value) -> bool {
125
+ let handle = unsafe {
126
+ let ivar_id = rb_sys::rb_intern(RUST_HANDLE_IVAR.as_ptr());
127
+ rb_sys::rb_ivar_get(std::mem::transmute::<Value, rb_sys::VALUE>(rb_self), ivar_id)
128
+ };
129
+ handle != rb_sys::Qnil as rb_sys::VALUE
130
+ }
131
+
132
+ /// Tokenize using Rust VM and yield to block
133
+ fn lex_rust_native(rb_self: Value, rb_bytes: RString) -> Result<Value, Error> {
134
+ let handle = Self::get_handle(rb_self)?;
135
+ handle.lex_native_block(rb_bytes)
136
+ }
137
+
138
+ /// Create a LexStream
139
+ fn create_rust_stream(rb_self: Value, rb_bytes: RString) -> Result<Value, Error> {
140
+ use magnus::IntoValue;
141
+ let handle = Self::get_handle(rb_self)?;
142
+ let prog = handle.prog.borrow();
143
+ let ruby = Ruby::get().unwrap();
144
+
145
+ // Set current program for LexStream to use
146
+ fast_stream::set_current_program(&*prog as *const CompiledProgram);
147
+
148
+ // Create LexStream directly using rb-sys
149
+ let result = unsafe {
150
+ let klass = rb_sys::rb_const_get(
151
+ rb_sys::rb_const_get(
152
+ rb_sys::rb_cObject,
153
+ rb_sys::rb_intern(c"LexerKit".as_ptr()),
154
+ ),
155
+ rb_sys::rb_intern(c"LexStream".as_ptr()),
156
+ );
157
+ let program_value: rb_sys::VALUE = std::mem::transmute(rb_self);
158
+ let input_value: rb_sys::VALUE = std::mem::transmute(rb_bytes.into_value_with(&ruby));
159
+ rb_sys::rb_funcall(
160
+ klass,
161
+ rb_sys::rb_intern(c"create".as_ptr()),
162
+ 2,
163
+ program_value,
164
+ input_value,
165
+ )
166
+ };
167
+
168
+ Ok(unsafe { std::mem::transmute::<rb_sys::VALUE, Value>(result) })
169
+ }
170
+
171
+ /// Get handle from internal variable
172
+ ///
173
+ /// # Safety
174
+ /// The returned reference is valid as long as `rb_self` is kept alive by Ruby's GC.
175
+ /// Since this is called within Ruby method invocations where `rb_self` is rooted,
176
+ /// the reference is guaranteed to be valid for the duration of the method call.
177
+ ///
178
+ /// Note: The lifetime is tied to magnus's TypedData wrapper, not truly 'static.
179
+ /// Callers must not store this reference beyond the current method call.
180
+ fn get_handle<'a>(rb_self: Value) -> Result<&'a RustHandle, Error> {
181
+ let ruby = Ruby::get().unwrap();
182
+ let handle_value = unsafe {
183
+ let ivar_id = rb_sys::rb_intern(RUST_HANDLE_IVAR.as_ptr());
184
+ rb_sys::rb_ivar_get(std::mem::transmute::<Value, rb_sys::VALUE>(rb_self), ivar_id)
185
+ };
186
+
187
+ if handle_value == rb_sys::Qnil as rb_sys::VALUE {
188
+ return Err(Error::new(
189
+ ruby.exception_runtime_error(),
190
+ "Rust native not loaded. Call load_rust_native first.",
191
+ ));
192
+ }
193
+
194
+ // Convert VALUE back to RustHandle reference.
195
+ // Safety: The handle is stored as an instance variable of rb_self.
196
+ // Ruby's GC keeps rb_self alive during method calls, so the handle
197
+ // is valid for the duration of the caller's use.
198
+ let handle: &RustHandle = unsafe {
199
+ let value: Value = std::mem::transmute(handle_value);
200
+ TryConvert::try_convert(value)?
201
+ };
202
+
203
+ Ok(handle)
204
+ }
205
+ }
206
+
207
+ #[magnus::init]
208
+ fn init(ruby: &Ruby) -> Result<(), Error> {
209
+ // Define LexerKit module and IR submodule
210
+ let lexer_kit = ruby.define_module("LexerKit")?;
211
+ let ir = lexer_kit.define_module("IR")?;
212
+
213
+ // Define RustHandle class (internal, not meant to be used directly)
214
+ let rust_handle = ir.define_class("RustHandle", ruby.class_object())?;
215
+ rust_handle
216
+ .define_singleton_method("load_from_ruby", function!(RustHandle::load_from_ruby, 1))?;
217
+ rust_handle.define_method("lex_native", method!(RustHandle::lex_native, 1))?;
218
+ rust_handle.define_method("lex_native_block", method!(RustHandle::lex_native_block, 1))?;
219
+
220
+ // Define RustNative module (to be included in CompiledProgram)
221
+ let rust_native = ir.define_module("RustNative")?;
222
+ rust_native.define_method("load_rust_native", method!(RustNative::load_rust_native, 1))?;
223
+ rust_native.define_method(
224
+ "rust_native_loaded?",
225
+ method!(RustNative::rust_native_loaded, 0),
226
+ )?;
227
+ rust_native.define_method("lex_rust_native", method!(RustNative::lex_rust_native, 1))?;
228
+ rust_native.define_method(
229
+ "create_rust_stream",
230
+ method!(RustNative::create_rust_stream, 1),
231
+ )?;
232
+
233
+ // Include RustNative into CompiledProgram if it exists
234
+ let compiled_program = ir.const_get::<_, Value>("CompiledProgram");
235
+ if let Ok(cp) = compiled_program {
236
+ unsafe {
237
+ rb_sys::rb_include_module(
238
+ std::mem::transmute::<Value, rb_sys::VALUE>(cp),
239
+ std::mem::transmute::<Value, rb_sys::VALUE>(rust_native.as_value()),
240
+ );
241
+ }
242
+ }
243
+
244
+ // Initialize LexStream class (rb-sys based, no RefCell overhead)
245
+ fast_stream::init_lex_stream();
246
+
247
+ Ok(())
248
+ }