lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
//! LexStream - High-performance streaming lexer
|
|
2
|
+
//!
|
|
3
|
+
//! This module implements the performance-critical parts of LexStream in Rust.
|
|
4
|
+
//! Higher-level operations like token_name, text, and line_col are implemented in Ruby.
|
|
5
|
+
|
|
6
|
+
use crate::types::CompiledProgram;
|
|
7
|
+
use crate::vm;
|
|
8
|
+
use rb_sys::*;
|
|
9
|
+
use std::ptr;
|
|
10
|
+
|
|
11
|
+
/// Define a Ruby method with no arguments
|
|
12
|
+
macro_rules! define_ruby_method_noarg {
|
|
13
|
+
($class:expr, $name:expr, $fn:expr) => {
|
|
14
|
+
rb_define_method(
|
|
15
|
+
$class,
|
|
16
|
+
$name.as_ptr(),
|
|
17
|
+
Some(std::mem::transmute::<
|
|
18
|
+
unsafe extern "C" fn(VALUE) -> VALUE,
|
|
19
|
+
unsafe extern "C" fn() -> VALUE,
|
|
20
|
+
>($fn)),
|
|
21
|
+
0,
|
|
22
|
+
)
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/// Define a Ruby method with variable arguments
|
|
27
|
+
macro_rules! define_ruby_method_vararg {
|
|
28
|
+
($class:expr, $name:expr, $fn:expr) => {
|
|
29
|
+
rb_define_method(
|
|
30
|
+
$class,
|
|
31
|
+
$name.as_ptr(),
|
|
32
|
+
Some(std::mem::transmute::<
|
|
33
|
+
unsafe extern "C" fn(libc::c_int, *const VALUE, VALUE) -> VALUE,
|
|
34
|
+
unsafe extern "C" fn() -> VALUE,
|
|
35
|
+
>($fn)),
|
|
36
|
+
-1,
|
|
37
|
+
)
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const STRIDE: usize = 3;
|
|
42
|
+
|
|
43
|
+
/// Token buffer using Vec for safe memory management
|
|
44
|
+
///
|
|
45
|
+
/// Memory allocation and deallocation is handled safely by Vec.
|
|
46
|
+
pub(crate) struct FastTokenBuffer {
|
|
47
|
+
/// Flat array: [token_id, start, len] * count
|
|
48
|
+
data: Vec<i32>,
|
|
49
|
+
/// Number of tokens (data.len() / 3)
|
|
50
|
+
count: usize,
|
|
51
|
+
/// Current position
|
|
52
|
+
index: usize,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
impl FastTokenBuffer {
|
|
56
|
+
/// Create from collected tokens
|
|
57
|
+
pub(crate) fn from_tokens(tokens: Vec<(u16, usize, usize)>) -> Result<Self, &'static str> {
|
|
58
|
+
let count = tokens.len();
|
|
59
|
+
let mut data = Vec::with_capacity(count * STRIDE);
|
|
60
|
+
|
|
61
|
+
for (token_id, start, length) in tokens {
|
|
62
|
+
if start > i32::MAX as usize || length > i32::MAX as usize {
|
|
63
|
+
return Err("token position exceeds maximum value (input too large)");
|
|
64
|
+
}
|
|
65
|
+
data.push(token_id as i32);
|
|
66
|
+
data.push(start as i32);
|
|
67
|
+
data.push(length as i32);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
Ok(Self { data, count, index: 0 })
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
#[inline(always)]
|
|
74
|
+
pub(crate) fn eof(&self) -> bool {
|
|
75
|
+
self.index >= self.count
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[inline(always)]
|
|
79
|
+
pub(crate) fn token_id(&self) -> i32 {
|
|
80
|
+
if self.index >= self.count {
|
|
81
|
+
-1
|
|
82
|
+
} else {
|
|
83
|
+
self.data[self.index * STRIDE]
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
#[inline(always)]
|
|
88
|
+
pub(crate) fn start(&self) -> i32 {
|
|
89
|
+
if self.index >= self.count {
|
|
90
|
+
-1
|
|
91
|
+
} else {
|
|
92
|
+
self.data[self.index * STRIDE + 1]
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
#[inline(always)]
|
|
97
|
+
pub(crate) fn len(&self) -> i32 {
|
|
98
|
+
if self.index >= self.count {
|
|
99
|
+
-1
|
|
100
|
+
} else {
|
|
101
|
+
self.data[self.index * STRIDE + 2]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#[inline(always)]
|
|
106
|
+
pub(crate) fn peek_token_id(&self, n: usize) -> i32 {
|
|
107
|
+
let i = self.index + n;
|
|
108
|
+
if i >= self.count {
|
|
109
|
+
-1
|
|
110
|
+
} else {
|
|
111
|
+
self.data[i * STRIDE]
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
#[inline(always)]
|
|
116
|
+
pub(crate) fn peek_start(&self, n: usize) -> i32 {
|
|
117
|
+
let i = self.index + n;
|
|
118
|
+
if i >= self.count {
|
|
119
|
+
-1
|
|
120
|
+
} else {
|
|
121
|
+
self.data[i * STRIDE + 1]
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
#[inline(always)]
|
|
126
|
+
pub(crate) fn peek_len(&self, n: usize) -> i32 {
|
|
127
|
+
let i = self.index + n;
|
|
128
|
+
if i >= self.count {
|
|
129
|
+
-1
|
|
130
|
+
} else {
|
|
131
|
+
self.data[i * STRIDE + 2]
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
#[inline(always)]
|
|
136
|
+
pub(crate) fn advance(&mut self) {
|
|
137
|
+
if self.index < self.count {
|
|
138
|
+
self.index += 1;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Get total number of tokens (for testing)
|
|
143
|
+
#[cfg(test)]
|
|
144
|
+
pub(crate) fn count(&self) -> usize {
|
|
145
|
+
self.count
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Get current index (for testing)
|
|
149
|
+
#[cfg(test)]
|
|
150
|
+
pub(crate) fn index(&self) -> usize {
|
|
151
|
+
self.index
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// No manual Drop needed - Vec handles memory deallocation automatically
|
|
156
|
+
|
|
157
|
+
// ============================================================================
|
|
158
|
+
// LexStream - Minimal Rust structure for fast VM execution
|
|
159
|
+
// ============================================================================
|
|
160
|
+
|
|
161
|
+
/// LexStream holds token buffer. Higher-level state is managed in Ruby.
|
|
162
|
+
#[repr(C)]
|
|
163
|
+
pub struct LexStream {
|
|
164
|
+
buffer: FastTokenBuffer,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// ============================================================================
|
|
168
|
+
// Ruby TypedData integration
|
|
169
|
+
// ============================================================================
|
|
170
|
+
|
|
171
|
+
static mut LEX_STREAM_CLASS: VALUE = 0;
|
|
172
|
+
static mut LEX_STREAM_DATA_TYPE: *const rb_data_type_t = ptr::null();
|
|
173
|
+
|
|
174
|
+
// No mark function needed - LexStream no longer holds Ruby VALUEs
|
|
175
|
+
|
|
176
|
+
unsafe extern "C" fn lex_stream_free(ptr: *mut libc::c_void) {
|
|
177
|
+
if !ptr.is_null() {
|
|
178
|
+
let _ = Box::from_raw(ptr as *mut LexStream);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
unsafe extern "C" fn lex_stream_size(ptr: *const libc::c_void) -> size_t {
|
|
183
|
+
if ptr.is_null() {
|
|
184
|
+
0
|
|
185
|
+
} else {
|
|
186
|
+
let stream = &*(ptr as *const LexStream);
|
|
187
|
+
(std::mem::size_of::<LexStream>()
|
|
188
|
+
+ stream.buffer.data.capacity() * std::mem::size_of::<i32>()) as size_t
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
#[inline(always)]
|
|
193
|
+
unsafe fn get_stream(obj: VALUE) -> *mut LexStream {
|
|
194
|
+
RTYPEDDATA_GET_DATA(obj) as *mut LexStream
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// ============================================================================
|
|
198
|
+
// Thread-local storage for passing program between magnus and rb-sys
|
|
199
|
+
// ============================================================================
|
|
200
|
+
|
|
201
|
+
thread_local! {
|
|
202
|
+
static CURRENT_PROGRAM: std::cell::RefCell<*const CompiledProgram> = const { std::cell::RefCell::new(ptr::null()) };
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
pub fn set_current_program(prog: *const CompiledProgram) {
|
|
206
|
+
CURRENT_PROGRAM.with(|p| {
|
|
207
|
+
*p.borrow_mut() = prog;
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
fn take_current_program() -> *const CompiledProgram {
|
|
212
|
+
CURRENT_PROGRAM.with(|p| {
|
|
213
|
+
let ptr = *p.borrow();
|
|
214
|
+
*p.borrow_mut() = ptr::null(); // Clear after taking
|
|
215
|
+
ptr
|
|
216
|
+
})
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// ============================================================================
|
|
220
|
+
// Ruby method implementations
|
|
221
|
+
// ============================================================================
|
|
222
|
+
|
|
223
|
+
/// LexStream.create(program, input) -> LexStream
|
|
224
|
+
///
|
|
225
|
+
/// SAFETY: This method requires CURRENT_PROGRAM to be set via set_current_program()
|
|
226
|
+
/// before calling. It is intended to be called only from CompiledProgram#stream.
|
|
227
|
+
/// Direct calls will fail with an error.
|
|
228
|
+
unsafe extern "C" fn lex_stream_create(
|
|
229
|
+
_klass: VALUE,
|
|
230
|
+
program: VALUE,
|
|
231
|
+
input: VALUE,
|
|
232
|
+
) -> VALUE {
|
|
233
|
+
// Take the compiled program from thread-local (clears it to prevent reuse)
|
|
234
|
+
let prog_ptr = take_current_program();
|
|
235
|
+
if prog_ptr.is_null() {
|
|
236
|
+
rb_raise(
|
|
237
|
+
rb_eArgError,
|
|
238
|
+
c"LexStream.create must be called via CompiledProgram#stream".as_ptr(),
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Get input bytes
|
|
243
|
+
let input_ptr = rb_string_value_ptr(&input as *const _ as *mut _);
|
|
244
|
+
let input_len = RSTRING_LEN(input) as usize;
|
|
245
|
+
let bytes = std::slice::from_raw_parts(input_ptr as *const u8, input_len);
|
|
246
|
+
|
|
247
|
+
// Collect tokens using VM
|
|
248
|
+
let prog = &*prog_ptr;
|
|
249
|
+
let tokens = match vm::collect_tokens(prog, bytes) {
|
|
250
|
+
Ok(t) => t,
|
|
251
|
+
Err(msg) => {
|
|
252
|
+
let err_msg = std::ffi::CString::new(msg)
|
|
253
|
+
.unwrap_or_else(|_| std::ffi::CString::new("VM error").unwrap());
|
|
254
|
+
rb_raise(rb_eRuntimeError, err_msg.as_ptr());
|
|
255
|
+
unreachable!()
|
|
256
|
+
}
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
// Create buffer
|
|
260
|
+
let buffer = match FastTokenBuffer::from_tokens(tokens) {
|
|
261
|
+
Ok(b) => b,
|
|
262
|
+
Err(_msg) => {
|
|
263
|
+
rb_raise(rb_eRuntimeError, c"Lexer execution failed".as_ptr());
|
|
264
|
+
unreachable!()
|
|
265
|
+
}
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
// Create LexStream (minimal Rust struct)
|
|
269
|
+
let stream = Box::new(LexStream { buffer });
|
|
270
|
+
let stream_ptr = Box::into_raw(stream);
|
|
271
|
+
|
|
272
|
+
let obj = rb_data_typed_object_wrap(
|
|
273
|
+
LEX_STREAM_CLASS,
|
|
274
|
+
stream_ptr as *mut libc::c_void,
|
|
275
|
+
LEX_STREAM_DATA_TYPE,
|
|
276
|
+
);
|
|
277
|
+
|
|
278
|
+
// Get token_names from program
|
|
279
|
+
let token_names = rb_funcall(program, rb_intern(c"token_names".as_ptr()), 0);
|
|
280
|
+
|
|
281
|
+
// Set instance variables directly (avoids rb_funcall overhead)
|
|
282
|
+
rb_ivar_set(obj, rb_intern(c"@input".as_ptr()), input);
|
|
283
|
+
rb_ivar_set(obj, rb_intern(c"@token_names".as_ptr()), token_names);
|
|
284
|
+
|
|
285
|
+
obj
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/// eof? -> bool
|
|
289
|
+
unsafe extern "C" fn lex_stream_eof(slf: VALUE) -> VALUE {
|
|
290
|
+
let stream = get_stream(slf);
|
|
291
|
+
if (*stream).buffer.eof() {
|
|
292
|
+
Qtrue as VALUE
|
|
293
|
+
} else {
|
|
294
|
+
Qfalse as VALUE
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// advance -> self
|
|
299
|
+
unsafe extern "C" fn lex_stream_advance(slf: VALUE) -> VALUE {
|
|
300
|
+
let stream = get_stream(slf);
|
|
301
|
+
(*stream).buffer.advance();
|
|
302
|
+
slf
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/// token_id -> Integer or nil
|
|
306
|
+
unsafe extern "C" fn lex_stream_token_id(slf: VALUE) -> VALUE {
|
|
307
|
+
let stream = get_stream(slf);
|
|
308
|
+
let id = (*stream).buffer.token_id();
|
|
309
|
+
if id < 0 {
|
|
310
|
+
Qnil as VALUE
|
|
311
|
+
} else {
|
|
312
|
+
rb_int2inum(id as isize)
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/// input -> String
|
|
317
|
+
unsafe extern "C" fn lex_stream_input(slf: VALUE) -> VALUE {
|
|
318
|
+
rb_ivar_get(slf, rb_intern(c"@input".as_ptr()))
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/// start -> Integer or nil
|
|
322
|
+
unsafe extern "C" fn lex_stream_start(slf: VALUE) -> VALUE {
|
|
323
|
+
let stream = get_stream(slf);
|
|
324
|
+
let s = (*stream).buffer.start();
|
|
325
|
+
if s < 0 {
|
|
326
|
+
Qnil as VALUE
|
|
327
|
+
} else {
|
|
328
|
+
rb_int2inum(s as isize)
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/// len -> Integer or nil
|
|
333
|
+
unsafe extern "C" fn lex_stream_len(slf: VALUE) -> VALUE {
|
|
334
|
+
let stream = get_stream(slf);
|
|
335
|
+
let l = (*stream).buffer.len();
|
|
336
|
+
if l < 0 {
|
|
337
|
+
Qnil as VALUE
|
|
338
|
+
} else {
|
|
339
|
+
rb_int2inum(l as isize)
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/// peek_token_id(n = 1) -> Integer or nil
|
|
344
|
+
unsafe extern "C" fn lex_stream_peek_token_id(argc: libc::c_int, argv: *const VALUE, slf: VALUE) -> VALUE {
|
|
345
|
+
let n = if argc == 0 {
|
|
346
|
+
1
|
|
347
|
+
} else {
|
|
348
|
+
let n_val = rb_num2long(*argv);
|
|
349
|
+
if n_val < 0 {
|
|
350
|
+
rb_raise(rb_eArgError, c"offset must be non-negative".as_ptr());
|
|
351
|
+
unreachable!()
|
|
352
|
+
}
|
|
353
|
+
n_val as usize
|
|
354
|
+
};
|
|
355
|
+
|
|
356
|
+
let stream = get_stream(slf);
|
|
357
|
+
let id = (*stream).buffer.peek_token_id(n);
|
|
358
|
+
if id < 0 {
|
|
359
|
+
Qnil as VALUE
|
|
360
|
+
} else {
|
|
361
|
+
rb_int2inum(id as isize)
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/// peek_start(n = 1) -> Integer or nil
|
|
366
|
+
unsafe extern "C" fn lex_stream_peek_start(argc: libc::c_int, argv: *const VALUE, slf: VALUE) -> VALUE {
|
|
367
|
+
let n = if argc == 0 {
|
|
368
|
+
1
|
|
369
|
+
} else {
|
|
370
|
+
let n_val = rb_num2long(*argv);
|
|
371
|
+
if n_val < 0 {
|
|
372
|
+
rb_raise(rb_eArgError, c"offset must be non-negative".as_ptr());
|
|
373
|
+
unreachable!()
|
|
374
|
+
}
|
|
375
|
+
n_val as usize
|
|
376
|
+
};
|
|
377
|
+
|
|
378
|
+
let stream = get_stream(slf);
|
|
379
|
+
let s = (*stream).buffer.peek_start(n);
|
|
380
|
+
if s < 0 {
|
|
381
|
+
Qnil as VALUE
|
|
382
|
+
} else {
|
|
383
|
+
rb_int2inum(s as isize)
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/// peek_len(n = 1) -> Integer or nil
|
|
388
|
+
unsafe extern "C" fn lex_stream_peek_len(argc: libc::c_int, argv: *const VALUE, slf: VALUE) -> VALUE {
|
|
389
|
+
let n = if argc == 0 {
|
|
390
|
+
1
|
|
391
|
+
} else {
|
|
392
|
+
let n_val = rb_num2long(*argv);
|
|
393
|
+
if n_val < 0 {
|
|
394
|
+
rb_raise(rb_eArgError, c"offset must be non-negative".as_ptr());
|
|
395
|
+
unreachable!()
|
|
396
|
+
}
|
|
397
|
+
n_val as usize
|
|
398
|
+
};
|
|
399
|
+
|
|
400
|
+
let stream = get_stream(slf);
|
|
401
|
+
let l = (*stream).buffer.peek_len(n);
|
|
402
|
+
if l < 0 {
|
|
403
|
+
Qnil as VALUE
|
|
404
|
+
} else {
|
|
405
|
+
rb_int2inum(l as isize)
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// ============================================================================
|
|
410
|
+
// Initialization
|
|
411
|
+
// ============================================================================
|
|
412
|
+
|
|
413
|
+
pub fn init_lex_stream() {
|
|
414
|
+
unsafe {
|
|
415
|
+
let data_type = Box::new(rb_data_type_t {
|
|
416
|
+
wrap_struct_name: c"LexerKit::LexStream".as_ptr(),
|
|
417
|
+
function: rb_data_type_struct__bindgen_ty_1 {
|
|
418
|
+
dmark: None, // No Ruby VALUEs to mark
|
|
419
|
+
dfree: Some(lex_stream_free),
|
|
420
|
+
dsize: Some(lex_stream_size),
|
|
421
|
+
dcompact: None,
|
|
422
|
+
reserved: [ptr::null_mut(); 1],
|
|
423
|
+
},
|
|
424
|
+
parent: ptr::null(),
|
|
425
|
+
data: ptr::null_mut(),
|
|
426
|
+
flags: 0,
|
|
427
|
+
});
|
|
428
|
+
LEX_STREAM_DATA_TYPE = Box::into_raw(data_type);
|
|
429
|
+
|
|
430
|
+
let lexer_kit_id = rb_intern(c"LexerKit".as_ptr());
|
|
431
|
+
let lexer_kit = if rb_const_defined(rb_cObject, lexer_kit_id) != 0 {
|
|
432
|
+
rb_const_get(rb_cObject, lexer_kit_id)
|
|
433
|
+
} else {
|
|
434
|
+
rb_define_module(c"LexerKit".as_ptr())
|
|
435
|
+
};
|
|
436
|
+
|
|
437
|
+
LEX_STREAM_CLASS = rb_define_class_under(
|
|
438
|
+
lexer_kit,
|
|
439
|
+
c"LexStream".as_ptr(),
|
|
440
|
+
rb_cObject,
|
|
441
|
+
);
|
|
442
|
+
|
|
443
|
+
rb_undef_alloc_func(LEX_STREAM_CLASS);
|
|
444
|
+
|
|
445
|
+
// Class method
|
|
446
|
+
rb_define_singleton_method(
|
|
447
|
+
LEX_STREAM_CLASS,
|
|
448
|
+
c"create".as_ptr(),
|
|
449
|
+
Some(std::mem::transmute::<
|
|
450
|
+
unsafe extern "C" fn(VALUE, VALUE, VALUE) -> VALUE,
|
|
451
|
+
unsafe extern "C" fn() -> VALUE,
|
|
452
|
+
>(lex_stream_create)),
|
|
453
|
+
2,
|
|
454
|
+
);
|
|
455
|
+
|
|
456
|
+
// Instance methods (performance-critical, kept in Rust)
|
|
457
|
+
define_ruby_method_noarg!(LEX_STREAM_CLASS, c"eof?", lex_stream_eof);
|
|
458
|
+
define_ruby_method_noarg!(LEX_STREAM_CLASS, c"advance", lex_stream_advance);
|
|
459
|
+
define_ruby_method_noarg!(LEX_STREAM_CLASS, c"token_id", lex_stream_token_id);
|
|
460
|
+
define_ruby_method_noarg!(LEX_STREAM_CLASS, c"start", lex_stream_start);
|
|
461
|
+
define_ruby_method_noarg!(LEX_STREAM_CLASS, c"len", lex_stream_len);
|
|
462
|
+
define_ruby_method_noarg!(LEX_STREAM_CLASS, c"input", lex_stream_input);
|
|
463
|
+
|
|
464
|
+
define_ruby_method_vararg!(LEX_STREAM_CLASS, c"peek_token_id", lex_stream_peek_token_id);
|
|
465
|
+
define_ruby_method_vararg!(LEX_STREAM_CLASS, c"peek_start", lex_stream_peek_start);
|
|
466
|
+
define_ruby_method_vararg!(LEX_STREAM_CLASS, c"peek_len", lex_stream_peek_len);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
//! LexerKit Rust Extension
|
|
2
|
+
//!
|
|
3
|
+
//! High-performance VM for executing compiled lexer programs.
|
|
4
|
+
|
|
5
|
+
mod deserializer;
|
|
6
|
+
#[cfg(not(test))]
|
|
7
|
+
mod dfa;
|
|
8
|
+
#[cfg(test)]
|
|
9
|
+
pub mod dfa; // テスト時は公開
|
|
10
|
+
#[cfg(not(test))]
|
|
11
|
+
mod fast_stream;
|
|
12
|
+
#[cfg(test)]
|
|
13
|
+
pub mod fast_stream; // テスト時は公開
|
|
14
|
+
#[cfg(not(test))]
|
|
15
|
+
mod opcodes;
|
|
16
|
+
#[cfg(test)]
|
|
17
|
+
pub mod opcodes; // テスト時は公開
|
|
18
|
+
#[cfg(not(test))]
|
|
19
|
+
mod trie;
|
|
20
|
+
#[cfg(test)]
|
|
21
|
+
pub mod trie; // テスト時は公開
|
|
22
|
+
#[cfg(not(test))]
|
|
23
|
+
mod types;
|
|
24
|
+
#[cfg(test)]
|
|
25
|
+
pub mod types; // テスト時は公開
|
|
26
|
+
#[cfg(not(test))]
|
|
27
|
+
mod vm;
|
|
28
|
+
#[cfg(test)]
|
|
29
|
+
pub mod vm; // テスト時は公開
|
|
30
|
+
|
|
31
|
+
#[cfg(test)]
|
|
32
|
+
mod safety_test;
|
|
33
|
+
|
|
34
|
+
use magnus::{
|
|
35
|
+
function, method, prelude::*, Error, IntoValue, RArray, RHash, RString, Ruby, TryConvert, Value,
|
|
36
|
+
};
|
|
37
|
+
use std::cell::RefCell;
|
|
38
|
+
use types::CompiledProgram;
|
|
39
|
+
|
|
40
|
+
/// Wrapper for CompiledProgram to make it shareable with Ruby
|
|
41
|
+
#[magnus::wrap(class = "LexerKit::IR::RustHandle", free_immediately, size)]
|
|
42
|
+
struct RustHandle {
|
|
43
|
+
prog: RefCell<CompiledProgram>,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
impl RustHandle {
|
|
47
|
+
/// Load program from Ruby data hash
|
|
48
|
+
fn load_from_ruby(rb_data: RHash) -> Result<Self, Error> {
|
|
49
|
+
let prog = deserializer::parse_ruby_data(rb_data)?;
|
|
50
|
+
Ok(Self {
|
|
51
|
+
prog: RefCell::new(prog),
|
|
52
|
+
})
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Tokenize input using the VM and yield tokens
|
|
56
|
+
fn lex_native(&self, rb_bytes: RString) -> Result<RArray, Error> {
|
|
57
|
+
let ruby = Ruby::get().unwrap();
|
|
58
|
+
let bytes = unsafe { rb_bytes.as_slice() };
|
|
59
|
+
let prog = self.prog.borrow();
|
|
60
|
+
|
|
61
|
+
let tokens = vm::collect_tokens(&prog, bytes)
|
|
62
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e))?;
|
|
63
|
+
|
|
64
|
+
let result = ruby.ary_new_capa(tokens.len());
|
|
65
|
+
for (token_id, start, length) in tokens {
|
|
66
|
+
let token = ruby.ary_new_from_values(&[
|
|
67
|
+
ruby.into_value(token_id),
|
|
68
|
+
ruby.into_value(start),
|
|
69
|
+
ruby.into_value(length),
|
|
70
|
+
]);
|
|
71
|
+
result.push(token)?;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
Ok(result)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/// Tokenize input and yield each token to a block
|
|
78
|
+
fn lex_native_block(&self, rb_bytes: RString) -> Result<Value, Error> {
|
|
79
|
+
let ruby = Ruby::get().unwrap();
|
|
80
|
+
let bytes = unsafe { rb_bytes.as_slice() };
|
|
81
|
+
let prog = self.prog.borrow();
|
|
82
|
+
|
|
83
|
+
let tokens = vm::collect_tokens(&prog, bytes)
|
|
84
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e))?;
|
|
85
|
+
|
|
86
|
+
for (token_id, start, length) in tokens {
|
|
87
|
+
let args = (
|
|
88
|
+
ruby.into_value(token_id),
|
|
89
|
+
ruby.into_value(start),
|
|
90
|
+
ruby.into_value(length),
|
|
91
|
+
);
|
|
92
|
+
let _: Value = ruby.yield_values(args)?;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
Ok(ruby.qnil().as_value())
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/// Internal variable name for Rust handle (non-@ name, inaccessible from Ruby)
|
|
101
|
+
static RUST_HANDLE_IVAR: &std::ffi::CStr = c"__lkt_rust__";
|
|
102
|
+
|
|
103
|
+
/// RustNative module methods - added to CompiledProgram
|
|
104
|
+
struct RustNative;
|
|
105
|
+
|
|
106
|
+
impl RustNative {
|
|
107
|
+
/// Load Rust native handle from Ruby data
|
|
108
|
+
/// compiled_program.load_rust_native(data) -> self
|
|
109
|
+
fn load_rust_native(rb_self: Value, rb_data: RHash) -> Result<Value, Error> {
|
|
110
|
+
let ruby = Ruby::get().unwrap();
|
|
111
|
+
let handle = RustHandle::load_from_ruby(rb_data)?;
|
|
112
|
+
|
|
113
|
+
// Store handle as internal variable (non-@ name)
|
|
114
|
+
unsafe {
|
|
115
|
+
let ivar_id = rb_sys::rb_intern(RUST_HANDLE_IVAR.as_ptr());
|
|
116
|
+
let handle_value: rb_sys::VALUE = std::mem::transmute::<Value, rb_sys::VALUE>(handle.into_value_with(&ruby));
|
|
117
|
+
rb_sys::rb_ivar_set(std::mem::transmute::<Value, rb_sys::VALUE>(rb_self), ivar_id, handle_value);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
Ok(rb_self)
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/// Check if Rust native handle is loaded
|
|
124
|
+
fn rust_native_loaded(rb_self: Value) -> bool {
|
|
125
|
+
let handle = unsafe {
|
|
126
|
+
let ivar_id = rb_sys::rb_intern(RUST_HANDLE_IVAR.as_ptr());
|
|
127
|
+
rb_sys::rb_ivar_get(std::mem::transmute::<Value, rb_sys::VALUE>(rb_self), ivar_id)
|
|
128
|
+
};
|
|
129
|
+
handle != rb_sys::Qnil as rb_sys::VALUE
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/// Tokenize using Rust VM and yield to block
|
|
133
|
+
fn lex_rust_native(rb_self: Value, rb_bytes: RString) -> Result<Value, Error> {
|
|
134
|
+
let handle = Self::get_handle(rb_self)?;
|
|
135
|
+
handle.lex_native_block(rb_bytes)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/// Create a LexStream
|
|
139
|
+
fn create_rust_stream(rb_self: Value, rb_bytes: RString) -> Result<Value, Error> {
|
|
140
|
+
use magnus::IntoValue;
|
|
141
|
+
let handle = Self::get_handle(rb_self)?;
|
|
142
|
+
let prog = handle.prog.borrow();
|
|
143
|
+
let ruby = Ruby::get().unwrap();
|
|
144
|
+
|
|
145
|
+
// Set current program for LexStream to use
|
|
146
|
+
fast_stream::set_current_program(&*prog as *const CompiledProgram);
|
|
147
|
+
|
|
148
|
+
// Create LexStream directly using rb-sys
|
|
149
|
+
let result = unsafe {
|
|
150
|
+
let klass = rb_sys::rb_const_get(
|
|
151
|
+
rb_sys::rb_const_get(
|
|
152
|
+
rb_sys::rb_cObject,
|
|
153
|
+
rb_sys::rb_intern(c"LexerKit".as_ptr()),
|
|
154
|
+
),
|
|
155
|
+
rb_sys::rb_intern(c"LexStream".as_ptr()),
|
|
156
|
+
);
|
|
157
|
+
let program_value: rb_sys::VALUE = std::mem::transmute(rb_self);
|
|
158
|
+
let input_value: rb_sys::VALUE = std::mem::transmute(rb_bytes.into_value_with(&ruby));
|
|
159
|
+
rb_sys::rb_funcall(
|
|
160
|
+
klass,
|
|
161
|
+
rb_sys::rb_intern(c"create".as_ptr()),
|
|
162
|
+
2,
|
|
163
|
+
program_value,
|
|
164
|
+
input_value,
|
|
165
|
+
)
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
Ok(unsafe { std::mem::transmute::<rb_sys::VALUE, Value>(result) })
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/// Get handle from internal variable
|
|
172
|
+
///
|
|
173
|
+
/// # Safety
|
|
174
|
+
/// The returned reference is valid as long as `rb_self` is kept alive by Ruby's GC.
|
|
175
|
+
/// Since this is called within Ruby method invocations where `rb_self` is rooted,
|
|
176
|
+
/// the reference is guaranteed to be valid for the duration of the method call.
|
|
177
|
+
///
|
|
178
|
+
/// Note: The lifetime is tied to magnus's TypedData wrapper, not truly 'static.
|
|
179
|
+
/// Callers must not store this reference beyond the current method call.
|
|
180
|
+
fn get_handle<'a>(rb_self: Value) -> Result<&'a RustHandle, Error> {
|
|
181
|
+
let ruby = Ruby::get().unwrap();
|
|
182
|
+
let handle_value = unsafe {
|
|
183
|
+
let ivar_id = rb_sys::rb_intern(RUST_HANDLE_IVAR.as_ptr());
|
|
184
|
+
rb_sys::rb_ivar_get(std::mem::transmute::<Value, rb_sys::VALUE>(rb_self), ivar_id)
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
if handle_value == rb_sys::Qnil as rb_sys::VALUE {
|
|
188
|
+
return Err(Error::new(
|
|
189
|
+
ruby.exception_runtime_error(),
|
|
190
|
+
"Rust native not loaded. Call load_rust_native first.",
|
|
191
|
+
));
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Convert VALUE back to RustHandle reference.
|
|
195
|
+
// Safety: The handle is stored as an instance variable of rb_self.
|
|
196
|
+
// Ruby's GC keeps rb_self alive during method calls, so the handle
|
|
197
|
+
// is valid for the duration of the caller's use.
|
|
198
|
+
let handle: &RustHandle = unsafe {
|
|
199
|
+
let value: Value = std::mem::transmute(handle_value);
|
|
200
|
+
TryConvert::try_convert(value)?
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
Ok(handle)
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
#[magnus::init]
|
|
208
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
209
|
+
// Define LexerKit module and IR submodule
|
|
210
|
+
let lexer_kit = ruby.define_module("LexerKit")?;
|
|
211
|
+
let ir = lexer_kit.define_module("IR")?;
|
|
212
|
+
|
|
213
|
+
// Define RustHandle class (internal, not meant to be used directly)
|
|
214
|
+
let rust_handle = ir.define_class("RustHandle", ruby.class_object())?;
|
|
215
|
+
rust_handle
|
|
216
|
+
.define_singleton_method("load_from_ruby", function!(RustHandle::load_from_ruby, 1))?;
|
|
217
|
+
rust_handle.define_method("lex_native", method!(RustHandle::lex_native, 1))?;
|
|
218
|
+
rust_handle.define_method("lex_native_block", method!(RustHandle::lex_native_block, 1))?;
|
|
219
|
+
|
|
220
|
+
// Define RustNative module (to be included in CompiledProgram)
|
|
221
|
+
let rust_native = ir.define_module("RustNative")?;
|
|
222
|
+
rust_native.define_method("load_rust_native", method!(RustNative::load_rust_native, 1))?;
|
|
223
|
+
rust_native.define_method(
|
|
224
|
+
"rust_native_loaded?",
|
|
225
|
+
method!(RustNative::rust_native_loaded, 0),
|
|
226
|
+
)?;
|
|
227
|
+
rust_native.define_method("lex_rust_native", method!(RustNative::lex_rust_native, 1))?;
|
|
228
|
+
rust_native.define_method(
|
|
229
|
+
"create_rust_stream",
|
|
230
|
+
method!(RustNative::create_rust_stream, 1),
|
|
231
|
+
)?;
|
|
232
|
+
|
|
233
|
+
// Include RustNative into CompiledProgram if it exists
|
|
234
|
+
let compiled_program = ir.const_get::<_, Value>("CompiledProgram");
|
|
235
|
+
if let Ok(cp) = compiled_program {
|
|
236
|
+
unsafe {
|
|
237
|
+
rb_sys::rb_include_module(
|
|
238
|
+
std::mem::transmute::<Value, rb_sys::VALUE>(cp),
|
|
239
|
+
std::mem::transmute::<Value, rb_sys::VALUE>(rust_native.as_value()),
|
|
240
|
+
);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Initialize LexStream class (rb-sys based, no RefCell overhead)
|
|
245
|
+
fast_stream::init_lex_stream();
|
|
246
|
+
|
|
247
|
+
Ok(())
|
|
248
|
+
}
|