RubyGems - lexer_kit - Versions diffs - 0.5.0 - Mend

lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +7 -0
data/LICENSE.txt +21 -0
data/README.md +157 -0
data/exe/lexer_kit +7 -0
data/ext/lexer_kit_rust/Cargo.toml +17 -0
data/ext/lexer_kit_rust/extconf.rb +6 -0
data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
data/ext/lexer_kit_rust/src/dfa.rs +217 -0
data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
data/ext/lexer_kit_rust/src/lib.rs +248 -0
data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
data/ext/lexer_kit_rust/src/trie.rs +206 -0
data/ext/lexer_kit_rust/src/types.rs +319 -0
data/ext/lexer_kit_rust/src/vm.rs +258 -0
data/lib/lexer_kit/builder/compiler.rb +596 -0
data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
data/lib/lexer_kit/builder/mode_def.rb +36 -0
data/lib/lexer_kit/builder/token_def.rb +65 -0
data/lib/lexer_kit/builder/validator.rb +84 -0
data/lib/lexer_kit/builder.rb +230 -0
data/lib/lexer_kit/cli/commands.rb +389 -0
data/lib/lexer_kit/cli.rb +88 -0
data/lib/lexer_kit/core/diagnostic.rb +103 -0
data/lib/lexer_kit/core/source.rb +154 -0
data/lib/lexer_kit/core/span.rb +80 -0
data/lib/lexer_kit/core/token.rb +120 -0
data/lib/lexer_kit/core.rb +13 -0
data/lib/lexer_kit/debug/disassembler.rb +143 -0
data/lib/lexer_kit/debug/visualizer.rb +203 -0
data/lib/lexer_kit/debug.rb +11 -0
data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
data/lib/lexer_kit/dfa/case_folding.rb +45 -0
data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
data/lib/lexer_kit/dfa/nfa.rb +304 -0
data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
data/lib/lexer_kit/dfa.rb +37 -0
data/lib/lexer_kit/errors.rb +76 -0
data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
data/lib/lexer_kit/format/lkb1.rb +199 -0
data/lib/lexer_kit/format/lkt1.rb +111 -0
data/lib/lexer_kit/format.rb +19 -0
data/lib/lexer_kit/ir/compiled_program.rb +228 -0
data/lib/lexer_kit/ir/constant_pool.rb +107 -0
data/lib/lexer_kit/ir/dfa_table.rb +125 -0
data/lib/lexer_kit/ir/instruction.rb +50 -0
data/lib/lexer_kit/ir/jump_table.rb +94 -0
data/lib/lexer_kit/ir/keyword_table.rb +168 -0
data/lib/lexer_kit/ir/opcode.rb +96 -0
data/lib/lexer_kit/ir/serializer.rb +249 -0
data/lib/lexer_kit/ir.rb +16 -0
data/lib/lexer_kit/runner.rb +114 -0
data/lib/lexer_kit/trie.rb +170 -0
data/lib/lexer_kit/version.rb +5 -0
data/lib/lexer_kit.rb +155 -0
metadata +119 -0

data/ext/lexer_kit_rust/src/types.rs ADDED Viewed

@@ -0,0 +1,319 @@
+//! Core data structures for the LexerKit Rust VM
+//!
+//! These structures mirror the C implementation for binary compatibility.
+/// DFA special values
+pub const DFA_DEAD_STATE: u16 = 0;
+pub const DFA_NO_ACCEPT: u16 = 0xFFFF;
+/// Maximum mode stack depth
+pub const MAX_MODE_STACK: usize = 64;
+/// Maximum steps per token (prevents infinite loops)
+pub const MAX_STEPS_PER_TOKEN: u32 = 10_000;
+/// Maximum consecutive zero-length tokens before halting
+pub const MAX_ZERO_PROGRESS_TOKENS: u16 = 100;
+/// VM return codes
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VmResult {
+    Continue,
+    Emit,
+    Halt,
+}
+/// Emit result structure (filled by emit handlers)
+#[derive(Debug, Clone, Copy, Default)]
+pub struct EmitResult {
+    pub token_id: u16,
+    pub start: usize,
+    pub length: usize,
+}
+/// Instruction: 4 bytes packed
+#[derive(Debug, Clone, Copy)]
+pub struct Instruction {
+    pub opcode: u8,
+    pub arg_hi: u8,
+    pub arg_mid: u8,
+    pub arg_lo: u8,
+}
+impl Instruction {
+    /// Extract the 24-bit argument from the instruction
+    #[inline]
+    pub fn arg(&self) -> u32 {
+        ((self.arg_hi as u32) << 16) | ((self.arg_mid as u32) << 8) | (self.arg_lo as u32)
+    }
+    /// Create instruction from bytes
+    pub fn from_bytes(bytes: &[u8]) -> Self {
+        Self {
+            opcode: bytes[0],
+            arg_hi: bytes[1],
+            arg_mid: bytes[2],
+            arg_lo: bytes[3],
+        }
+    }
+}
+/// DFA table
+#[derive(Debug, Clone)]
+pub struct DfaTable {
+    pub state_count: u16,
+    pub class_count: u16,
+    pub byte_class: Vec<u8>,     // 256 bytes
+    pub transitions: Vec<u16>,   // state_count * class_count
+    pub accept_tokens: Vec<u16>, // state_count entries, 0xFFFF = not accepting
+}
+impl DfaTable {
+    /// Create a new empty DFA table
+    pub fn new() -> Self {
+        Self {
+            state_count: 0,
+            class_count: 0,
+            byte_class: vec![0; 256],
+            transitions: Vec::new(),
+            accept_tokens: Vec::new(),
+        }
+    }
+}
+impl Default for DfaTable {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+/// Jump table - dense 256-entry lookup for O(1) access
+#[derive(Debug, Clone)]
+pub struct JumpTable {
+    pub lookup: [u32; 256],  // Direct byte -> offset mapping
+    pub default_offset: u32, // Offset for EOF case
+}
+impl JumpTable {
+    pub fn new() -> Self {
+        Self {
+            lookup: [0; 256],
+            default_offset: 0,
+        }
+    }
+}
+impl Default for JumpTable {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+/// Constant pool entry
+#[derive(Debug, Clone)]
+pub struct ConstantEntry {
+    pub data: Vec<u8>,
+}
+/// Constant pool
+#[derive(Debug, Clone, Default)]
+pub struct ConstantPool {
+    pub entries: Vec<ConstantEntry>,
+}
+impl ConstantPool {
+    pub fn new() -> Self {
+        Self {
+            entries: Vec::new(),
+        }
+    }
+    /// Get constant data by index
+    #[inline]
+    pub fn get(&self, index: usize) -> Option<&[u8]> {
+        self.entries.get(index).map(|e| e.data.as_slice())
+    }
+}
+/// Keyword table entry
+#[derive(Debug, Clone)]
+pub struct KeywordEntry {
+    pub keyword: Vec<u8>,
+    pub token_id: u16,
+}
+/// Keyword table
+#[derive(Debug, Clone)]
+pub struct KeywordTable {
+    pub base_token_id: u16, // Token ID when no keyword matches
+    pub entries: Vec<KeywordEntry>,
+}
+impl KeywordTable {
+    pub fn new(base_token_id: u16) -> Self {
+        Self {
+            base_token_id,
+            entries: Vec::new(),
+        }
+    }
+}
+/// Mode entry - only offset needed at runtime
+#[derive(Debug, Clone, Copy)]
+pub struct Mode {
+    pub start_offset: u32,
+}
+/// Compiled program
+#[derive(Debug, Clone)]
+pub struct CompiledProgram {
+    pub instructions: Vec<Instruction>,
+    pub dfa_tables: Vec<DfaTable>,
+    pub jump_tables: Vec<JumpTable>,
+    pub keyword_tables: Vec<KeywordTable>,
+    pub constant_pool: ConstantPool,
+    pub modes: Vec<Mode>,
+    pub default_mode_offset: u32,
+}
+impl CompiledProgram {
+    pub fn new() -> Self {
+        Self {
+            instructions: Vec::new(),
+            dfa_tables: Vec::new(),
+            jump_tables: Vec::new(),
+            keyword_tables: Vec::new(),
+            constant_pool: ConstantPool::new(),
+            modes: Vec::new(),
+            default_mode_offset: 0,
+        }
+    }
+    /// Find mode offset by index
+    #[inline]
+    pub fn find_mode_offset(&self, mode_idx: u16) -> u32 {
+        let offset = if (mode_idx as usize) < self.modes.len() {
+            self.modes[mode_idx as usize].start_offset
+        } else {
+            self.default_mode_offset
+        };
+        // Ensure offset is within bounds
+        if (offset as usize) < self.instructions.len() {
+            offset
+        } else {
+            0
+        }
+    }
+}
+impl Default for CompiledProgram {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+/// VM execution state
+#[derive(Debug)]
+pub struct VmState<'a> {
+    pub prog: &'a CompiledProgram,
+    pub bytes: &'a [u8],
+    pub pos: usize,
+    pub mark: usize,
+    pub last_match_len: usize,
+    pub last_match_order: u16,
+    pub last_match_ip: u32,
+    pub best_match_len: usize,
+    pub best_match_order: u16,
+    pub best_match_ip: u32,
+    pub ip: u32, // instruction pointer
+    pub mode: u16,
+    pub mode_stack: [u16; MAX_MODE_STACK],
+    pub mode_sp: u8,
+    pub failed: bool,
+    pub pending_token_id: u16,
+    pub has_pending_token: bool,
+    // Zero-progress detection for infinite loop prevention
+    pub last_emit_pos: usize,
+    pub zero_progress_count: u16,
+    // Error message (set on runtime errors like stack overflow)
+    pub error: Option<&'static str>,
+}
+impl<'a> VmState<'a> {
+    /// Initialize VM state
+    pub fn new(prog: &'a CompiledProgram, bytes: &'a [u8]) -> Self {
+        let ip = if (prog.default_mode_offset as usize) < prog.instructions.len() {
+            prog.default_mode_offset
+        } else {
+            0
+        };
+        Self {
+            prog,
+            bytes,
+            pos: 0,
+            mark: 0,
+            last_match_len: 0,
+            last_match_order: 0xFFFF,
+            last_match_ip: 0,
+            best_match_len: 0,
+            best_match_order: 0xFFFF,
+            best_match_ip: 0,
+            ip,
+            mode: 0,
+            mode_stack: [0; MAX_MODE_STACK],
+            mode_sp: 0,
+            failed: false,
+            pending_token_id: 0,
+            has_pending_token: false,
+            last_emit_pos: 0,
+            zero_progress_count: 0,
+            error: None,
+        }
+    }
+    /// Get remaining bytes from current position
+    #[inline]
+    pub fn remaining_bytes(&self) -> &'a [u8] {
+        &self.bytes[self.pos..]
+    }
+    /// Check if at end of input
+    #[inline]
+    pub fn is_eof(&self) -> bool {
+        self.pos >= self.bytes.len()
+    }
+    /// Get current byte (if available)
+    #[inline]
+    pub fn current_byte(&self) -> Option<u8> {
+        self.bytes.get(self.pos).copied()
+    }
+    /// Update best match if current last_match is better (longer, or same length with earlier order)
+    #[inline]
+    pub fn update_best_if_better(&mut self) {
+        if self.last_match_len > self.best_match_len
+            || (self.last_match_len == self.best_match_len
+                && self.last_match_order < self.best_match_order)
+        {
+            self.best_match_len = self.last_match_len;
+            self.best_match_order = self.last_match_order;
+            self.best_match_ip = self.last_match_ip;
+        }
+    }
+    /// Clear best match tracking state
+    #[inline]
+    pub fn clear_best(&mut self) {
+        self.best_match_len = 0;
+        self.best_match_order = 0xFFFF;
+        self.best_match_ip = 0;
+    }
+    /// Reset position to mark
+    #[inline]
+    pub fn reset_pos_to_mark(&mut self) {
+        self.pos = self.mark;
+    }
+}

data/ext/lexer_kit_rust/src/vm.rs ADDED Viewed

@@ -0,0 +1,258 @@
+//! VM execution loop
+//!
+//! This module implements the main VM execution logic.
+use crate::opcodes::{self, OP_HALT};
+use crate::types::{CompiledProgram, EmitResult, VmResult, VmState, MAX_STEPS_PER_TOKEN};
+/// Reserved token IDs (must match lib/lexer_kit.rb)
+/// - 0:   Internal sentinel (never emitted)
+/// - 1:   INVALID (error token)
+/// - 2-7: Reserved for future use
+/// - 8+:  User-defined tokens
+const INVALID_TOKEN_ID: u16 = 1;
+const FIRST_USER_TOKEN_ID: u16 = 8;
+/// Check if a token_id is valid for emission
+/// Valid tokens are: INVALID (1) for error tokens, or user tokens (>= 8)
+#[inline]
+fn is_valid_token_id(token_id: u16) -> bool {
+    token_id == INVALID_TOKEN_ID || token_id >= FIRST_USER_TOKEN_ID
+}
+/// VM step result
+#[derive(Debug, Clone, Copy)]
+pub enum StepResult {
+    /// Token was emitted
+    Emit(EmitResult),
+    /// Execution halted (EOF or error)
+    Halt,
+}
+/// Execute one token step of the VM
+///
+/// This function executes instructions until a token is emitted or the VM halts.
+pub fn step(vm: &mut VmState) -> StepResult {
+    let mut step_count = 0u32;
+    while (vm.ip as usize) < vm.prog.instructions.len() {
+        let instr = &vm.prog.instructions[vm.ip as usize];
+        let arg = instr.arg();
+        let opcode = instr.opcode;
+        step_count += 1;
+        if step_count > MAX_STEPS_PER_TOKEN {
+            return StepResult::Halt;
+        }
+        // Handle OP_HALT specially
+        if opcode == OP_HALT {
+            return StepResult::Halt;
+        }
+        let (result, emit) = opcodes::execute(vm, opcode, arg);
+        match result {
+            VmResult::Emit => {
+                if let Some(emit_result) = emit {
+                    return StepResult::Emit(emit_result);
+                }
+            }
+            VmResult::Halt => {
+                return StepResult::Halt;
+            }
+            VmResult::Continue => {}
+        }
+    }
+    StepResult::Halt
+}
+/// Collect all tokens from input
+///
+/// This is the batch collection function that avoids Ruby boundary crossings.
+/// Returns Err with error message if VM encounters a runtime error.
+pub fn collect_tokens(
+    prog: &CompiledProgram,
+    bytes: &[u8],
+) -> Result<Vec<(u16, usize, usize)>, &'static str> {
+    let mut vm = VmState::new(prog, bytes);
+    let mut tokens = Vec::new();
+    loop {
+        match step(&mut vm) {
+            StepResult::Emit(emit) => {
+                // Filter tokens:
+                // - Must have length > 0 (skip zero-length tokens)
+                // - Must have valid token_id (INVALID=1 or user tokens >= 8)
+                // This guards against malformed serialized data with reserved token IDs
+                if emit.length > 0 && is_valid_token_id(emit.token_id) {
+                    tokens.push((emit.token_id, emit.start, emit.length));
+                }
+            }
+            StepResult::Halt => {
+                // Check for error
+                if let Some(err) = vm.error {
+                    return Err(err);
+                }
+                break;
+            }
+        }
+    }
+    Ok(tokens)
+}
+/// Token iterator for streaming tokenization
+pub struct TokenIterator<'a> {
+    vm: VmState<'a>,
+    done: bool,
+}
+impl<'a> TokenIterator<'a> {
+    #[allow(dead_code)] // Used in tests
+    pub fn new(prog: &'a CompiledProgram, bytes: &'a [u8]) -> Self {
+        Self {
+            vm: VmState::new(prog, bytes),
+            done: false,
+        }
+    }
+}
+impl<'a> Iterator for TokenIterator<'a> {
+    type Item = (u16, usize, usize);
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.done {
+            return None;
+        }
+        loop {
+            match step(&mut self.vm) {
+                StepResult::Emit(emit) => {
+                    // Filter tokens:
+                    // - Must have length > 0 (skip zero-length tokens)
+                    // - Must have valid token_id (INVALID=1 or user tokens >= 8)
+                    // This guards against malformed serialized data with reserved token IDs
+                    if emit.length > 0 && is_valid_token_id(emit.token_id) {
+                        return Some((emit.token_id, emit.start, emit.length));
+                    }
+                }
+                StepResult::Halt => {
+                    self.done = true;
+                    return None;
+                }
+            }
+        }
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::{DfaTable, Instruction, Mode};
+    fn create_simple_program() -> CompiledProgram {
+        // Create a simple program that matches "a" and emits token 1
+        let mut prog = CompiledProgram::new();
+        // DFA that matches 'a'
+        let mut dfa = DfaTable::new();
+        dfa.state_count = 3;
+        dfa.class_count = 2;
+        dfa.byte_class = vec![0; 256];
+        dfa.byte_class[b'a' as usize] = 1; // 'a' is class 1
+                                           // Transitions: state 0 is dead, state 1 is start, state 2 is accept
+                                           // transitions[state * class_count + class]
+        dfa.transitions = vec![
+            0, 0, // state 0 (dead)
+            0, 2, // state 1: class 0 -> dead, class 1 ('a') -> state 2
+            0, 0, // state 2 (accept): all -> dead
+        ];
+        dfa.accept_tokens = vec![
+            0xFFFF, // state 0: not accepting
+            0xFFFF, // state 1: not accepting
+            1,      // state 2: token 1
+        ];
+        prog.dfa_tables.push(dfa);
+        // Instructions:
+        // 0: MARK
+        // 1: DFA_RUN_IF_MATCH (dfa=0, fail->4)
+        // 2: EMIT 1
+        // 3: JUMP 0 (loop)
+        // 4: HALT
+        prog.instructions = vec![
+            Instruction {
+                opcode: opcodes::OP_MARK,
+                arg_hi: 0,
+                arg_mid: 0,
+                arg_lo: 0,
+            },
+            Instruction {
+                opcode: opcodes::OP_DFA_RUN_IF_MATCH,
+                arg_hi: 0,
+                arg_mid: 0,
+                arg_lo: 4,
+            },
+            Instruction {
+                opcode: opcodes::OP_EMIT,
+                arg_hi: 0,
+                arg_mid: 0,
+                arg_lo: 1,
+            },
+            Instruction {
+                opcode: opcodes::OP_JUMP,
+                arg_hi: 0,
+                arg_mid: 0,
+                arg_lo: 0,
+            },
+            Instruction {
+                opcode: opcodes::OP_HALT,
+                arg_hi: 0,
+                arg_mid: 0,
+                arg_lo: 0,
+            },
+        ];
+        prog.modes.push(Mode { start_offset: 0 });
+        prog.default_mode_offset = 0;
+        prog
+    }
+    #[test]
+    fn test_vm_single_token() {
+        let prog = create_simple_program();
+        let tokens = collect_tokens(&prog, b"a").unwrap();
+        assert_eq!(tokens.len(), 1);
+        assert_eq!(tokens[0], (1, 0, 1));
+    }
+    #[test]
+    fn test_vm_multiple_tokens() {
+        let prog = create_simple_program();
+        let tokens = collect_tokens(&prog, b"aaa").unwrap();
+        assert_eq!(tokens.len(), 3);
+        assert_eq!(tokens[0], (1, 0, 1));
+        assert_eq!(tokens[1], (1, 1, 1));
+        assert_eq!(tokens[2], (1, 2, 1));
+    }
+    #[test]
+    fn test_vm_no_match() {
+        let prog = create_simple_program();
+        let tokens = collect_tokens(&prog, b"b").unwrap();
+        assert_eq!(tokens.len(), 0);
+    }
+    #[test]
+    fn test_token_iterator() {
+        let prog = create_simple_program();
+        let iter = TokenIterator::new(&prog, b"aa");
+        let tokens: Vec<_> = iter.collect();
+        assert_eq!(tokens.len(), 2);
+        assert_eq!(tokens[0], (1, 0, 1));
+        assert_eq!(tokens[1], (1, 1, 1));
+    }
+}