tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
| @@ -0,0 +1,108 @@ | |
| 1 | 
            +
            use super::{merge_overlapping_spans, post_process, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
            use linkify::{LinkFinder, LinkKind};
         | 
| 4 | 
            +
            use regex::Regex;
         | 
| 5 | 
            +
            use unicode_segmentation::UnicodeSegmentation;
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            pub struct UrlEmailTokenizer {
         | 
| 8 | 
            +
                config: TokenizerConfig,
         | 
| 9 | 
            +
                preserve_patterns: Vec<Regex>,
         | 
| 10 | 
            +
            }
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            impl UrlEmailTokenizer {
         | 
| 13 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 14 | 
            +
                    let preserve_patterns = config
         | 
| 15 | 
            +
                        .preserve_patterns
         | 
| 16 | 
            +
                        .iter()
         | 
| 17 | 
            +
                        .filter_map(|p| Regex::new(p).ok())
         | 
| 18 | 
            +
                        .collect();
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    Self {
         | 
| 21 | 
            +
                        config,
         | 
| 22 | 
            +
                        preserve_patterns,
         | 
| 23 | 
            +
                    }
         | 
| 24 | 
            +
                }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                fn extract_url_email_spans(&self, text: &str) -> Vec<(usize, usize, String)> {
         | 
| 27 | 
            +
                    let finder = LinkFinder::new();
         | 
| 28 | 
            +
                    let mut spans = Vec::new();
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    for link in finder.links(text) {
         | 
| 31 | 
            +
                        match link.kind() {
         | 
| 32 | 
            +
                            LinkKind::Url | LinkKind::Email => {
         | 
| 33 | 
            +
                                let (start, end) = (link.start(), link.end());
         | 
| 34 | 
            +
                                spans.push((start, end, link.as_str().to_string()));
         | 
| 35 | 
            +
                            }
         | 
| 36 | 
            +
                            _ => {}
         | 
| 37 | 
            +
                        }
         | 
| 38 | 
            +
                    }
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    spans
         | 
| 41 | 
            +
                }
         | 
| 42 | 
            +
            }
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            impl Tokenizer for UrlEmailTokenizer {
         | 
| 45 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 46 | 
            +
                    let mut spans = self.extract_url_email_spans(text);
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    // Add preserve_pattern matches to spans
         | 
| 49 | 
            +
                    for pattern in &self.preserve_patterns {
         | 
| 50 | 
            +
                        for mat in pattern.find_iter(text) {
         | 
| 51 | 
            +
                            spans.push((mat.start(), mat.end(), mat.as_str().to_string()));
         | 
| 52 | 
            +
                        }
         | 
| 53 | 
            +
                    }
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                    // Merge overlapping spans to handle conflicts
         | 
| 56 | 
            +
                    let spans = if !spans.is_empty() {
         | 
| 57 | 
            +
                        merge_overlapping_spans(spans)
         | 
| 58 | 
            +
                    } else {
         | 
| 59 | 
            +
                        spans
         | 
| 60 | 
            +
                    };
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    if spans.is_empty() {
         | 
| 63 | 
            +
                        let tokens: Vec<String> = text
         | 
| 64 | 
            +
                            .unicode_words()
         | 
| 65 | 
            +
                            .map(|s| s.to_string())
         | 
| 66 | 
            +
                            .collect();
         | 
| 67 | 
            +
                        return post_process(tokens, &self.config);
         | 
| 68 | 
            +
                    }
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    let mut result = Vec::new();
         | 
| 71 | 
            +
                    let mut pos = 0;
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    for (start, end, url_or_email) in spans {
         | 
| 74 | 
            +
                        if start > pos {
         | 
| 75 | 
            +
                            let before = &text[pos..start];
         | 
| 76 | 
            +
                            let before_tokens: Vec<String> = before
         | 
| 77 | 
            +
                                .unicode_words()
         | 
| 78 | 
            +
                                .map(|s| s.to_string())
         | 
| 79 | 
            +
                                .collect();
         | 
| 80 | 
            +
                            let before_tokens = post_process(before_tokens, &self.config);
         | 
| 81 | 
            +
                            result.extend(before_tokens);
         | 
| 82 | 
            +
                        }
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                        // Don't lowercase preserved patterns, but do lowercase URLs/emails if config says so
         | 
| 85 | 
            +
                        // unless they are from preserve_patterns
         | 
| 86 | 
            +
                        let preserved = if self.config.lowercase && !self.preserve_patterns.iter().any(|p| p.is_match(&url_or_email)) {
         | 
| 87 | 
            +
                            url_or_email.to_lowercase()
         | 
| 88 | 
            +
                        } else {
         | 
| 89 | 
            +
                            url_or_email
         | 
| 90 | 
            +
                        };
         | 
| 91 | 
            +
                        result.push(preserved);
         | 
| 92 | 
            +
                        pos = end;
         | 
| 93 | 
            +
                    }
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                    if pos < text.len() {
         | 
| 96 | 
            +
                        let remaining = &text[pos..];
         | 
| 97 | 
            +
                        let remaining_tokens: Vec<String> = remaining
         | 
| 98 | 
            +
                            .unicode_words()
         | 
| 99 | 
            +
                            .map(|s| s.to_string())
         | 
| 100 | 
            +
                            .collect();
         | 
| 101 | 
            +
                        let remaining_tokens = post_process(remaining_tokens, &self.config);
         | 
| 102 | 
            +
                        result.extend(remaining_tokens);
         | 
| 103 | 
            +
                    }
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                    result
         | 
| 106 | 
            +
                }
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            }
         | 
| @@ -0,0 +1,31 @@ | |
| 1 | 
            +
            use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            pub struct WhitespaceTokenizer {
         | 
| 5 | 
            +
                base: BaseTokenizerFields,
         | 
| 6 | 
            +
            }
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            impl WhitespaceTokenizer {
         | 
| 9 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 10 | 
            +
                    Self {
         | 
| 11 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 12 | 
            +
                    }
         | 
| 13 | 
            +
                }
         | 
| 14 | 
            +
            }
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            impl Tokenizer for WhitespaceTokenizer {
         | 
| 17 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 18 | 
            +
                    let tokens: Vec<String> = text
         | 
| 19 | 
            +
                        .split_whitespace()
         | 
| 20 | 
            +
                        .filter(|s| !s.is_empty())
         | 
| 21 | 
            +
                        .map(|s| s.to_string())
         | 
| 22 | 
            +
                        .collect();
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    if self.base.has_preserve_patterns() {
         | 
| 25 | 
            +
                        apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
         | 
| 26 | 
            +
                    } else {
         | 
| 27 | 
            +
                        post_process(tokens, &self.base.config)
         | 
| 28 | 
            +
                    }
         | 
| 29 | 
            +
                }
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            }
         | 
| @@ -0,0 +1,74 @@ | |
| 1 | 
            +
            module TokenKit
         | 
| 2 | 
            +
              class Config
         | 
| 3 | 
            +
                attr_accessor :strategy, :regex, :grapheme_extended, :min_gram, :max_gram, :delimiter, :split_on_chars, :lowercase, :remove_punctuation, :preserve_patterns
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                def self.instance
         | 
| 6 | 
            +
                  @instance ||= new
         | 
| 7 | 
            +
                end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def initialize
         | 
| 10 | 
            +
                  @strategy = :unicode
         | 
| 11 | 
            +
                  @lowercase = true
         | 
| 12 | 
            +
                  @remove_punctuation = false
         | 
| 13 | 
            +
                  @preserve_patterns = []
         | 
| 14 | 
            +
                  @grapheme_extended = true
         | 
| 15 | 
            +
                  @min_gram = 2
         | 
| 16 | 
            +
                  @max_gram = 10
         | 
| 17 | 
            +
                  @delimiter = "/"
         | 
| 18 | 
            +
                  @split_on_chars = " \t\n\r"
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                def apply!
         | 
| 22 | 
            +
                  config_hash = {
         | 
| 23 | 
            +
                    "strategy" => strategy.to_s,
         | 
| 24 | 
            +
                    "lowercase" => lowercase,
         | 
| 25 | 
            +
                    "remove_punctuation" => remove_punctuation,
         | 
| 26 | 
            +
                    "preserve_patterns" => preserve_patterns.map { |p| pattern_to_string(p) }
         | 
| 27 | 
            +
                  }
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  if strategy == :pattern && regex
         | 
| 30 | 
            +
                    config_hash["regex"] = regex
         | 
| 31 | 
            +
                  end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  if strategy == :grapheme
         | 
| 34 | 
            +
                    config_hash["extended"] = grapheme_extended
         | 
| 35 | 
            +
                  end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  if strategy == :edge_ngram || strategy == :ngram
         | 
| 38 | 
            +
                    config_hash["min_gram"] = min_gram
         | 
| 39 | 
            +
                    config_hash["max_gram"] = max_gram
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  if strategy == :path_hierarchy
         | 
| 43 | 
            +
                    config_hash["delimiter"] = delimiter
         | 
| 44 | 
            +
                  end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                  if strategy == :char_group
         | 
| 47 | 
            +
                    config_hash["split_on_chars"] = split_on_chars
         | 
| 48 | 
            +
                  end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  TokenKit.configure(config_hash)
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                def to_h
         | 
| 54 | 
            +
                  {
         | 
| 55 | 
            +
                    strategy: strategy,
         | 
| 56 | 
            +
                    regex: regex,
         | 
| 57 | 
            +
                    grapheme_extended: grapheme_extended,
         | 
| 58 | 
            +
                    min_gram: min_gram,
         | 
| 59 | 
            +
                    max_gram: max_gram,
         | 
| 60 | 
            +
                    delimiter: delimiter,
         | 
| 61 | 
            +
                    split_on_chars: split_on_chars,
         | 
| 62 | 
            +
                    lowercase: lowercase,
         | 
| 63 | 
            +
                    remove_punctuation: remove_punctuation,
         | 
| 64 | 
            +
                    preserve_patterns: preserve_patterns
         | 
| 65 | 
            +
                  }.compact
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                private
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                def pattern_to_string(pattern)
         | 
| 71 | 
            +
                  pattern.is_a?(Regexp) ? pattern.source : pattern.to_s
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
              end
         | 
| 74 | 
            +
            end
         | 
| @@ -0,0 +1,209 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require_relative 'regex_converter'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module TokenKit
         | 
| 6 | 
            +
              # Builder for creating immutable Configuration objects
         | 
| 7 | 
            +
              class ConfigBuilder
         | 
| 8 | 
            +
                attr_accessor :strategy, :lowercase, :remove_punctuation, :preserve_patterns
         | 
| 9 | 
            +
                attr_accessor :regex, :grapheme_extended, :min_gram, :max_gram
         | 
| 10 | 
            +
                attr_accessor :delimiter, :split_on_chars
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                # Default values
         | 
| 13 | 
            +
                DEFAULTS = {
         | 
| 14 | 
            +
                  strategy: :unicode,
         | 
| 15 | 
            +
                  lowercase: true,
         | 
| 16 | 
            +
                  remove_punctuation: false,
         | 
| 17 | 
            +
                  preserve_patterns: [],
         | 
| 18 | 
            +
                  grapheme_extended: true,
         | 
| 19 | 
            +
                  min_gram: 2,
         | 
| 20 | 
            +
                  max_gram: 10,
         | 
| 21 | 
            +
                  delimiter: "/",
         | 
| 22 | 
            +
                  split_on_chars: " \t\n\r"
         | 
| 23 | 
            +
                }.freeze
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                VALID_STRATEGIES = [
         | 
| 26 | 
            +
                  :unicode, :whitespace, :pattern, :sentence, :grapheme, :keyword,
         | 
| 27 | 
            +
                  :edge_ngram, :ngram, :path_hierarchy, :url_email, :char_group,
         | 
| 28 | 
            +
                  :letter, :lowercase
         | 
| 29 | 
            +
                ].freeze
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def initialize(base_config = nil)
         | 
| 32 | 
            +
                  if base_config
         | 
| 33 | 
            +
                    # Copy from existing config
         | 
| 34 | 
            +
                    @strategy = base_config.strategy
         | 
| 35 | 
            +
                    @lowercase = base_config.lowercase
         | 
| 36 | 
            +
                    @remove_punctuation = base_config.remove_punctuation
         | 
| 37 | 
            +
                    @preserve_patterns = base_config.preserve_patterns.dup
         | 
| 38 | 
            +
                    @regex = base_config.instance_variable_get(:@regex) if base_config.instance_variable_defined?(:@regex)
         | 
| 39 | 
            +
                    @grapheme_extended = base_config.instance_variable_get(:@grapheme_extended) || DEFAULTS[:grapheme_extended]
         | 
| 40 | 
            +
                    @min_gram = base_config.instance_variable_get(:@min_gram) || DEFAULTS[:min_gram]
         | 
| 41 | 
            +
                    @max_gram = base_config.instance_variable_get(:@max_gram) || DEFAULTS[:max_gram]
         | 
| 42 | 
            +
                    @delimiter = base_config.instance_variable_get(:@delimiter) || DEFAULTS[:delimiter]
         | 
| 43 | 
            +
                    @split_on_chars = base_config.instance_variable_get(:@split_on_chars) || DEFAULTS[:split_on_chars]
         | 
| 44 | 
            +
                  else
         | 
| 45 | 
            +
                    # Start with defaults
         | 
| 46 | 
            +
                    DEFAULTS.each do |key, value|
         | 
| 47 | 
            +
                      instance_variable_set("@#{key}", value.is_a?(Array) ? value.dup : value)
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                # Build an immutable Configuration object
         | 
| 53 | 
            +
                # @return [Configuration] The built configuration
         | 
| 54 | 
            +
                # @raise [Error] if configuration is invalid
         | 
| 55 | 
            +
                def build
         | 
| 56 | 
            +
                  validate!
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                  config_hash = build_config_hash
         | 
| 59 | 
            +
                  Configuration.new(config_hash, self)
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                private
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                def validate!
         | 
| 65 | 
            +
                  # Validate strategy
         | 
| 66 | 
            +
                  unless VALID_STRATEGIES.include?(@strategy)
         | 
| 67 | 
            +
                    raise Error, "Invalid strategy: #{@strategy}. Valid strategies are: #{VALID_STRATEGIES.join(', ')}"
         | 
| 68 | 
            +
                  end
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                  # Strategy-specific validations
         | 
| 71 | 
            +
                  case @strategy
         | 
| 72 | 
            +
                  when :pattern
         | 
| 73 | 
            +
                    raise Error, "Pattern strategy requires a regex" unless @regex
         | 
| 74 | 
            +
                    if @regex.is_a?(String)
         | 
| 75 | 
            +
                      RegexConverter.validate!(@regex)
         | 
| 76 | 
            +
                    end
         | 
| 77 | 
            +
                  when :edge_ngram, :ngram
         | 
| 78 | 
            +
                    raise Error, "min_gram must be positive, got #{@min_gram}" if @min_gram < 1
         | 
| 79 | 
            +
                    raise Error, "max_gram (#{@max_gram}) must be >= min_gram (#{@min_gram})" if @max_gram < @min_gram
         | 
| 80 | 
            +
                  when :path_hierarchy
         | 
| 81 | 
            +
                    raise Error, "Path hierarchy requires a delimiter" if @delimiter.nil? || @delimiter.empty?
         | 
| 82 | 
            +
                  when :lowercase
         | 
| 83 | 
            +
                    # Warn if lowercase: false with :lowercase strategy
         | 
| 84 | 
            +
                    if !@lowercase
         | 
| 85 | 
            +
                      warn "Warning: The :lowercase strategy always lowercases text. The 'lowercase: false' setting will be ignored."
         | 
| 86 | 
            +
                    end
         | 
| 87 | 
            +
                  end
         | 
| 88 | 
            +
                end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                def build_config_hash
         | 
| 91 | 
            +
                  config = {
         | 
| 92 | 
            +
                    "strategy" => @strategy.to_s,
         | 
| 93 | 
            +
                    "lowercase" => @lowercase,
         | 
| 94 | 
            +
                    "remove_punctuation" => @remove_punctuation,
         | 
| 95 | 
            +
                    "preserve_patterns" => RegexConverter.patterns_to_rust(@preserve_patterns)
         | 
| 96 | 
            +
                  }
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                  # Add strategy-specific parameters
         | 
| 99 | 
            +
                  case @strategy
         | 
| 100 | 
            +
                  when :pattern
         | 
| 101 | 
            +
                    config["regex"] = @regex.is_a?(Regexp) ? RegexConverter.to_rust(@regex) : @regex.to_s
         | 
| 102 | 
            +
                  when :grapheme
         | 
| 103 | 
            +
                    config["extended"] = @grapheme_extended
         | 
| 104 | 
            +
                  when :edge_ngram, :ngram
         | 
| 105 | 
            +
                    config["min_gram"] = @min_gram
         | 
| 106 | 
            +
                    config["max_gram"] = @max_gram
         | 
| 107 | 
            +
                  when :path_hierarchy
         | 
| 108 | 
            +
                    config["delimiter"] = @delimiter
         | 
| 109 | 
            +
                  when :char_group
         | 
| 110 | 
            +
                    config["split_on_chars"] = @split_on_chars
         | 
| 111 | 
            +
                  end
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                  config
         | 
| 114 | 
            +
                end
         | 
| 115 | 
            +
              end
         | 
| 116 | 
            +
             | 
| 117 | 
            +
              # Immutable configuration object
         | 
| 118 | 
            +
              class Configuration
         | 
| 119 | 
            +
                attr_reader :strategy, :lowercase, :remove_punctuation, :preserve_patterns
         | 
| 120 | 
            +
                attr_reader :regex, :grapheme_extended, :min_gram, :max_gram, :delimiter, :split_on_chars
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                def initialize(config_hash, builder = nil)
         | 
| 123 | 
            +
                  @strategy = config_hash["strategy"]&.to_sym || :unicode
         | 
| 124 | 
            +
                  @lowercase = config_hash.fetch("lowercase", true)
         | 
| 125 | 
            +
                  @remove_punctuation = config_hash.fetch("remove_punctuation", false)
         | 
| 126 | 
            +
                  @raw_hash = config_hash.freeze
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                  # Store builder data for creating new builders from this config
         | 
| 129 | 
            +
                  if builder
         | 
| 130 | 
            +
                    # Store original Ruby patterns, not the converted strings
         | 
| 131 | 
            +
                    @preserve_patterns = builder.preserve_patterns.freeze
         | 
| 132 | 
            +
                    @regex = builder.regex
         | 
| 133 | 
            +
                    @grapheme_extended = builder.grapheme_extended
         | 
| 134 | 
            +
                    @min_gram = builder.min_gram
         | 
| 135 | 
            +
                    @max_gram = builder.max_gram
         | 
| 136 | 
            +
                    @delimiter = builder.delimiter
         | 
| 137 | 
            +
                    @split_on_chars = builder.split_on_chars
         | 
| 138 | 
            +
                  else
         | 
| 139 | 
            +
                    # Extract from raw_hash for backward compatibility
         | 
| 140 | 
            +
                    @preserve_patterns = config_hash.fetch("preserve_patterns", []).freeze
         | 
| 141 | 
            +
                    @regex = config_hash["regex"]
         | 
| 142 | 
            +
                    @grapheme_extended = config_hash.fetch("extended", ConfigBuilder::DEFAULTS[:grapheme_extended])
         | 
| 143 | 
            +
                    @min_gram = config_hash.fetch("min_gram", ConfigBuilder::DEFAULTS[:min_gram])
         | 
| 144 | 
            +
                    @max_gram = config_hash.fetch("max_gram", ConfigBuilder::DEFAULTS[:max_gram])
         | 
| 145 | 
            +
                    @delimiter = config_hash.fetch("delimiter", ConfigBuilder::DEFAULTS[:delimiter])
         | 
| 146 | 
            +
                    @split_on_chars = config_hash.fetch("split_on_chars", ConfigBuilder::DEFAULTS[:split_on_chars])
         | 
| 147 | 
            +
                  end
         | 
| 148 | 
            +
                end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                # Create a new builder initialized with this configuration
         | 
| 151 | 
            +
                def to_builder
         | 
| 152 | 
            +
                  ConfigBuilder.new(self)
         | 
| 153 | 
            +
                end
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                # Strategy-specific accessors
         | 
| 156 | 
            +
                def pattern?
         | 
| 157 | 
            +
                  strategy == :pattern
         | 
| 158 | 
            +
                end
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                def grapheme?
         | 
| 161 | 
            +
                  strategy == :grapheme
         | 
| 162 | 
            +
                end
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                def extended
         | 
| 165 | 
            +
                  @grapheme_extended
         | 
| 166 | 
            +
                end
         | 
| 167 | 
            +
             | 
| 168 | 
            +
                def edge_ngram?
         | 
| 169 | 
            +
                  strategy == :edge_ngram
         | 
| 170 | 
            +
                end
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                def ngram?
         | 
| 173 | 
            +
                  strategy == :ngram
         | 
| 174 | 
            +
                end
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                def path_hierarchy?
         | 
| 177 | 
            +
                  strategy == :path_hierarchy
         | 
| 178 | 
            +
                end
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                def char_group?
         | 
| 181 | 
            +
                  strategy == :char_group
         | 
| 182 | 
            +
                end
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                def letter?
         | 
| 185 | 
            +
                  strategy == :letter
         | 
| 186 | 
            +
                end
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                def lowercase?
         | 
| 189 | 
            +
                  strategy == :lowercase
         | 
| 190 | 
            +
                end
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                def to_h
         | 
| 193 | 
            +
                  @raw_hash.dup
         | 
| 194 | 
            +
                end
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                def to_rust_config
         | 
| 197 | 
            +
                  @raw_hash
         | 
| 198 | 
            +
                end
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                def inspect
         | 
| 201 | 
            +
                  "#<TokenKit::Configuration strategy=#{strategy} lowercase=#{lowercase} remove_punctuation=#{remove_punctuation}>"
         | 
| 202 | 
            +
                end
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                # Check equality with another configuration
         | 
| 205 | 
            +
                def ==(other)
         | 
| 206 | 
            +
                  other.is_a?(Configuration) && to_h == other.to_h
         | 
| 207 | 
            +
                end
         | 
| 208 | 
            +
              end
         | 
| 209 | 
            +
            end
         | 
| @@ -0,0 +1,52 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module TokenKit
         | 
| 4 | 
            +
              # Compatibility wrapper that mimics the old Config singleton API
         | 
| 5 | 
            +
              # This allows us to migrate gradually
         | 
| 6 | 
            +
              class Config
         | 
| 7 | 
            +
                # Singleton pattern for backward compatibility
         | 
| 8 | 
            +
                def self.instance
         | 
| 9 | 
            +
                  @instance ||= new
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                # Delegate all accessors to the global config builder
         | 
| 13 | 
            +
                def method_missing(method, *args, &block)
         | 
| 14 | 
            +
                  if method.to_s.end_with?('=')
         | 
| 15 | 
            +
                    # Setter - store in temporary builder
         | 
| 16 | 
            +
                    attr = method.to_s.chomp('=').to_sym
         | 
| 17 | 
            +
                    @temp_builder ||= TokenKit.config_hash.to_builder
         | 
| 18 | 
            +
                    @temp_builder.send(method, *args, &block) if @temp_builder.respond_to?(method)
         | 
| 19 | 
            +
                  else
         | 
| 20 | 
            +
                    # Getter - get from current config or temp builder
         | 
| 21 | 
            +
                    if @temp_builder && @temp_builder.respond_to?(method)
         | 
| 22 | 
            +
                      @temp_builder.send(method)
         | 
| 23 | 
            +
                    else
         | 
| 24 | 
            +
                      TokenKit.config_hash.send(method) if TokenKit.config_hash.respond_to?(method)
         | 
| 25 | 
            +
                    end
         | 
| 26 | 
            +
                  end
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                def respond_to_missing?(method, include_private = false)
         | 
| 30 | 
            +
                  # Avoid infinite recursion by checking config_hash instead of config
         | 
| 31 | 
            +
                  return true if [:strategy=, :lowercase=, :remove_punctuation=, :preserve_patterns=,
         | 
| 32 | 
            +
                                  :regex=, :grapheme_extended=, :min_gram=, :max_gram=,
         | 
| 33 | 
            +
                                  :delimiter=, :split_on_chars=,
         | 
| 34 | 
            +
                                  :strategy, :lowercase, :remove_punctuation, :preserve_patterns,
         | 
| 35 | 
            +
                                  :regex, :grapheme_extended, :min_gram, :max_gram,
         | 
| 36 | 
            +
                                  :delimiter, :split_on_chars].include?(method)
         | 
| 37 | 
            +
                  super
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                # Called by TokenKit.configure to get the built config
         | 
| 41 | 
            +
                def build_config
         | 
| 42 | 
            +
                  builder = @temp_builder || TokenKit.config_hash.to_builder
         | 
| 43 | 
            +
                  @temp_builder = nil  # Clear after building
         | 
| 44 | 
            +
                  builder
         | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # Reset temporary builder
         | 
| 48 | 
            +
                def reset_temp
         | 
| 49 | 
            +
                  @temp_builder = nil
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
              end
         | 
| 52 | 
            +
            end
         | 
| @@ -0,0 +1,194 @@ | |
| 1 | 
            +
            module TokenKit
         | 
| 2 | 
            +
              # Immutable configuration object representing tokenizer settings.
         | 
| 3 | 
            +
              #
         | 
| 4 | 
            +
              # This class provides read-only access to configuration values and
         | 
| 5 | 
            +
              # convenient predicate methods for checking the current strategy.
         | 
| 6 | 
            +
              #
         | 
| 7 | 
            +
              # @example Access configuration
         | 
| 8 | 
            +
              #   config = TokenKit.config_hash
         | 
| 9 | 
            +
              #   config.strategy           # => :unicode
         | 
| 10 | 
            +
              #   config.lowercase          # => true
         | 
| 11 | 
            +
              #   config.preserve_patterns  # => [/\d+mg/i]
         | 
| 12 | 
            +
              #
         | 
| 13 | 
            +
              # @example Check strategy type
         | 
| 14 | 
            +
              #   config.unicode?           # => true
         | 
| 15 | 
            +
              #   config.edge_ngram?        # => false
         | 
| 16 | 
            +
              #
         | 
| 17 | 
            +
              class Configuration
         | 
| 18 | 
            +
                # @return [Symbol] The tokenization strategy
         | 
| 19 | 
            +
                attr_reader :strategy
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                # @return [Boolean] Whether to lowercase tokens
         | 
| 22 | 
            +
                attr_reader :lowercase
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                # @return [Boolean] Whether to remove punctuation
         | 
| 25 | 
            +
                attr_reader :remove_punctuation
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                # @return [Array<Regexp>] Patterns to preserve from modification
         | 
| 28 | 
            +
                attr_reader :preserve_patterns
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                # Creates a new configuration from a hash.
         | 
| 31 | 
            +
                #
         | 
| 32 | 
            +
                # @param config_hash [Hash] Configuration values from Rust
         | 
| 33 | 
            +
                # @api private
         | 
| 34 | 
            +
                #
         | 
| 35 | 
            +
                def initialize(config_hash)
         | 
| 36 | 
            +
                  @strategy = config_hash["strategy"]&.to_sym || :unicode
         | 
| 37 | 
            +
                  @lowercase = config_hash.fetch("lowercase", true)
         | 
| 38 | 
            +
                  @remove_punctuation = config_hash.fetch("remove_punctuation", false)
         | 
| 39 | 
            +
                  @preserve_patterns = config_hash.fetch("preserve_patterns", []).freeze
         | 
| 40 | 
            +
                  @raw_hash = config_hash
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                # @return [Boolean] true if using pattern tokenization strategy
         | 
| 44 | 
            +
                def pattern?
         | 
| 45 | 
            +
                  strategy == :pattern
         | 
| 46 | 
            +
                end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                # @return [String, nil] The regex pattern for pattern strategy
         | 
| 49 | 
            +
                def regex
         | 
| 50 | 
            +
                  @raw_hash["regex"]
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                # @return [Boolean] true if using grapheme tokenization strategy
         | 
| 54 | 
            +
                def grapheme?
         | 
| 55 | 
            +
                  strategy == :grapheme
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                # @return [Boolean, nil] Whether to use extended grapheme clusters
         | 
| 59 | 
            +
                def extended
         | 
| 60 | 
            +
                  @raw_hash["extended"]
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                # @return [Boolean] true if using edge n-gram tokenization strategy
         | 
| 64 | 
            +
                def edge_ngram?
         | 
| 65 | 
            +
                  strategy == :edge_ngram
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                # @return [Integer, nil] Minimum n-gram size for n-gram strategies
         | 
| 69 | 
            +
                def min_gram
         | 
| 70 | 
            +
                  @raw_hash["min_gram"]
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                # @return [Integer, nil] Maximum n-gram size for n-gram strategies
         | 
| 74 | 
            +
                def max_gram
         | 
| 75 | 
            +
                  @raw_hash["max_gram"]
         | 
| 76 | 
            +
                end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                # @return [Boolean] true if using path hierarchy tokenization strategy
         | 
| 79 | 
            +
                def path_hierarchy?
         | 
| 80 | 
            +
                  strategy == :path_hierarchy
         | 
| 81 | 
            +
                end
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                # @return [String, nil] Delimiter for path hierarchy strategy
         | 
| 84 | 
            +
                def delimiter
         | 
| 85 | 
            +
                  @raw_hash["delimiter"]
         | 
| 86 | 
            +
                end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                # @return [Boolean] true if using n-gram tokenization strategy
         | 
| 89 | 
            +
                def ngram?
         | 
| 90 | 
            +
                  strategy == :ngram
         | 
| 91 | 
            +
                end
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                # @return [Boolean] true if using character group tokenization strategy
         | 
| 94 | 
            +
                def char_group?
         | 
| 95 | 
            +
                  strategy == :char_group
         | 
| 96 | 
            +
                end
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                # @return [String, nil] Characters to split on for char_group strategy
         | 
| 99 | 
            +
                def split_on_chars
         | 
| 100 | 
            +
                  @raw_hash["split_on_chars"]
         | 
| 101 | 
            +
                end
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                # @return [Boolean] true if using letter tokenization strategy
         | 
| 104 | 
            +
                def letter?
         | 
| 105 | 
            +
                  strategy == :letter
         | 
| 106 | 
            +
                end
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                # @return [Boolean] true if using lowercase tokenization strategy
         | 
| 109 | 
            +
                def lowercase?
         | 
| 110 | 
            +
                  strategy == :lowercase
         | 
| 111 | 
            +
                end
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                # @return [Boolean] true if using unicode tokenization strategy
         | 
| 114 | 
            +
                def unicode?
         | 
| 115 | 
            +
                  strategy == :unicode
         | 
| 116 | 
            +
                end
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                # @return [Boolean] true if using whitespace tokenization strategy
         | 
| 119 | 
            +
                def whitespace?
         | 
| 120 | 
            +
                  strategy == :whitespace
         | 
| 121 | 
            +
                end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                # @return [Boolean] true if using sentence tokenization strategy
         | 
| 124 | 
            +
                def sentence?
         | 
| 125 | 
            +
                  strategy == :sentence
         | 
| 126 | 
            +
                end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                # @return [Boolean] true if using keyword tokenization strategy
         | 
| 129 | 
            +
                def keyword?
         | 
| 130 | 
            +
                  strategy == :keyword
         | 
| 131 | 
            +
                end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                # @return [Boolean] true if using url_email tokenization strategy
         | 
| 134 | 
            +
                def url_email?
         | 
| 135 | 
            +
                  strategy == :url_email
         | 
| 136 | 
            +
                end
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                # Converts configuration to a hash.
         | 
| 139 | 
            +
                #
         | 
| 140 | 
            +
                # @return [Hash] Configuration as a hash
         | 
| 141 | 
            +
                #
         | 
| 142 | 
            +
                # @example
         | 
| 143 | 
            +
                #   config.to_h
         | 
| 144 | 
            +
                #   # => {"strategy" => "unicode", "lowercase" => true, ...}
         | 
| 145 | 
            +
                #
         | 
| 146 | 
            +
                def to_h
         | 
| 147 | 
            +
                  @raw_hash.dup
         | 
| 148 | 
            +
                end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                # Returns a string representation of the configuration.
         | 
| 151 | 
            +
                #
         | 
| 152 | 
            +
                # @return [String] Human-readable configuration summary
         | 
| 153 | 
            +
                #
         | 
| 154 | 
            +
                def inspect
         | 
| 155 | 
            +
                  "#<TokenKit::Configuration strategy=#{strategy} lowercase=#{lowercase} remove_punctuation=#{remove_punctuation}>"
         | 
| 156 | 
            +
                end
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                # Converts configuration to format expected by Rust.
         | 
| 159 | 
            +
                #
         | 
| 160 | 
            +
                # @return [Hash] Configuration hash for Rust FFI
         | 
| 161 | 
            +
                # @api private
         | 
| 162 | 
            +
                #
         | 
| 163 | 
            +
                def to_rust_config
         | 
| 164 | 
            +
                  @raw_hash
         | 
| 165 | 
            +
                end
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                # Creates a ConfigBuilder from this configuration for modification.
         | 
| 168 | 
            +
                #
         | 
| 169 | 
            +
                # @return [ConfigBuilder] A builder initialized with this configuration
         | 
| 170 | 
            +
                #
         | 
| 171 | 
            +
                # @example
         | 
| 172 | 
            +
                #   builder = config.to_builder
         | 
| 173 | 
            +
                #   builder.lowercase = false
         | 
| 174 | 
            +
                #   new_config = builder.build
         | 
| 175 | 
            +
                #
         | 
| 176 | 
            +
                def to_builder
         | 
| 177 | 
            +
                  builder = ConfigBuilder.new
         | 
| 178 | 
            +
                  builder.strategy = strategy
         | 
| 179 | 
            +
                  builder.lowercase = lowercase
         | 
| 180 | 
            +
                  builder.remove_punctuation = remove_punctuation
         | 
| 181 | 
            +
                  builder.preserve_patterns = preserve_patterns.dup
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                  # Copy strategy-specific settings
         | 
| 184 | 
            +
                  builder.regex = regex if pattern?
         | 
| 185 | 
            +
                  builder.extended = extended if grapheme?
         | 
| 186 | 
            +
                  builder.min_gram = min_gram if edge_ngram? || ngram?
         | 
| 187 | 
            +
                  builder.max_gram = max_gram if edge_ngram? || ngram?
         | 
| 188 | 
            +
                  builder.delimiter = delimiter if path_hierarchy?
         | 
| 189 | 
            +
                  builder.split_on_chars = split_on_chars if char_group?
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                  builder
         | 
| 192 | 
            +
                end
         | 
| 193 | 
            +
              end
         | 
| 194 | 
            +
            end
         |