tokenkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.standard.yml +3 -0
  4. data/.yardopts +12 -0
  5. data/CODE_OF_CONDUCT.md +132 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +644 -0
  8. data/Rakefile +18 -0
  9. data/benchmarks/cache_test.rb +63 -0
  10. data/benchmarks/final_comparison.rb +83 -0
  11. data/benchmarks/tokenizer_benchmark.rb +250 -0
  12. data/docs/ARCHITECTURE.md +469 -0
  13. data/docs/PERFORMANCE.md +382 -0
  14. data/docs/README.md +118 -0
  15. data/ext/tokenkit/Cargo.toml +21 -0
  16. data/ext/tokenkit/extconf.rb +4 -0
  17. data/ext/tokenkit/src/config.rs +37 -0
  18. data/ext/tokenkit/src/error.rs +67 -0
  19. data/ext/tokenkit/src/lib.rs +346 -0
  20. data/ext/tokenkit/src/tokenizer/base.rs +41 -0
  21. data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
  22. data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
  23. data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
  24. data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
  25. data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
  26. data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
  27. data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
  28. data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
  29. data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
  30. data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
  31. data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
  32. data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
  33. data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
  34. data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
  35. data/lib/tokenkit/config.rb +74 -0
  36. data/lib/tokenkit/config_builder.rb +209 -0
  37. data/lib/tokenkit/config_compat.rb +52 -0
  38. data/lib/tokenkit/configuration.rb +194 -0
  39. data/lib/tokenkit/regex_converter.rb +58 -0
  40. data/lib/tokenkit/version.rb +5 -0
  41. data/lib/tokenkit.rb +336 -0
  42. data/sig/tokenkit.rbs +4 -0
  43. metadata +172 -0
@@ -0,0 +1,108 @@
1
+ use super::{merge_overlapping_spans, post_process, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+ use linkify::{LinkFinder, LinkKind};
4
+ use regex::Regex;
5
+ use unicode_segmentation::UnicodeSegmentation;
6
+
7
+ pub struct UrlEmailTokenizer {
8
+ config: TokenizerConfig,
9
+ preserve_patterns: Vec<Regex>,
10
+ }
11
+
12
+ impl UrlEmailTokenizer {
13
+ pub fn new(config: TokenizerConfig) -> Self {
14
+ let preserve_patterns = config
15
+ .preserve_patterns
16
+ .iter()
17
+ .filter_map(|p| Regex::new(p).ok())
18
+ .collect();
19
+
20
+ Self {
21
+ config,
22
+ preserve_patterns,
23
+ }
24
+ }
25
+
26
+ fn extract_url_email_spans(&self, text: &str) -> Vec<(usize, usize, String)> {
27
+ let finder = LinkFinder::new();
28
+ let mut spans = Vec::new();
29
+
30
+ for link in finder.links(text) {
31
+ match link.kind() {
32
+ LinkKind::Url | LinkKind::Email => {
33
+ let (start, end) = (link.start(), link.end());
34
+ spans.push((start, end, link.as_str().to_string()));
35
+ }
36
+ _ => {}
37
+ }
38
+ }
39
+
40
+ spans
41
+ }
42
+ }
43
+
44
+ impl Tokenizer for UrlEmailTokenizer {
45
+ fn tokenize(&self, text: &str) -> Vec<String> {
46
+ let mut spans = self.extract_url_email_spans(text);
47
+
48
+ // Add preserve_pattern matches to spans
49
+ for pattern in &self.preserve_patterns {
50
+ for mat in pattern.find_iter(text) {
51
+ spans.push((mat.start(), mat.end(), mat.as_str().to_string()));
52
+ }
53
+ }
54
+
55
+ // Merge overlapping spans to handle conflicts
56
+ let spans = if !spans.is_empty() {
57
+ merge_overlapping_spans(spans)
58
+ } else {
59
+ spans
60
+ };
61
+
62
+ if spans.is_empty() {
63
+ let tokens: Vec<String> = text
64
+ .unicode_words()
65
+ .map(|s| s.to_string())
66
+ .collect();
67
+ return post_process(tokens, &self.config);
68
+ }
69
+
70
+ let mut result = Vec::new();
71
+ let mut pos = 0;
72
+
73
+ for (start, end, url_or_email) in spans {
74
+ if start > pos {
75
+ let before = &text[pos..start];
76
+ let before_tokens: Vec<String> = before
77
+ .unicode_words()
78
+ .map(|s| s.to_string())
79
+ .collect();
80
+ let before_tokens = post_process(before_tokens, &self.config);
81
+ result.extend(before_tokens);
82
+ }
83
+
84
+ // Don't lowercase preserved patterns, but do lowercase URLs/emails if config says so
85
+ // unless they are from preserve_patterns
86
+ let preserved = if self.config.lowercase && !self.preserve_patterns.iter().any(|p| p.is_match(&url_or_email)) {
87
+ url_or_email.to_lowercase()
88
+ } else {
89
+ url_or_email
90
+ };
91
+ result.push(preserved);
92
+ pos = end;
93
+ }
94
+
95
+ if pos < text.len() {
96
+ let remaining = &text[pos..];
97
+ let remaining_tokens: Vec<String> = remaining
98
+ .unicode_words()
99
+ .map(|s| s.to_string())
100
+ .collect();
101
+ let remaining_tokens = post_process(remaining_tokens, &self.config);
102
+ result.extend(remaining_tokens);
103
+ }
104
+
105
+ result
106
+ }
107
+
108
+ }
@@ -0,0 +1,31 @@
1
+ use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+
4
+ pub struct WhitespaceTokenizer {
5
+ base: BaseTokenizerFields,
6
+ }
7
+
8
+ impl WhitespaceTokenizer {
9
+ pub fn new(config: TokenizerConfig) -> Self {
10
+ Self {
11
+ base: BaseTokenizerFields::new(config),
12
+ }
13
+ }
14
+ }
15
+
16
+ impl Tokenizer for WhitespaceTokenizer {
17
+ fn tokenize(&self, text: &str) -> Vec<String> {
18
+ let tokens: Vec<String> = text
19
+ .split_whitespace()
20
+ .filter(|s| !s.is_empty())
21
+ .map(|s| s.to_string())
22
+ .collect();
23
+
24
+ if self.base.has_preserve_patterns() {
25
+ apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
26
+ } else {
27
+ post_process(tokens, &self.base.config)
28
+ }
29
+ }
30
+
31
+ }
@@ -0,0 +1,74 @@
1
+ module TokenKit
2
+ class Config
3
+ attr_accessor :strategy, :regex, :grapheme_extended, :min_gram, :max_gram, :delimiter, :split_on_chars, :lowercase, :remove_punctuation, :preserve_patterns
4
+
5
+ def self.instance
6
+ @instance ||= new
7
+ end
8
+
9
+ def initialize
10
+ @strategy = :unicode
11
+ @lowercase = true
12
+ @remove_punctuation = false
13
+ @preserve_patterns = []
14
+ @grapheme_extended = true
15
+ @min_gram = 2
16
+ @max_gram = 10
17
+ @delimiter = "/"
18
+ @split_on_chars = " \t\n\r"
19
+ end
20
+
21
+ def apply!
22
+ config_hash = {
23
+ "strategy" => strategy.to_s,
24
+ "lowercase" => lowercase,
25
+ "remove_punctuation" => remove_punctuation,
26
+ "preserve_patterns" => preserve_patterns.map { |p| pattern_to_string(p) }
27
+ }
28
+
29
+ if strategy == :pattern && regex
30
+ config_hash["regex"] = regex
31
+ end
32
+
33
+ if strategy == :grapheme
34
+ config_hash["extended"] = grapheme_extended
35
+ end
36
+
37
+ if strategy == :edge_ngram || strategy == :ngram
38
+ config_hash["min_gram"] = min_gram
39
+ config_hash["max_gram"] = max_gram
40
+ end
41
+
42
+ if strategy == :path_hierarchy
43
+ config_hash["delimiter"] = delimiter
44
+ end
45
+
46
+ if strategy == :char_group
47
+ config_hash["split_on_chars"] = split_on_chars
48
+ end
49
+
50
+ TokenKit.configure(config_hash)
51
+ end
52
+
53
+ def to_h
54
+ {
55
+ strategy: strategy,
56
+ regex: regex,
57
+ grapheme_extended: grapheme_extended,
58
+ min_gram: min_gram,
59
+ max_gram: max_gram,
60
+ delimiter: delimiter,
61
+ split_on_chars: split_on_chars,
62
+ lowercase: lowercase,
63
+ remove_punctuation: remove_punctuation,
64
+ preserve_patterns: preserve_patterns
65
+ }.compact
66
+ end
67
+
68
+ private
69
+
70
+ def pattern_to_string(pattern)
71
+ pattern.is_a?(Regexp) ? pattern.source : pattern.to_s
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,209 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'regex_converter'
4
+
5
+ module TokenKit
6
+ # Builder for creating immutable Configuration objects
7
+ class ConfigBuilder
8
+ attr_accessor :strategy, :lowercase, :remove_punctuation, :preserve_patterns
9
+ attr_accessor :regex, :grapheme_extended, :min_gram, :max_gram
10
+ attr_accessor :delimiter, :split_on_chars
11
+
12
+ # Default values
13
+ DEFAULTS = {
14
+ strategy: :unicode,
15
+ lowercase: true,
16
+ remove_punctuation: false,
17
+ preserve_patterns: [],
18
+ grapheme_extended: true,
19
+ min_gram: 2,
20
+ max_gram: 10,
21
+ delimiter: "/",
22
+ split_on_chars: " \t\n\r"
23
+ }.freeze
24
+
25
+ VALID_STRATEGIES = [
26
+ :unicode, :whitespace, :pattern, :sentence, :grapheme, :keyword,
27
+ :edge_ngram, :ngram, :path_hierarchy, :url_email, :char_group,
28
+ :letter, :lowercase
29
+ ].freeze
30
+
31
+ def initialize(base_config = nil)
32
+ if base_config
33
+ # Copy from existing config
34
+ @strategy = base_config.strategy
35
+ @lowercase = base_config.lowercase
36
+ @remove_punctuation = base_config.remove_punctuation
37
+ @preserve_patterns = base_config.preserve_patterns.dup
38
+ @regex = base_config.instance_variable_get(:@regex) if base_config.instance_variable_defined?(:@regex)
39
+ @grapheme_extended = base_config.instance_variable_get(:@grapheme_extended) || DEFAULTS[:grapheme_extended]
40
+ @min_gram = base_config.instance_variable_get(:@min_gram) || DEFAULTS[:min_gram]
41
+ @max_gram = base_config.instance_variable_get(:@max_gram) || DEFAULTS[:max_gram]
42
+ @delimiter = base_config.instance_variable_get(:@delimiter) || DEFAULTS[:delimiter]
43
+ @split_on_chars = base_config.instance_variable_get(:@split_on_chars) || DEFAULTS[:split_on_chars]
44
+ else
45
+ # Start with defaults
46
+ DEFAULTS.each do |key, value|
47
+ instance_variable_set("@#{key}", value.is_a?(Array) ? value.dup : value)
48
+ end
49
+ end
50
+ end
51
+
52
+ # Build an immutable Configuration object
53
+ # @return [Configuration] The built configuration
54
+ # @raise [Error] if configuration is invalid
55
+ def build
56
+ validate!
57
+
58
+ config_hash = build_config_hash
59
+ Configuration.new(config_hash, self)
60
+ end
61
+
62
+ private
63
+
64
+ def validate!
65
+ # Validate strategy
66
+ unless VALID_STRATEGIES.include?(@strategy)
67
+ raise Error, "Invalid strategy: #{@strategy}. Valid strategies are: #{VALID_STRATEGIES.join(', ')}"
68
+ end
69
+
70
+ # Strategy-specific validations
71
+ case @strategy
72
+ when :pattern
73
+ raise Error, "Pattern strategy requires a regex" unless @regex
74
+ if @regex.is_a?(String)
75
+ RegexConverter.validate!(@regex)
76
+ end
77
+ when :edge_ngram, :ngram
78
+ raise Error, "min_gram must be positive, got #{@min_gram}" if @min_gram < 1
79
+ raise Error, "max_gram (#{@max_gram}) must be >= min_gram (#{@min_gram})" if @max_gram < @min_gram
80
+ when :path_hierarchy
81
+ raise Error, "Path hierarchy requires a delimiter" if @delimiter.nil? || @delimiter.empty?
82
+ when :lowercase
83
+ # Warn if lowercase: false with :lowercase strategy
84
+ if !@lowercase
85
+ warn "Warning: The :lowercase strategy always lowercases text. The 'lowercase: false' setting will be ignored."
86
+ end
87
+ end
88
+ end
89
+
90
+ def build_config_hash
91
+ config = {
92
+ "strategy" => @strategy.to_s,
93
+ "lowercase" => @lowercase,
94
+ "remove_punctuation" => @remove_punctuation,
95
+ "preserve_patterns" => RegexConverter.patterns_to_rust(@preserve_patterns)
96
+ }
97
+
98
+ # Add strategy-specific parameters
99
+ case @strategy
100
+ when :pattern
101
+ config["regex"] = @regex.is_a?(Regexp) ? RegexConverter.to_rust(@regex) : @regex.to_s
102
+ when :grapheme
103
+ config["extended"] = @grapheme_extended
104
+ when :edge_ngram, :ngram
105
+ config["min_gram"] = @min_gram
106
+ config["max_gram"] = @max_gram
107
+ when :path_hierarchy
108
+ config["delimiter"] = @delimiter
109
+ when :char_group
110
+ config["split_on_chars"] = @split_on_chars
111
+ end
112
+
113
+ config
114
+ end
115
+ end
116
+
117
+ # Immutable configuration object
118
+ class Configuration
119
+ attr_reader :strategy, :lowercase, :remove_punctuation, :preserve_patterns
120
+ attr_reader :regex, :grapheme_extended, :min_gram, :max_gram, :delimiter, :split_on_chars
121
+
122
+ def initialize(config_hash, builder = nil)
123
+ @strategy = config_hash["strategy"]&.to_sym || :unicode
124
+ @lowercase = config_hash.fetch("lowercase", true)
125
+ @remove_punctuation = config_hash.fetch("remove_punctuation", false)
126
+ @raw_hash = config_hash.freeze
127
+
128
+ # Store builder data for creating new builders from this config
129
+ if builder
130
+ # Store original Ruby patterns, not the converted strings
131
+ @preserve_patterns = builder.preserve_patterns.freeze
132
+ @regex = builder.regex
133
+ @grapheme_extended = builder.grapheme_extended
134
+ @min_gram = builder.min_gram
135
+ @max_gram = builder.max_gram
136
+ @delimiter = builder.delimiter
137
+ @split_on_chars = builder.split_on_chars
138
+ else
139
+ # Extract from raw_hash for backward compatibility
140
+ @preserve_patterns = config_hash.fetch("preserve_patterns", []).freeze
141
+ @regex = config_hash["regex"]
142
+ @grapheme_extended = config_hash.fetch("extended", ConfigBuilder::DEFAULTS[:grapheme_extended])
143
+ @min_gram = config_hash.fetch("min_gram", ConfigBuilder::DEFAULTS[:min_gram])
144
+ @max_gram = config_hash.fetch("max_gram", ConfigBuilder::DEFAULTS[:max_gram])
145
+ @delimiter = config_hash.fetch("delimiter", ConfigBuilder::DEFAULTS[:delimiter])
146
+ @split_on_chars = config_hash.fetch("split_on_chars", ConfigBuilder::DEFAULTS[:split_on_chars])
147
+ end
148
+ end
149
+
150
+ # Create a new builder initialized with this configuration
151
+ def to_builder
152
+ ConfigBuilder.new(self)
153
+ end
154
+
155
+ # Strategy-specific accessors
156
+ def pattern?
157
+ strategy == :pattern
158
+ end
159
+
160
+ def grapheme?
161
+ strategy == :grapheme
162
+ end
163
+
164
+ def extended
165
+ @grapheme_extended
166
+ end
167
+
168
+ def edge_ngram?
169
+ strategy == :edge_ngram
170
+ end
171
+
172
+ def ngram?
173
+ strategy == :ngram
174
+ end
175
+
176
+ def path_hierarchy?
177
+ strategy == :path_hierarchy
178
+ end
179
+
180
+ def char_group?
181
+ strategy == :char_group
182
+ end
183
+
184
+ def letter?
185
+ strategy == :letter
186
+ end
187
+
188
+ def lowercase?
189
+ strategy == :lowercase
190
+ end
191
+
192
+ def to_h
193
+ @raw_hash.dup
194
+ end
195
+
196
+ def to_rust_config
197
+ @raw_hash
198
+ end
199
+
200
+ def inspect
201
+ "#<TokenKit::Configuration strategy=#{strategy} lowercase=#{lowercase} remove_punctuation=#{remove_punctuation}>"
202
+ end
203
+
204
+ # Check equality with another configuration
205
+ def ==(other)
206
+ other.is_a?(Configuration) && to_h == other.to_h
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TokenKit
4
+ # Compatibility wrapper that mimics the old Config singleton API
5
+ # This allows us to migrate gradually
6
+ class Config
7
+ # Singleton pattern for backward compatibility
8
+ def self.instance
9
+ @instance ||= new
10
+ end
11
+
12
+ # Delegate all accessors to the global config builder
13
+ def method_missing(method, *args, &block)
14
+ if method.to_s.end_with?('=')
15
+ # Setter - store in temporary builder
16
+ attr = method.to_s.chomp('=').to_sym
17
+ @temp_builder ||= TokenKit.config_hash.to_builder
18
+ @temp_builder.send(method, *args, &block) if @temp_builder.respond_to?(method)
19
+ else
20
+ # Getter - get from current config or temp builder
21
+ if @temp_builder && @temp_builder.respond_to?(method)
22
+ @temp_builder.send(method)
23
+ else
24
+ TokenKit.config_hash.send(method) if TokenKit.config_hash.respond_to?(method)
25
+ end
26
+ end
27
+ end
28
+
29
+ def respond_to_missing?(method, include_private = false)
30
+ # Avoid infinite recursion by checking config_hash instead of config
31
+ return true if [:strategy=, :lowercase=, :remove_punctuation=, :preserve_patterns=,
32
+ :regex=, :grapheme_extended=, :min_gram=, :max_gram=,
33
+ :delimiter=, :split_on_chars=,
34
+ :strategy, :lowercase, :remove_punctuation, :preserve_patterns,
35
+ :regex, :grapheme_extended, :min_gram, :max_gram,
36
+ :delimiter, :split_on_chars].include?(method)
37
+ super
38
+ end
39
+
40
+ # Called by TokenKit.configure to get the built config
41
+ def build_config
42
+ builder = @temp_builder || TokenKit.config_hash.to_builder
43
+ @temp_builder = nil # Clear after building
44
+ builder
45
+ end
46
+
47
+ # Reset temporary builder
48
+ def reset_temp
49
+ @temp_builder = nil
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,194 @@
1
+ module TokenKit
2
+ # Immutable configuration object representing tokenizer settings.
3
+ #
4
+ # This class provides read-only access to configuration values and
5
+ # convenient predicate methods for checking the current strategy.
6
+ #
7
+ # @example Access configuration
8
+ # config = TokenKit.config_hash
9
+ # config.strategy # => :unicode
10
+ # config.lowercase # => true
11
+ # config.preserve_patterns # => [/\d+mg/i]
12
+ #
13
+ # @example Check strategy type
14
+ # config.unicode? # => true
15
+ # config.edge_ngram? # => false
16
+ #
17
+ class Configuration
18
+ # @return [Symbol] The tokenization strategy
19
+ attr_reader :strategy
20
+
21
+ # @return [Boolean] Whether to lowercase tokens
22
+ attr_reader :lowercase
23
+
24
+ # @return [Boolean] Whether to remove punctuation
25
+ attr_reader :remove_punctuation
26
+
27
+ # @return [Array<Regexp>] Patterns to preserve from modification
28
+ attr_reader :preserve_patterns
29
+
30
+ # Creates a new configuration from a hash.
31
+ #
32
+ # @param config_hash [Hash] Configuration values from Rust
33
+ # @api private
34
+ #
35
+ def initialize(config_hash)
36
+ @strategy = config_hash["strategy"]&.to_sym || :unicode
37
+ @lowercase = config_hash.fetch("lowercase", true)
38
+ @remove_punctuation = config_hash.fetch("remove_punctuation", false)
39
+ @preserve_patterns = config_hash.fetch("preserve_patterns", []).freeze
40
+ @raw_hash = config_hash
41
+ end
42
+
43
+ # @return [Boolean] true if using pattern tokenization strategy
44
+ def pattern?
45
+ strategy == :pattern
46
+ end
47
+
48
+ # @return [String, nil] The regex pattern for pattern strategy
49
+ def regex
50
+ @raw_hash["regex"]
51
+ end
52
+
53
+ # @return [Boolean] true if using grapheme tokenization strategy
54
+ def grapheme?
55
+ strategy == :grapheme
56
+ end
57
+
58
+ # @return [Boolean, nil] Whether to use extended grapheme clusters
59
+ def extended
60
+ @raw_hash["extended"]
61
+ end
62
+
63
+ # @return [Boolean] true if using edge n-gram tokenization strategy
64
+ def edge_ngram?
65
+ strategy == :edge_ngram
66
+ end
67
+
68
+ # @return [Integer, nil] Minimum n-gram size for n-gram strategies
69
+ def min_gram
70
+ @raw_hash["min_gram"]
71
+ end
72
+
73
+ # @return [Integer, nil] Maximum n-gram size for n-gram strategies
74
+ def max_gram
75
+ @raw_hash["max_gram"]
76
+ end
77
+
78
+ # @return [Boolean] true if using path hierarchy tokenization strategy
79
+ def path_hierarchy?
80
+ strategy == :path_hierarchy
81
+ end
82
+
83
+ # @return [String, nil] Delimiter for path hierarchy strategy
84
+ def delimiter
85
+ @raw_hash["delimiter"]
86
+ end
87
+
88
+ # @return [Boolean] true if using n-gram tokenization strategy
89
+ def ngram?
90
+ strategy == :ngram
91
+ end
92
+
93
+ # @return [Boolean] true if using character group tokenization strategy
94
+ def char_group?
95
+ strategy == :char_group
96
+ end
97
+
98
+ # @return [String, nil] Characters to split on for char_group strategy
99
+ def split_on_chars
100
+ @raw_hash["split_on_chars"]
101
+ end
102
+
103
+ # @return [Boolean] true if using letter tokenization strategy
104
+ def letter?
105
+ strategy == :letter
106
+ end
107
+
108
+ # @return [Boolean] true if using lowercase tokenization strategy
109
+ def lowercase?
110
+ strategy == :lowercase
111
+ end
112
+
113
+ # @return [Boolean] true if using unicode tokenization strategy
114
+ def unicode?
115
+ strategy == :unicode
116
+ end
117
+
118
+ # @return [Boolean] true if using whitespace tokenization strategy
119
+ def whitespace?
120
+ strategy == :whitespace
121
+ end
122
+
123
+ # @return [Boolean] true if using sentence tokenization strategy
124
+ def sentence?
125
+ strategy == :sentence
126
+ end
127
+
128
+ # @return [Boolean] true if using keyword tokenization strategy
129
+ def keyword?
130
+ strategy == :keyword
131
+ end
132
+
133
+ # @return [Boolean] true if using url_email tokenization strategy
134
+ def url_email?
135
+ strategy == :url_email
136
+ end
137
+
138
+ # Converts configuration to a hash.
139
+ #
140
+ # @return [Hash] Configuration as a hash
141
+ #
142
+ # @example
143
+ # config.to_h
144
+ # # => {"strategy" => "unicode", "lowercase" => true, ...}
145
+ #
146
+ def to_h
147
+ @raw_hash.dup
148
+ end
149
+
150
+ # Returns a string representation of the configuration.
151
+ #
152
+ # @return [String] Human-readable configuration summary
153
+ #
154
+ def inspect
155
+ "#<TokenKit::Configuration strategy=#{strategy} lowercase=#{lowercase} remove_punctuation=#{remove_punctuation}>"
156
+ end
157
+
158
+ # Converts configuration to format expected by Rust.
159
+ #
160
+ # @return [Hash] Configuration hash for Rust FFI
161
+ # @api private
162
+ #
163
+ def to_rust_config
164
+ @raw_hash
165
+ end
166
+
167
+ # Creates a ConfigBuilder from this configuration for modification.
168
+ #
169
+ # @return [ConfigBuilder] A builder initialized with this configuration
170
+ #
171
+ # @example
172
+ # builder = config.to_builder
173
+ # builder.lowercase = false
174
+ # new_config = builder.build
175
+ #
176
+ def to_builder
177
+ builder = ConfigBuilder.new
178
+ builder.strategy = strategy
179
+ builder.lowercase = lowercase
180
+ builder.remove_punctuation = remove_punctuation
181
+ builder.preserve_patterns = preserve_patterns.dup
182
+
183
+ # Copy strategy-specific settings
184
+ builder.regex = regex if pattern?
185
+ builder.extended = extended if grapheme?
186
+ builder.min_gram = min_gram if edge_ngram? || ngram?
187
+ builder.max_gram = max_gram if edge_ngram? || ngram?
188
+ builder.delimiter = delimiter if path_hierarchy?
189
+ builder.split_on_chars = split_on_chars if char_group?
190
+
191
+ builder
192
+ end
193
+ end
194
+ end