tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
use super::{merge_overlapping_spans, post_process, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
use linkify::{LinkFinder, LinkKind};
|
|
4
|
+
use regex::Regex;
|
|
5
|
+
use unicode_segmentation::UnicodeSegmentation;
|
|
6
|
+
|
|
7
|
+
pub struct UrlEmailTokenizer {
|
|
8
|
+
config: TokenizerConfig,
|
|
9
|
+
preserve_patterns: Vec<Regex>,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
impl UrlEmailTokenizer {
|
|
13
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
14
|
+
let preserve_patterns = config
|
|
15
|
+
.preserve_patterns
|
|
16
|
+
.iter()
|
|
17
|
+
.filter_map(|p| Regex::new(p).ok())
|
|
18
|
+
.collect();
|
|
19
|
+
|
|
20
|
+
Self {
|
|
21
|
+
config,
|
|
22
|
+
preserve_patterns,
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
fn extract_url_email_spans(&self, text: &str) -> Vec<(usize, usize, String)> {
|
|
27
|
+
let finder = LinkFinder::new();
|
|
28
|
+
let mut spans = Vec::new();
|
|
29
|
+
|
|
30
|
+
for link in finder.links(text) {
|
|
31
|
+
match link.kind() {
|
|
32
|
+
LinkKind::Url | LinkKind::Email => {
|
|
33
|
+
let (start, end) = (link.start(), link.end());
|
|
34
|
+
spans.push((start, end, link.as_str().to_string()));
|
|
35
|
+
}
|
|
36
|
+
_ => {}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
spans
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
impl Tokenizer for UrlEmailTokenizer {
|
|
45
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
46
|
+
let mut spans = self.extract_url_email_spans(text);
|
|
47
|
+
|
|
48
|
+
// Add preserve_pattern matches to spans
|
|
49
|
+
for pattern in &self.preserve_patterns {
|
|
50
|
+
for mat in pattern.find_iter(text) {
|
|
51
|
+
spans.push((mat.start(), mat.end(), mat.as_str().to_string()));
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Merge overlapping spans to handle conflicts
|
|
56
|
+
let spans = if !spans.is_empty() {
|
|
57
|
+
merge_overlapping_spans(spans)
|
|
58
|
+
} else {
|
|
59
|
+
spans
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
if spans.is_empty() {
|
|
63
|
+
let tokens: Vec<String> = text
|
|
64
|
+
.unicode_words()
|
|
65
|
+
.map(|s| s.to_string())
|
|
66
|
+
.collect();
|
|
67
|
+
return post_process(tokens, &self.config);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let mut result = Vec::new();
|
|
71
|
+
let mut pos = 0;
|
|
72
|
+
|
|
73
|
+
for (start, end, url_or_email) in spans {
|
|
74
|
+
if start > pos {
|
|
75
|
+
let before = &text[pos..start];
|
|
76
|
+
let before_tokens: Vec<String> = before
|
|
77
|
+
.unicode_words()
|
|
78
|
+
.map(|s| s.to_string())
|
|
79
|
+
.collect();
|
|
80
|
+
let before_tokens = post_process(before_tokens, &self.config);
|
|
81
|
+
result.extend(before_tokens);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Don't lowercase preserved patterns, but do lowercase URLs/emails if config says so
|
|
85
|
+
// unless they are from preserve_patterns
|
|
86
|
+
let preserved = if self.config.lowercase && !self.preserve_patterns.iter().any(|p| p.is_match(&url_or_email)) {
|
|
87
|
+
url_or_email.to_lowercase()
|
|
88
|
+
} else {
|
|
89
|
+
url_or_email
|
|
90
|
+
};
|
|
91
|
+
result.push(preserved);
|
|
92
|
+
pos = end;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if pos < text.len() {
|
|
96
|
+
let remaining = &text[pos..];
|
|
97
|
+
let remaining_tokens: Vec<String> = remaining
|
|
98
|
+
.unicode_words()
|
|
99
|
+
.map(|s| s.to_string())
|
|
100
|
+
.collect();
|
|
101
|
+
let remaining_tokens = post_process(remaining_tokens, &self.config);
|
|
102
|
+
result.extend(remaining_tokens);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
result
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
|
|
4
|
+
pub struct WhitespaceTokenizer {
|
|
5
|
+
base: BaseTokenizerFields,
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
impl WhitespaceTokenizer {
|
|
9
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
10
|
+
Self {
|
|
11
|
+
base: BaseTokenizerFields::new(config),
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
impl Tokenizer for WhitespaceTokenizer {
|
|
17
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
18
|
+
let tokens: Vec<String> = text
|
|
19
|
+
.split_whitespace()
|
|
20
|
+
.filter(|s| !s.is_empty())
|
|
21
|
+
.map(|s| s.to_string())
|
|
22
|
+
.collect();
|
|
23
|
+
|
|
24
|
+
if self.base.has_preserve_patterns() {
|
|
25
|
+
apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
|
|
26
|
+
} else {
|
|
27
|
+
post_process(tokens, &self.base.config)
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
module TokenKit
|
|
2
|
+
class Config
|
|
3
|
+
attr_accessor :strategy, :regex, :grapheme_extended, :min_gram, :max_gram, :delimiter, :split_on_chars, :lowercase, :remove_punctuation, :preserve_patterns
|
|
4
|
+
|
|
5
|
+
def self.instance
|
|
6
|
+
@instance ||= new
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@strategy = :unicode
|
|
11
|
+
@lowercase = true
|
|
12
|
+
@remove_punctuation = false
|
|
13
|
+
@preserve_patterns = []
|
|
14
|
+
@grapheme_extended = true
|
|
15
|
+
@min_gram = 2
|
|
16
|
+
@max_gram = 10
|
|
17
|
+
@delimiter = "/"
|
|
18
|
+
@split_on_chars = " \t\n\r"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def apply!
|
|
22
|
+
config_hash = {
|
|
23
|
+
"strategy" => strategy.to_s,
|
|
24
|
+
"lowercase" => lowercase,
|
|
25
|
+
"remove_punctuation" => remove_punctuation,
|
|
26
|
+
"preserve_patterns" => preserve_patterns.map { |p| pattern_to_string(p) }
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if strategy == :pattern && regex
|
|
30
|
+
config_hash["regex"] = regex
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
if strategy == :grapheme
|
|
34
|
+
config_hash["extended"] = grapheme_extended
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
if strategy == :edge_ngram || strategy == :ngram
|
|
38
|
+
config_hash["min_gram"] = min_gram
|
|
39
|
+
config_hash["max_gram"] = max_gram
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
if strategy == :path_hierarchy
|
|
43
|
+
config_hash["delimiter"] = delimiter
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
if strategy == :char_group
|
|
47
|
+
config_hash["split_on_chars"] = split_on_chars
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
TokenKit.configure(config_hash)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def to_h
|
|
54
|
+
{
|
|
55
|
+
strategy: strategy,
|
|
56
|
+
regex: regex,
|
|
57
|
+
grapheme_extended: grapheme_extended,
|
|
58
|
+
min_gram: min_gram,
|
|
59
|
+
max_gram: max_gram,
|
|
60
|
+
delimiter: delimiter,
|
|
61
|
+
split_on_chars: split_on_chars,
|
|
62
|
+
lowercase: lowercase,
|
|
63
|
+
remove_punctuation: remove_punctuation,
|
|
64
|
+
preserve_patterns: preserve_patterns
|
|
65
|
+
}.compact
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
def pattern_to_string(pattern)
|
|
71
|
+
pattern.is_a?(Regexp) ? pattern.source : pattern.to_s
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'regex_converter'
|
|
4
|
+
|
|
5
|
+
module TokenKit
|
|
6
|
+
# Builder for creating immutable Configuration objects
|
|
7
|
+
class ConfigBuilder
|
|
8
|
+
attr_accessor :strategy, :lowercase, :remove_punctuation, :preserve_patterns
|
|
9
|
+
attr_accessor :regex, :grapheme_extended, :min_gram, :max_gram
|
|
10
|
+
attr_accessor :delimiter, :split_on_chars
|
|
11
|
+
|
|
12
|
+
# Default values
|
|
13
|
+
DEFAULTS = {
|
|
14
|
+
strategy: :unicode,
|
|
15
|
+
lowercase: true,
|
|
16
|
+
remove_punctuation: false,
|
|
17
|
+
preserve_patterns: [],
|
|
18
|
+
grapheme_extended: true,
|
|
19
|
+
min_gram: 2,
|
|
20
|
+
max_gram: 10,
|
|
21
|
+
delimiter: "/",
|
|
22
|
+
split_on_chars: " \t\n\r"
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
VALID_STRATEGIES = [
|
|
26
|
+
:unicode, :whitespace, :pattern, :sentence, :grapheme, :keyword,
|
|
27
|
+
:edge_ngram, :ngram, :path_hierarchy, :url_email, :char_group,
|
|
28
|
+
:letter, :lowercase
|
|
29
|
+
].freeze
|
|
30
|
+
|
|
31
|
+
def initialize(base_config = nil)
|
|
32
|
+
if base_config
|
|
33
|
+
# Copy from existing config
|
|
34
|
+
@strategy = base_config.strategy
|
|
35
|
+
@lowercase = base_config.lowercase
|
|
36
|
+
@remove_punctuation = base_config.remove_punctuation
|
|
37
|
+
@preserve_patterns = base_config.preserve_patterns.dup
|
|
38
|
+
@regex = base_config.instance_variable_get(:@regex) if base_config.instance_variable_defined?(:@regex)
|
|
39
|
+
@grapheme_extended = base_config.instance_variable_get(:@grapheme_extended) || DEFAULTS[:grapheme_extended]
|
|
40
|
+
@min_gram = base_config.instance_variable_get(:@min_gram) || DEFAULTS[:min_gram]
|
|
41
|
+
@max_gram = base_config.instance_variable_get(:@max_gram) || DEFAULTS[:max_gram]
|
|
42
|
+
@delimiter = base_config.instance_variable_get(:@delimiter) || DEFAULTS[:delimiter]
|
|
43
|
+
@split_on_chars = base_config.instance_variable_get(:@split_on_chars) || DEFAULTS[:split_on_chars]
|
|
44
|
+
else
|
|
45
|
+
# Start with defaults
|
|
46
|
+
DEFAULTS.each do |key, value|
|
|
47
|
+
instance_variable_set("@#{key}", value.is_a?(Array) ? value.dup : value)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Build an immutable Configuration object
|
|
53
|
+
# @return [Configuration] The built configuration
|
|
54
|
+
# @raise [Error] if configuration is invalid
|
|
55
|
+
def build
|
|
56
|
+
validate!
|
|
57
|
+
|
|
58
|
+
config_hash = build_config_hash
|
|
59
|
+
Configuration.new(config_hash, self)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def validate!
|
|
65
|
+
# Validate strategy
|
|
66
|
+
unless VALID_STRATEGIES.include?(@strategy)
|
|
67
|
+
raise Error, "Invalid strategy: #{@strategy}. Valid strategies are: #{VALID_STRATEGIES.join(', ')}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Strategy-specific validations
|
|
71
|
+
case @strategy
|
|
72
|
+
when :pattern
|
|
73
|
+
raise Error, "Pattern strategy requires a regex" unless @regex
|
|
74
|
+
if @regex.is_a?(String)
|
|
75
|
+
RegexConverter.validate!(@regex)
|
|
76
|
+
end
|
|
77
|
+
when :edge_ngram, :ngram
|
|
78
|
+
raise Error, "min_gram must be positive, got #{@min_gram}" if @min_gram < 1
|
|
79
|
+
raise Error, "max_gram (#{@max_gram}) must be >= min_gram (#{@min_gram})" if @max_gram < @min_gram
|
|
80
|
+
when :path_hierarchy
|
|
81
|
+
raise Error, "Path hierarchy requires a delimiter" if @delimiter.nil? || @delimiter.empty?
|
|
82
|
+
when :lowercase
|
|
83
|
+
# Warn if lowercase: false with :lowercase strategy
|
|
84
|
+
if !@lowercase
|
|
85
|
+
warn "Warning: The :lowercase strategy always lowercases text. The 'lowercase: false' setting will be ignored."
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def build_config_hash
|
|
91
|
+
config = {
|
|
92
|
+
"strategy" => @strategy.to_s,
|
|
93
|
+
"lowercase" => @lowercase,
|
|
94
|
+
"remove_punctuation" => @remove_punctuation,
|
|
95
|
+
"preserve_patterns" => RegexConverter.patterns_to_rust(@preserve_patterns)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Add strategy-specific parameters
|
|
99
|
+
case @strategy
|
|
100
|
+
when :pattern
|
|
101
|
+
config["regex"] = @regex.is_a?(Regexp) ? RegexConverter.to_rust(@regex) : @regex.to_s
|
|
102
|
+
when :grapheme
|
|
103
|
+
config["extended"] = @grapheme_extended
|
|
104
|
+
when :edge_ngram, :ngram
|
|
105
|
+
config["min_gram"] = @min_gram
|
|
106
|
+
config["max_gram"] = @max_gram
|
|
107
|
+
when :path_hierarchy
|
|
108
|
+
config["delimiter"] = @delimiter
|
|
109
|
+
when :char_group
|
|
110
|
+
config["split_on_chars"] = @split_on_chars
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
config
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Immutable configuration object
|
|
118
|
+
class Configuration
|
|
119
|
+
attr_reader :strategy, :lowercase, :remove_punctuation, :preserve_patterns
|
|
120
|
+
attr_reader :regex, :grapheme_extended, :min_gram, :max_gram, :delimiter, :split_on_chars
|
|
121
|
+
|
|
122
|
+
def initialize(config_hash, builder = nil)
|
|
123
|
+
@strategy = config_hash["strategy"]&.to_sym || :unicode
|
|
124
|
+
@lowercase = config_hash.fetch("lowercase", true)
|
|
125
|
+
@remove_punctuation = config_hash.fetch("remove_punctuation", false)
|
|
126
|
+
@raw_hash = config_hash.freeze
|
|
127
|
+
|
|
128
|
+
# Store builder data for creating new builders from this config
|
|
129
|
+
if builder
|
|
130
|
+
# Store original Ruby patterns, not the converted strings
|
|
131
|
+
@preserve_patterns = builder.preserve_patterns.freeze
|
|
132
|
+
@regex = builder.regex
|
|
133
|
+
@grapheme_extended = builder.grapheme_extended
|
|
134
|
+
@min_gram = builder.min_gram
|
|
135
|
+
@max_gram = builder.max_gram
|
|
136
|
+
@delimiter = builder.delimiter
|
|
137
|
+
@split_on_chars = builder.split_on_chars
|
|
138
|
+
else
|
|
139
|
+
# Extract from raw_hash for backward compatibility
|
|
140
|
+
@preserve_patterns = config_hash.fetch("preserve_patterns", []).freeze
|
|
141
|
+
@regex = config_hash["regex"]
|
|
142
|
+
@grapheme_extended = config_hash.fetch("extended", ConfigBuilder::DEFAULTS[:grapheme_extended])
|
|
143
|
+
@min_gram = config_hash.fetch("min_gram", ConfigBuilder::DEFAULTS[:min_gram])
|
|
144
|
+
@max_gram = config_hash.fetch("max_gram", ConfigBuilder::DEFAULTS[:max_gram])
|
|
145
|
+
@delimiter = config_hash.fetch("delimiter", ConfigBuilder::DEFAULTS[:delimiter])
|
|
146
|
+
@split_on_chars = config_hash.fetch("split_on_chars", ConfigBuilder::DEFAULTS[:split_on_chars])
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Create a new builder initialized with this configuration
|
|
151
|
+
def to_builder
|
|
152
|
+
ConfigBuilder.new(self)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Strategy-specific accessors
|
|
156
|
+
def pattern?
|
|
157
|
+
strategy == :pattern
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def grapheme?
|
|
161
|
+
strategy == :grapheme
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def extended
|
|
165
|
+
@grapheme_extended
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def edge_ngram?
|
|
169
|
+
strategy == :edge_ngram
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def ngram?
|
|
173
|
+
strategy == :ngram
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def path_hierarchy?
|
|
177
|
+
strategy == :path_hierarchy
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def char_group?
|
|
181
|
+
strategy == :char_group
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def letter?
|
|
185
|
+
strategy == :letter
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def lowercase?
|
|
189
|
+
strategy == :lowercase
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def to_h
|
|
193
|
+
@raw_hash.dup
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def to_rust_config
|
|
197
|
+
@raw_hash
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def inspect
|
|
201
|
+
"#<TokenKit::Configuration strategy=#{strategy} lowercase=#{lowercase} remove_punctuation=#{remove_punctuation}>"
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Check equality with another configuration
|
|
205
|
+
def ==(other)
|
|
206
|
+
other.is_a?(Configuration) && to_h == other.to_h
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module TokenKit
|
|
4
|
+
# Compatibility wrapper that mimics the old Config singleton API
|
|
5
|
+
# This allows us to migrate gradually
|
|
6
|
+
class Config
|
|
7
|
+
# Singleton pattern for backward compatibility
|
|
8
|
+
def self.instance
|
|
9
|
+
@instance ||= new
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Delegate all accessors to the global config builder
|
|
13
|
+
def method_missing(method, *args, &block)
|
|
14
|
+
if method.to_s.end_with?('=')
|
|
15
|
+
# Setter - store in temporary builder
|
|
16
|
+
attr = method.to_s.chomp('=').to_sym
|
|
17
|
+
@temp_builder ||= TokenKit.config_hash.to_builder
|
|
18
|
+
@temp_builder.send(method, *args, &block) if @temp_builder.respond_to?(method)
|
|
19
|
+
else
|
|
20
|
+
# Getter - get from current config or temp builder
|
|
21
|
+
if @temp_builder && @temp_builder.respond_to?(method)
|
|
22
|
+
@temp_builder.send(method)
|
|
23
|
+
else
|
|
24
|
+
TokenKit.config_hash.send(method) if TokenKit.config_hash.respond_to?(method)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def respond_to_missing?(method, include_private = false)
|
|
30
|
+
# Avoid infinite recursion by checking config_hash instead of config
|
|
31
|
+
return true if [:strategy=, :lowercase=, :remove_punctuation=, :preserve_patterns=,
|
|
32
|
+
:regex=, :grapheme_extended=, :min_gram=, :max_gram=,
|
|
33
|
+
:delimiter=, :split_on_chars=,
|
|
34
|
+
:strategy, :lowercase, :remove_punctuation, :preserve_patterns,
|
|
35
|
+
:regex, :grapheme_extended, :min_gram, :max_gram,
|
|
36
|
+
:delimiter, :split_on_chars].include?(method)
|
|
37
|
+
super
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Called by TokenKit.configure to get the built config
|
|
41
|
+
def build_config
|
|
42
|
+
builder = @temp_builder || TokenKit.config_hash.to_builder
|
|
43
|
+
@temp_builder = nil # Clear after building
|
|
44
|
+
builder
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Reset temporary builder
|
|
48
|
+
def reset_temp
|
|
49
|
+
@temp_builder = nil
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
module TokenKit
|
|
2
|
+
# Immutable configuration object representing tokenizer settings.
|
|
3
|
+
#
|
|
4
|
+
# This class provides read-only access to configuration values and
|
|
5
|
+
# convenient predicate methods for checking the current strategy.
|
|
6
|
+
#
|
|
7
|
+
# @example Access configuration
|
|
8
|
+
# config = TokenKit.config_hash
|
|
9
|
+
# config.strategy # => :unicode
|
|
10
|
+
# config.lowercase # => true
|
|
11
|
+
# config.preserve_patterns # => [/\d+mg/i]
|
|
12
|
+
#
|
|
13
|
+
# @example Check strategy type
|
|
14
|
+
# config.unicode? # => true
|
|
15
|
+
# config.edge_ngram? # => false
|
|
16
|
+
#
|
|
17
|
+
class Configuration
|
|
18
|
+
# @return [Symbol] The tokenization strategy
|
|
19
|
+
attr_reader :strategy
|
|
20
|
+
|
|
21
|
+
# @return [Boolean] Whether to lowercase tokens
|
|
22
|
+
attr_reader :lowercase
|
|
23
|
+
|
|
24
|
+
# @return [Boolean] Whether to remove punctuation
|
|
25
|
+
attr_reader :remove_punctuation
|
|
26
|
+
|
|
27
|
+
# @return [Array<Regexp>] Patterns to preserve from modification
|
|
28
|
+
attr_reader :preserve_patterns
|
|
29
|
+
|
|
30
|
+
# Creates a new configuration from a hash.
|
|
31
|
+
#
|
|
32
|
+
# @param config_hash [Hash] Configuration values from Rust
|
|
33
|
+
# @api private
|
|
34
|
+
#
|
|
35
|
+
def initialize(config_hash)
|
|
36
|
+
@strategy = config_hash["strategy"]&.to_sym || :unicode
|
|
37
|
+
@lowercase = config_hash.fetch("lowercase", true)
|
|
38
|
+
@remove_punctuation = config_hash.fetch("remove_punctuation", false)
|
|
39
|
+
@preserve_patterns = config_hash.fetch("preserve_patterns", []).freeze
|
|
40
|
+
@raw_hash = config_hash
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# @return [Boolean] true if using pattern tokenization strategy
|
|
44
|
+
def pattern?
|
|
45
|
+
strategy == :pattern
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# @return [String, nil] The regex pattern for pattern strategy
|
|
49
|
+
def regex
|
|
50
|
+
@raw_hash["regex"]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# @return [Boolean] true if using grapheme tokenization strategy
|
|
54
|
+
def grapheme?
|
|
55
|
+
strategy == :grapheme
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# @return [Boolean, nil] Whether to use extended grapheme clusters
|
|
59
|
+
def extended
|
|
60
|
+
@raw_hash["extended"]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# @return [Boolean] true if using edge n-gram tokenization strategy
|
|
64
|
+
def edge_ngram?
|
|
65
|
+
strategy == :edge_ngram
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# @return [Integer, nil] Minimum n-gram size for n-gram strategies
|
|
69
|
+
def min_gram
|
|
70
|
+
@raw_hash["min_gram"]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# @return [Integer, nil] Maximum n-gram size for n-gram strategies
|
|
74
|
+
def max_gram
|
|
75
|
+
@raw_hash["max_gram"]
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# @return [Boolean] true if using path hierarchy tokenization strategy
|
|
79
|
+
def path_hierarchy?
|
|
80
|
+
strategy == :path_hierarchy
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# @return [String, nil] Delimiter for path hierarchy strategy
|
|
84
|
+
def delimiter
|
|
85
|
+
@raw_hash["delimiter"]
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# @return [Boolean] true if using n-gram tokenization strategy
|
|
89
|
+
def ngram?
|
|
90
|
+
strategy == :ngram
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# @return [Boolean] true if using character group tokenization strategy
|
|
94
|
+
def char_group?
|
|
95
|
+
strategy == :char_group
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# @return [String, nil] Characters to split on for char_group strategy
|
|
99
|
+
def split_on_chars
|
|
100
|
+
@raw_hash["split_on_chars"]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# @return [Boolean] true if using letter tokenization strategy
|
|
104
|
+
def letter?
|
|
105
|
+
strategy == :letter
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# @return [Boolean] true if using lowercase tokenization strategy
|
|
109
|
+
def lowercase?
|
|
110
|
+
strategy == :lowercase
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# @return [Boolean] true if using unicode tokenization strategy
|
|
114
|
+
def unicode?
|
|
115
|
+
strategy == :unicode
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# @return [Boolean] true if using whitespace tokenization strategy
|
|
119
|
+
def whitespace?
|
|
120
|
+
strategy == :whitespace
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# @return [Boolean] true if using sentence tokenization strategy
|
|
124
|
+
def sentence?
|
|
125
|
+
strategy == :sentence
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# @return [Boolean] true if using keyword tokenization strategy
|
|
129
|
+
def keyword?
|
|
130
|
+
strategy == :keyword
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# @return [Boolean] true if using url_email tokenization strategy
|
|
134
|
+
def url_email?
|
|
135
|
+
strategy == :url_email
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Converts configuration to a hash.
|
|
139
|
+
#
|
|
140
|
+
# @return [Hash] Configuration as a hash
|
|
141
|
+
#
|
|
142
|
+
# @example
|
|
143
|
+
# config.to_h
|
|
144
|
+
# # => {"strategy" => "unicode", "lowercase" => true, ...}
|
|
145
|
+
#
|
|
146
|
+
def to_h
|
|
147
|
+
@raw_hash.dup
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Returns a string representation of the configuration.
|
|
151
|
+
#
|
|
152
|
+
# @return [String] Human-readable configuration summary
|
|
153
|
+
#
|
|
154
|
+
def inspect
|
|
155
|
+
"#<TokenKit::Configuration strategy=#{strategy} lowercase=#{lowercase} remove_punctuation=#{remove_punctuation}>"
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Converts configuration to format expected by Rust.
|
|
159
|
+
#
|
|
160
|
+
# @return [Hash] Configuration hash for Rust FFI
|
|
161
|
+
# @api private
|
|
162
|
+
#
|
|
163
|
+
def to_rust_config
|
|
164
|
+
@raw_hash
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Creates a ConfigBuilder from this configuration for modification.
|
|
168
|
+
#
|
|
169
|
+
# @return [ConfigBuilder] A builder initialized with this configuration
|
|
170
|
+
#
|
|
171
|
+
# @example
|
|
172
|
+
# builder = config.to_builder
|
|
173
|
+
# builder.lowercase = false
|
|
174
|
+
# new_config = builder.build
|
|
175
|
+
#
|
|
176
|
+
def to_builder
|
|
177
|
+
builder = ConfigBuilder.new
|
|
178
|
+
builder.strategy = strategy
|
|
179
|
+
builder.lowercase = lowercase
|
|
180
|
+
builder.remove_punctuation = remove_punctuation
|
|
181
|
+
builder.preserve_patterns = preserve_patterns.dup
|
|
182
|
+
|
|
183
|
+
# Copy strategy-specific settings
|
|
184
|
+
builder.regex = regex if pattern?
|
|
185
|
+
builder.extended = extended if grapheme?
|
|
186
|
+
builder.min_gram = min_gram if edge_ngram? || ngram?
|
|
187
|
+
builder.max_gram = max_gram if edge_ngram? || ngram?
|
|
188
|
+
builder.delimiter = delimiter if path_hierarchy?
|
|
189
|
+
builder.split_on_chars = split_on_chars if char_group?
|
|
190
|
+
|
|
191
|
+
builder
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|