tokenkit 0.1.0.pre.2-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.standard.yml +3 -0
  4. data/.yardopts +12 -0
  5. data/CODE_OF_CONDUCT.md +132 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +644 -0
  8. data/Rakefile +37 -0
  9. data/benchmarks/cache_test.rb +63 -0
  10. data/benchmarks/final_comparison.rb +83 -0
  11. data/benchmarks/tokenizer_benchmark.rb +250 -0
  12. data/docs/ARCHITECTURE.md +469 -0
  13. data/docs/PERFORMANCE.md +382 -0
  14. data/docs/README.md +118 -0
  15. data/docs/assets/tokenkit-wide.png +0 -0
  16. data/docs/assets/tokenkit.png +0 -0
  17. data/ext/tokenkit/Cargo.toml +21 -0
  18. data/ext/tokenkit/extconf.rb +4 -0
  19. data/ext/tokenkit/src/config.rs +37 -0
  20. data/ext/tokenkit/src/error.rs +67 -0
  21. data/ext/tokenkit/src/lib.rs +346 -0
  22. data/ext/tokenkit/src/tokenizer/base.rs +41 -0
  23. data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
  24. data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
  25. data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
  26. data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
  27. data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
  28. data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
  29. data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
  30. data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
  31. data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
  32. data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
  33. data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
  34. data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
  35. data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
  36. data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
  37. data/lib/tokenkit/3.1/tokenkit.bundle +0 -0
  38. data/lib/tokenkit/3.2/tokenkit.bundle +0 -0
  39. data/lib/tokenkit/3.3/tokenkit.bundle +0 -0
  40. data/lib/tokenkit/3.4/tokenkit.bundle +0 -0
  41. data/lib/tokenkit/config.rb +74 -0
  42. data/lib/tokenkit/config_builder.rb +209 -0
  43. data/lib/tokenkit/config_compat.rb +52 -0
  44. data/lib/tokenkit/configuration.rb +194 -0
  45. data/lib/tokenkit/regex_converter.rb +58 -0
  46. data/lib/tokenkit/version.rb +5 -0
  47. data/lib/tokenkit.rb +342 -0
  48. data/sig/tokenkit.rbs +4 -0
  49. metadata +175 -0
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TokenKit
4
+ # Converts Ruby Regexp objects to Rust-compatible regex strings
5
+ module RegexConverter
6
+ extend self
7
+
8
+ # Convert a Ruby Regexp to Rust regex syntax
9
+ # @param pattern [Regexp, String] The pattern to convert
10
+ # @return [String] Rust-compatible regex string
11
+ def to_rust(pattern)
12
+ return pattern.to_s unless pattern.is_a?(Regexp)
13
+
14
+ flags = extract_flags(pattern)
15
+ source = pattern.source
16
+
17
+ if flags.empty?
18
+ source
19
+ else
20
+ "(?#{flags})#{source}"
21
+ end
22
+ end
23
+
24
+ # Convert an array of patterns to Rust regex strings
25
+ # @param patterns [Array<Regexp, String>] The patterns to convert
26
+ # @return [Array<String>] Rust-compatible regex strings
27
+ def patterns_to_rust(patterns)
28
+ return [] unless patterns
29
+
30
+ patterns.map { |p| to_rust(p) }
31
+ end
32
+
33
+ # Validate a regex pattern
34
+ # @param pattern [String] The regex pattern to validate
35
+ # @return [Boolean] true if valid
36
+ # @raise [Error] if invalid
37
+ def validate!(pattern)
38
+ # Try to compile it in Ruby first
39
+ Regexp.new(pattern)
40
+ true
41
+ rescue RegexpError => e
42
+ raise Error, "Invalid regex pattern '#{pattern}': #{e.message}"
43
+ end
44
+
45
+ private
46
+
47
+ # Extract flags from a Ruby Regexp
48
+ # @param regexp [Regexp] The regexp to extract flags from
49
+ # @return [String] Rust-compatible flag string
50
+ def extract_flags(regexp)
51
+ flags = ""
52
+ flags += "i" if (regexp.options & Regexp::IGNORECASE) != 0
53
+ flags += "m" if (regexp.options & Regexp::MULTILINE) != 0
54
+ flags += "x" if (regexp.options & Regexp::EXTENDED) != 0
55
+ flags
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TokenKit
4
+ VERSION = "0.1.0.pre.2"
5
+ end
data/lib/tokenkit.rb ADDED
@@ -0,0 +1,342 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "tokenkit/version"
4
+ require_relative "tokenkit/regex_converter"
5
+ require_relative "tokenkit/config_builder"
6
+ require_relative "tokenkit/config_compat"
7
+
8
+ # Load the compiled Rust extension. Precompiled (platform) gems install it into a
9
+ # Ruby-ABI-versioned subdir (lib/tokenkit/<major.minor>/tokenkit.{so,bundle}) so a single
10
+ # fat gem can carry a binary per Ruby version; source/dev builds place it flat at
11
+ # lib/tokenkit/tokenkit.{so,bundle}. Try the versioned path first, fall back to the flat
12
+ # one. Resolution goes through $LOAD_PATH (`require`, never `require_relative`) because
13
+ # RubyGems installs native extensions outside the gem's lib/ dir.
14
+ begin
15
+ RUBY_VERSION =~ /(\d+\.\d+)/
16
+ require "tokenkit/#{Regexp.last_match(1)}/tokenkit"
17
+ rescue LoadError
18
+ require "tokenkit/tokenkit"
19
+ end
20
+
21
+ # TokenKit provides fast, Rust-backed tokenization for Ruby with pattern preservation.
22
+ #
23
+ # @example Basic usage
24
+ # TokenKit.tokenize("Hello, world!")
25
+ # # => ["hello", "world"]
26
+ #
27
+ # @example Configuration
28
+ # TokenKit.configure do |config|
29
+ # config.strategy = :unicode
30
+ # config.lowercase = true
31
+ # config.preserve_patterns = [/\d+mg/i]
32
+ # end
33
+ #
34
+ # @example Instance-based tokenization
35
+ # tokenizer = TokenKit::Tokenizer.new(strategy: :unicode)
36
+ # tokenizer.tokenize("test text")
37
+ #
38
+ module TokenKit
39
+ # Base error class for TokenKit exceptions
40
+ class Error < StandardError; end
41
+
42
+ # Instance-based tokenizer for thread-safe tokenization with specific configuration.
43
+ #
44
+ # @example Create a tokenizer with custom config
45
+ # tokenizer = TokenKit::Tokenizer.new(
46
+ # strategy: :unicode,
47
+ # lowercase: true,
48
+ # preserve_patterns: [/\d+mg/i]
49
+ # )
50
+ # tokenizer.tokenize("Patient received 100mg")
51
+ # # => ["patient", "received", "100mg"]
52
+ #
53
+ class Tokenizer
54
+ # @return [Configuration] The tokenizer's configuration
55
+ attr_reader :config
56
+
57
+ # Creates a new tokenizer instance with the specified configuration.
58
+ #
59
+ # @param config [Hash, Configuration, ConfigBuilder] The configuration for this tokenizer
60
+ # @option config [Symbol] :strategy (:unicode) The tokenization strategy
61
+ # @option config [Boolean] :lowercase (true) Whether to lowercase tokens
62
+ # @option config [Boolean] :remove_punctuation (false) Whether to remove punctuation
63
+ # @option config [Array<Regexp>] :preserve_patterns ([]) Patterns to preserve
64
+ #
65
+ # @example With hash configuration
66
+ # tokenizer = TokenKit::Tokenizer.new(strategy: :whitespace)
67
+ #
68
+ # @example With existing configuration
69
+ # config = TokenKit.config_hash
70
+ # tokenizer = TokenKit::Tokenizer.new(config)
71
+ #
72
+ def initialize(config = {})
73
+ @config = if config.is_a?(Configuration)
74
+ config
75
+ elsif config.is_a?(ConfigBuilder)
76
+ config.build
77
+ elsif config.is_a?(Hash)
78
+ builder = TokenKit.config_hash.to_builder
79
+ config.each do |key, value|
80
+ builder.send("#{key}=", value) if builder.respond_to?("#{key}=")
81
+ end
82
+ builder.build
83
+ else
84
+ TokenKit.config_hash
85
+ end
86
+ end
87
+
88
+ # Tokenizes the given text using this tokenizer's configuration.
89
+ #
90
+ # @param text [String] The text to tokenize
91
+ # @return [Array<String>] An array of tokens
92
+ #
93
+ # @example
94
+ # tokenizer = TokenKit::Tokenizer.new(strategy: :unicode)
95
+ # tokenizer.tokenize("Hello world")
96
+ # # => ["hello", "world"]
97
+ #
98
+ def tokenize(text)
99
+ TokenKit._tokenize_with_config(text, @config.to_rust_config)
100
+ end
101
+ end
102
+
103
+ extend self
104
+
105
+ # Thread-safe storage for current configuration
106
+ @current_config = nil
107
+ @config_mutex = Mutex.new
108
+
109
+ # Tokenizes text using the global configuration or with temporary overrides.
110
+ #
111
+ # @param text [String] The text to tokenize
112
+ # @param opts [Hash] Optional configuration overrides for this tokenization only
113
+ # @option opts [Symbol] :strategy The tokenization strategy to use
114
+ # @option opts [Boolean] :lowercase Whether to lowercase tokens
115
+ # @option opts [Boolean] :remove_punctuation Whether to remove punctuation
116
+ # @option opts [Array<Regexp>] :preserve_patterns Patterns to preserve
117
+ # @option opts [String, Regexp] :regex Pattern for :pattern strategy
118
+ # @option opts [Integer] :min_gram Minimum n-gram size (for n-gram strategies)
119
+ # @option opts [Integer] :max_gram Maximum n-gram size (for n-gram strategies)
120
+ # @option opts [String] :delimiter Delimiter for :path_hierarchy strategy
121
+ # @option opts [String] :split_on_chars Characters to split on for :char_group strategy
122
+ # @option opts [Boolean] :extended Extended grapheme clusters for :grapheme strategy
123
+ #
124
+ # @return [Array<String>] An array of tokens
125
+ #
126
+ # @example Basic tokenization
127
+ # TokenKit.tokenize("Hello, world!")
128
+ # # => ["hello", "world"]
129
+ #
130
+ # @example With temporary overrides
131
+ # TokenKit.tokenize("Hello World", lowercase: false)
132
+ # # => ["Hello", "World"]
133
+ #
134
+ # @example With strategy override
135
+ # TokenKit.tokenize("test-case", strategy: :char_group, split_on_chars: "-")
136
+ # # => ["test", "case"]
137
+ #
138
+ def tokenize(text, **opts)
139
+ if opts.any?
140
+ # Create a fresh tokenizer with merged config
141
+ merged_config = build_merged_config(opts)
142
+ _tokenize_with_config(text, merged_config)
143
+ else
144
+ # Use default config (creates fresh tokenizer internally)
145
+ _tokenize(text)
146
+ end
147
+ end
148
+
149
+ # Returns the global configuration object for backward compatibility.
150
+ #
151
+ # @deprecated Use {#config_hash} for read-only access or {#configure} to modify
152
+ # @return [Config] The global configuration singleton
153
+ #
154
+ # @example
155
+ # TokenKit.config.strategy = :unicode # Deprecated
156
+ #
157
+ def config
158
+ Config.instance
159
+ end
160
+
161
+ # Returns the current global configuration as an immutable object.
162
+ #
163
+ # @return [Configuration] The current configuration with accessor methods
164
+ #
165
+ # @example Get current configuration
166
+ # config = TokenKit.config_hash
167
+ # config.strategy # => :unicode
168
+ # config.lowercase # => true
169
+ # config.preserve_patterns # => []
170
+ #
171
+ # @example Check strategy type
172
+ # config = TokenKit.config_hash
173
+ # config.unicode? # => true
174
+ # config.edge_ngram? # => false
175
+ #
176
+ def config_hash
177
+ @config_mutex.synchronize do
178
+ @current_config ||= ConfigBuilder.new.build
179
+ end
180
+ end
181
+
182
+ # Configures the global tokenizer settings.
183
+ #
184
+ # @yield [Config] Yields the configuration object for modification
185
+ # @return [Configuration] The new configuration
186
+ #
187
+ # @raise [ArgumentError] If invalid configuration is provided
188
+ # @raise [RegexpError] If invalid regex pattern is provided
189
+ #
190
+ # @example Basic configuration
191
+ # TokenKit.configure do |config|
192
+ # config.strategy = :unicode
193
+ # config.lowercase = true
194
+ # end
195
+ #
196
+ # @example With pattern preservation
197
+ # TokenKit.configure do |config|
198
+ # config.strategy = :unicode
199
+ # config.preserve_patterns = [
200
+ # /\d+mg/i, # Measurements
201
+ # /[A-Z]{2,}/, # Acronyms
202
+ # /\w+@\w+\.\w+/ # Emails
203
+ # ]
204
+ # end
205
+ #
206
+ # @example Edge n-gram configuration
207
+ # TokenKit.configure do |config|
208
+ # config.strategy = :edge_ngram
209
+ # config.min_gram = 2
210
+ # config.max_gram = 10
211
+ # end
212
+ #
213
+ def configure
214
+ # Use the compatibility wrapper to support old API
215
+ yield Config.instance if block_given?
216
+
217
+ # Get the builder from the compatibility wrapper
218
+ builder = Config.instance.build_config
219
+
220
+ begin
221
+ # Build and validate the new configuration
222
+ new_config = builder.build
223
+
224
+ # Apply to Rust tokenizer
225
+ _configure(new_config.to_rust_config)
226
+
227
+ # Store the new configuration
228
+ @config_mutex.synchronize do
229
+ @current_config = new_config
230
+ end
231
+
232
+ # Reset the compatibility wrapper
233
+ Config.instance.reset_temp
234
+
235
+ new_config
236
+ rescue => e
237
+ # Reset the compatibility wrapper on error
238
+ Config.instance.reset_temp
239
+ raise e
240
+ end
241
+ end
242
+
243
+ # Resets the tokenizer to default configuration.
244
+ #
245
+ # @return [void]
246
+ #
247
+ # @example
248
+ # TokenKit.reset
249
+ # # Configuration is now:
250
+ # # - strategy: :unicode
251
+ # # - lowercase: true
252
+ # # - remove_punctuation: false
253
+ # # - preserve_patterns: []
254
+ #
255
+ def reset
256
+ # Create default configuration
257
+ new_config = ConfigBuilder.new.build
258
+
259
+ # Reset Rust tokenizer
260
+ _reset
261
+ _configure(new_config.to_rust_config)
262
+
263
+ # Store the new configuration
264
+ @config_mutex.synchronize do
265
+ @current_config = new_config
266
+ end
267
+
268
+ # Reset the compatibility wrapper
269
+ Config.instance.reset_temp
270
+
271
+ # Reset Config singleton instance variables for backward compatibility
272
+ Config.instance.instance_variable_set(:@strategy, :unicode)
273
+ Config.instance.instance_variable_set(:@lowercase, true)
274
+ Config.instance.instance_variable_set(:@remove_punctuation, false)
275
+ Config.instance.instance_variable_set(:@preserve_patterns, [])
276
+ Config.instance.instance_variable_set(:@grapheme_extended, true)
277
+ Config.instance.instance_variable_set(:@min_gram, 2)
278
+ Config.instance.instance_variable_set(:@max_gram, 10)
279
+ Config.instance.instance_variable_set(:@delimiter, "/")
280
+ Config.instance.instance_variable_set(:@split_on_chars, " \t\n\r")
281
+ end
282
+
283
+ private
284
+
285
+ def build_merged_config(opts)
286
+ # Build config with options merged in
287
+ builder = config_hash.to_builder
288
+
289
+ # Apply options to builder
290
+ opts.each do |key, value|
291
+ case key
292
+ when :strategy
293
+ builder.strategy = value
294
+ when :lowercase
295
+ builder.lowercase = value
296
+ when :remove_punctuation
297
+ builder.remove_punctuation = value
298
+ when :preserve, :preserve_patterns
299
+ patterns = Array(value)
300
+ builder.preserve_patterns = patterns
301
+ when :regex
302
+ builder.regex = value
303
+ when :extended, :grapheme_extended
304
+ builder.grapheme_extended = value
305
+ when :min_gram
306
+ builder.min_gram = value
307
+ when :max_gram
308
+ builder.max_gram = value
309
+ when :delimiter
310
+ builder.delimiter = value
311
+ when :split_on_chars
312
+ builder.split_on_chars = value
313
+ end
314
+ end
315
+
316
+ builder.build.to_rust_config
317
+ end
318
+
319
+ def _tokenize(text)
320
+ raise NotImplementedError, "Native extension not loaded"
321
+ end
322
+
323
+ def _tokenize_with_config(text, config_hash)
324
+ raise NotImplementedError, "Native extension not loaded"
325
+ end
326
+
327
+ def _configure(hash)
328
+ raise NotImplementedError, "Native extension not loaded"
329
+ end
330
+
331
+ def _reset
332
+ raise NotImplementedError, "Native extension not loaded"
333
+ end
334
+
335
+ def _config_hash
336
+ raise NotImplementedError, "Native extension not loaded"
337
+ end
338
+
339
+ def _load_config(hash)
340
+ raise NotImplementedError, "Native extension not loaded"
341
+ end
342
+ end
data/sig/tokenkit.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Tokenkit
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,175 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tokenkit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.pre.2
5
+ platform: arm64-darwin
6
+ authors:
7
+ - Chris Petersen
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rb_sys
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '0.9'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '0.9'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rake
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '13.0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '13.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rake-compiler
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.2'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '1.2'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rspec
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '3.0'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: standard
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '1.3'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '1.3'
82
+ - !ruby/object:Gem::Dependency
83
+ name: simplecov
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.22'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '0.22'
96
+ description: TokenKit provides lightweight, Unicode-aware word-level tokenization
97
+ with pattern preservation, backed by Rust for performance.
98
+ email:
99
+ - chris@petersen.io
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".rspec"
105
+ - ".standard.yml"
106
+ - ".yardopts"
107
+ - CODE_OF_CONDUCT.md
108
+ - LICENSE.txt
109
+ - README.md
110
+ - Rakefile
111
+ - benchmarks/cache_test.rb
112
+ - benchmarks/final_comparison.rb
113
+ - benchmarks/tokenizer_benchmark.rb
114
+ - docs/ARCHITECTURE.md
115
+ - docs/PERFORMANCE.md
116
+ - docs/README.md
117
+ - docs/assets/tokenkit-wide.png
118
+ - docs/assets/tokenkit.png
119
+ - ext/tokenkit/Cargo.toml
120
+ - ext/tokenkit/extconf.rb
121
+ - ext/tokenkit/src/config.rs
122
+ - ext/tokenkit/src/error.rs
123
+ - ext/tokenkit/src/lib.rs
124
+ - ext/tokenkit/src/tokenizer/base.rs
125
+ - ext/tokenkit/src/tokenizer/char_group.rs
126
+ - ext/tokenkit/src/tokenizer/edge_ngram.rs
127
+ - ext/tokenkit/src/tokenizer/grapheme.rs
128
+ - ext/tokenkit/src/tokenizer/keyword.rs
129
+ - ext/tokenkit/src/tokenizer/letter.rs
130
+ - ext/tokenkit/src/tokenizer/lowercase.rs
131
+ - ext/tokenkit/src/tokenizer/mod.rs
132
+ - ext/tokenkit/src/tokenizer/ngram.rs
133
+ - ext/tokenkit/src/tokenizer/path_hierarchy.rs
134
+ - ext/tokenkit/src/tokenizer/pattern.rs
135
+ - ext/tokenkit/src/tokenizer/sentence.rs
136
+ - ext/tokenkit/src/tokenizer/unicode.rs
137
+ - ext/tokenkit/src/tokenizer/url_email.rs
138
+ - ext/tokenkit/src/tokenizer/whitespace.rs
139
+ - lib/tokenkit.rb
140
+ - lib/tokenkit/3.1/tokenkit.bundle
141
+ - lib/tokenkit/3.2/tokenkit.bundle
142
+ - lib/tokenkit/3.3/tokenkit.bundle
143
+ - lib/tokenkit/3.4/tokenkit.bundle
144
+ - lib/tokenkit/config.rb
145
+ - lib/tokenkit/config_builder.rb
146
+ - lib/tokenkit/config_compat.rb
147
+ - lib/tokenkit/configuration.rb
148
+ - lib/tokenkit/regex_converter.rb
149
+ - lib/tokenkit/version.rb
150
+ - sig/tokenkit.rbs
151
+ homepage: https://github.com/scientist-labs/tokenkit
152
+ licenses:
153
+ - MIT
154
+ metadata:
155
+ homepage_uri: https://github.com/scientist-labs/tokenkit
156
+ source_code_uri: https://github.com/scientist-labs/tokenkit
157
+ rdoc_options: []
158
+ require_paths:
159
+ - lib
160
+ required_ruby_version: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - ">="
163
+ - !ruby/object:Gem::Version
164
+ version: 3.1.0
165
+ required_rubygems_version: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - ">="
168
+ - !ruby/object:Gem::Version
169
+ version: '0'
170
+ requirements:
171
+ - Rust >= 1.85
172
+ rubygems_version: 3.6.9
173
+ specification_version: 4
174
+ summary: Fast, Rust-backed word-level tokenization for Ruby
175
+ test_files: []