tokenkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.standard.yml +3 -0
  4. data/.yardopts +12 -0
  5. data/CODE_OF_CONDUCT.md +132 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +644 -0
  8. data/Rakefile +18 -0
  9. data/benchmarks/cache_test.rb +63 -0
  10. data/benchmarks/final_comparison.rb +83 -0
  11. data/benchmarks/tokenizer_benchmark.rb +250 -0
  12. data/docs/ARCHITECTURE.md +469 -0
  13. data/docs/PERFORMANCE.md +382 -0
  14. data/docs/README.md +118 -0
  15. data/ext/tokenkit/Cargo.toml +21 -0
  16. data/ext/tokenkit/extconf.rb +4 -0
  17. data/ext/tokenkit/src/config.rs +37 -0
  18. data/ext/tokenkit/src/error.rs +67 -0
  19. data/ext/tokenkit/src/lib.rs +346 -0
  20. data/ext/tokenkit/src/tokenizer/base.rs +41 -0
  21. data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
  22. data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
  23. data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
  24. data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
  25. data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
  26. data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
  27. data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
  28. data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
  29. data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
  30. data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
  31. data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
  32. data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
  33. data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
  34. data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
  35. data/lib/tokenkit/config.rb +74 -0
  36. data/lib/tokenkit/config_builder.rb +209 -0
  37. data/lib/tokenkit/config_compat.rb +52 -0
  38. data/lib/tokenkit/configuration.rb +194 -0
  39. data/lib/tokenkit/regex_converter.rb +58 -0
  40. data/lib/tokenkit/version.rb +5 -0
  41. data/lib/tokenkit.rb +336 -0
  42. data/sig/tokenkit.rbs +4 -0
  43. metadata +172 -0
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TokenKit
4
+ # Converts Ruby Regexp objects to Rust-compatible regex strings
5
+ module RegexConverter
6
+ extend self
7
+
8
+ # Convert a Ruby Regexp to Rust regex syntax
9
+ # @param pattern [Regexp, String] The pattern to convert
10
+ # @return [String] Rust-compatible regex string
11
+ def to_rust(pattern)
12
+ return pattern.to_s unless pattern.is_a?(Regexp)
13
+
14
+ flags = extract_flags(pattern)
15
+ source = pattern.source
16
+
17
+ if flags.empty?
18
+ source
19
+ else
20
+ "(?#{flags})#{source}"
21
+ end
22
+ end
23
+
24
+ # Convert an array of patterns to Rust regex strings
25
+ # @param patterns [Array<Regexp, String>] The patterns to convert
26
+ # @return [Array<String>] Rust-compatible regex strings
27
+ def patterns_to_rust(patterns)
28
+ return [] unless patterns
29
+
30
+ patterns.map { |p| to_rust(p) }
31
+ end
32
+
33
+ # Validate a regex pattern
34
+ # @param pattern [String] The regex pattern to validate
35
+ # @return [Boolean] true if valid
36
+ # @raise [Error] if invalid
37
+ def validate!(pattern)
38
+ # Try to compile it in Ruby first
39
+ Regexp.new(pattern)
40
+ true
41
+ rescue RegexpError => e
42
+ raise Error, "Invalid regex pattern '#{pattern}': #{e.message}"
43
+ end
44
+
45
+ private
46
+
47
+ # Extract flags from a Ruby Regexp
48
+ # @param regexp [Regexp] The regexp to extract flags from
49
+ # @return [String] Rust-compatible flag string
50
+ def extract_flags(regexp)
51
+ flags = ""
52
+ flags += "i" if (regexp.options & Regexp::IGNORECASE) != 0
53
+ flags += "m" if (regexp.options & Regexp::MULTILINE) != 0
54
+ flags += "x" if (regexp.options & Regexp::EXTENDED) != 0
55
+ flags
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TokenKit
4
+ VERSION = "0.1.0.pre.1"
5
+ end
data/lib/tokenkit.rb ADDED
@@ -0,0 +1,336 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "tokenkit/version"
4
+ require_relative "tokenkit/regex_converter"
5
+ require_relative "tokenkit/config_builder"
6
+ require_relative "tokenkit/config_compat"
7
+
8
+ begin
9
+ RUBY_VERSION =~ /(\d+\.\d+)/
10
+ require_relative "tokenkit/#{Regexp.last_match(1)}/tokenkit"
11
+ rescue LoadError
12
+ require_relative "tokenkit/tokenkit"
13
+ end
14
+
15
+ # TokenKit provides fast, Rust-backed tokenization for Ruby with pattern preservation.
16
+ #
17
+ # @example Basic usage
18
+ # TokenKit.tokenize("Hello, world!")
19
+ # # => ["hello", "world"]
20
+ #
21
+ # @example Configuration
22
+ # TokenKit.configure do |config|
23
+ # config.strategy = :unicode
24
+ # config.lowercase = true
25
+ # config.preserve_patterns = [/\d+mg/i]
26
+ # end
27
+ #
28
+ # @example Instance-based tokenization
29
+ # tokenizer = TokenKit::Tokenizer.new(strategy: :unicode)
30
+ # tokenizer.tokenize("test text")
31
+ #
32
+ module TokenKit
33
+ # Base error class for TokenKit exceptions
34
+ class Error < StandardError; end
35
+
36
+ # Instance-based tokenizer for thread-safe tokenization with specific configuration.
37
+ #
38
+ # @example Create a tokenizer with custom config
39
+ # tokenizer = TokenKit::Tokenizer.new(
40
+ # strategy: :unicode,
41
+ # lowercase: true,
42
+ # preserve_patterns: [/\d+mg/i]
43
+ # )
44
+ # tokenizer.tokenize("Patient received 100mg")
45
+ # # => ["patient", "received", "100mg"]
46
+ #
47
+ class Tokenizer
48
+ # @return [Configuration] The tokenizer's configuration
49
+ attr_reader :config
50
+
51
+ # Creates a new tokenizer instance with the specified configuration.
52
+ #
53
+ # @param config [Hash, Configuration, ConfigBuilder] The configuration for this tokenizer
54
+ # @option config [Symbol] :strategy (:unicode) The tokenization strategy
55
+ # @option config [Boolean] :lowercase (true) Whether to lowercase tokens
56
+ # @option config [Boolean] :remove_punctuation (false) Whether to remove punctuation
57
+ # @option config [Array<Regexp>] :preserve_patterns ([]) Patterns to preserve
58
+ #
59
+ # @example With hash configuration
60
+ # tokenizer = TokenKit::Tokenizer.new(strategy: :whitespace)
61
+ #
62
+ # @example With existing configuration
63
+ # config = TokenKit.config_hash
64
+ # tokenizer = TokenKit::Tokenizer.new(config)
65
+ #
66
+ def initialize(config = {})
67
+ @config = if config.is_a?(Configuration)
68
+ config
69
+ elsif config.is_a?(ConfigBuilder)
70
+ config.build
71
+ elsif config.is_a?(Hash)
72
+ builder = TokenKit.config_hash.to_builder
73
+ config.each do |key, value|
74
+ builder.send("#{key}=", value) if builder.respond_to?("#{key}=")
75
+ end
76
+ builder.build
77
+ else
78
+ TokenKit.config_hash
79
+ end
80
+ end
81
+
82
+ # Tokenizes the given text using this tokenizer's configuration.
83
+ #
84
+ # @param text [String] The text to tokenize
85
+ # @return [Array<String>] An array of tokens
86
+ #
87
+ # @example
88
+ # tokenizer = TokenKit::Tokenizer.new(strategy: :unicode)
89
+ # tokenizer.tokenize("Hello world")
90
+ # # => ["hello", "world"]
91
+ #
92
+ def tokenize(text)
93
+ TokenKit._tokenize_with_config(text, @config.to_rust_config)
94
+ end
95
+ end
96
+
97
+ extend self
98
+
99
+ # Thread-safe storage for current configuration
100
+ @current_config = nil
101
+ @config_mutex = Mutex.new
102
+
103
+ # Tokenizes text using the global configuration or with temporary overrides.
104
+ #
105
+ # @param text [String] The text to tokenize
106
+ # @param opts [Hash] Optional configuration overrides for this tokenization only
107
+ # @option opts [Symbol] :strategy The tokenization strategy to use
108
+ # @option opts [Boolean] :lowercase Whether to lowercase tokens
109
+ # @option opts [Boolean] :remove_punctuation Whether to remove punctuation
110
+ # @option opts [Array<Regexp>] :preserve_patterns Patterns to preserve
111
+ # @option opts [String, Regexp] :regex Pattern for :pattern strategy
112
+ # @option opts [Integer] :min_gram Minimum n-gram size (for n-gram strategies)
113
+ # @option opts [Integer] :max_gram Maximum n-gram size (for n-gram strategies)
114
+ # @option opts [String] :delimiter Delimiter for :path_hierarchy strategy
115
+ # @option opts [String] :split_on_chars Characters to split on for :char_group strategy
116
+ # @option opts [Boolean] :extended Extended grapheme clusters for :grapheme strategy
117
+ #
118
+ # @return [Array<String>] An array of tokens
119
+ #
120
+ # @example Basic tokenization
121
+ # TokenKit.tokenize("Hello, world!")
122
+ # # => ["hello", "world"]
123
+ #
124
+ # @example With temporary overrides
125
+ # TokenKit.tokenize("Hello World", lowercase: false)
126
+ # # => ["Hello", "World"]
127
+ #
128
+ # @example With strategy override
129
+ # TokenKit.tokenize("test-case", strategy: :char_group, split_on_chars: "-")
130
+ # # => ["test", "case"]
131
+ #
132
+ def tokenize(text, **opts)
133
+ if opts.any?
134
+ # Create a fresh tokenizer with merged config
135
+ merged_config = build_merged_config(opts)
136
+ _tokenize_with_config(text, merged_config)
137
+ else
138
+ # Use default config (creates fresh tokenizer internally)
139
+ _tokenize(text)
140
+ end
141
+ end
142
+
143
+ # Returns the global configuration object for backward compatibility.
144
+ #
145
+ # @deprecated Use {#config_hash} for read-only access or {#configure} to modify
146
+ # @return [Config] The global configuration singleton
147
+ #
148
+ # @example
149
+ # TokenKit.config.strategy = :unicode # Deprecated
150
+ #
151
+ def config
152
+ Config.instance
153
+ end
154
+
155
+ # Returns the current global configuration as an immutable object.
156
+ #
157
+ # @return [Configuration] The current configuration with accessor methods
158
+ #
159
+ # @example Get current configuration
160
+ # config = TokenKit.config_hash
161
+ # config.strategy # => :unicode
162
+ # config.lowercase # => true
163
+ # config.preserve_patterns # => []
164
+ #
165
+ # @example Check strategy type
166
+ # config = TokenKit.config_hash
167
+ # config.unicode? # => true
168
+ # config.edge_ngram? # => false
169
+ #
170
+ def config_hash
171
+ @config_mutex.synchronize do
172
+ @current_config ||= ConfigBuilder.new.build
173
+ end
174
+ end
175
+
176
+ # Configures the global tokenizer settings.
177
+ #
178
+ # @yield [Config] Yields the configuration object for modification
179
+ # @return [Configuration] The new configuration
180
+ #
181
+ # @raise [ArgumentError] If invalid configuration is provided
182
+ # @raise [RegexpError] If invalid regex pattern is provided
183
+ #
184
+ # @example Basic configuration
185
+ # TokenKit.configure do |config|
186
+ # config.strategy = :unicode
187
+ # config.lowercase = true
188
+ # end
189
+ #
190
+ # @example With pattern preservation
191
+ # TokenKit.configure do |config|
192
+ # config.strategy = :unicode
193
+ # config.preserve_patterns = [
194
+ # /\d+mg/i, # Measurements
195
+ # /[A-Z]{2,}/, # Acronyms
196
+ # /\w+@\w+\.\w+/ # Emails
197
+ # ]
198
+ # end
199
+ #
200
+ # @example Edge n-gram configuration
201
+ # TokenKit.configure do |config|
202
+ # config.strategy = :edge_ngram
203
+ # config.min_gram = 2
204
+ # config.max_gram = 10
205
+ # end
206
+ #
207
+ def configure
208
+ # Use the compatibility wrapper to support old API
209
+ yield Config.instance if block_given?
210
+
211
+ # Get the builder from the compatibility wrapper
212
+ builder = Config.instance.build_config
213
+
214
+ begin
215
+ # Build and validate the new configuration
216
+ new_config = builder.build
217
+
218
+ # Apply to Rust tokenizer
219
+ _configure(new_config.to_rust_config)
220
+
221
+ # Store the new configuration
222
+ @config_mutex.synchronize do
223
+ @current_config = new_config
224
+ end
225
+
226
+ # Reset the compatibility wrapper
227
+ Config.instance.reset_temp
228
+
229
+ new_config
230
+ rescue => e
231
+ # Reset the compatibility wrapper on error
232
+ Config.instance.reset_temp
233
+ raise e
234
+ end
235
+ end
236
+
237
+ # Resets the tokenizer to default configuration.
238
+ #
239
+ # @return [void]
240
+ #
241
+ # @example
242
+ # TokenKit.reset
243
+ # # Configuration is now:
244
+ # # - strategy: :unicode
245
+ # # - lowercase: true
246
+ # # - remove_punctuation: false
247
+ # # - preserve_patterns: []
248
+ #
249
+ def reset
250
+ # Create default configuration
251
+ new_config = ConfigBuilder.new.build
252
+
253
+ # Reset Rust tokenizer
254
+ _reset
255
+ _configure(new_config.to_rust_config)
256
+
257
+ # Store the new configuration
258
+ @config_mutex.synchronize do
259
+ @current_config = new_config
260
+ end
261
+
262
+ # Reset the compatibility wrapper
263
+ Config.instance.reset_temp
264
+
265
+ # Reset Config singleton instance variables for backward compatibility
266
+ Config.instance.instance_variable_set(:@strategy, :unicode)
267
+ Config.instance.instance_variable_set(:@lowercase, true)
268
+ Config.instance.instance_variable_set(:@remove_punctuation, false)
269
+ Config.instance.instance_variable_set(:@preserve_patterns, [])
270
+ Config.instance.instance_variable_set(:@grapheme_extended, true)
271
+ Config.instance.instance_variable_set(:@min_gram, 2)
272
+ Config.instance.instance_variable_set(:@max_gram, 10)
273
+ Config.instance.instance_variable_set(:@delimiter, "/")
274
+ Config.instance.instance_variable_set(:@split_on_chars, " \t\n\r")
275
+ end
276
+
277
+ private
278
+
279
+ def build_merged_config(opts)
280
+ # Build config with options merged in
281
+ builder = config_hash.to_builder
282
+
283
+ # Apply options to builder
284
+ opts.each do |key, value|
285
+ case key
286
+ when :strategy
287
+ builder.strategy = value
288
+ when :lowercase
289
+ builder.lowercase = value
290
+ when :remove_punctuation
291
+ builder.remove_punctuation = value
292
+ when :preserve, :preserve_patterns
293
+ patterns = Array(value)
294
+ builder.preserve_patterns = patterns
295
+ when :regex
296
+ builder.regex = value
297
+ when :extended, :grapheme_extended
298
+ builder.grapheme_extended = value
299
+ when :min_gram
300
+ builder.min_gram = value
301
+ when :max_gram
302
+ builder.max_gram = value
303
+ when :delimiter
304
+ builder.delimiter = value
305
+ when :split_on_chars
306
+ builder.split_on_chars = value
307
+ end
308
+ end
309
+
310
+ builder.build.to_rust_config
311
+ end
312
+
313
+ def _tokenize(text)
314
+ raise NotImplementedError, "Native extension not loaded"
315
+ end
316
+
317
+ def _tokenize_with_config(text, config_hash)
318
+ raise NotImplementedError, "Native extension not loaded"
319
+ end
320
+
321
+ def _configure(hash)
322
+ raise NotImplementedError, "Native extension not loaded"
323
+ end
324
+
325
+ def _reset
326
+ raise NotImplementedError, "Native extension not loaded"
327
+ end
328
+
329
+ def _config_hash
330
+ raise NotImplementedError, "Native extension not loaded"
331
+ end
332
+
333
+ def _load_config(hash)
334
+ raise NotImplementedError, "Native extension not loaded"
335
+ end
336
+ end
data/sig/tokenkit.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Tokenkit
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,172 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tokenkit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.pre.1
5
+ platform: ruby
6
+ authors:
7
+ - Chris Petersen
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-09-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.2'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: standard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.3'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.3'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.22'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.22'
97
+ description: TokenKit provides lightweight, Unicode-aware word-level tokenization
98
+ with pattern preservation, backed by Rust for performance.
99
+ email:
100
+ - chris@petersen.io
101
+ executables: []
102
+ extensions:
103
+ - ext/tokenkit/extconf.rb
104
+ extra_rdoc_files: []
105
+ files:
106
+ - ".rspec"
107
+ - ".standard.yml"
108
+ - ".yardopts"
109
+ - CODE_OF_CONDUCT.md
110
+ - LICENSE.txt
111
+ - README.md
112
+ - Rakefile
113
+ - benchmarks/cache_test.rb
114
+ - benchmarks/final_comparison.rb
115
+ - benchmarks/tokenizer_benchmark.rb
116
+ - docs/ARCHITECTURE.md
117
+ - docs/PERFORMANCE.md
118
+ - docs/README.md
119
+ - ext/tokenkit/Cargo.toml
120
+ - ext/tokenkit/extconf.rb
121
+ - ext/tokenkit/src/config.rs
122
+ - ext/tokenkit/src/error.rs
123
+ - ext/tokenkit/src/lib.rs
124
+ - ext/tokenkit/src/tokenizer/base.rs
125
+ - ext/tokenkit/src/tokenizer/char_group.rs
126
+ - ext/tokenkit/src/tokenizer/edge_ngram.rs
127
+ - ext/tokenkit/src/tokenizer/grapheme.rs
128
+ - ext/tokenkit/src/tokenizer/keyword.rs
129
+ - ext/tokenkit/src/tokenizer/letter.rs
130
+ - ext/tokenkit/src/tokenizer/lowercase.rs
131
+ - ext/tokenkit/src/tokenizer/mod.rs
132
+ - ext/tokenkit/src/tokenizer/ngram.rs
133
+ - ext/tokenkit/src/tokenizer/path_hierarchy.rs
134
+ - ext/tokenkit/src/tokenizer/pattern.rs
135
+ - ext/tokenkit/src/tokenizer/sentence.rs
136
+ - ext/tokenkit/src/tokenizer/unicode.rs
137
+ - ext/tokenkit/src/tokenizer/url_email.rs
138
+ - ext/tokenkit/src/tokenizer/whitespace.rs
139
+ - lib/tokenkit.rb
140
+ - lib/tokenkit/config.rb
141
+ - lib/tokenkit/config_builder.rb
142
+ - lib/tokenkit/config_compat.rb
143
+ - lib/tokenkit/configuration.rb
144
+ - lib/tokenkit/regex_converter.rb
145
+ - lib/tokenkit/version.rb
146
+ - sig/tokenkit.rbs
147
+ homepage: https://github.com/scientist-labs/tokenkit
148
+ licenses:
149
+ - MIT
150
+ metadata:
151
+ homepage_uri: https://github.com/scientist-labs/tokenkit
152
+ source_code_uri: https://github.com/scientist-labs/tokenkit
153
+ post_install_message:
154
+ rdoc_options: []
155
+ require_paths:
156
+ - lib
157
+ required_ruby_version: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ">="
160
+ - !ruby/object:Gem::Version
161
+ version: 3.1.0
162
+ required_rubygems_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ requirements: []
168
+ rubygems_version: 3.5.3
169
+ signing_key:
170
+ specification_version: 4
171
+ summary: Fast, Rust-backed word-level tokenization for Ruby
172
+ test_files: []