ruby-fst 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 82d0d6d69fc9c3c580ecd104cd588bf669d2d8d269e6e73ccdb3650e571ece14
4
- data.tar.gz: 300c838ff26a9e6fc790ead10d403cfd19c419a40b329f684221bd29b60e7607
3
+ metadata.gz: f20d41a6b4f23c4d08861ebfa42cf22221428852c1a50c38541c81ba64b425b2
4
+ data.tar.gz: b72b1e9cef426abef16a5296000383b01b15f8d14b9848102b6a7084e0a83df7
5
5
  SHA512:
6
- metadata.gz: b5d38c9b3cb260f1632929bd2adce4d5b6f13e6fb7273d2a8af4e69667fe0a36ddf60a1524e48775f6bad0b5954b2e981d0e3bb6428b014ea2ce187e21d10ffb
7
- data.tar.gz: 6fb9e1d99c4888032eb5f3c0a4657dd0ea5498ad85d6aec28c5bc2da727975006331d7007706a9d3ccfb141b9884545a723ba4e070228ce482eeb5ac3a5abbf6
6
+ metadata.gz: c6a8d55e3569a7cbd2d6e890bc6859f8251e63b53b99f25f70793afcaea435930f1ada3cf8f95bb45f0f9119b95de0c4605e75b7430d2abe09733372bbb84e48
7
+ data.tar.gz: bc65a0d886637ff6ab1a897a3303c9e6f55de379d0a98ce87228bd574634f6a199095921936935ecc110d5b834de152d83487656b903c028c1debb42af35ebc6
data/.rubocop.yml ADDED
@@ -0,0 +1,82 @@
1
+ plugins:
2
+ - rubocop-minitest
3
+ - rubocop-rake
4
+
5
+ AllCops:
6
+ NewCops: enable
7
+ TargetRubyVersion: 3.2
8
+ SuggestExtensions: false
9
+ Exclude:
10
+ - 'target/**/*'
11
+ - 'tmp/**/*'
12
+ - 'pkg/**/*'
13
+ - 'vendor/**/*'
14
+
15
+ Style/StringLiterals:
16
+ EnforcedStyle: single_quotes
17
+
18
+ Style/StringLiteralsInInterpolation:
19
+ EnforcedStyle: single_quotes
20
+
21
+ Style/SymbolArray:
22
+ EnforcedStyle: percent
23
+ MinSize: 3
24
+
25
+ Style/WordArray:
26
+ EnforcedStyle: percent
27
+ MinSize: 3
28
+
29
+ Style/PercentLiteralDelimiters:
30
+ PreferredDelimiters:
31
+ default: '()'
32
+ '%i': '()'
33
+ '%w': '()'
34
+
35
+ Style/Documentation:
36
+ Enabled: false
37
+
38
+ Style/HashEachMethods:
39
+ Exclude:
40
+ - 'test/**/*'
41
+
42
+ Style/MapIntoArray:
43
+ Exclude:
44
+ - 'test/**/*'
45
+
46
+ Naming/MethodParameterName:
47
+ AllowedNames:
48
+ - ge
49
+ - le
50
+ - id
51
+ - to
52
+ - by
53
+ - on
54
+ - in
55
+ - at
56
+ - of
57
+ - or
58
+ - if
59
+ - is
60
+ - as
61
+ - it
62
+
63
+ Naming/VariableNumber:
64
+ CheckMethodNames: false
65
+
66
+ Layout/LineLength:
67
+ Max: 120
68
+
69
+ Metrics/BlockLength:
70
+ Exclude:
71
+ - 'test/**/*'
72
+ - '*.gemspec'
73
+ - 'Rakefile'
74
+
75
+ Metrics/MethodLength:
76
+ Max: 25
77
+
78
+ Metrics/AbcSize:
79
+ Max: 25
80
+
81
+ Minitest/MultipleAssertions:
82
+ Max: 8
data/CHANGELOG.md ADDED
@@ -0,0 +1,34 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and this project
5
+ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.2.0] — 2026-05-13
10
+
11
+ ### Added
12
+ - `Map.from_path_mmap` and `Set.from_path_mmap` for memory-mapped loading of large FSTs.
13
+ - `Map#range(ge:, le:)` and `Set#range(ge:, le:)` streaming range iteration.
14
+ - `Map#starts_with(prefix)` and `Set#starts_with(prefix)` prefix scans.
15
+ - `Map#get_le_value` and `Map#get_ge_value` — return only the value (no key allocation) for floor/ceiling lookups.
16
+ - Precompiled native gems for Linux (x86_64, aarch64), macOS (x86_64, arm64), and Windows (x64) — installs without a Rust toolchain on those platforms.
17
+ - Tests covering binary string encoding, key lifetime after stream drop, builder GC, and 0xFF prefix-scan edge case.
18
+ - RuboCop lint configuration with rubocop-minitest and rubocop-rake plugins; runs as part of `rake` and as a CI gate.
19
+
20
+ ### Changed
21
+ - Minimum Ruby version raised to 3.2.
22
+ - `MapBuilder` and `SetBuilder` now operate on a generic `Storage` backing (heap or mmap) for both `Map` and `Set`.
23
+ - Upgraded to magnus 0.8 (drops support for Ruby 2.7 and 3.0 in the underlying bindings; we already require 3.2+).
24
+
25
+ ### Documentation
26
+ - README clarifies key encoding, insertion order, mmap vs in-memory loading, and Levenshtein UTF-8 requirement.
27
+
28
+ ## [0.1.0] — 2026-05-13
29
+
30
+ ### Added
31
+ - Initial release: `RubyFst::Map`, `RubyFst::Set`, `MapBuilder`, `SetBuilder`.
32
+ - Floor/ceiling lookups (`get_le`, `get_ge`).
33
+ - Levenshtein automaton search.
34
+ - Bytes/file serialization via `to_bytes` / `save` / `from_path`.
data/Cargo.lock CHANGED
@@ -115,9 +115,9 @@ dependencies = [
115
115
 
116
116
  [[package]]
117
117
  name = "magnus"
118
- version = "0.7.1"
118
+ version = "0.8.2"
119
119
  source = "registry+https://github.com/rust-lang/crates.io-index"
120
- checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
120
+ checksum = "3b36a5b126bbe97eb0d02d07acfeb327036c6319fd816139a49824a83b7f9012"
121
121
  dependencies = [
122
122
  "magnus-macros",
123
123
  "rb-sys",
@@ -127,9 +127,9 @@ dependencies = [
127
127
 
128
128
  [[package]]
129
129
  name = "magnus-macros"
130
- version = "0.6.0"
130
+ version = "0.8.0"
131
131
  source = "registry+https://github.com/rust-lang/crates.io-index"
132
- checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
132
+ checksum = "47607461fd8e1513cb4f2076c197d8092d921a1ea75bd08af97398f593751892"
133
133
  dependencies = [
134
134
  "proc-macro2",
135
135
  "quote",
@@ -142,6 +142,15 @@ version = "2.8.0"
142
142
  source = "registry+https://github.com/rust-lang/crates.io-index"
143
143
  checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
144
144
 
145
+ [[package]]
146
+ name = "memmap2"
147
+ version = "0.9.10"
148
+ source = "registry+https://github.com/rust-lang/crates.io-index"
149
+ checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
150
+ dependencies = [
151
+ "libc",
152
+ ]
153
+
145
154
  [[package]]
146
155
  name = "minimal-lexical"
147
156
  version = "0.2.1"
@@ -202,9 +211,9 @@ dependencies = [
202
211
 
203
212
  [[package]]
204
213
  name = "rb-sys-env"
205
- version = "0.1.2"
214
+ version = "0.2.3"
206
215
  source = "registry+https://github.com/rust-lang/crates.io-index"
207
- checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
216
+ checksum = "cca7ad6a7e21e72151d56fe2495a259b5670e204c3adac41ee7ef676ea08117a"
208
217
 
209
218
  [[package]]
210
219
  name = "regex"
@@ -237,10 +246,11 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
237
246
 
238
247
  [[package]]
239
248
  name = "ruby_fst"
240
- version = "0.1.0"
249
+ version = "0.2.0"
241
250
  dependencies = [
242
251
  "fst",
243
252
  "magnus",
253
+ "memmap2",
244
254
  "rb-sys",
245
255
  ]
246
256
 
data/Gemfile CHANGED
@@ -7,3 +7,6 @@ gemspec
7
7
  gem 'minitest', '~> 5.0'
8
8
  gem 'rake', '~> 13.0'
9
9
  gem 'rake-compiler', '~> 1.2'
10
+ gem 'rubocop', '~> 1.60', require: false
11
+ gem 'rubocop-minitest', '~> 0.34', require: false
12
+ gem 'rubocop-rake', '~> 0.6', require: false
data/README.md CHANGED
@@ -1,23 +1,35 @@
1
1
  # ruby-fst
2
2
 
3
- Ruby bindings for the [fst](https://github.com/BurntSushi/fst) crate by Andrew Gallant. Provides finite state transducer backed ordered maps and sets with fast lookup, range queries, and fuzzy search.
3
+ [![CI](https://github.com/dsablic/ruby-fst/actions/workflows/ci.yml/badge.svg)](https://github.com/dsablic/ruby-fst/actions/workflows/ci.yml) [![Gem Version](https://badge.fury.io/rb/ruby-fst.svg)](https://badge.fury.io/rb/ruby-fst) [![License](https://img.shields.io/badge/license-MIT-blue.svg)](/LICENSE.txt) [![Ruby](https://img.shields.io/badge/ruby-%3E%3D%203.2-red.svg)](https://www.ruby-lang.org/)
4
+
5
+ Ruby bindings for the [fst](https://github.com/BurntSushi/fst) crate by Andrew Gallant. Provides finite state transducer backed ordered maps and sets with fast lookup, prefix and range scans, floor/ceiling lookups, and Levenshtein fuzzy search.
4
6
 
5
7
  ## Requirements
6
8
 
7
- - Ruby >= 3.0
8
- - Rust toolchain (for compilation)
9
+ - Ruby >= 3.2
10
+ - For source installs: a Rust toolchain. Precompiled gems are published for Linux (x86_64, aarch64, glibc and musl), macOS (x86_64, arm64), and Windows (x64) — these install with no Rust toolchain required.
9
11
 
10
12
  ## Installation
11
13
 
14
+ ```
15
+ gem install ruby-fst
16
+ ```
17
+
18
+ Or add to your Gemfile:
19
+
12
20
  ```ruby
13
21
  gem 'ruby-fst'
14
22
  ```
15
23
 
16
- ## Usage
24
+ ## Keys are byte strings
25
+
26
+ All FST keys are arbitrary byte strings (`Encoding::BINARY`). When a key is returned from the gem (e.g. via `each`, `get_le`, `range`) it always carries the `Encoding::BINARY` encoding — be careful when interpolating into UTF-8 strings or `puts`-ing keys that contain non-ASCII bytes.
27
+
28
+ Keys must be inserted into a builder in **strictly ascending lexicographic byte order**, with no duplicates. Out-of-order or duplicate inserts raise.
17
29
 
18
- ### Map
30
+ ## Map
19
31
 
20
- Ordered map from byte string keys to unsigned 64-bit integer values. Keys must be inserted in lexicographic order.
32
+ Ordered map from byte string keys to unsigned 64-bit integer values.
21
33
 
22
34
  ```ruby
23
35
  require 'ruby_fst'
@@ -28,15 +40,15 @@ builder.insert('baz', 3)
28
40
  builder.insert('foo', 1)
29
41
  map = RubyFst::Map.new(builder.finish)
30
42
 
31
- map['foo'] # => 1
32
- map.get('missing') # => nil
33
- map.contains?('bar') # => true
34
- map.length # => 3
43
+ map['foo'] # => 1
44
+ map.get('missing') # => nil
45
+ map.contains?('bar') # => true
46
+ map.length # => 3
35
47
 
36
- map.each { |key, value| puts "#{key}: #{value}" }
48
+ map.each { |key, value| puts("#{key}: #{value}") }
37
49
  ```
38
50
 
39
- ### Set
51
+ ## Set
40
52
 
41
53
  Ordered set of byte string keys.
42
54
 
@@ -47,13 +59,30 @@ builder.insert('baz')
47
59
  builder.insert('foo')
48
60
  set = RubyFst::Set.new(builder.finish)
49
61
 
50
- set.contains?('foo') # => true
51
- set.length # => 3
62
+ set.contains?('foo') # => true
63
+ set.length # => 3
52
64
 
53
- set.each { |key| puts key }
65
+ set.each { |key| puts(key) }
54
66
  ```
55
67
 
56
- ### Floor and ceiling lookups
68
+ ## Range queries
69
+
70
+ `Map#range` and `Set#range` stream every entry whose key falls in `[ge, le]`. Either bound may be omitted.
71
+
72
+ ```ruby
73
+ map.range(ge: 'b', le: 'f') { |key, value| puts("#{key} -> #{value}") }
74
+ map.range(ge: 'b').to_a # Enumerator without a block
75
+ set.range(le: 'baz') { |key| puts(key) }
76
+ ```
77
+
78
+ `Map#starts_with` and `Set#starts_with` stream every entry whose key begins with the given prefix.
79
+
80
+ ```ruby
81
+ map.starts_with('foo') { |key, value| puts("#{key} -> #{value}") }
82
+ set.starts_with('app').to_a
83
+ ```
84
+
85
+ ## Floor and ceiling lookups
57
86
 
58
87
  `get_le` returns the greatest key less than or equal to the query. `get_ge` returns the smallest key greater than or equal to the query. Both return `[key, value]` or `nil`.
59
88
 
@@ -64,61 +93,80 @@ builder.insert('foo', 2)
64
93
  builder.insert('qux', 3)
65
94
  map = RubyFst::Map.new(builder.finish)
66
95
 
67
- map.get_le('dog') # => ["bar", 1]
68
- map.get_ge('dog') # => ["foo", 2]
69
- map.get_le('aaa') # => nil
96
+ map.get_le('dog') # => ["bar", 1]
97
+ map.get_ge('dog') # => ["foo", 2]
98
+ map.get_le('aaa') # => nil
99
+ ```
100
+
101
+ For range lookups where you only need the value, `get_le_value` and `get_ge_value` skip key reconstruction and return only the `Integer` (or `nil`):
102
+
103
+ ```ruby
104
+ map.get_le_value('dog') # => 1
105
+ map.get_ge_value('dog') # => 2
106
+ map.get_le_value('aaa') # => nil
70
107
  ```
71
108
 
72
- This is useful for IP range lookups. Encode range starts as 4-byte big-endian keys and use `get_le` to find which range an IP falls into:
109
+ This is useful for IP range lookups. Encode range starts as 4-byte big-endian keys and use `get_le_value` to find which range an IP falls into:
73
110
 
74
111
  ```ruby
112
+ require 'ipaddr'
113
+
75
114
  builder = RubyFst::MapBuilder.new
76
- builder.insert([167_772_160].pack('N'), 1) # 10.0.0.0
115
+ builder.insert([167_772_160].pack('N'), 1) # 10.0.0.0
77
116
  builder.insert([3_232_235_520].pack('N'), 2) # 192.168.0.0
78
117
  map = RubyFst::Map.new(builder.finish)
79
118
 
80
119
  ip = IPAddr.new('10.0.0.100').to_i
81
- key, label_id = map.get_le([ip].pack('N'))
120
+ label_id = map.get_le_value([ip].pack('N'))
82
121
  ```
83
122
 
84
- ### Levenshtein search
123
+ ## Levenshtein search
85
124
 
86
125
  Find all keys within a given edit distance. The search runs as an automaton intersection with the FST, visiting only reachable states.
87
126
 
127
+ The query must be valid UTF-8 (the Levenshtein automaton is defined over Unicode codepoints), but the keys themselves can be any bytes — the automaton matches against the UTF-8 interpretation of the keys.
128
+
88
129
  ```ruby
89
130
  builder = RubyFst::MapBuilder.new
90
131
  %w(bar baz cat foo fun).each_with_index { |w, i| builder.insert(w, i) }
91
132
  map = RubyFst::Map.new(builder.finish)
92
133
 
93
- map.search_levenshtein('far', 1) { |key, value| puts key }
134
+ map.search_levenshtein('far', 1) { |key, value| puts(key) }
94
135
  # => bar
95
136
  ```
96
137
 
97
138
  Works on sets too:
98
139
 
99
140
  ```ruby
100
- set.search_levenshtein('university', 2) { |key| puts key }
141
+ set.search_levenshtein('university', 2) { |key| puts(key) }
101
142
  ```
102
143
 
103
- ### Serialization
144
+ ## Serialization
104
145
 
105
146
  ```ruby
106
- # To/from bytes
107
- bytes = map.to_bytes
147
+ # Bytes
148
+ bytes = map.to_bytes # binary string
108
149
  map = RubyFst::Map.new(bytes)
109
150
 
110
- # To/from file
151
+ # File: read entire file into memory (good for small/medium FSTs)
111
152
  map.save('/path/to/file.fst')
112
153
  map = RubyFst::Map.from_path('/path/to/file.fst')
154
+
155
+ # File: memory-map the file (good for large FSTs; lookups page in only what they touch)
156
+ map = RubyFst::Map.from_path_mmap('/path/to/file.fst')
113
157
  ```
114
158
 
159
+ When using `from_path_mmap`, the file must remain unchanged on disk for the lifetime of the resulting `Map` or `Set` — modifying or truncating it causes undefined behavior.
160
+
115
161
  ## Development
116
162
 
117
163
  ```
118
164
  bundle install
119
- bundle exec rake compile test
165
+ bundle exec rake # compile + test + rubocop
120
166
  ```
121
167
 
168
+ Individual tasks: `rake compile`, `rake test`, `rake rubocop`. Bump the version (and CHANGELOG) with `rake bump[patch]` / `rake bump[minor]` / `rake bump[major]`. Tagging `vX.Y.Z` on GitHub triggers cross-compile and publish to RubyGems.
169
+
122
170
  ## License
123
171
 
124
172
  MIT. See [LICENSE.txt](LICENSE.txt) for details.
data/Rakefile CHANGED
@@ -2,11 +2,22 @@
2
2
 
3
3
  require 'rake/testtask'
4
4
  require 'rb_sys/extensiontask'
5
+ require 'rubocop/rake_task'
5
6
 
6
7
  GEMSPEC = Gem::Specification.load('ruby_fst.gemspec')
7
8
 
8
9
  RbSys::ExtensionTask.new('ruby_fst', GEMSPEC) do |ext|
9
10
  ext.lib_dir = 'lib/ruby_fst'
11
+ ext.cross_compile = true
12
+ ext.cross_platform = %w(
13
+ aarch64-linux
14
+ aarch64-linux-musl
15
+ arm64-darwin
16
+ x64-mingw-ucrt
17
+ x86_64-darwin
18
+ x86_64-linux
19
+ x86_64-linux-musl
20
+ )
10
21
  end
11
22
 
12
23
  Rake::TestTask.new do |t|
@@ -14,31 +25,55 @@ Rake::TestTask.new do |t|
14
25
  t.test_files = FileList['test/**/*_test.rb']
15
26
  end
16
27
 
17
- task default: %i(compile test)
28
+ RuboCop::RakeTask.new
29
+
30
+ task default: %i(compile test rubocop)
18
31
 
19
32
  desc 'Bump version (rake bump[patch], rake bump[minor], rake bump[major])'
20
33
  task :bump, [:level] do |_, args|
21
34
  level = args[:level] || 'patch'
22
35
  version_file = File.join(__dir__, 'lib', 'ruby_fst', 'version.rb')
23
36
  cargo_file = File.join(__dir__, 'ext', 'ruby_fst', 'Cargo.toml')
37
+ lock_file = File.join(__dir__, 'Cargo.lock')
24
38
 
25
39
  content = File.read(version_file)
26
40
  current = content[/VERSION = '(.+)'/, 1]
27
41
  major, minor, patch = current.split('.').map(&:to_i)
28
42
 
29
- case level
30
- when 'major' then major += 1; minor = 0; patch = 0
31
- when 'minor' then minor += 1; patch = 0
32
- when 'patch' then patch += 1
33
- else abort("Unknown level: #{level}. Use major, minor, or patch.")
34
- end
35
-
36
- new_version = "#{major}.#{minor}.#{patch}"
43
+ new_version =
44
+ case level
45
+ when 'major' then "#{major + 1}.0.0"
46
+ when 'minor' then "#{major}.#{minor + 1}.0"
47
+ when 'patch' then "#{major}.#{minor}.#{patch + 1}"
48
+ else abort("Unknown level: #{level}. Use major, minor, or patch.")
49
+ end
37
50
 
38
51
  File.write(version_file, content.sub(/VERSION = '.+'/, "VERSION = '#{new_version}'"))
39
52
 
40
53
  cargo = File.read(cargo_file)
41
54
  File.write(cargo_file, cargo.sub(/^version = ".+"/, "version = \"#{new_version}\""))
42
55
 
56
+ if File.exist?(lock_file)
57
+ lock = File.read(lock_file)
58
+ File.write(
59
+ lock_file,
60
+ lock.sub(
61
+ /(\[\[package\]\]\nname = "ruby_fst"\nversion = ").+(")/,
62
+ "\\1#{new_version}\\2"
63
+ )
64
+ )
65
+ end
66
+
67
+ changelog_file = File.join(__dir__, 'CHANGELOG.md')
68
+ if File.exist?(changelog_file)
69
+ changelog = File.read(changelog_file)
70
+ today = Time.now.utc.strftime('%Y-%m-%d')
71
+ promoted = changelog.sub(
72
+ '## [Unreleased]',
73
+ "## [Unreleased]\n\n## [#{new_version}] — #{today}"
74
+ )
75
+ File.write(changelog_file, promoted) if promoted != changelog
76
+ end
77
+
43
78
  puts("#{current} -> #{new_version}")
44
79
  end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "ruby_fst"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  edition = "2021"
5
5
  publish = false
6
6
 
@@ -8,6 +8,7 @@ publish = false
8
8
  crate-type = ["cdylib"]
9
9
 
10
10
  [dependencies]
11
- magnus = { version = "0.7", features = ["rb-sys"] }
11
+ magnus = { version = "0.8", features = ["rb-sys"] }
12
12
  rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
13
13
  fst = { version = "0.4", features = ["levenshtein"] }
14
+ memmap2 = "0.9"
@@ -1,39 +1,105 @@
1
1
  use std::cell::RefCell;
2
- use std::fs;
2
+ use std::fs::{self, File};
3
3
 
4
4
  use fst::automaton::Levenshtein;
5
5
  use fst::raw::{CompiledAddr, Fst, Node, Output};
6
6
  use fst::{IntoStreamer, Streamer};
7
7
  use magnus::prelude::*;
8
- use magnus::{block, exception, function, method, Error, RArray, RString, Ruby, Value};
8
+ use magnus::{function, method, Error, RArray, RString, Ruby, Value};
9
+ use memmap2::Mmap;
9
10
 
10
11
  fn err(msg: impl std::fmt::Display) -> Error {
11
- Error::new(exception::runtime_error(), msg.to_string())
12
+ Error::new(ruby().exception_runtime_error(), msg.to_string())
12
13
  }
13
14
 
15
+ // SAFETY: every method exposed to Ruby is invoked by the VM on the GVL-owning
16
+ // thread, so `Ruby::get_unchecked()` is sound — calling code must not invoke
17
+ // this from a non-Ruby thread.
14
18
  fn ruby() -> Ruby {
15
19
  unsafe { Ruby::get_unchecked() }
16
20
  }
17
21
 
22
+ // ---------------------------------------------------------------------------
23
+ // Storage: backing buffer for an FST. Either an owned heap allocation or a
24
+ // memory map. Both implement AsRef<[u8]> so fst::Map / fst::Set can accept
25
+ // either without monomorphising the wrapper structs.
26
+ // ---------------------------------------------------------------------------
27
+
28
+ enum Storage {
29
+ Mem(Vec<u8>),
30
+ Mmap(Mmap),
31
+ }
32
+
33
+ impl AsRef<[u8]> for Storage {
34
+ fn as_ref(&self) -> &[u8] {
35
+ match self {
36
+ Storage::Mem(v) => v.as_ref(),
37
+ Storage::Mmap(m) => m.as_ref(),
38
+ }
39
+ }
40
+ }
41
+
42
+ fn read_storage(path: &str) -> Result<Storage, Error> {
43
+ let data = fs::read(path).map_err(err)?;
44
+ Ok(Storage::Mem(data))
45
+ }
46
+
47
+ fn mmap_storage(path: &str) -> Result<Storage, Error> {
48
+ let file = File::open(path).map_err(err)?;
49
+ // SAFETY: callers opt in to mmap and accept the contract that the file
50
+ // must not be modified or truncated while the FST is alive. We document
51
+ // this in the Ruby-level docs.
52
+ let mmap = unsafe { Mmap::map(&file) }.map_err(err)?;
53
+ Ok(Storage::Mmap(mmap))
54
+ }
55
+
56
+ // SAFETY for `RString::as_slice()` callers below: between obtaining the slice
57
+ // and dropping it, no Ruby allocation, GC trigger, or string mutation occurs.
58
+ // We either copy into a Vec immediately or pass the slice into pure-Rust fst
59
+ // operations that never re-enter Ruby.
60
+
61
+ fn rstring_to_vec(s: RString) -> Vec<u8> {
62
+ unsafe { s.as_slice() }.to_vec()
63
+ }
64
+
65
+ // Returns the exclusive upper bound for a prefix scan: the smallest byte
66
+ // string strictly greater than `prefix` that does NOT have `prefix` as a
67
+ // prefix. None when no such bound exists (empty prefix, or all-0xFF prefix);
68
+ // the scan is then unbounded above.
69
+ fn prefix_upper_bound(prefix: &[u8]) -> Option<Vec<u8>> {
70
+ let mut upper = prefix.to_vec();
71
+ while let Some(byte) = upper.last_mut() {
72
+ if *byte < 0xFF {
73
+ *byte += 1;
74
+ return Some(upper);
75
+ }
76
+ upper.pop();
77
+ }
78
+ None
79
+ }
80
+
18
81
  // ---------------------------------------------------------------------------
19
82
  // Map
20
83
  // ---------------------------------------------------------------------------
21
84
 
22
85
  #[magnus::wrap(class = "RubyFst::Map", free_immediately, size)]
23
86
  struct FstMap {
24
- inner: fst::Map<Vec<u8>>,
87
+ inner: fst::Map<Storage>,
25
88
  }
26
89
 
27
90
  impl FstMap {
28
91
  fn new(bytes: RString) -> Result<Self, Error> {
29
- let data = unsafe { bytes.as_slice() }.to_vec();
30
- let inner = fst::Map::new(data).map_err(err)?;
92
+ let inner = fst::Map::new(Storage::Mem(rstring_to_vec(bytes))).map_err(err)?;
31
93
  Ok(Self { inner })
32
94
  }
33
95
 
34
96
  fn from_path(path: String) -> Result<Self, Error> {
35
- let data = fs::read(&path).map_err(err)?;
36
- let inner = fst::Map::new(data).map_err(err)?;
97
+ let inner = fst::Map::new(read_storage(&path)?).map_err(err)?;
98
+ Ok(Self { inner })
99
+ }
100
+
101
+ fn from_path_mmap(path: String) -> Result<Self, Error> {
102
+ let inner = fst::Map::new(mmap_storage(&path)?).map_err(err)?;
37
103
  Ok(Self { inner })
38
104
  }
39
105
 
@@ -77,6 +143,11 @@ impl FstMap {
77
143
  }
78
144
  }
79
145
 
146
+ fn get_le_value(&self, key: RString) -> Option<u64> {
147
+ let key = unsafe { key.as_slice() };
148
+ floor_value(self.inner.as_fst(), key)
149
+ }
150
+
80
151
  fn get_ge(&self, key: RString) -> Result<Option<RArray>, Error> {
81
152
  let r = ruby();
82
153
  let key = unsafe { key.as_slice() };
@@ -92,12 +163,53 @@ impl FstMap {
92
163
  }
93
164
  }
94
165
 
166
+ fn get_ge_value(&self, key: RString) -> Option<u64> {
167
+ let key = unsafe { key.as_slice() };
168
+ let mut stream = self.inner.range().ge(key).into_stream();
169
+ stream.next().map(|(_, v)| v)
170
+ }
171
+
95
172
  fn each(&self) -> Result<(), Error> {
96
173
  let r = ruby();
97
174
  let mut stream = (&self.inner).into_stream();
98
175
  while let Some((key, value)) = stream.next() {
99
176
  let rb_key = r.str_from_slice(key);
100
- let _: Value = block::yield_values((rb_key, value))?;
177
+ let _: Value = r.yield_values((rb_key, value))?;
178
+ }
179
+ Ok(())
180
+ }
181
+
182
+ fn range(&self, ge: Option<RString>, le: Option<RString>) -> Result<(), Error> {
183
+ let r = ruby();
184
+ let ge_bytes = ge.map(rstring_to_vec);
185
+ let le_bytes = le.map(rstring_to_vec);
186
+ let mut builder = self.inner.range();
187
+ if let Some(ref b) = ge_bytes {
188
+ builder = builder.ge(b);
189
+ }
190
+ if let Some(ref b) = le_bytes {
191
+ builder = builder.le(b);
192
+ }
193
+ let mut stream = builder.into_stream();
194
+ while let Some((key, value)) = stream.next() {
195
+ let rb_key = r.str_from_slice(key);
196
+ let _: Value = r.yield_values((rb_key, value))?;
197
+ }
198
+ Ok(())
199
+ }
200
+
201
+ fn starts_with(&self, prefix: RString) -> Result<(), Error> {
202
+ let r = ruby();
203
+ let prefix_bytes = rstring_to_vec(prefix);
204
+ let upper = prefix_upper_bound(&prefix_bytes);
205
+ let mut builder = self.inner.range().ge(&prefix_bytes);
206
+ if let Some(ref u) = upper {
207
+ builder = builder.lt(u);
208
+ }
209
+ let mut stream = builder.into_stream();
210
+ while let Some((key, value)) = stream.next() {
211
+ let rb_key = r.str_from_slice(key);
212
+ let _: Value = r.yield_values((rb_key, value))?;
101
213
  }
102
214
  Ok(())
103
215
  }
@@ -108,7 +220,7 @@ impl FstMap {
108
220
  let mut stream = self.inner.search(lev).into_stream();
109
221
  while let Some((key, value)) = stream.next() {
110
222
  let rb_key = r.str_from_slice(key);
111
- let _: Value = block::yield_values((rb_key, value))?;
223
+ let _: Value = r.yield_values((rb_key, value))?;
112
224
  }
113
225
  Ok(())
114
226
  }
@@ -131,7 +243,7 @@ impl FstMapBuilder {
131
243
  }
132
244
 
133
245
  fn insert(&self, key: RString, value: u64) -> Result<(), Error> {
134
- let key = unsafe { key.as_slice() }.to_vec();
246
+ let key = rstring_to_vec(key);
135
247
  let mut guard = self.inner.borrow_mut();
136
248
  let b = guard.as_mut().ok_or_else(|| err("builder already finished"))?;
137
249
  b.insert(&key, value).map_err(err)
@@ -151,19 +263,22 @@ impl FstMapBuilder {
151
263
 
152
264
  #[magnus::wrap(class = "RubyFst::Set", free_immediately, size)]
153
265
  struct FstSet {
154
- inner: fst::Set<Vec<u8>>,
266
+ inner: fst::Set<Storage>,
155
267
  }
156
268
 
157
269
  impl FstSet {
158
270
  fn new(bytes: RString) -> Result<Self, Error> {
159
- let data = unsafe { bytes.as_slice() }.to_vec();
160
- let inner = fst::Set::new(data).map_err(err)?;
271
+ let inner = fst::Set::new(Storage::Mem(rstring_to_vec(bytes))).map_err(err)?;
161
272
  Ok(Self { inner })
162
273
  }
163
274
 
164
275
  fn from_path(path: String) -> Result<Self, Error> {
165
- let data = fs::read(&path).map_err(err)?;
166
- let inner = fst::Set::new(data).map_err(err)?;
276
+ let inner = fst::Set::new(read_storage(&path)?).map_err(err)?;
277
+ Ok(Self { inner })
278
+ }
279
+
280
+ fn from_path_mmap(path: String) -> Result<Self, Error> {
281
+ let inner = fst::Set::new(mmap_storage(&path)?).map_err(err)?;
167
282
  Ok(Self { inner })
168
283
  }
169
284
 
@@ -193,7 +308,42 @@ impl FstSet {
193
308
  let mut stream = (&self.inner).into_stream();
194
309
  while let Some(key) = stream.next() {
195
310
  let rb_key = r.str_from_slice(key);
196
- let _: Value = block::yield_value(rb_key)?;
311
+ let _: Value = r.yield_value(rb_key)?;
312
+ }
313
+ Ok(())
314
+ }
315
+
316
+ fn range(&self, ge: Option<RString>, le: Option<RString>) -> Result<(), Error> {
317
+ let r = ruby();
318
+ let ge_bytes = ge.map(rstring_to_vec);
319
+ let le_bytes = le.map(rstring_to_vec);
320
+ let mut builder = self.inner.range();
321
+ if let Some(ref b) = ge_bytes {
322
+ builder = builder.ge(b);
323
+ }
324
+ if let Some(ref b) = le_bytes {
325
+ builder = builder.le(b);
326
+ }
327
+ let mut stream = builder.into_stream();
328
+ while let Some(key) = stream.next() {
329
+ let rb_key = r.str_from_slice(key);
330
+ let _: Value = r.yield_value(rb_key)?;
331
+ }
332
+ Ok(())
333
+ }
334
+
335
+ fn starts_with(&self, prefix: RString) -> Result<(), Error> {
336
+ let r = ruby();
337
+ let prefix_bytes = rstring_to_vec(prefix);
338
+ let upper = prefix_upper_bound(&prefix_bytes);
339
+ let mut builder = self.inner.range().ge(&prefix_bytes);
340
+ if let Some(ref u) = upper {
341
+ builder = builder.lt(u);
342
+ }
343
+ let mut stream = builder.into_stream();
344
+ while let Some(key) = stream.next() {
345
+ let rb_key = r.str_from_slice(key);
346
+ let _: Value = r.yield_value(rb_key)?;
197
347
  }
198
348
  Ok(())
199
349
  }
@@ -204,7 +354,7 @@ impl FstSet {
204
354
  let mut stream = self.inner.search(lev).into_stream();
205
355
  while let Some(key) = stream.next() {
206
356
  let rb_key = r.str_from_slice(key);
207
- let _: Value = block::yield_value(rb_key)?;
357
+ let _: Value = r.yield_value(rb_key)?;
208
358
  }
209
359
  Ok(())
210
360
  }
@@ -227,7 +377,7 @@ impl FstSetBuilder {
227
377
  }
228
378
 
229
379
  fn insert(&self, key: RString) -> Result<(), Error> {
230
- let key = unsafe { key.as_slice() }.to_vec();
380
+ let key = rstring_to_vec(key);
231
381
  let mut guard = self.inner.borrow_mut();
232
382
  let b = guard.as_mut().ok_or_else(|| err("builder already finished"))?;
233
383
  b.insert(&key).map_err(err)
@@ -242,7 +392,13 @@ impl FstSetBuilder {
242
392
  }
243
393
 
244
394
  // ---------------------------------------------------------------------------
245
- // Floor lookup (get_le): greatest key <= query
395
+ // Floor lookup (get_le): greatest key <= query.
396
+ //
397
+ // The upstream fst crate exposes `range().ge(...)` natively but not a floor
398
+ // operation, so this walks the FST manually: descend matching the query while
399
+ // recording, at each step, the rightmost transition strictly less than the
400
+ // current query byte. If exact match fails we backtrack to the most recent
401
+ // such transition and follow the rightmost path to a leaf.
246
402
  // ---------------------------------------------------------------------------
247
403
 
248
404
  struct Frame {
@@ -281,7 +437,7 @@ fn rightmost_to_leaf<D: AsRef<[u8]>>(
281
437
  let mut out = output;
282
438
  let mut suffix = Vec::new();
283
439
 
284
- while node.len() > 0 {
440
+ while !node.is_empty() {
285
441
  let last = node.len() - 1;
286
442
  let t = node.transition(last);
287
443
  suffix.push(t.inp);
@@ -292,6 +448,20 @@ fn rightmost_to_leaf<D: AsRef<[u8]>>(
292
448
  (suffix, out.cat(node.final_output()).value())
293
449
  }
294
450
 
451
+ fn rightmost_value<D: AsRef<[u8]>>(fst: &Fst<D>, addr: CompiledAddr, output: Output) -> u64 {
452
+ let mut node = fst.node(addr);
453
+ let mut out = output;
454
+
455
+ while !node.is_empty() {
456
+ let last = node.len() - 1;
457
+ let t = node.transition(last);
458
+ out = out.cat(t.out);
459
+ node = fst.node(t.addr);
460
+ }
461
+
462
+ out.cat(node.final_output()).value()
463
+ }
464
+
295
465
  fn floor_lookup<D: AsRef<[u8]>>(fst: &Fst<D>, key: &[u8]) -> Option<(Vec<u8>, u64)> {
296
466
  let root = fst.root();
297
467
 
@@ -355,6 +525,65 @@ fn floor_lookup<D: AsRef<[u8]>>(fst: &Fst<D>, key: &[u8]) -> Option<(Vec<u8>, u6
355
525
  None
356
526
  }
357
527
 
528
+ fn floor_value<D: AsRef<[u8]>>(fst: &Fst<D>, key: &[u8]) -> Option<u64> {
529
+ let root = fst.root();
530
+
531
+ if key.is_empty() {
532
+ return if root.is_final() {
533
+ Some(root.final_output().value())
534
+ } else {
535
+ None
536
+ };
537
+ }
538
+
539
+ let mut node = root;
540
+ let mut output = Output::zero();
541
+ let mut stack: Vec<Frame> = Vec::with_capacity(key.len());
542
+ let mut matched: usize = 0;
543
+
544
+ for &byte in key.iter() {
545
+ let lesser = find_max_lesser(&node, byte);
546
+
547
+ stack.push(Frame {
548
+ node_addr: node.addr(),
549
+ output,
550
+ prefix_len: matched,
551
+ max_lesser_idx: lesser,
552
+ is_final: node.is_final(),
553
+ final_value: output.cat(node.final_output()).value(),
554
+ });
555
+
556
+ match node.find_input(byte) {
557
+ Some(idx) => {
558
+ let t = node.transition(idx);
559
+ output = output.cat(t.out);
560
+ node = fst.node(t.addr);
561
+ matched += 1;
562
+ }
563
+ None => break,
564
+ }
565
+ }
566
+
567
+ if matched == key.len() && node.is_final() {
568
+ return Some(output.cat(node.final_output()).value());
569
+ }
570
+
571
+ while let Some(frame) = stack.pop() {
572
+ if let Some(j) = frame.max_lesser_idx {
573
+ let frame_node = fst.node(frame.node_addr);
574
+ let t = frame_node.transition(j);
575
+ let branch_output = frame.output.cat(t.out);
576
+ return Some(rightmost_value(fst, t.addr, branch_output));
577
+ }
578
+
579
+ if frame.is_final {
580
+ return Some(frame.final_value);
581
+ }
582
+ }
583
+
584
+ None
585
+ }
586
+
358
587
  // ---------------------------------------------------------------------------
359
588
  // Init
360
589
  // ---------------------------------------------------------------------------
@@ -366,6 +595,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
366
595
  let map_class = module.define_class("Map", ruby.class_object())?;
367
596
  map_class.define_singleton_method("new", function!(FstMap::new, 1))?;
368
597
  map_class.define_singleton_method("from_path", function!(FstMap::from_path, 1))?;
598
+ map_class.define_singleton_method("from_path_mmap", function!(FstMap::from_path_mmap, 1))?;
369
599
  map_class.define_method("get", method!(FstMap::get, 1))?;
370
600
  map_class.define_method("[]", method!(FstMap::get, 1))?;
371
601
  map_class.define_method("contains?", method!(FstMap::contains, 1))?;
@@ -375,8 +605,12 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
375
605
  map_class.define_method("to_bytes", method!(FstMap::to_bytes, 0))?;
376
606
  map_class.define_method("save", method!(FstMap::save, 1))?;
377
607
  map_class.define_method("get_le", method!(FstMap::get_le, 1))?;
608
+ map_class.define_method("get_le_value", method!(FstMap::get_le_value, 1))?;
378
609
  map_class.define_method("get_ge", method!(FstMap::get_ge, 1))?;
610
+ map_class.define_method("get_ge_value", method!(FstMap::get_ge_value, 1))?;
379
611
  map_class.define_method("each", method!(FstMap::each, 0))?;
612
+ map_class.define_method("range", method!(FstMap::range, 2))?;
613
+ map_class.define_method("starts_with", method!(FstMap::starts_with, 1))?;
380
614
  map_class.define_method("search_levenshtein", method!(FstMap::search_levenshtein, 2))?;
381
615
 
382
616
  let map_builder = module.define_class("MapBuilder", ruby.class_object())?;
@@ -387,6 +621,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
387
621
  let set_class = module.define_class("Set", ruby.class_object())?;
388
622
  set_class.define_singleton_method("new", function!(FstSet::new, 1))?;
389
623
  set_class.define_singleton_method("from_path", function!(FstSet::from_path, 1))?;
624
+ set_class.define_singleton_method("from_path_mmap", function!(FstSet::from_path_mmap, 1))?;
390
625
  set_class.define_method("contains?", method!(FstSet::contains, 1))?;
391
626
  set_class.define_method("length", method!(FstSet::len, 0))?;
392
627
  set_class.define_method("size", method!(FstSet::len, 0))?;
@@ -394,6 +629,8 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
394
629
  set_class.define_method("to_bytes", method!(FstSet::to_bytes, 0))?;
395
630
  set_class.define_method("save", method!(FstSet::save, 1))?;
396
631
  set_class.define_method("each", method!(FstSet::each, 0))?;
632
+ set_class.define_method("range", method!(FstSet::range, 2))?;
633
+ set_class.define_method("starts_with", method!(FstSet::starts_with, 1))?;
397
634
  set_class.define_method("search_levenshtein", method!(FstSet::search_levenshtein, 2))?;
398
635
 
399
636
  let set_builder = module.define_class("SetBuilder", ruby.class_object())?;
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RubyFst
4
- VERSION = '0.1.0'
4
+ VERSION = '0.2.0'
5
5
  end
data/lib/ruby_fst.rb CHANGED
@@ -4,11 +4,37 @@ require_relative 'ruby_fst/version'
4
4
  require_relative 'ruby_fst/ruby_fst'
5
5
 
6
6
  module RubyFst
7
+ module RangeQuery
8
+ def range(ge: nil, le: nil, &block)
9
+ return enum_for(:range, ge:, le:) unless block
10
+
11
+ _range(ge, le, &block)
12
+ end
13
+
14
+ def starts_with(prefix, &block)
15
+ return enum_for(:starts_with, prefix) unless block
16
+
17
+ _starts_with(prefix, &block)
18
+ end
19
+ end
20
+
7
21
  class Map
8
22
  include Enumerable
23
+
24
+ alias _range range
25
+ alias _starts_with starts_with
26
+ private :_range, :_starts_with
27
+
28
+ prepend RangeQuery
9
29
  end
10
30
 
11
31
  class Set
12
32
  include Enumerable
33
+
34
+ alias _range range
35
+ alias _starts_with starts_with
36
+ private :_range, :_starts_with
37
+
38
+ prepend RangeQuery
13
39
  end
14
40
  end
data/ruby_fst.gemspec CHANGED
@@ -11,11 +11,11 @@ Gem::Specification.new do |spec|
11
11
  spec.description = 'Finite state transducer backed ordered sets and maps via the Rust fst crate by BurntSushi'
12
12
  spec.homepage = 'https://github.com/dsablic/ruby-fst'
13
13
  spec.license = 'MIT'
14
- spec.required_ruby_version = '>= 3.0'
14
+ spec.required_ruby_version = '>= 3.2'
15
15
 
16
16
  spec.metadata['rubygems_mfa_required'] = 'true'
17
17
  spec.metadata['source_code_uri'] = 'https://github.com/dsablic/ruby-fst'
18
- spec.metadata['changelog_uri'] = 'https://github.com/dsablic/ruby-fst/releases'
18
+ spec.metadata['changelog_uri'] = 'https://github.com/dsablic/ruby-fst/blob/main/CHANGELOG.md'
19
19
 
20
20
  spec.files = Dir.chdir(__dir__) do
21
21
  `git ls-files -z`.split("\x0").reject { |f| f.start_with?('test/', '.git') }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-fst
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Denis Sablic
@@ -32,6 +32,8 @@ extensions:
32
32
  - ext/ruby_fst/extconf.rb
33
33
  extra_rdoc_files: []
34
34
  files:
35
+ - ".rubocop.yml"
36
+ - CHANGELOG.md
35
37
  - Cargo.lock
36
38
  - Cargo.toml
37
39
  - Gemfile
@@ -50,7 +52,7 @@ licenses:
50
52
  metadata:
51
53
  rubygems_mfa_required: 'true'
52
54
  source_code_uri: https://github.com/dsablic/ruby-fst
53
- changelog_uri: https://github.com/dsablic/ruby-fst/releases
55
+ changelog_uri: https://github.com/dsablic/ruby-fst/blob/main/CHANGELOG.md
54
56
  rdoc_options: []
55
57
  require_paths:
56
58
  - lib
@@ -58,7 +60,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
58
60
  requirements:
59
61
  - - ">="
60
62
  - !ruby/object:Gem::Version
61
- version: '3.0'
63
+ version: '3.2'
62
64
  required_rubygems_version: !ruby/object:Gem::Requirement
63
65
  requirements:
64
66
  - - ">="