phrasekit 0.2.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ require "shellwords"
2
+
3
+ module PhraseKit
4
+ class Tagger
5
+ class Error < StandardError; end
6
+
7
+ class << self
8
+ def tag(
9
+ input_path:,
10
+ output_path:,
11
+ artifacts_dir: nil,
12
+ automaton_path: nil,
13
+ payloads_path: nil,
14
+ manifest_path: nil,
15
+ vocab_path: nil,
16
+ policy: :leftmost_longest,
17
+ max_spans: 100,
18
+ label: "PHRASE",
19
+ config_path: nil
20
+ )
21
+ binary_path = find_binary
22
+
23
+ if config_path.nil?
24
+ require "tempfile"
25
+ require "json"
26
+
27
+ if artifacts_dir
28
+ automaton_path ||= File.join(artifacts_dir, "phrases.daac")
29
+ payloads_path ||= File.join(artifacts_dir, "payloads.bin")
30
+ manifest_path ||= File.join(artifacts_dir, "manifest.json")
31
+ vocab_path ||= File.join(artifacts_dir, "vocab.json")
32
+ end
33
+
34
+ unless automaton_path && payloads_path && manifest_path && vocab_path
35
+ raise Error, "Must provide either artifacts_dir or all artifact paths"
36
+ end
37
+
38
+ config_file = Tempfile.new(["tag_config", ".json"])
39
+ config_file.write(JSON.generate({
40
+ automaton_path: automaton_path.to_s,
41
+ payloads_path: payloads_path.to_s,
42
+ manifest_path: manifest_path.to_s,
43
+ vocab_path: vocab_path.to_s,
44
+ policy: policy.to_s,
45
+ max_spans: max_spans,
46
+ label: label.to_s
47
+ }))
48
+ config_file.flush
49
+ config_path = config_file.path
50
+ end
51
+
52
+ cmd = [
53
+ binary_path,
54
+ input_path.to_s,
55
+ config_path.to_s,
56
+ output_path.to_s
57
+ ]
58
+ output = `#{cmd.shelljoin} 2>&1`
59
+
60
+ unless $?.success?
61
+ config_file.close! if config_file
62
+ raise Error, "Tagging failed: #{output}"
63
+ end
64
+
65
+ config_file.close! if config_file
66
+
67
+ parse_stats(output)
68
+ end
69
+
70
+ private
71
+
72
+ def find_binary
73
+ base_dir = File.expand_path("../..", __dir__)
74
+
75
+ candidates = [
76
+ File.join(base_dir, "ext/phrasekit/target/release/phrasekit_tag"),
77
+ File.join(base_dir, "ext/phrasekit/target/debug/phrasekit_tag"),
78
+ File.join(base_dir, "bin/phrasekit_tag")
79
+ ]
80
+
81
+ candidates.each do |binary|
82
+ return binary if File.exist?(binary) && File.executable?(binary)
83
+ end
84
+
85
+ raise Error, "phrasekit_tag binary not found. Run: cargo build --release --bin phrasekit_tag --manifest-path ext/phrasekit/Cargo.toml"
86
+ end
87
+
88
+ def parse_stats(output)
89
+ stats = {}
90
+
91
+ output.scan(/Documents:\s+(\d+)/) { stats[:documents] = $1.to_i }
92
+ output.scan(/Total spans:\s+(\d+)/) { stats[:total_spans] = $1.to_i }
93
+ output.scan(/Documents with spans:\s+(\d+)/) { stats[:docs_with_spans] = $1.to_i }
94
+ output.scan(/Avg spans per document:\s+([\d.]+)/) { stats[:avg_spans_per_doc] = $1.to_f }
95
+
96
+ stats
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,3 @@
1
+ module PhraseKit
2
+ VERSION = "0.2.0"
3
+ end
data/lib/phrasekit.rb ADDED
@@ -0,0 +1,100 @@
1
+ require "phrasekit/version"
2
+
3
+ # Load the compiled Rust extension. Precompiled (platform) gems install it into a
4
+ # Ruby-ABI-versioned subdir (lib/phrasekit/<major.minor>/phrasekit.{so,bundle}) so a
5
+ # single fat gem can carry a binary per Ruby version; source/dev builds place it flat
6
+ # at lib/phrasekit/phrasekit.{so,bundle}. Try the versioned path first, fall back to
7
+ # the flat one. Resolution goes through $LOAD_PATH (`require`, never `require_relative`)
8
+ # because RubyGems installs native extensions outside the gem's lib/ dir.
9
+ begin
10
+ RUBY_VERSION =~ /(\d+\.\d+)/
11
+ require "phrasekit/#{Regexp.last_match(1)}/phrasekit"
12
+ rescue LoadError
13
+ require "phrasekit/phrasekit"
14
+ end
15
+
16
+ require "phrasekit/miner"
17
+ require "phrasekit/scorer"
18
+ require "phrasekit/tagger"
19
+
20
+ module PhraseKit
21
+ class Error < StandardError; end
22
+
23
+ class << self
24
+ attr_reader :vocabulary
25
+
26
+ def load!(automaton_path:, payloads_path:, manifest_path:, vocab_path: nil)
27
+ @matcher = NativeMatcher.new
28
+ begin
29
+ @matcher.load(automaton_path.to_s, payloads_path.to_s, manifest_path.to_s)
30
+ rescue RuntimeError => e
31
+ raise Error, e.message
32
+ end
33
+
34
+ if vocab_path
35
+ begin
36
+ require "json"
37
+ vocab_data = JSON.parse(File.read(vocab_path))
38
+ @vocabulary = {
39
+ tokens: vocab_data["tokens"],
40
+ special_tokens: vocab_data["special_tokens"],
41
+ separator_id: vocab_data["separator_id"]
42
+ }
43
+ rescue => e
44
+ raise Error, "Failed to load vocabulary: #{e.message}"
45
+ end
46
+ else
47
+ @vocabulary = nil
48
+ end
49
+ end
50
+
51
+ def match_tokens(token_ids:, policy: :leftmost_longest, max: 32)
52
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
53
+ @matcher.match_tokens(token_ids, policy.to_s, max).map(&:symbolize_keys)
54
+ end
55
+
56
+ def encode_tokens(tokens)
57
+ raise Error, "Vocabulary not loaded. Call PhraseKit.load! with vocab_path" unless @vocabulary
58
+
59
+ unk_id = @vocabulary[:special_tokens]["<UNK>"]
60
+ tokens.map do |token|
61
+ normalized = token.to_s.downcase
62
+ @vocabulary[:tokens][normalized] || unk_id
63
+ end
64
+ end
65
+
66
+ def match_text_tokens(tokens:, policy: :leftmost_longest, max: 32)
67
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
68
+ raise Error, "Vocabulary not loaded. Call PhraseKit.load! with vocab_path" unless @vocabulary
69
+
70
+ token_ids = encode_tokens(tokens)
71
+ match_tokens(token_ids: token_ids, policy: policy, max: max)
72
+ end
73
+
74
+ def stats
75
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
76
+ begin
77
+ stats_hash = @matcher.stats.symbolize_keys
78
+ stats_hash[:loaded_at] = Time.at(stats_hash[:loaded_at] / 1000.0)
79
+ stats_hash
80
+ rescue RuntimeError => e
81
+ raise Error, e.message
82
+ end
83
+ end
84
+
85
+ def healthcheck
86
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
87
+ begin
88
+ @matcher.healthcheck
89
+ rescue RuntimeError => e
90
+ raise Error, e.message
91
+ end
92
+ end
93
+ end
94
+ end
95
+
96
+ class Hash
97
+ def symbolize_keys
98
+ transform_keys { |key| key.to_sym rescue key }
99
+ end unless method_defined?(:symbolize_keys)
100
+ end
@@ -0,0 +1,80 @@
1
+ # SpellKit stub for integration example
2
+ # This will be replaced by the actual spellkit gem (version 0.1.1+)
3
+
4
+ module SpellKit
5
+ class Error < StandardError; end
6
+
7
+ class << self
8
+ attr_reader :stats
9
+
10
+ def load!(dictionary:, edit_distance: 1, frequency_threshold: 0, protected_terms: nil, skip_patterns: {})
11
+ @loaded = true
12
+ @edit_distance = edit_distance
13
+ @protected_terms = Set.new(protected_terms || %w[CDK10 IL6 IL-6 BRCA1 BRCA2 TP53 EGFR])
14
+ @stats = {
15
+ version: "spellkit-stub-0.1.1",
16
+ loaded_at: Time.now,
17
+ tokens_corrected: 0,
18
+ p50_us: 20,
19
+ p95_us: 60
20
+ }
21
+ puts "SpellKit loaded (stub implementation)"
22
+ end
23
+
24
+ def suggestions(term, max = 5)
25
+ return [] unless @loaded
26
+
27
+ # Stub suggestions
28
+ case term.downcase
29
+ when "sequnce"
30
+ [{"term" => "sequence", "distance" => 1, "freq" => 50000}]
31
+ when "helllo"
32
+ [{"term" => "hello", "distance" => 1, "freq" => 100000}]
33
+ when "lyssis"
34
+ [{"term" => "lysis", "distance" => 1, "freq" => 12345}]
35
+ when "protien"
36
+ [{"term" => "protein", "distance" => 1, "freq" => 54321}]
37
+ else
38
+ []
39
+ end
40
+ end
41
+
42
+ def correct?(term)
43
+ return false unless @loaded
44
+
45
+ # Protected terms are always correct
46
+ return true if @protected_terms.include?(term)
47
+
48
+ # Stub: check if term is in "dictionary"
49
+ known_terms = %w[hello world sequence gene the with to need i lysis protein oligo rat buffer western blot]
50
+ known_terms.include?(term.downcase) || @protected_terms.include?(term)
51
+ end
52
+
53
+ def correct(term)
54
+ return term unless @loaded
55
+
56
+ # Protected terms never get corrected
57
+ return term if @protected_terms.include?(term)
58
+
59
+ # Stub corrections
60
+ corrections = {
61
+ "sequnce" => "sequence",
62
+ "helllo" => "hello",
63
+ "lyssis" => "lysis",
64
+ "protien" => "protein"
65
+ }
66
+
67
+ corrections[term.downcase] || term
68
+ end
69
+
70
+ def correct_tokens(tokens)
71
+ return tokens unless @loaded
72
+ tokens.map { |t| correct(t) }
73
+ end
74
+
75
+ def healthcheck
76
+ raise Error, "SpellKit not loaded. Call SpellKit.load! first" unless @loaded
77
+ true
78
+ end
79
+ end
80
+ end
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: phrasekit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: x86_64-linux
6
+ authors:
7
+ - PhraseKit Contributors
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-06-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.2'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.22'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.22'
83
+ - !ruby/object:Gem::Dependency
84
+ name: spellkit
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.1
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.1
97
+ description: High-performance phrase matching using Aho-Corasick automaton with Ruby
98
+ bindings via Magnus
99
+ email:
100
+ - ''
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - README.md
106
+ - ext/phrasekit/Cargo.toml
107
+ - ext/phrasekit/extconf.rb
108
+ - ext/phrasekit/src/bin/fixture_builder.rs
109
+ - ext/phrasekit/src/bin/phrasekit_build.rs
110
+ - ext/phrasekit/src/bin/phrasekit_mine.rs
111
+ - ext/phrasekit/src/bin/phrasekit_score.rs
112
+ - ext/phrasekit/src/bin/phrasekit_tag.rs
113
+ - ext/phrasekit/src/lib.rs
114
+ - ext/phrasekit/src/manifest.rs
115
+ - ext/phrasekit/src/matcher.rs
116
+ - ext/phrasekit/src/payload.rs
117
+ - ext/phrasekit/src/policy.rs
118
+ - lib/phrasekit.rb
119
+ - lib/phrasekit/3.1/phrasekit.so
120
+ - lib/phrasekit/3.2/phrasekit.so
121
+ - lib/phrasekit/3.3/phrasekit.so
122
+ - lib/phrasekit/3.4/phrasekit.so
123
+ - lib/phrasekit/miner.rb
124
+ - lib/phrasekit/scorer.rb
125
+ - lib/phrasekit/tagger.rb
126
+ - lib/phrasekit/version.rb
127
+ - lib/spellkit_stub.rb
128
+ homepage: https://github.com/scientist-labs/phrasekit
129
+ licenses:
130
+ - MIT
131
+ metadata:
132
+ homepage_uri: https://github.com/scientist-labs/phrasekit
133
+ source_code_uri: https://github.com/scientist-labs/phrasekit
134
+ post_install_message:
135
+ rdoc_options: []
136
+ require_paths:
137
+ - lib
138
+ required_ruby_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '3.1'
143
+ - - "<"
144
+ - !ruby/object:Gem::Version
145
+ version: 3.5.dev
146
+ required_rubygems_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: 3.0.0
151
+ requirements: []
152
+ rubygems_version: 3.5.23
153
+ signing_key:
154
+ specification_version: 4
155
+ summary: Ultra-fast deterministic phrase matcher
156
+ test_files: []