phrasekit 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ require "shellwords"
2
+
3
+ module PhraseKit
4
+ class Tagger
5
+ class Error < StandardError; end
6
+
7
+ class << self
8
+ def tag(
9
+ input_path:,
10
+ output_path:,
11
+ artifacts_dir: nil,
12
+ automaton_path: nil,
13
+ payloads_path: nil,
14
+ manifest_path: nil,
15
+ vocab_path: nil,
16
+ policy: :leftmost_longest,
17
+ max_spans: 100,
18
+ label: "PHRASE",
19
+ config_path: nil
20
+ )
21
+ binary_path = find_binary
22
+
23
+ if config_path.nil?
24
+ require "tempfile"
25
+ require "json"
26
+
27
+ if artifacts_dir
28
+ automaton_path ||= File.join(artifacts_dir, "phrases.daac")
29
+ payloads_path ||= File.join(artifacts_dir, "payloads.bin")
30
+ manifest_path ||= File.join(artifacts_dir, "manifest.json")
31
+ vocab_path ||= File.join(artifacts_dir, "vocab.json")
32
+ end
33
+
34
+ unless automaton_path && payloads_path && manifest_path && vocab_path
35
+ raise Error, "Must provide either artifacts_dir or all artifact paths"
36
+ end
37
+
38
+ config_file = Tempfile.new(["tag_config", ".json"])
39
+ config_file.write(JSON.generate({
40
+ automaton_path: automaton_path.to_s,
41
+ payloads_path: payloads_path.to_s,
42
+ manifest_path: manifest_path.to_s,
43
+ vocab_path: vocab_path.to_s,
44
+ policy: policy.to_s,
45
+ max_spans: max_spans,
46
+ label: label.to_s
47
+ }))
48
+ config_file.flush
49
+ config_path = config_file.path
50
+ end
51
+
52
+ cmd = [
53
+ binary_path,
54
+ input_path.to_s,
55
+ config_path.to_s,
56
+ output_path.to_s
57
+ ]
58
+ output = `#{cmd.shelljoin} 2>&1`
59
+
60
+ unless $?.success?
61
+ config_file.close! if config_file
62
+ raise Error, "Tagging failed: #{output}"
63
+ end
64
+
65
+ config_file.close! if config_file
66
+
67
+ parse_stats(output)
68
+ end
69
+
70
+ private
71
+
72
+ def find_binary
73
+ base_dir = File.expand_path("../..", __dir__)
74
+
75
+ candidates = [
76
+ File.join(base_dir, "ext/phrasekit/target/release/phrasekit_tag"),
77
+ File.join(base_dir, "ext/phrasekit/target/debug/phrasekit_tag"),
78
+ File.join(base_dir, "bin/phrasekit_tag")
79
+ ]
80
+
81
+ candidates.each do |binary|
82
+ return binary if File.exist?(binary) && File.executable?(binary)
83
+ end
84
+
85
+ raise Error, "phrasekit_tag binary not found. Run: cargo build --release --bin phrasekit_tag --manifest-path ext/phrasekit/Cargo.toml"
86
+ end
87
+
88
+ def parse_stats(output)
89
+ stats = {}
90
+
91
+ output.scan(/Documents:\s+(\d+)/) { stats[:documents] = $1.to_i }
92
+ output.scan(/Total spans:\s+(\d+)/) { stats[:total_spans] = $1.to_i }
93
+ output.scan(/Documents with spans:\s+(\d+)/) { stats[:docs_with_spans] = $1.to_i }
94
+ output.scan(/Avg spans per document:\s+([\d.]+)/) { stats[:avg_spans_per_doc] = $1.to_f }
95
+
96
+ stats
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,3 @@
1
+ module PhraseKit
2
+ VERSION = "0.2.0"
3
+ end
data/lib/phrasekit.rb ADDED
@@ -0,0 +1,100 @@
1
+ require "phrasekit/version"
2
+
3
+ # Load the compiled Rust extension. Precompiled (platform) gems install it into a
4
+ # Ruby-ABI-versioned subdir (lib/phrasekit/<major.minor>/phrasekit.{so,bundle}) so a
5
+ # single fat gem can carry a binary per Ruby version; source/dev builds place it flat
6
+ # at lib/phrasekit/phrasekit.{so,bundle}. Try the versioned path first, fall back to
7
+ # the flat one. Resolution goes through $LOAD_PATH (`require`, never `require_relative`)
8
+ # because RubyGems installs native extensions outside the gem's lib/ dir.
9
+ begin
10
+ RUBY_VERSION =~ /(\d+\.\d+)/
11
+ require "phrasekit/#{Regexp.last_match(1)}/phrasekit"
12
+ rescue LoadError
13
+ require "phrasekit/phrasekit"
14
+ end
15
+
16
+ require "phrasekit/miner"
17
+ require "phrasekit/scorer"
18
+ require "phrasekit/tagger"
19
+
20
+ module PhraseKit
21
+ class Error < StandardError; end
22
+
23
+ class << self
24
+ attr_reader :vocabulary
25
+
26
+ def load!(automaton_path:, payloads_path:, manifest_path:, vocab_path: nil)
27
+ @matcher = NativeMatcher.new
28
+ begin
29
+ @matcher.load(automaton_path.to_s, payloads_path.to_s, manifest_path.to_s)
30
+ rescue RuntimeError => e
31
+ raise Error, e.message
32
+ end
33
+
34
+ if vocab_path
35
+ begin
36
+ require "json"
37
+ vocab_data = JSON.parse(File.read(vocab_path))
38
+ @vocabulary = {
39
+ tokens: vocab_data["tokens"],
40
+ special_tokens: vocab_data["special_tokens"],
41
+ separator_id: vocab_data["separator_id"]
42
+ }
43
+ rescue => e
44
+ raise Error, "Failed to load vocabulary: #{e.message}"
45
+ end
46
+ else
47
+ @vocabulary = nil
48
+ end
49
+ end
50
+
51
+ def match_tokens(token_ids:, policy: :leftmost_longest, max: 32)
52
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
53
+ @matcher.match_tokens(token_ids, policy.to_s, max).map(&:symbolize_keys)
54
+ end
55
+
56
+ def encode_tokens(tokens)
57
+ raise Error, "Vocabulary not loaded. Call PhraseKit.load! with vocab_path" unless @vocabulary
58
+
59
+ unk_id = @vocabulary[:special_tokens]["<UNK>"]
60
+ tokens.map do |token|
61
+ normalized = token.to_s.downcase
62
+ @vocabulary[:tokens][normalized] || unk_id
63
+ end
64
+ end
65
+
66
+ def match_text_tokens(tokens:, policy: :leftmost_longest, max: 32)
67
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
68
+ raise Error, "Vocabulary not loaded. Call PhraseKit.load! with vocab_path" unless @vocabulary
69
+
70
+ token_ids = encode_tokens(tokens)
71
+ match_tokens(token_ids: token_ids, policy: policy, max: max)
72
+ end
73
+
74
+ def stats
75
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
76
+ begin
77
+ stats_hash = @matcher.stats.symbolize_keys
78
+ stats_hash[:loaded_at] = Time.at(stats_hash[:loaded_at] / 1000.0)
79
+ stats_hash
80
+ rescue RuntimeError => e
81
+ raise Error, e.message
82
+ end
83
+ end
84
+
85
+ def healthcheck
86
+ raise Error, "PhraseKit not loaded. Call PhraseKit.load! first" unless @matcher
87
+ begin
88
+ @matcher.healthcheck
89
+ rescue RuntimeError => e
90
+ raise Error, e.message
91
+ end
92
+ end
93
+ end
94
+ end
95
+
96
+ class Hash
97
+ def symbolize_keys
98
+ transform_keys { |key| key.to_sym rescue key }
99
+ end unless method_defined?(:symbolize_keys)
100
+ end
@@ -0,0 +1,80 @@
1
+ # SpellKit stub for integration example
2
+ # This will be replaced by the actual spellkit gem (version 0.1.1+)
3
+
4
+ module SpellKit
5
+ class Error < StandardError; end
6
+
7
+ class << self
8
+ attr_reader :stats
9
+
10
+ def load!(dictionary:, edit_distance: 1, frequency_threshold: 0, protected_terms: nil, skip_patterns: {})
11
+ @loaded = true
12
+ @edit_distance = edit_distance
13
+ @protected_terms = Set.new(protected_terms || %w[CDK10 IL6 IL-6 BRCA1 BRCA2 TP53 EGFR])
14
+ @stats = {
15
+ version: "spellkit-stub-0.1.1",
16
+ loaded_at: Time.now,
17
+ tokens_corrected: 0,
18
+ p50_us: 20,
19
+ p95_us: 60
20
+ }
21
+ puts "SpellKit loaded (stub implementation)"
22
+ end
23
+
24
+ def suggestions(term, max = 5)
25
+ return [] unless @loaded
26
+
27
+ # Stub suggestions
28
+ case term.downcase
29
+ when "sequnce"
30
+ [{"term" => "sequence", "distance" => 1, "freq" => 50000}]
31
+ when "helllo"
32
+ [{"term" => "hello", "distance" => 1, "freq" => 100000}]
33
+ when "lyssis"
34
+ [{"term" => "lysis", "distance" => 1, "freq" => 12345}]
35
+ when "protien"
36
+ [{"term" => "protein", "distance" => 1, "freq" => 54321}]
37
+ else
38
+ []
39
+ end
40
+ end
41
+
42
+ def correct?(term)
43
+ return false unless @loaded
44
+
45
+ # Protected terms are always correct
46
+ return true if @protected_terms.include?(term)
47
+
48
+ # Stub: check if term is in "dictionary"
49
+ known_terms = %w[hello world sequence gene the with to need i lysis protein oligo rat buffer western blot]
50
+ known_terms.include?(term.downcase) || @protected_terms.include?(term)
51
+ end
52
+
53
+ def correct(term)
54
+ return term unless @loaded
55
+
56
+ # Protected terms never get corrected
57
+ return term if @protected_terms.include?(term)
58
+
59
+ # Stub corrections
60
+ corrections = {
61
+ "sequnce" => "sequence",
62
+ "helllo" => "hello",
63
+ "lyssis" => "lysis",
64
+ "protien" => "protein"
65
+ }
66
+
67
+ corrections[term.downcase] || term
68
+ end
69
+
70
+ def correct_tokens(tokens)
71
+ return tokens unless @loaded
72
+ tokens.map { |t| correct(t) }
73
+ end
74
+
75
+ def healthcheck
76
+ raise Error, "SpellKit not loaded. Call SpellKit.load! first" unless @loaded
77
+ true
78
+ end
79
+ end
80
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: phrasekit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - PhraseKit Contributors
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rb_sys
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '0.9'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '0.9'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rake
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '13.0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '13.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rspec
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '3.0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '3.0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rake-compiler
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.2'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.2'
68
+ - !ruby/object:Gem::Dependency
69
+ name: simplecov
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '0.22'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.22'
82
+ - !ruby/object:Gem::Dependency
83
+ name: spellkit
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: 0.1.1
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: 0.1.1
96
+ description: High-performance phrase matching using Aho-Corasick automaton with Ruby
97
+ bindings via Magnus
98
+ email:
99
+ - ''
100
+ executables: []
101
+ extensions:
102
+ - ext/phrasekit/extconf.rb
103
+ extra_rdoc_files: []
104
+ files:
105
+ - README.md
106
+ - ext/phrasekit/Cargo.toml
107
+ - ext/phrasekit/extconf.rb
108
+ - ext/phrasekit/src/bin/fixture_builder.rs
109
+ - ext/phrasekit/src/bin/phrasekit_build.rs
110
+ - ext/phrasekit/src/bin/phrasekit_mine.rs
111
+ - ext/phrasekit/src/bin/phrasekit_score.rs
112
+ - ext/phrasekit/src/bin/phrasekit_tag.rs
113
+ - ext/phrasekit/src/lib.rs
114
+ - ext/phrasekit/src/manifest.rs
115
+ - ext/phrasekit/src/matcher.rs
116
+ - ext/phrasekit/src/payload.rs
117
+ - ext/phrasekit/src/policy.rs
118
+ - lib/phrasekit.rb
119
+ - lib/phrasekit/miner.rb
120
+ - lib/phrasekit/scorer.rb
121
+ - lib/phrasekit/tagger.rb
122
+ - lib/phrasekit/version.rb
123
+ - lib/spellkit_stub.rb
124
+ homepage: https://github.com/scientist-labs/phrasekit
125
+ licenses:
126
+ - MIT
127
+ metadata:
128
+ homepage_uri: https://github.com/scientist-labs/phrasekit
129
+ source_code_uri: https://github.com/scientist-labs/phrasekit
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: 3.0.0
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 3.0.0
143
+ requirements: []
144
+ rubygems_version: 3.6.9
145
+ specification_version: 4
146
+ summary: Ultra-fast deterministic phrase matcher
147
+ test_files: []