skill-extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ # Keyword matcher — a faithful port of FlashText's extract_keywords.
2
+ # Word boundaries: any char outside [A-Za-z0-9_]. Case-insensitive,
3
+ # longest-match-wins, non-overlapping. Identical spans across all
4
+ # skill-extractor language implementations.
5
+ module SkillExtractor
6
+ class KeywordMatcher
7
+ KEYWORD = :__keyword__
8
+ WORD_CHARS = ("a".."z").to_a + ("A".."Z").to_a + ("0".."9").to_a + ["_"]
9
+ WORD_SET = WORD_CHARS.to_h { |c| [c, true] }.freeze
10
+
11
+ def initialize(keywords = [])
12
+ @trie = {}
13
+ keywords.each { |kw| add(kw) }
14
+ end
15
+
16
+ def add(keyword)
17
+ node = @trie
18
+ kw = keyword.downcase
19
+ kw.each_char do |ch|
20
+ node = (node[ch] ||= {})
21
+ end
22
+ node[KEYWORD] = kw
23
+ end
24
+
25
+ # Returns [[keyword, start, end], ...] spans, matching FlashText exactly.
26
+ def extract(sentence)
27
+ out = []
28
+ return out if sentence.nil? || sentence.empty?
29
+
30
+ sentence = sentence.downcase
31
+ chars = sentence.chars
32
+ current = @trie
33
+ seq_start = 0
34
+ seq_end = 0
35
+ reset = false
36
+ idx = 0
37
+ n = chars.length
38
+ while idx < n
39
+ ch = chars[idx]
40
+ if !WORD_SET[ch]
41
+ if current.key?(KEYWORD) || current.key?(ch)
42
+ longest = nil
43
+ longer_found = false
44
+ if current.key?(KEYWORD)
45
+ longest = current[KEYWORD]
46
+ seq_end = idx
47
+ end
48
+ if current.key?(ch)
49
+ cont = current[ch]
50
+ idy = idx + 1
51
+ broke = false
52
+ while idy < n
53
+ inner = chars[idy]
54
+ if !WORD_SET[inner] && cont.key?(KEYWORD)
55
+ longest = cont[KEYWORD]
56
+ seq_end = idy
57
+ longer_found = true
58
+ end
59
+ if cont.key?(inner)
60
+ cont = cont[inner]
61
+ else
62
+ broke = true
63
+ break
64
+ end
65
+ idy += 1
66
+ end
67
+ if !broke && cont.key?(KEYWORD)
68
+ longest = cont[KEYWORD]
69
+ seq_end = idy
70
+ longer_found = true
71
+ end
72
+ idx = seq_end if longer_found
73
+ end
74
+ current = @trie
75
+ out << [longest, seq_start, idx] if longest
76
+ reset = true
77
+ else
78
+ current = @trie
79
+ reset = true
80
+ end
81
+ elsif current.key?(ch)
82
+ current = current[ch]
83
+ else
84
+ current = @trie
85
+ reset = true
86
+ idy = idx + 1
87
+ while idy < n
88
+ break unless WORD_SET[chars[idy]]
89
+ idy += 1
90
+ end
91
+ idx = idy
92
+ end
93
+ if idx + 1 >= n && current.key?(KEYWORD)
94
+ out << [current[KEYWORD], seq_start, n]
95
+ end
96
+ idx += 1
97
+ if reset
98
+ reset = false
99
+ seq_start = idx
100
+ end
101
+ end
102
+ out
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,44 @@
1
+ # MLP classifier forward pass — consumes mlp.json exported from the trained
2
+ # sklearn MLPClassifier (base64 float32, row-major, shape [in, out]).
3
+ require "base64"
4
+ require "json"
5
+
6
+ module SkillExtractor
7
+ class MLP
8
+ def initialize(path)
9
+ spec = JSON.parse(File.read(path))
10
+ unless spec["format"] == "mlp-weights-v1"
11
+ raise ArgumentError, "unsupported weights format: #{spec["format"]}"
12
+ end
13
+ @layers = spec["layers"].map do |l|
14
+ {
15
+ rows: l["shape"][0],
16
+ cols: l["shape"][1],
17
+ w: Base64.decode64(l["weights_b64"]).unpack("e*"),
18
+ b: Base64.decode64(l["bias_b64"]).unpack("e*")
19
+ }
20
+ end
21
+ end
22
+
23
+ # xs: array of 384-dim L2-normalized embeddings -> array of P(skill)
24
+ def predict_proba(xs)
25
+ last = @layers.length - 1
26
+ xs.map do |x|
27
+ h = x
28
+ @layers.each_with_index do |l, i|
29
+ cols = l[:cols]
30
+ w = l[:w]
31
+ nxt = l[:b].dup
32
+ h.each_with_index do |hv, r|
33
+ next if hv.zero?
34
+ off = r * cols
35
+ cols.times { |j| nxt[j] += hv * w[off + j] }
36
+ end
37
+ nxt.map! { |v| v.negative? ? 0.0 : v } if i < last # relu
38
+ h = nxt
39
+ end
40
+ 1.0 / (1.0 + Math.exp(-h[0])) # logistic
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,3 @@
1
+ module SkillExtractor
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,70 @@
1
+ # skill-extractor: extract skills from job postings and resumes.
2
+ # 32K-skill gazetteer + MiniLM embeddings + MLP context classifier,
3
+ # trained on 491K labeled samples (73% F1 held-out).
4
+ require "json"
5
+
6
+ require_relative "skill_extractor/matcher"
7
+ require_relative "skill_extractor/mlp"
8
+ require_relative "skill_extractor/version"
9
+
10
+ module SkillExtractor
11
+ MODEL = "Xenova/all-MiniLM-L6-v2"
12
+ CONTEXT_BEFORE = 20
13
+ CONTEXT_AFTER = 21
14
+ DEFAULT_THRESHOLD = 0.5
15
+ DATA_DIR = File.expand_path("../data", __dir__)
16
+
17
+ class Extractor
18
+ # fp32 (default) matches the reference implementation bit-for-bit
19
+ def initialize(quantized: false)
20
+ @quantized = quantized
21
+ @matcher = KeywordMatcher.new(JSON.parse(File.read(File.join(DATA_DIR, "skills.json"))))
22
+ @mlp = MLP.new(File.join(DATA_DIR, "mlp.json"))
23
+ @pipe = nil
24
+ end
25
+
26
+ attr_reader :matcher, :mlp
27
+
28
+ # Gazetteer matches with their +/-20-word context windows.
29
+ def candidates(text)
30
+ matches = @matcher.extract(text)
31
+ return [] if matches.empty?
32
+
33
+ words = text.split
34
+ matches.map do |skill, start, _end|
35
+ word_idx = text[0...start].split.length
36
+ lo = [0, word_idx - CONTEXT_BEFORE].max
37
+ hi = [words.length, word_idx + CONTEXT_AFTER].min
38
+ { skill: skill, context: words[lo...hi].join(" ") }
39
+ end
40
+ end
41
+
42
+ # P(skill) for each candidate input string.
43
+ def classify(inputs)
44
+ @pipe ||= begin
45
+ require "informers"
46
+ Informers.pipeline("embedding", MODEL, quantized: @quantized)
47
+ end
48
+ @mlp.predict_proba(@pipe.(inputs))
49
+ end
50
+
51
+ # Extract confirmed skills from a job posting or resume text.
52
+ def extract(text, threshold: DEFAULT_THRESHOLD)
53
+ cands = candidates(text)
54
+ return [] if cands.empty?
55
+
56
+ probs = classify(cands.map { |c| "#{c[:skill]} : #{c[:context]}" })
57
+ found = {}
58
+ probs.each_with_index { |p, i| found[cands[i][:skill]] = true if p >= threshold }
59
+ found.keys.sort
60
+ end
61
+ end
62
+
63
+ @default = nil
64
+
65
+ # Module-level convenience wrapper around a shared Extractor.
66
+ def self.extract_skills(text, threshold: DEFAULT_THRESHOLD)
67
+ @default ||= Extractor.new
68
+ @default.extract(text, threshold: threshold)
69
+ end
70
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skill-extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Qarera
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-07-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: informers
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ description: Gazetteer candidates filtered by a MiniLM + MLP context classifier trained
28
+ on 491K labeled samples, so prose like 'can-do attitude' doesn't become a skill.
29
+ Runs on ONNX Runtime via the informers gem.
30
+ email:
31
+ - yashthenuan21@gmail.com
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - LICENSE
37
+ - README.md
38
+ - data/meta.json
39
+ - data/mlp.json
40
+ - data/skills.json
41
+ - lib/skill_extractor.rb
42
+ - lib/skill_extractor/matcher.rb
43
+ - lib/skill_extractor/mlp.rb
44
+ - lib/skill_extractor/version.rb
45
+ homepage: https://github.com/dreamjobs-tech/skill-extractor
46
+ licenses:
47
+ - MIT
48
+ metadata:
49
+ source_code_uri: https://github.com/dreamjobs-tech/skill-extractor/tree/main/ruby
50
+ homepage_uri: https://www.qarera.com
51
+ post_install_message:
52
+ rdoc_options: []
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '3.0'
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ requirements: []
66
+ rubygems_version: 3.4.10
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Extract skills from job postings and resumes — 32K-skill gazetteer + MiniLM
70
+ embeddings + MLP context classifier (73% F1).
71
+ test_files: []