skill-extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +38 -0
- data/data/meta.json +14 -0
- data/data/mlp.json +1 -0
- data/data/skills.json +1 -0
- data/lib/skill_extractor/matcher.rb +105 -0
- data/lib/skill_extractor/mlp.rb +44 -0
- data/lib/skill_extractor/version.rb +3 -0
- data/lib/skill_extractor.rb +70 -0
- metadata +71 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Keyword matcher — a faithful port of FlashText's extract_keywords.
|
|
2
|
+
# Word boundaries: any char outside [A-Za-z0-9_]. Case-insensitive,
|
|
3
|
+
# longest-match-wins, non-overlapping. Identical spans across all
|
|
4
|
+
# skill-extractor language implementations.
|
|
5
|
+
module SkillExtractor
|
|
6
|
+
class KeywordMatcher
|
|
7
|
+
KEYWORD = :__keyword__
|
|
8
|
+
WORD_CHARS = ("a".."z").to_a + ("A".."Z").to_a + ("0".."9").to_a + ["_"]
|
|
9
|
+
WORD_SET = WORD_CHARS.to_h { |c| [c, true] }.freeze
|
|
10
|
+
|
|
11
|
+
def initialize(keywords = [])
|
|
12
|
+
@trie = {}
|
|
13
|
+
keywords.each { |kw| add(kw) }
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def add(keyword)
|
|
17
|
+
node = @trie
|
|
18
|
+
kw = keyword.downcase
|
|
19
|
+
kw.each_char do |ch|
|
|
20
|
+
node = (node[ch] ||= {})
|
|
21
|
+
end
|
|
22
|
+
node[KEYWORD] = kw
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Returns [[keyword, start, end], ...] spans, matching FlashText exactly.
|
|
26
|
+
def extract(sentence)
|
|
27
|
+
out = []
|
|
28
|
+
return out if sentence.nil? || sentence.empty?
|
|
29
|
+
|
|
30
|
+
sentence = sentence.downcase
|
|
31
|
+
chars = sentence.chars
|
|
32
|
+
current = @trie
|
|
33
|
+
seq_start = 0
|
|
34
|
+
seq_end = 0
|
|
35
|
+
reset = false
|
|
36
|
+
idx = 0
|
|
37
|
+
n = chars.length
|
|
38
|
+
while idx < n
|
|
39
|
+
ch = chars[idx]
|
|
40
|
+
if !WORD_SET[ch]
|
|
41
|
+
if current.key?(KEYWORD) || current.key?(ch)
|
|
42
|
+
longest = nil
|
|
43
|
+
longer_found = false
|
|
44
|
+
if current.key?(KEYWORD)
|
|
45
|
+
longest = current[KEYWORD]
|
|
46
|
+
seq_end = idx
|
|
47
|
+
end
|
|
48
|
+
if current.key?(ch)
|
|
49
|
+
cont = current[ch]
|
|
50
|
+
idy = idx + 1
|
|
51
|
+
broke = false
|
|
52
|
+
while idy < n
|
|
53
|
+
inner = chars[idy]
|
|
54
|
+
if !WORD_SET[inner] && cont.key?(KEYWORD)
|
|
55
|
+
longest = cont[KEYWORD]
|
|
56
|
+
seq_end = idy
|
|
57
|
+
longer_found = true
|
|
58
|
+
end
|
|
59
|
+
if cont.key?(inner)
|
|
60
|
+
cont = cont[inner]
|
|
61
|
+
else
|
|
62
|
+
broke = true
|
|
63
|
+
break
|
|
64
|
+
end
|
|
65
|
+
idy += 1
|
|
66
|
+
end
|
|
67
|
+
if !broke && cont.key?(KEYWORD)
|
|
68
|
+
longest = cont[KEYWORD]
|
|
69
|
+
seq_end = idy
|
|
70
|
+
longer_found = true
|
|
71
|
+
end
|
|
72
|
+
idx = seq_end if longer_found
|
|
73
|
+
end
|
|
74
|
+
current = @trie
|
|
75
|
+
out << [longest, seq_start, idx] if longest
|
|
76
|
+
reset = true
|
|
77
|
+
else
|
|
78
|
+
current = @trie
|
|
79
|
+
reset = true
|
|
80
|
+
end
|
|
81
|
+
elsif current.key?(ch)
|
|
82
|
+
current = current[ch]
|
|
83
|
+
else
|
|
84
|
+
current = @trie
|
|
85
|
+
reset = true
|
|
86
|
+
idy = idx + 1
|
|
87
|
+
while idy < n
|
|
88
|
+
break unless WORD_SET[chars[idy]]
|
|
89
|
+
idy += 1
|
|
90
|
+
end
|
|
91
|
+
idx = idy
|
|
92
|
+
end
|
|
93
|
+
if idx + 1 >= n && current.key?(KEYWORD)
|
|
94
|
+
out << [current[KEYWORD], seq_start, n]
|
|
95
|
+
end
|
|
96
|
+
idx += 1
|
|
97
|
+
if reset
|
|
98
|
+
reset = false
|
|
99
|
+
seq_start = idx
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
out
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# MLP classifier forward pass — consumes mlp.json exported from the trained
|
|
2
|
+
# sklearn MLPClassifier (base64 float32, row-major, shape [in, out]).
|
|
3
|
+
require "base64"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module SkillExtractor
|
|
7
|
+
class MLP
|
|
8
|
+
def initialize(path)
|
|
9
|
+
spec = JSON.parse(File.read(path))
|
|
10
|
+
unless spec["format"] == "mlp-weights-v1"
|
|
11
|
+
raise ArgumentError, "unsupported weights format: #{spec["format"]}"
|
|
12
|
+
end
|
|
13
|
+
@layers = spec["layers"].map do |l|
|
|
14
|
+
{
|
|
15
|
+
rows: l["shape"][0],
|
|
16
|
+
cols: l["shape"][1],
|
|
17
|
+
w: Base64.decode64(l["weights_b64"]).unpack("e*"),
|
|
18
|
+
b: Base64.decode64(l["bias_b64"]).unpack("e*")
|
|
19
|
+
}
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# xs: array of 384-dim L2-normalized embeddings -> array of P(skill)
|
|
24
|
+
def predict_proba(xs)
|
|
25
|
+
last = @layers.length - 1
|
|
26
|
+
xs.map do |x|
|
|
27
|
+
h = x
|
|
28
|
+
@layers.each_with_index do |l, i|
|
|
29
|
+
cols = l[:cols]
|
|
30
|
+
w = l[:w]
|
|
31
|
+
nxt = l[:b].dup
|
|
32
|
+
h.each_with_index do |hv, r|
|
|
33
|
+
next if hv.zero?
|
|
34
|
+
off = r * cols
|
|
35
|
+
cols.times { |j| nxt[j] += hv * w[off + j] }
|
|
36
|
+
end
|
|
37
|
+
nxt.map! { |v| v.negative? ? 0.0 : v } if i < last # relu
|
|
38
|
+
h = nxt
|
|
39
|
+
end
|
|
40
|
+
1.0 / (1.0 + Math.exp(-h[0])) # logistic
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# skill-extractor: extract skills from job postings and resumes.
|
|
2
|
+
# 32K-skill gazetteer + MiniLM embeddings + MLP context classifier,
|
|
3
|
+
# trained on 491K labeled samples (73% F1 held-out).
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
require_relative "skill_extractor/matcher"
|
|
7
|
+
require_relative "skill_extractor/mlp"
|
|
8
|
+
require_relative "skill_extractor/version"
|
|
9
|
+
|
|
10
|
+
module SkillExtractor
|
|
11
|
+
MODEL = "Xenova/all-MiniLM-L6-v2"
|
|
12
|
+
CONTEXT_BEFORE = 20
|
|
13
|
+
CONTEXT_AFTER = 21
|
|
14
|
+
DEFAULT_THRESHOLD = 0.5
|
|
15
|
+
DATA_DIR = File.expand_path("../data", __dir__)
|
|
16
|
+
|
|
17
|
+
class Extractor
|
|
18
|
+
# fp32 (default) matches the reference implementation bit-for-bit
|
|
19
|
+
def initialize(quantized: false)
|
|
20
|
+
@quantized = quantized
|
|
21
|
+
@matcher = KeywordMatcher.new(JSON.parse(File.read(File.join(DATA_DIR, "skills.json"))))
|
|
22
|
+
@mlp = MLP.new(File.join(DATA_DIR, "mlp.json"))
|
|
23
|
+
@pipe = nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
attr_reader :matcher, :mlp
|
|
27
|
+
|
|
28
|
+
# Gazetteer matches with their +/-20-word context windows.
|
|
29
|
+
def candidates(text)
|
|
30
|
+
matches = @matcher.extract(text)
|
|
31
|
+
return [] if matches.empty?
|
|
32
|
+
|
|
33
|
+
words = text.split
|
|
34
|
+
matches.map do |skill, start, _end|
|
|
35
|
+
word_idx = text[0...start].split.length
|
|
36
|
+
lo = [0, word_idx - CONTEXT_BEFORE].max
|
|
37
|
+
hi = [words.length, word_idx + CONTEXT_AFTER].min
|
|
38
|
+
{ skill: skill, context: words[lo...hi].join(" ") }
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# P(skill) for each candidate input string.
|
|
43
|
+
def classify(inputs)
|
|
44
|
+
@pipe ||= begin
|
|
45
|
+
require "informers"
|
|
46
|
+
Informers.pipeline("embedding", MODEL, quantized: @quantized)
|
|
47
|
+
end
|
|
48
|
+
@mlp.predict_proba(@pipe.(inputs))
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Extract confirmed skills from a job posting or resume text.
|
|
52
|
+
def extract(text, threshold: DEFAULT_THRESHOLD)
|
|
53
|
+
cands = candidates(text)
|
|
54
|
+
return [] if cands.empty?
|
|
55
|
+
|
|
56
|
+
probs = classify(cands.map { |c| "#{c[:skill]} : #{c[:context]}" })
|
|
57
|
+
found = {}
|
|
58
|
+
probs.each_with_index { |p, i| found[cands[i][:skill]] = true if p >= threshold }
|
|
59
|
+
found.keys.sort
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
@default = nil
|
|
64
|
+
|
|
65
|
+
# Module-level convenience wrapper around a shared Extractor.
|
|
66
|
+
def self.extract_skills(text, threshold: DEFAULT_THRESHOLD)
|
|
67
|
+
@default ||= Extractor.new
|
|
68
|
+
@default.extract(text, threshold: threshold)
|
|
69
|
+
end
|
|
70
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: skill-extractor
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Qarera
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-07-04 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: informers
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.0'
|
|
27
|
+
description: Gazetteer candidates filtered by a MiniLM + MLP context classifier trained
|
|
28
|
+
on 491K labeled samples, so prose like 'can-do attitude' doesn't become a skill.
|
|
29
|
+
Runs on ONNX Runtime via the informers gem.
|
|
30
|
+
email:
|
|
31
|
+
- yashthenuan21@gmail.com
|
|
32
|
+
executables: []
|
|
33
|
+
extensions: []
|
|
34
|
+
extra_rdoc_files: []
|
|
35
|
+
files:
|
|
36
|
+
- LICENSE
|
|
37
|
+
- README.md
|
|
38
|
+
- data/meta.json
|
|
39
|
+
- data/mlp.json
|
|
40
|
+
- data/skills.json
|
|
41
|
+
- lib/skill_extractor.rb
|
|
42
|
+
- lib/skill_extractor/matcher.rb
|
|
43
|
+
- lib/skill_extractor/mlp.rb
|
|
44
|
+
- lib/skill_extractor/version.rb
|
|
45
|
+
homepage: https://github.com/dreamjobs-tech/skill-extractor
|
|
46
|
+
licenses:
|
|
47
|
+
- MIT
|
|
48
|
+
metadata:
|
|
49
|
+
source_code_uri: https://github.com/dreamjobs-tech/skill-extractor/tree/main/ruby
|
|
50
|
+
homepage_uri: https://www.qarera.com
|
|
51
|
+
post_install_message:
|
|
52
|
+
rdoc_options: []
|
|
53
|
+
require_paths:
|
|
54
|
+
- lib
|
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
56
|
+
requirements:
|
|
57
|
+
- - ">="
|
|
58
|
+
- !ruby/object:Gem::Version
|
|
59
|
+
version: '3.0'
|
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
61
|
+
requirements:
|
|
62
|
+
- - ">="
|
|
63
|
+
- !ruby/object:Gem::Version
|
|
64
|
+
version: '0'
|
|
65
|
+
requirements: []
|
|
66
|
+
rubygems_version: 3.4.10
|
|
67
|
+
signing_key:
|
|
68
|
+
specification_version: 4
|
|
69
|
+
summary: Extract skills from job postings and resumes — 32K-skill gazetteer + MiniLM
|
|
70
|
+
embeddings + MLP context classifier (73% F1).
|
|
71
|
+
test_files: []
|