mesh-medical-subject-headings 1.2.2 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/Gemfile.lock +4 -4
- data/MESH.gemspec +14 -14
- data/lib/MESH.rb +2 -0
- data/lib/MESH/classifier.rb +103 -0
- data/lib/MESH/mesh.rb +148 -55
- data/lib/MESH/translator.rb +25 -17
- data/lib/MESH/version.rb +1 -1
- data/match.rb +20 -4
- data/test/classifier_test.rb +443 -0
- data/test/mesh_test.rb +171 -12
- data/test/test_helper.rb +10 -1
- data/test/translator_test.rb +10 -6
- data/tr_speed.rb +8 -0
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3e0859a3e5c4757eb7babc4072ac97177ae0feee
|
4
|
+
data.tar.gz: 2488bf617fa6761cf5b91c47c7b6a0b0641afca4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2770ad376f99544f8825eaab301ab457ad29d559beeca42cc0e66fee92bc68f70d6eecdf23775c34de57734a31505e2e44e79461fa052ab73e8eba27738c2fad
|
7
|
+
data.tar.gz: 9be16f17ace608d05d8dca68c407f76e7c450ffd1b79907a00fe20d6879aaa21acbfc1c2b0e5820f9ff04a0b52067f9f5e9464b19639d3b2135b0d6eae08d56d
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
mesh-medical-subject-headings (1.
|
4
|
+
mesh-medical-subject-headings (1.3.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
metaclass (0.0.
|
9
|
+
metaclass (0.0.4)
|
10
10
|
minitest (5.0.8)
|
11
11
|
mocha (1.0.0)
|
12
12
|
metaclass (~> 0.0.1)
|
13
|
-
rake (10.
|
13
|
+
rake (10.2.2)
|
14
14
|
ruby-prof (0.14.2)
|
15
|
-
yard (0.8.7.
|
15
|
+
yard (0.8.7.4)
|
16
16
|
|
17
17
|
PLATFORMS
|
18
18
|
ruby
|
data/MESH.gemspec
CHANGED
@@ -4,24 +4,24 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'MESH/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
7
|
+
spec.name = 'mesh-medical-subject-headings'
|
8
8
|
spec.version = Mesh::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
11
|
-
spec.description =
|
12
|
-
spec.summary =
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
9
|
+
spec.authors = ['Rob Styles']
|
10
|
+
spec.email = ['rob@dynamicorange.com']
|
11
|
+
spec.description = 'A ruby gem containing MeSH subject headings (https://www.nlm.nih.gov/mesh/) for use in classifying and entity recognition.'
|
12
|
+
spec.summary = spec.description
|
13
|
+
spec.homepage = ''
|
14
|
+
spec.license = 'AGPL3'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.executables = nil
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
19
|
+
spec.require_paths = ['lib']
|
20
20
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
23
|
-
spec.add_development_dependency
|
24
|
-
spec.add_development_dependency
|
25
|
-
spec.add_development_dependency
|
26
|
-
spec.add_development_dependency
|
21
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
22
|
+
spec.add_development_dependency 'rake'
|
23
|
+
spec.add_development_dependency 'mocha'
|
24
|
+
spec.add_development_dependency 'yard'
|
25
|
+
spec.add_development_dependency 'minitest', '~> 5.0.8'
|
26
|
+
spec.add_development_dependency 'ruby-prof'
|
27
27
|
end
|
data/lib/MESH.rb
CHANGED
@@ -0,0 +1,103 @@
|
|
1
|
+
module MESH
|
2
|
+
class Classifier
|
3
|
+
def classify(weighted_matches)
|
4
|
+
|
5
|
+
weighted_headings = []
|
6
|
+
weighted_matches.each do |wm|
|
7
|
+
wm[:matches].each do |match|
|
8
|
+
weighted_headings << [wm[:weight], match[:heading]]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
root_groups = {}
|
13
|
+
weighted_headings.each do |weight, heading|
|
14
|
+
heading.roots.each do |root|
|
15
|
+
root_groups[root] ||= []
|
16
|
+
root_groups[root] << [weight, heading]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
chosen = {}
|
21
|
+
|
22
|
+
root_groups.each do |root, weighted_headings|
|
23
|
+
scored = {}
|
24
|
+
weighted_headings.each do |weight, heading|
|
25
|
+
calculate_scores(scored, root, heading, weight)
|
26
|
+
#scored[heading] ||= 0
|
27
|
+
#scored[heading] += weight
|
28
|
+
#heading.parents.each do |p|
|
29
|
+
# if p.roots.include? root
|
30
|
+
# scored[p] ||= 0
|
31
|
+
# scored[p] += weight / 3.0
|
32
|
+
# end
|
33
|
+
#end
|
34
|
+
end
|
35
|
+
scored.each { |h,s| scored[h] = s.round(3) }
|
36
|
+
scored.delete_if { |h,s| s == 0 }
|
37
|
+
best_score, best_connected = scored.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
|
38
|
+
most_specific = best_connected.max_by { |h| h.deepest_position(root) }
|
39
|
+
#chosen[root] = [most_specific, best_score]
|
40
|
+
chosen[root] = [best_score, scored]
|
41
|
+
end
|
42
|
+
|
43
|
+
chosen
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
def calculate_scores(scored, root, heading, weight)
|
48
|
+
scored[heading] ||= 0
|
49
|
+
scored[heading] += weight
|
50
|
+
heading.parents.each do |p|
|
51
|
+
if p.roots.include? root
|
52
|
+
calculate_scores(scored, root, p, weight / 3.0)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
#text = "#{document[:title]}\n#{document[:abstract]}\n#{document[:content]}"
|
61
|
+
#matches = MESH::Mesh.match_in_text(text)
|
62
|
+
#headings = matches.map { |m| m[:heading] }
|
63
|
+
#root_groups = headings.reduce({}) do |rg, heading|
|
64
|
+
# heading.roots.each { |root| (rg[root] ||= []) << heading }
|
65
|
+
# rg
|
66
|
+
#end
|
67
|
+
#root_groups.reduce({}) do |chosen, (root, candidates)|
|
68
|
+
# connections = calculate_connections(root, candidates)
|
69
|
+
# best_score, best_connected = connections.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
|
70
|
+
# most_specific = best_connected.max_by { |h| h.deepest_position }
|
71
|
+
# chosen[root] = most_specific
|
72
|
+
# chosen
|
73
|
+
#end
|
74
|
+
#end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def calculate_connections(root, headings, weight)
|
79
|
+
connections = {}
|
80
|
+
headings.each do |h|
|
81
|
+
add_connection(connections, root, h, weight)
|
82
|
+
end
|
83
|
+
connections
|
84
|
+
end
|
85
|
+
|
86
|
+
def add_connection(connections, root, heading, weight)
|
87
|
+
return unless heading.roots.include? root
|
88
|
+
connections[heading] ||= 0
|
89
|
+
connections[heading] += weight
|
90
|
+
heading.parents.each do |p|
|
91
|
+
connections[p] ||= 0
|
92
|
+
connections[p] += weight
|
93
|
+
#add_connection(connections, root, p, weight)
|
94
|
+
end
|
95
|
+
#heading.siblings.each do |p|
|
96
|
+
# add_connection(connections, p)
|
97
|
+
#end
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
data/lib/MESH/mesh.rb
CHANGED
@@ -1,73 +1,79 @@
|
|
1
|
-
require_relative 'translator'
|
2
|
-
|
3
1
|
module MESH
|
4
2
|
class Mesh
|
5
3
|
|
6
|
-
|
4
|
+
include Comparable
|
5
|
+
attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class
|
6
|
+
|
7
|
+
def <=> other
|
8
|
+
self.unique_id <=> other.unique_id
|
9
|
+
end
|
7
10
|
|
8
|
-
def original_heading(locale =
|
9
|
-
return @original_heading
|
10
|
-
@@translator.translate(@original_heading)
|
11
|
+
def original_heading(locale = @@default_locale)
|
12
|
+
return @original_heading[locale]
|
11
13
|
end
|
12
14
|
|
13
|
-
def natural_language_name(locale =
|
14
|
-
return @natural_language_name
|
15
|
-
@@translator.translate(@natural_language_name)
|
15
|
+
def natural_language_name(locale = @@default_locale)
|
16
|
+
return @natural_language_name[locale]
|
16
17
|
end
|
17
18
|
|
18
|
-
def summary(locale =
|
19
|
-
return @summary
|
20
|
-
@@translator.translate(@summary)
|
19
|
+
def summary(locale = @@default_locale)
|
20
|
+
return @summary[locale]
|
21
21
|
end
|
22
22
|
|
23
|
-
def entries(locale =
|
24
|
-
|
25
|
-
@entries
|
23
|
+
def entries(locale = @@default_locale)
|
24
|
+
@entries[locale] ||= []
|
25
|
+
return @entries[locale]
|
26
26
|
end
|
27
27
|
|
28
28
|
def self.configure(args)
|
29
29
|
return if @@configured
|
30
30
|
raise ArgumentError.new('MeshHeadingGraph requires a filename in order to configure itself') unless not args[:filename].nil?
|
31
|
+
|
31
32
|
gzipped_file = File.open(args[:filename])
|
32
33
|
file = Zlib::GzipReader.new(gzipped_file)
|
34
|
+
|
33
35
|
current_heading = Mesh.new
|
34
36
|
file.each_line do |line|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@@
|
37
|
+
|
38
|
+
case
|
39
|
+
|
40
|
+
when matches = line.match(/^\*NEWRECORD$/)
|
41
|
+
unless current_heading.unique_id.nil?
|
42
|
+
current_heading.entries.sort!
|
43
|
+
@@headings << current_heading
|
44
|
+
@@by_unique_id[current_heading.unique_id] = current_heading
|
45
|
+
@@by_original_heading[current_heading.original_heading] = current_heading
|
46
|
+
current_heading.tree_numbers.each do |tree_number|
|
47
|
+
@@by_tree_number[tree_number] = current_heading
|
48
|
+
end
|
43
49
|
end
|
44
|
-
|
45
|
-
current_heading = Mesh.new
|
46
|
-
end
|
50
|
+
current_heading = Mesh.new
|
47
51
|
|
48
|
-
|
49
|
-
|
52
|
+
when matches = line.match(/^UI = (.*)/)
|
53
|
+
current_heading.unique_id = matches[1]
|
50
54
|
|
51
|
-
|
52
|
-
|
55
|
+
when matches = line.match(/^MN = (.*)/)
|
56
|
+
current_heading.tree_numbers << matches[1]
|
57
|
+
current_heading.roots << matches[1][0] unless current_heading.roots.include?(matches[1][0])
|
53
58
|
|
54
|
-
|
55
|
-
|
59
|
+
when matches = line.match(/^MS = (.*)/)
|
60
|
+
current_heading.set_summary(matches[1])
|
56
61
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
when matches = line.match(/^DC = (.*)/)
|
63
|
+
current_heading.descriptor_class = @@descriptor_classes[matches[1].to_i]
|
64
|
+
|
65
|
+
when matches = line.match(/^MH = (.*)/)
|
66
|
+
mh = matches[1]
|
67
|
+
current_heading.set_original_heading(mh)
|
68
|
+
current_heading.entries << mh
|
69
|
+
librarian_parts = mh.match(/(.*), (.*)/)
|
70
|
+
nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
|
71
|
+
current_heading.set_natural_language_name(nln)
|
72
|
+
|
73
|
+
when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
|
74
|
+
entry = matches[1].chomp
|
75
|
+
current_heading.entries << entry
|
66
76
|
|
67
|
-
matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
|
68
|
-
unless matches.nil?
|
69
|
-
mh = matches[1].chomp
|
70
|
-
current_heading.entries << mh
|
71
77
|
end
|
72
78
|
|
73
79
|
end
|
@@ -88,6 +94,19 @@ module MESH
|
|
88
94
|
@@configured = true
|
89
95
|
end
|
90
96
|
|
97
|
+
def self.translate(locale, tr)
|
98
|
+
return if @@locales.include? locale
|
99
|
+
@@headings.each_with_index do |h, i|
|
100
|
+
h.set_original_heading(tr.translate(h.original_heading), locale)
|
101
|
+
h.set_natural_language_name(tr.translate(h.natural_language_name), locale)
|
102
|
+
h.set_summary(tr.translate(h.summary), locale)
|
103
|
+
h.entries.each { |entry| h.entries(locale) << tr.translate(entry) }
|
104
|
+
h.entries(locale).sort!
|
105
|
+
end
|
106
|
+
|
107
|
+
@@locales << locale
|
108
|
+
end
|
109
|
+
|
91
110
|
def self.find(unique_id)
|
92
111
|
raise 'MeshHeadingGraph.configure must be called before use' unless @@configured
|
93
112
|
return @@by_unique_id[unique_id]
|
@@ -118,20 +137,72 @@ module MESH
|
|
118
137
|
end
|
119
138
|
|
120
139
|
def self.match_in_text(text)
|
121
|
-
|
140
|
+
return [] if text.nil?
|
141
|
+
downcased = text.downcase
|
122
142
|
matches = []
|
123
143
|
@@headings.each do |heading|
|
124
144
|
next unless heading.useful
|
125
|
-
|
126
|
-
|
127
|
-
regex
|
128
|
-
|
129
|
-
|
145
|
+
@@locales.each do |locale|
|
146
|
+
heading.entries(locale).each do |entry|
|
147
|
+
if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
|
148
|
+
if /^[A-Z0-9]+$/ =~ entry
|
149
|
+
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
|
150
|
+
else
|
151
|
+
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
|
152
|
+
end
|
153
|
+
text.to_enum(:scan, regex).map do |m,|
|
154
|
+
matches << {heading: heading, matched: entry, index: $`.size}
|
155
|
+
end
|
130
156
|
end
|
131
157
|
end
|
132
158
|
end
|
133
159
|
end
|
134
|
-
|
160
|
+
confirmed_matches = []
|
161
|
+
matches.combination(2) do |l, r|
|
162
|
+
if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length)
|
163
|
+
#r is within l
|
164
|
+
r[:delete] = true
|
165
|
+
elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length)
|
166
|
+
#l is within r
|
167
|
+
l[:delete] = true
|
168
|
+
end
|
169
|
+
end
|
170
|
+
matches.delete_if { |match| match[:delete] }
|
171
|
+
end
|
172
|
+
|
173
|
+
def has_ancestor(heading)
|
174
|
+
return false if parents.empty?
|
175
|
+
return true if parents.include? heading
|
176
|
+
in_grandparents = parents.map { |p| p.has_ancestor(heading) }
|
177
|
+
return in_grandparents.include? true
|
178
|
+
end
|
179
|
+
|
180
|
+
def has_descendant(heading)
|
181
|
+
return false if children.empty?
|
182
|
+
return true if children.include? heading
|
183
|
+
in_grandchildren = children.map { |p| p.has_descendant(heading) }
|
184
|
+
return in_grandchildren.include? true
|
185
|
+
end
|
186
|
+
|
187
|
+
def sibling?(heading)
|
188
|
+
common_parents = parents & heading.parents
|
189
|
+
!common_parents.empty?
|
190
|
+
end
|
191
|
+
|
192
|
+
def deepest_position(root = '')
|
193
|
+
return nil if tree_numbers.empty?
|
194
|
+
deepest_tree_number = tree_numbers.max_by { |tn| tn.start_with?(root) ? tn.length : 0 }
|
195
|
+
deepest_tree_number.split('.').length
|
196
|
+
end
|
197
|
+
|
198
|
+
def shallowest_position
|
199
|
+
return nil if tree_numbers.empty?
|
200
|
+
shallowest_tree_number = tree_numbers.min_by { |tn| tn.length }
|
201
|
+
shallowest_tree_number.split('.').length
|
202
|
+
end
|
203
|
+
|
204
|
+
def self.cluster(headings)
|
205
|
+
return headings
|
135
206
|
end
|
136
207
|
|
137
208
|
def matches(conditions)
|
@@ -141,6 +212,8 @@ module MESH
|
|
141
212
|
return false unless field_content.find { |fc| pattern =~ fc }
|
142
213
|
elsif field_content.is_a?(TrueClass) || field_content.is_a?(FalseClass)
|
143
214
|
return false unless field_content == pattern
|
215
|
+
elsif field_content.is_a? Symbol
|
216
|
+
return field_content == pattern
|
144
217
|
else
|
145
218
|
return false unless pattern =~ field_content
|
146
219
|
end
|
@@ -149,7 +222,19 @@ module MESH
|
|
149
222
|
end
|
150
223
|
|
151
224
|
def inspect
|
152
|
-
return "#{
|
225
|
+
return "#{unique_id}, #{original_heading}, [#{tree_numbers.join(',')}]"
|
226
|
+
end
|
227
|
+
|
228
|
+
def set_original_heading(heading, locale = @@default_locale)
|
229
|
+
@original_heading[locale] = heading
|
230
|
+
end
|
231
|
+
|
232
|
+
def set_natural_language_name(name, locale = @@default_locale)
|
233
|
+
@natural_language_name[locale] = name
|
234
|
+
end
|
235
|
+
|
236
|
+
def set_summary(summary, locale = @@default_locale)
|
237
|
+
@summary[locale] = summary
|
153
238
|
end
|
154
239
|
|
155
240
|
private
|
@@ -160,16 +245,24 @@ module MESH
|
|
160
245
|
@@by_tree_number = {}
|
161
246
|
@@by_original_heading = {}
|
162
247
|
@@default_locale = 'en-US'
|
163
|
-
@@
|
248
|
+
@@locales = [@@default_locale]
|
249
|
+
@@us_to_gb = Translator.new(Translator.enus_to_engb)
|
250
|
+
@@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor]
|
164
251
|
|
165
252
|
def initialize
|
166
253
|
@useful = true
|
167
254
|
@tree_numbers = []
|
255
|
+
@roots = []
|
168
256
|
@parents = []
|
169
257
|
@children = []
|
170
|
-
@entries =
|
258
|
+
@entries = {}
|
259
|
+
@entries[@@default_locale] = []
|
260
|
+
@original_heading = {}
|
261
|
+
@natural_language_name = {}
|
262
|
+
@summary = {}
|
171
263
|
end
|
172
264
|
|
265
|
+
|
173
266
|
end
|
174
267
|
end
|
175
268
|
|