mesh-medical-subject-headings 1.3.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Gemfile.lock +1 -1
- data/data/mesh_data_2014/d2014.en_gb.bin.gz +0 -0
- data/lib/MESH.rb +2 -1
- data/lib/MESH/classifier.rb +0 -32
- data/lib/MESH/heading.rb +155 -0
- data/lib/MESH/tree.rb +216 -0
- data/lib/MESH/version.rb +1 -1
- data/match.rb +3 -3
- data/test/classifier_test.rb +239 -259
- data/test/{mesh_test.rb → mesh_core_test.rb} +172 -172
- data/test/test_helper.rb +11 -11
- data/tr_speed.rb +2 -2
- metadata +7 -5
- data/lib/MESH/mesh.rb +0 -314
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 246c3f1005660ea5801534254a71eea1b0c1037c
|
4
|
+
data.tar.gz: 793b97acd06ab80e225f20b9d8f753b22b9fb6bb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f21c94d6daa57f989777800f472fd679571c398fd535445fd8b75aabad1cfef38393cc0947b0545499211e3247b776531c9eab49a080e3d7893c0bd805ebf94
|
7
|
+
data.tar.gz: ab02026193ce2ce3d46099a00c1923bc76490a873eb81410f07c209666d8e34e2ee9f92edcd15bffb42f1391772a413be689e1abd9004123b7f0ac225a778848
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
#2.0.1 / 2014-04-28
|
2
|
+
* [FEATURE] Load translations from static file to improve load times
|
3
|
+
|
4
|
+
#2.0.0 / 2014-04-28
|
5
|
+
* [FEATURE] Refactored to instances for future running of multiple trees
|
6
|
+
|
1
7
|
#1.3.0 / 2014-04-10
|
2
8
|
* [FEATURE] Released classifier to group and score matched headings
|
3
9
|
|
data/Gemfile.lock
CHANGED
Binary file
|
data/lib/MESH.rb
CHANGED
data/lib/MESH/classifier.rb
CHANGED
@@ -23,20 +23,11 @@ module MESH
|
|
23
23
|
scored = {}
|
24
24
|
weighted_headings.each do |weight, heading|
|
25
25
|
calculate_scores(scored, root, heading, weight)
|
26
|
-
#scored[heading] ||= 0
|
27
|
-
#scored[heading] += weight
|
28
|
-
#heading.parents.each do |p|
|
29
|
-
# if p.roots.include? root
|
30
|
-
# scored[p] ||= 0
|
31
|
-
# scored[p] += weight / 3.0
|
32
|
-
# end
|
33
|
-
#end
|
34
26
|
end
|
35
27
|
scored.each { |h,s| scored[h] = s.round(3) }
|
36
28
|
scored.delete_if { |h,s| s == 0 }
|
37
29
|
best_score, best_connected = scored.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
|
38
30
|
most_specific = best_connected.max_by { |h| h.deepest_position(root) }
|
39
|
-
#chosen[root] = [most_specific, best_score]
|
40
31
|
chosen[root] = [best_score, scored]
|
41
32
|
end
|
42
33
|
|
@@ -54,25 +45,6 @@ module MESH
|
|
54
45
|
end
|
55
46
|
end
|
56
47
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
#text = "#{document[:title]}\n#{document[:abstract]}\n#{document[:content]}"
|
61
|
-
#matches = MESH::Mesh.match_in_text(text)
|
62
|
-
#headings = matches.map { |m| m[:heading] }
|
63
|
-
#root_groups = headings.reduce({}) do |rg, heading|
|
64
|
-
# heading.roots.each { |root| (rg[root] ||= []) << heading }
|
65
|
-
# rg
|
66
|
-
#end
|
67
|
-
#root_groups.reduce({}) do |chosen, (root, candidates)|
|
68
|
-
# connections = calculate_connections(root, candidates)
|
69
|
-
# best_score, best_connected = connections.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
|
70
|
-
# most_specific = best_connected.max_by { |h| h.deepest_position }
|
71
|
-
# chosen[root] = most_specific
|
72
|
-
# chosen
|
73
|
-
#end
|
74
|
-
#end
|
75
|
-
|
76
48
|
private
|
77
49
|
|
78
50
|
def calculate_connections(root, headings, weight)
|
@@ -90,11 +62,7 @@ module MESH
|
|
90
62
|
heading.parents.each do |p|
|
91
63
|
connections[p] ||= 0
|
92
64
|
connections[p] += weight
|
93
|
-
#add_connection(connections, root, p, weight)
|
94
65
|
end
|
95
|
-
#heading.siblings.each do |p|
|
96
|
-
# add_connection(connections, p)
|
97
|
-
#end
|
98
66
|
end
|
99
67
|
|
100
68
|
end
|
data/lib/MESH/heading.rb
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
module MESH
|
2
|
+
class Heading
|
3
|
+
|
4
|
+
include Comparable
|
5
|
+
attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class, :default_locale
|
6
|
+
|
7
|
+
def <=> other
|
8
|
+
self.unique_id <=> other.unique_id
|
9
|
+
end
|
10
|
+
|
11
|
+
def original_heading(locale = default_locale)
|
12
|
+
return @original_heading[locale]
|
13
|
+
end
|
14
|
+
|
15
|
+
def natural_language_name(locale = default_locale)
|
16
|
+
return @natural_language_name[locale]
|
17
|
+
end
|
18
|
+
|
19
|
+
def summary(locale = default_locale)
|
20
|
+
return @summary[locale]
|
21
|
+
end
|
22
|
+
|
23
|
+
def entries(locale = default_locale)
|
24
|
+
@entries[locale] ||= []
|
25
|
+
return @entries[locale]
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
def has_ancestor(heading)
|
30
|
+
return false if parents.empty?
|
31
|
+
return true if parents.include? heading
|
32
|
+
in_grandparents = parents.map { |p| p.has_ancestor(heading) }
|
33
|
+
return in_grandparents.include? true
|
34
|
+
end
|
35
|
+
|
36
|
+
def has_descendant(heading)
|
37
|
+
return false if children.empty?
|
38
|
+
return true if children.include? heading
|
39
|
+
in_grandchildren = children.map { |p| p.has_descendant(heading) }
|
40
|
+
return in_grandchildren.include? true
|
41
|
+
end
|
42
|
+
|
43
|
+
def sibling?(heading)
|
44
|
+
common_parents = parents & heading.parents
|
45
|
+
!common_parents.empty?
|
46
|
+
end
|
47
|
+
|
48
|
+
def deepest_position(root = '')
|
49
|
+
return nil if tree_numbers.empty?
|
50
|
+
deepest_tree_number = tree_numbers.max_by { |tn| tn.start_with?(root) ? tn.length : 0 }
|
51
|
+
deepest_tree_number.split('.').length
|
52
|
+
end
|
53
|
+
|
54
|
+
def shallowest_position
|
55
|
+
return nil if tree_numbers.empty?
|
56
|
+
shallowest_tree_number = tree_numbers.min_by { |tn| tn.length }
|
57
|
+
shallowest_tree_number.split('.').length
|
58
|
+
end
|
59
|
+
|
60
|
+
def matches(conditions)
|
61
|
+
conditions.each do |field, pattern|
|
62
|
+
field_content = self.send(field)
|
63
|
+
if field_content.kind_of?(Array)
|
64
|
+
return false unless field_content.find { |fc| pattern =~ fc }
|
65
|
+
elsif field_content.is_a?(TrueClass) || field_content.is_a?(FalseClass)
|
66
|
+
return false unless field_content == pattern
|
67
|
+
elsif field_content.is_a? Symbol
|
68
|
+
return field_content == pattern
|
69
|
+
else
|
70
|
+
return false unless pattern =~ field_content
|
71
|
+
end
|
72
|
+
end
|
73
|
+
return true
|
74
|
+
end
|
75
|
+
|
76
|
+
def inspect
|
77
|
+
return "#{unique_id}, #{original_heading}, [#{tree_numbers.join(',')}]"
|
78
|
+
end
|
79
|
+
|
80
|
+
def set_original_heading(heading, locale = default_locale)
|
81
|
+
@original_heading[locale] = heading
|
82
|
+
end
|
83
|
+
|
84
|
+
def set_natural_language_name(name, locale = default_locale)
|
85
|
+
@natural_language_name[locale] = name
|
86
|
+
end
|
87
|
+
|
88
|
+
def set_summary(summary, locale = default_locale)
|
89
|
+
@summary[locale] = summary
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def initialize
|
95
|
+
@useful = true
|
96
|
+
@tree_numbers = []
|
97
|
+
@roots = []
|
98
|
+
@parents = []
|
99
|
+
@children = []
|
100
|
+
@entries = {}
|
101
|
+
@original_heading = {}
|
102
|
+
@natural_language_name = {}
|
103
|
+
@summary = {}
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
#
|
111
|
+
#*NEWRECORD
|
112
|
+
#RECTYPE = D
|
113
|
+
#MH = Calcimycin
|
114
|
+
#AQ = AA AD AE AG AI AN BI BL CF CH CL CS CT DU EC HI IM IP ME PD PK PO RE SD ST TO TU UR
|
115
|
+
#ENTRY = A-23187|T109|T195|LAB|NRW|NLM (1991)|900308|abbcdef
|
116
|
+
#ENTRY = A23187|T109|T195|LAB|NRW|UNK (19XX)|741111|abbcdef
|
117
|
+
#ENTRY = Antibiotic A23187|T109|T195|NON|NRW|NLM (1991)|900308|abbcdef
|
118
|
+
#ENTRY = A 23187
|
119
|
+
#ENTRY = A23187, Antibiotic
|
120
|
+
#MN = D03.438.221.173
|
121
|
+
#PA = Anti-Bacterial Agents
|
122
|
+
#PA = Calcium Ionophores
|
123
|
+
#MH_TH = FDA SRS (2014)
|
124
|
+
#MH_TH = NLM (1975)
|
125
|
+
#ST = T109
|
126
|
+
#ST = T195
|
127
|
+
#N1 = 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-
|
128
|
+
# RN = 37H9VM9WZL
|
129
|
+
#RR = 52665-69-7 (Calcimycin)
|
130
|
+
#PI = Antibiotics (1973-1974)
|
131
|
+
#PI = Carboxylic Acids (1973-1974)
|
132
|
+
#MS = An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems.
|
133
|
+
# OL = use CALCIMYCIN to search A 23187 1975-90
|
134
|
+
#PM = 91; was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)
|
135
|
+
#HN = 91(75); was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)
|
136
|
+
#MED = *62
|
137
|
+
#MED = 847
|
138
|
+
#M90 = *299
|
139
|
+
#M90 = 2405
|
140
|
+
#M85 = *454
|
141
|
+
#M85 = 2878
|
142
|
+
#M80 = *316
|
143
|
+
#M80 = 1601
|
144
|
+
#M75 = *300
|
145
|
+
#M75 = 823
|
146
|
+
#M66 = *1
|
147
|
+
#M66 = 3
|
148
|
+
#M94 = *153
|
149
|
+
#M94 = 1606
|
150
|
+
#MR = 20130708
|
151
|
+
#DA = 19741119
|
152
|
+
#DC = 1
|
153
|
+
#DX = 19840101
|
154
|
+
#UI = D000001
|
155
|
+
#
|
data/lib/MESH/tree.rb
ADDED
@@ -0,0 +1,216 @@
|
|
1
|
+
module MESH
|
2
|
+
|
3
|
+
class Tree
|
4
|
+
|
5
|
+
@@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor]
|
6
|
+
@@default_locale = :en_us
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
|
10
|
+
@headings = []
|
11
|
+
@by_unique_id = {}
|
12
|
+
@by_tree_number = {}
|
13
|
+
@by_original_heading = {}
|
14
|
+
@locales = [@@default_locale]
|
15
|
+
|
16
|
+
filename = File.expand_path('../../../data/mesh_data_2014/d2014.bin.gz', __FILE__)
|
17
|
+
gzipped_file = File.open(filename)
|
18
|
+
file = Zlib::GzipReader.new(gzipped_file)
|
19
|
+
|
20
|
+
current_heading = MESH::Heading.new
|
21
|
+
current_heading.default_locale = @@default_locale
|
22
|
+
file.each_line do |line|
|
23
|
+
|
24
|
+
case
|
25
|
+
|
26
|
+
when matches = line.match(/^\*NEWRECORD$/)
|
27
|
+
unless current_heading.unique_id.nil?
|
28
|
+
current_heading.entries.sort!
|
29
|
+
@headings << current_heading
|
30
|
+
@by_unique_id[current_heading.unique_id] = current_heading
|
31
|
+
@by_original_heading[current_heading.original_heading] = current_heading
|
32
|
+
current_heading.tree_numbers.each do |tree_number|
|
33
|
+
@by_tree_number[tree_number] = current_heading
|
34
|
+
end
|
35
|
+
end
|
36
|
+
current_heading = MESH::Heading.new
|
37
|
+
current_heading.default_locale = @@default_locale
|
38
|
+
|
39
|
+
when matches = line.match(/^UI = (.*)/)
|
40
|
+
current_heading.unique_id = matches[1]
|
41
|
+
|
42
|
+
when matches = line.match(/^MN = (.*)/)
|
43
|
+
current_heading.tree_numbers << matches[1]
|
44
|
+
current_heading.roots << matches[1][0] unless current_heading.roots.include?(matches[1][0])
|
45
|
+
|
46
|
+
when matches = line.match(/^MS = (.*)/)
|
47
|
+
current_heading.set_summary(matches[1])
|
48
|
+
|
49
|
+
when matches = line.match(/^DC = (.*)/)
|
50
|
+
current_heading.descriptor_class = @@descriptor_classes[matches[1].to_i]
|
51
|
+
|
52
|
+
when matches = line.match(/^MH = (.*)/)
|
53
|
+
mh = matches[1]
|
54
|
+
current_heading.set_original_heading(mh)
|
55
|
+
current_heading.entries << mh
|
56
|
+
librarian_parts = mh.match(/(.*), (.*)/)
|
57
|
+
nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
|
58
|
+
current_heading.set_natural_language_name(nln)
|
59
|
+
|
60
|
+
when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
|
61
|
+
entry = matches[1].chomp
|
62
|
+
current_heading.entries << entry
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
@by_unique_id.each do |id, heading|
|
69
|
+
heading.tree_numbers.each do |tree_number|
|
70
|
+
#D03.438.221.173
|
71
|
+
parts = tree_number.split('.')
|
72
|
+
if parts.size > 1
|
73
|
+
parts.pop
|
74
|
+
parent_tree_number = parts.join '.'
|
75
|
+
parent = @by_tree_number[parent_tree_number]
|
76
|
+
heading.parents << parent unless parent.nil?
|
77
|
+
parent.children << heading unless parent.nil?
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
def load_translation(locale)
|
85
|
+
return if @locales.include? locale
|
86
|
+
filename = File.expand_path("../../../data/mesh_data_2014/d2014.#{locale}.bin.gz", __FILE__)
|
87
|
+
gzipped_file = File.open(filename)
|
88
|
+
file = Zlib::GzipReader.new(gzipped_file)
|
89
|
+
|
90
|
+
entries = []
|
91
|
+
original_heading = nil
|
92
|
+
natural_language_name = nil
|
93
|
+
summary = nil
|
94
|
+
unique_id = nil
|
95
|
+
file.each_line do |line|
|
96
|
+
|
97
|
+
case
|
98
|
+
|
99
|
+
when matches = line.match(/^\*NEWRECORD$/)
|
100
|
+
unless unique_id.nil?
|
101
|
+
entries.sort!
|
102
|
+
entries.uniq!
|
103
|
+
if heading = find(unique_id)
|
104
|
+
heading.set_original_heading(original_heading, locale) unless original_heading.nil?
|
105
|
+
heading.set_natural_language_name(natural_language_name, locale) unless natural_language_name.nil?
|
106
|
+
heading.set_summary(summary, locale) unless summary.nil?
|
107
|
+
entries.each { |entry| heading.entries(locale) << entry }
|
108
|
+
end
|
109
|
+
|
110
|
+
entries = []
|
111
|
+
original_heading = nil
|
112
|
+
summary = nil
|
113
|
+
unique_id = nil
|
114
|
+
end
|
115
|
+
|
116
|
+
when matches = line.match(/^UI = (.*)/)
|
117
|
+
unique_id = matches[1]
|
118
|
+
|
119
|
+
when matches = line.match(/^MS = (.*)/)
|
120
|
+
summary = matches[1]
|
121
|
+
|
122
|
+
when matches = line.match(/^MH = (.*)/)
|
123
|
+
mh = matches[1]
|
124
|
+
original_heading = mh
|
125
|
+
entries << mh
|
126
|
+
librarian_parts = mh.match(/(.*), (.*)/)
|
127
|
+
natural_language_name = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
|
128
|
+
|
129
|
+
when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
|
130
|
+
entry = matches[1].chomp
|
131
|
+
entries << entry
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
@locales << locale
|
137
|
+
end
|
138
|
+
|
139
|
+
# NO LONGER COVERED BY TESTS
|
140
|
+
# def translate(locale, tr)
|
141
|
+
# return if @locales.include? locale
|
142
|
+
# @headings.each_with_index do |h, i|
|
143
|
+
# h.set_original_heading(tr.translate(h.original_heading), locale)
|
144
|
+
# h.set_natural_language_name(tr.translate(h.natural_language_name), locale)
|
145
|
+
# h.set_summary(tr.translate(h.summary), locale)
|
146
|
+
# h.entries.each { |entry| h.entries(locale) << tr.translate(entry) }
|
147
|
+
# h.entries(locale).sort!
|
148
|
+
# end
|
149
|
+
#
|
150
|
+
# @locales << locale
|
151
|
+
# end
|
152
|
+
|
153
|
+
def find(unique_id)
|
154
|
+
return @by_unique_id[unique_id]
|
155
|
+
end
|
156
|
+
|
157
|
+
def find_by_tree_number(tree_number)
|
158
|
+
return @by_tree_number[tree_number]
|
159
|
+
end
|
160
|
+
|
161
|
+
def find_by_original_heading(heading)
|
162
|
+
return @by_original_heading[heading]
|
163
|
+
end
|
164
|
+
|
165
|
+
def where(conditions)
|
166
|
+
matches = []
|
167
|
+
@headings.each do |heading|
|
168
|
+
matches << heading if heading.matches(conditions)
|
169
|
+
end
|
170
|
+
matches
|
171
|
+
end
|
172
|
+
|
173
|
+
def each
|
174
|
+
for i in 0 ... @headings.size
|
175
|
+
yield @headings[i] if @headings[i].useful
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
def match_in_text(text)
|
180
|
+
return [] if text.nil?
|
181
|
+
downcased = text.downcase
|
182
|
+
matches = []
|
183
|
+
@headings.each do |heading|
|
184
|
+
next unless heading.useful
|
185
|
+
@locales.each do |locale|
|
186
|
+
heading.entries(locale).each do |entry|
|
187
|
+
if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
|
188
|
+
if /^[A-Z0-9]+$/ =~ entry
|
189
|
+
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
|
190
|
+
else
|
191
|
+
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
|
192
|
+
end
|
193
|
+
text.to_enum(:scan, regex).map do |m,|
|
194
|
+
matches << {heading: heading, matched: entry, index: $`.size}
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
confirmed_matches = []
|
201
|
+
matches.combination(2) do |l, r|
|
202
|
+
if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length)
|
203
|
+
#r is within l
|
204
|
+
r[:delete] = true
|
205
|
+
elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length)
|
206
|
+
#l is within r
|
207
|
+
l[:delete] = true
|
208
|
+
end
|
209
|
+
end
|
210
|
+
matches.delete_if { |match| match[:delete] }
|
211
|
+
end
|
212
|
+
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
end
|