mesh-medical-subject-headings 1.3.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Gemfile.lock +1 -1
- data/data/mesh_data_2014/d2014.en_gb.bin.gz +0 -0
- data/lib/MESH.rb +2 -1
- data/lib/MESH/classifier.rb +0 -32
- data/lib/MESH/heading.rb +155 -0
- data/lib/MESH/tree.rb +216 -0
- data/lib/MESH/version.rb +1 -1
- data/match.rb +3 -3
- data/test/classifier_test.rb +239 -259
- data/test/{mesh_test.rb → mesh_core_test.rb} +172 -172
- data/test/test_helper.rb +11 -11
- data/tr_speed.rb +2 -2
- metadata +7 -5
- data/lib/MESH/mesh.rb +0 -314
data/test/test_helper.rb
CHANGED
@@ -9,17 +9,17 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
9
9
|
|
10
10
|
require "MESH"
|
11
11
|
|
12
|
-
puts 'Configuring MESH::Mesh — this may take up to 10 seconds.'
|
13
|
-
start = Time.now
|
14
|
-
MESH::Mesh.configure(filename: File.expand_path('../../data/mesh_data_2014/d2014.bin.gz', __FILE__))
|
15
|
-
finish = Time.now
|
16
|
-
configuration_time = finish - start
|
12
|
+
# puts 'Configuring MESH::Mesh — this may take up to 10 seconds.'
|
13
|
+
# start = Time.now
|
14
|
+
# MESH::Mesh.configure(filename: File.expand_path('../../data/mesh_data_2014/d2014.bin.gz', __FILE__))
|
15
|
+
# finish = Time.now
|
16
|
+
# configuration_time = finish - start
|
17
17
|
#raise 'MESH::Mesh should configure in less than 10 seconds.' unless configuration_time < 10
|
18
18
|
|
19
|
-
puts 'Translating MESH::Mesh into English ;) — this may take up to 60 seconds.'
|
20
|
-
start = Time.now
|
21
|
-
MESH::Mesh.translate('en-GB', MESH::Translator.new(MESH::Translator.enus_to_engb))
|
22
|
-
finish = Time.now
|
23
|
-
configuration_time = finish - start
|
24
|
-
puts "took #{configuration_time}"
|
19
|
+
# puts 'Translating MESH::Mesh into English ;) — this may take up to 60 seconds.'
|
20
|
+
# start = Time.now
|
21
|
+
# MESH::Mesh.translate('en-GB', MESH::Translator.new(MESH::Translator.enus_to_engb))
|
22
|
+
# finish = Time.now
|
23
|
+
# configuration_time = finish - start
|
24
|
+
# puts "took #{configuration_time}"
|
25
25
|
#raise 'MESH::Mesh should translate in less than 30 seconds.' unless configuration_time < 60
|
data/tr_speed.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require_relative 'lib/MESH'
|
2
2
|
|
3
3
|
puts DateTime.now
|
4
|
-
MESH::
|
4
|
+
MESH::Heading.configure(filename: 'data/mesh_data_2014/d2014.bin.gz')
|
5
5
|
puts DateTime.now
|
6
|
-
MESH::
|
6
|
+
MESH::Heading.translate('en-GB', MESH::Translator.new(MESH::Translator.enus_to_engb))
|
7
7
|
puts DateTime.now
|
8
8
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mesh-medical-subject-headings
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Styles
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -113,17 +113,19 @@ files:
|
|
113
113
|
- Rakefile
|
114
114
|
- data/mesh_data_2014/c2014.bin.gz
|
115
115
|
- data/mesh_data_2014/d2014.bin.gz
|
116
|
+
- data/mesh_data_2014/d2014.en_gb.bin.gz
|
116
117
|
- data/mesh_data_2014/mtrees2014.bin.gz
|
117
118
|
- data/mesh_data_2014/q2014.bin.gz
|
118
119
|
- data/mesh_data_2014/useful_2014.tsv
|
119
120
|
- lib/MESH.rb
|
120
121
|
- lib/MESH/classifier.rb
|
121
|
-
- lib/MESH/
|
122
|
+
- lib/MESH/heading.rb
|
122
123
|
- lib/MESH/translator.rb
|
124
|
+
- lib/MESH/tree.rb
|
123
125
|
- lib/MESH/version.rb
|
124
126
|
- match.rb
|
125
127
|
- test/classifier_test.rb
|
126
|
-
- test/
|
128
|
+
- test/mesh_core_test.rb
|
127
129
|
- test/test_helper.rb
|
128
130
|
- test/translator_test.rb
|
129
131
|
- tr_speed.rb
|
@@ -154,7 +156,7 @@ summary: A ruby gem containing MeSH subject headings (https://www.nlm.nih.gov/me
|
|
154
156
|
for use in classifying and entity recognition.
|
155
157
|
test_files:
|
156
158
|
- test/classifier_test.rb
|
157
|
-
- test/
|
159
|
+
- test/mesh_core_test.rb
|
158
160
|
- test/test_helper.rb
|
159
161
|
- test/translator_test.rb
|
160
162
|
has_rdoc:
|
data/lib/MESH/mesh.rb
DELETED
@@ -1,314 +0,0 @@
|
|
1
|
-
module MESH
|
2
|
-
class Mesh
|
3
|
-
|
4
|
-
include Comparable
|
5
|
-
attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class
|
6
|
-
|
7
|
-
def <=> other
|
8
|
-
self.unique_id <=> other.unique_id
|
9
|
-
end
|
10
|
-
|
11
|
-
def original_heading(locale = @@default_locale)
|
12
|
-
return @original_heading[locale]
|
13
|
-
end
|
14
|
-
|
15
|
-
def natural_language_name(locale = @@default_locale)
|
16
|
-
return @natural_language_name[locale]
|
17
|
-
end
|
18
|
-
|
19
|
-
def summary(locale = @@default_locale)
|
20
|
-
return @summary[locale]
|
21
|
-
end
|
22
|
-
|
23
|
-
def entries(locale = @@default_locale)
|
24
|
-
@entries[locale] ||= []
|
25
|
-
return @entries[locale]
|
26
|
-
end
|
27
|
-
|
28
|
-
def self.configure(args)
|
29
|
-
return if @@configured
|
30
|
-
raise ArgumentError.new('MeshHeadingGraph requires a filename in order to configure itself') unless not args[:filename].nil?
|
31
|
-
|
32
|
-
gzipped_file = File.open(args[:filename])
|
33
|
-
file = Zlib::GzipReader.new(gzipped_file)
|
34
|
-
|
35
|
-
current_heading = Mesh.new
|
36
|
-
file.each_line do |line|
|
37
|
-
|
38
|
-
case
|
39
|
-
|
40
|
-
when matches = line.match(/^\*NEWRECORD$/)
|
41
|
-
unless current_heading.unique_id.nil?
|
42
|
-
current_heading.entries.sort!
|
43
|
-
@@headings << current_heading
|
44
|
-
@@by_unique_id[current_heading.unique_id] = current_heading
|
45
|
-
@@by_original_heading[current_heading.original_heading] = current_heading
|
46
|
-
current_heading.tree_numbers.each do |tree_number|
|
47
|
-
@@by_tree_number[tree_number] = current_heading
|
48
|
-
end
|
49
|
-
end
|
50
|
-
current_heading = Mesh.new
|
51
|
-
|
52
|
-
when matches = line.match(/^UI = (.*)/)
|
53
|
-
current_heading.unique_id = matches[1]
|
54
|
-
|
55
|
-
when matches = line.match(/^MN = (.*)/)
|
56
|
-
current_heading.tree_numbers << matches[1]
|
57
|
-
current_heading.roots << matches[1][0] unless current_heading.roots.include?(matches[1][0])
|
58
|
-
|
59
|
-
when matches = line.match(/^MS = (.*)/)
|
60
|
-
current_heading.set_summary(matches[1])
|
61
|
-
|
62
|
-
when matches = line.match(/^DC = (.*)/)
|
63
|
-
current_heading.descriptor_class = @@descriptor_classes[matches[1].to_i]
|
64
|
-
|
65
|
-
when matches = line.match(/^MH = (.*)/)
|
66
|
-
mh = matches[1]
|
67
|
-
current_heading.set_original_heading(mh)
|
68
|
-
current_heading.entries << mh
|
69
|
-
librarian_parts = mh.match(/(.*), (.*)/)
|
70
|
-
nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
|
71
|
-
current_heading.set_natural_language_name(nln)
|
72
|
-
|
73
|
-
when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
|
74
|
-
entry = matches[1].chomp
|
75
|
-
current_heading.entries << entry
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
@@by_unique_id.each do |id, heading|
|
82
|
-
heading.tree_numbers.each do |tree_number|
|
83
|
-
#D03.438.221.173
|
84
|
-
parts = tree_number.split('.')
|
85
|
-
if parts.size > 1
|
86
|
-
parts.pop
|
87
|
-
parent_tree_number = parts.join '.'
|
88
|
-
parent = @@by_tree_number[parent_tree_number]
|
89
|
-
heading.parents << parent unless parent.nil?
|
90
|
-
parent.children << heading unless parent.nil?
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
@@configured = true
|
95
|
-
end
|
96
|
-
|
97
|
-
def self.translate(locale, tr)
|
98
|
-
return if @@locales.include? locale
|
99
|
-
@@headings.each_with_index do |h, i|
|
100
|
-
h.set_original_heading(tr.translate(h.original_heading), locale)
|
101
|
-
h.set_natural_language_name(tr.translate(h.natural_language_name), locale)
|
102
|
-
h.set_summary(tr.translate(h.summary), locale)
|
103
|
-
h.entries.each { |entry| h.entries(locale) << tr.translate(entry) }
|
104
|
-
h.entries(locale).sort!
|
105
|
-
end
|
106
|
-
|
107
|
-
@@locales << locale
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.find(unique_id)
|
111
|
-
raise 'MeshHeadingGraph.configure must be called before use' unless @@configured
|
112
|
-
return @@by_unique_id[unique_id]
|
113
|
-
end
|
114
|
-
|
115
|
-
def self.find_by_tree_number(tree_number)
|
116
|
-
raise 'MeshHeadingGraph.configure must be called before use' unless @@configured
|
117
|
-
return @@by_tree_number[tree_number]
|
118
|
-
end
|
119
|
-
|
120
|
-
def self.find_by_original_heading(heading)
|
121
|
-
raise 'MeshHeadingGraph.configure must be called before use' unless @@configured
|
122
|
-
return @@by_original_heading[heading]
|
123
|
-
end
|
124
|
-
|
125
|
-
def self.where(conditions)
|
126
|
-
matches = []
|
127
|
-
@@headings.each do |heading|
|
128
|
-
matches << heading if heading.matches(conditions)
|
129
|
-
end
|
130
|
-
matches
|
131
|
-
end
|
132
|
-
|
133
|
-
def self.each
|
134
|
-
for i in 0 ... @@headings.size
|
135
|
-
yield @@headings[i] if @@headings[i].useful
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
def self.match_in_text(text)
|
140
|
-
return [] if text.nil?
|
141
|
-
downcased = text.downcase
|
142
|
-
matches = []
|
143
|
-
@@headings.each do |heading|
|
144
|
-
next unless heading.useful
|
145
|
-
@@locales.each do |locale|
|
146
|
-
heading.entries(locale).each do |entry|
|
147
|
-
if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
|
148
|
-
if /^[A-Z0-9]+$/ =~ entry
|
149
|
-
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
|
150
|
-
else
|
151
|
-
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
|
152
|
-
end
|
153
|
-
text.to_enum(:scan, regex).map do |m,|
|
154
|
-
matches << {heading: heading, matched: entry, index: $`.size}
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|
159
|
-
end
|
160
|
-
confirmed_matches = []
|
161
|
-
matches.combination(2) do |l, r|
|
162
|
-
if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length)
|
163
|
-
#r is within l
|
164
|
-
r[:delete] = true
|
165
|
-
elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length)
|
166
|
-
#l is within r
|
167
|
-
l[:delete] = true
|
168
|
-
end
|
169
|
-
end
|
170
|
-
matches.delete_if { |match| match[:delete] }
|
171
|
-
end
|
172
|
-
|
173
|
-
def has_ancestor(heading)
|
174
|
-
return false if parents.empty?
|
175
|
-
return true if parents.include? heading
|
176
|
-
in_grandparents = parents.map { |p| p.has_ancestor(heading) }
|
177
|
-
return in_grandparents.include? true
|
178
|
-
end
|
179
|
-
|
180
|
-
def has_descendant(heading)
|
181
|
-
return false if children.empty?
|
182
|
-
return true if children.include? heading
|
183
|
-
in_grandchildren = children.map { |p| p.has_descendant(heading) }
|
184
|
-
return in_grandchildren.include? true
|
185
|
-
end
|
186
|
-
|
187
|
-
def sibling?(heading)
|
188
|
-
common_parents = parents & heading.parents
|
189
|
-
!common_parents.empty?
|
190
|
-
end
|
191
|
-
|
192
|
-
def deepest_position(root = '')
|
193
|
-
return nil if tree_numbers.empty?
|
194
|
-
deepest_tree_number = tree_numbers.max_by { |tn| tn.start_with?(root) ? tn.length : 0 }
|
195
|
-
deepest_tree_number.split('.').length
|
196
|
-
end
|
197
|
-
|
198
|
-
def shallowest_position
|
199
|
-
return nil if tree_numbers.empty?
|
200
|
-
shallowest_tree_number = tree_numbers.min_by { |tn| tn.length }
|
201
|
-
shallowest_tree_number.split('.').length
|
202
|
-
end
|
203
|
-
|
204
|
-
def self.cluster(headings)
|
205
|
-
return headings
|
206
|
-
end
|
207
|
-
|
208
|
-
def matches(conditions)
|
209
|
-
conditions.each do |field, pattern|
|
210
|
-
field_content = self.send(field)
|
211
|
-
if field_content.kind_of?(Array)
|
212
|
-
return false unless field_content.find { |fc| pattern =~ fc }
|
213
|
-
elsif field_content.is_a?(TrueClass) || field_content.is_a?(FalseClass)
|
214
|
-
return false unless field_content == pattern
|
215
|
-
elsif field_content.is_a? Symbol
|
216
|
-
return field_content == pattern
|
217
|
-
else
|
218
|
-
return false unless pattern =~ field_content
|
219
|
-
end
|
220
|
-
end
|
221
|
-
return true
|
222
|
-
end
|
223
|
-
|
224
|
-
def inspect
|
225
|
-
return "#{unique_id}, #{original_heading}, [#{tree_numbers.join(',')}]"
|
226
|
-
end
|
227
|
-
|
228
|
-
def set_original_heading(heading, locale = @@default_locale)
|
229
|
-
@original_heading[locale] = heading
|
230
|
-
end
|
231
|
-
|
232
|
-
def set_natural_language_name(name, locale = @@default_locale)
|
233
|
-
@natural_language_name[locale] = name
|
234
|
-
end
|
235
|
-
|
236
|
-
def set_summary(summary, locale = @@default_locale)
|
237
|
-
@summary[locale] = summary
|
238
|
-
end
|
239
|
-
|
240
|
-
private
|
241
|
-
|
242
|
-
@@configured = false
|
243
|
-
@@headings = []
|
244
|
-
@@by_unique_id = {}
|
245
|
-
@@by_tree_number = {}
|
246
|
-
@@by_original_heading = {}
|
247
|
-
@@default_locale = 'en-US'
|
248
|
-
@@locales = [@@default_locale]
|
249
|
-
@@us_to_gb = Translator.new(Translator.enus_to_engb)
|
250
|
-
@@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor]
|
251
|
-
|
252
|
-
def initialize
|
253
|
-
@useful = true
|
254
|
-
@tree_numbers = []
|
255
|
-
@roots = []
|
256
|
-
@parents = []
|
257
|
-
@children = []
|
258
|
-
@entries = {}
|
259
|
-
@entries[@@default_locale] = []
|
260
|
-
@original_heading = {}
|
261
|
-
@natural_language_name = {}
|
262
|
-
@summary = {}
|
263
|
-
end
|
264
|
-
|
265
|
-
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
#
|
270
|
-
#*NEWRECORD
|
271
|
-
#RECTYPE = D
|
272
|
-
#MH = Calcimycin
|
273
|
-
#AQ = AA AD AE AG AI AN BI BL CF CH CL CS CT DU EC HI IM IP ME PD PK PO RE SD ST TO TU UR
|
274
|
-
#ENTRY = A-23187|T109|T195|LAB|NRW|NLM (1991)|900308|abbcdef
|
275
|
-
#ENTRY = A23187|T109|T195|LAB|NRW|UNK (19XX)|741111|abbcdef
|
276
|
-
#ENTRY = Antibiotic A23187|T109|T195|NON|NRW|NLM (1991)|900308|abbcdef
|
277
|
-
#ENTRY = A 23187
|
278
|
-
#ENTRY = A23187, Antibiotic
|
279
|
-
#MN = D03.438.221.173
|
280
|
-
#PA = Anti-Bacterial Agents
|
281
|
-
#PA = Calcium Ionophores
|
282
|
-
#MH_TH = FDA SRS (2014)
|
283
|
-
#MH_TH = NLM (1975)
|
284
|
-
#ST = T109
|
285
|
-
#ST = T195
|
286
|
-
#N1 = 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-
|
287
|
-
# RN = 37H9VM9WZL
|
288
|
-
#RR = 52665-69-7 (Calcimycin)
|
289
|
-
#PI = Antibiotics (1973-1974)
|
290
|
-
#PI = Carboxylic Acids (1973-1974)
|
291
|
-
#MS = An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems.
|
292
|
-
# OL = use CALCIMYCIN to search A 23187 1975-90
|
293
|
-
#PM = 91; was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)
|
294
|
-
#HN = 91(75); was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)
|
295
|
-
#MED = *62
|
296
|
-
#MED = 847
|
297
|
-
#M90 = *299
|
298
|
-
#M90 = 2405
|
299
|
-
#M85 = *454
|
300
|
-
#M85 = 2878
|
301
|
-
#M80 = *316
|
302
|
-
#M80 = 1601
|
303
|
-
#M75 = *300
|
304
|
-
#M75 = 823
|
305
|
-
#M66 = *1
|
306
|
-
#M66 = 3
|
307
|
-
#M94 = *153
|
308
|
-
#M94 = 1606
|
309
|
-
#MR = 20130708
|
310
|
-
#DA = 19741119
|
311
|
-
#DC = 1
|
312
|
-
#DX = 19840101
|
313
|
-
#UI = D000001
|
314
|
-
#
|