mesh-medical-subject-headings 1.3.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3e0859a3e5c4757eb7babc4072ac97177ae0feee
4
- data.tar.gz: 2488bf617fa6761cf5b91c47c7b6a0b0641afca4
3
+ metadata.gz: 246c3f1005660ea5801534254a71eea1b0c1037c
4
+ data.tar.gz: 793b97acd06ab80e225f20b9d8f753b22b9fb6bb
5
5
  SHA512:
6
- metadata.gz: 2770ad376f99544f8825eaab301ab457ad29d559beeca42cc0e66fee92bc68f70d6eecdf23775c34de57734a31505e2e44e79461fa052ab73e8eba27738c2fad
7
- data.tar.gz: 9be16f17ace608d05d8dca68c407f76e7c450ffd1b79907a00fe20d6879aaa21acbfc1c2b0e5820f9ff04a0b52067f9f5e9464b19639d3b2135b0d6eae08d56d
6
+ metadata.gz: 7f21c94d6daa57f989777800f472fd679571c398fd535445fd8b75aabad1cfef38393cc0947b0545499211e3247b776531c9eab49a080e3d7893c0bd805ebf94
7
+ data.tar.gz: ab02026193ce2ce3d46099a00c1923bc76490a873eb81410f07c209666d8e34e2ee9f92edcd15bffb42f1391772a413be689e1abd9004123b7f0ac225a778848
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ #2.0.1 / 2014-04-28
2
+ * [FEATURE] Load translations from static file to improve load times
3
+
4
+ #2.0.0 / 2014-04-28
5
+ * [FEATURE] Refactored to instances for future running of multiple trees
6
+
1
7
  #1.3.0 / 2014-04-10
2
8
  * [FEATURE] Released classifier to group and score matched headings
3
9
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- mesh-medical-subject-headings (1.3.0)
4
+ mesh-medical-subject-headings (2.0.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/lib/MESH.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'MESH/version'
2
+ require 'MESH/tree'
3
+ require 'MESH/heading'
2
4
  require 'MESH/translator'
3
- require 'MESH/mesh'
4
5
  require 'MESH/classifier'
@@ -23,20 +23,11 @@ module MESH
23
23
  scored = {}
24
24
  weighted_headings.each do |weight, heading|
25
25
  calculate_scores(scored, root, heading, weight)
26
- #scored[heading] ||= 0
27
- #scored[heading] += weight
28
- #heading.parents.each do |p|
29
- # if p.roots.include? root
30
- # scored[p] ||= 0
31
- # scored[p] += weight / 3.0
32
- # end
33
- #end
34
26
  end
35
27
  scored.each { |h,s| scored[h] = s.round(3) }
36
28
  scored.delete_if { |h,s| s == 0 }
37
29
  best_score, best_connected = scored.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
38
30
  most_specific = best_connected.max_by { |h| h.deepest_position(root) }
39
- #chosen[root] = [most_specific, best_score]
40
31
  chosen[root] = [best_score, scored]
41
32
  end
42
33
 
@@ -54,25 +45,6 @@ module MESH
54
45
  end
55
46
  end
56
47
 
57
-
58
-
59
-
60
- #text = "#{document[:title]}\n#{document[:abstract]}\n#{document[:content]}"
61
- #matches = MESH::Mesh.match_in_text(text)
62
- #headings = matches.map { |m| m[:heading] }
63
- #root_groups = headings.reduce({}) do |rg, heading|
64
- # heading.roots.each { |root| (rg[root] ||= []) << heading }
65
- # rg
66
- #end
67
- #root_groups.reduce({}) do |chosen, (root, candidates)|
68
- # connections = calculate_connections(root, candidates)
69
- # best_score, best_connected = connections.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
70
- # most_specific = best_connected.max_by { |h| h.deepest_position }
71
- # chosen[root] = most_specific
72
- # chosen
73
- #end
74
- #end
75
-
76
48
  private
77
49
 
78
50
  def calculate_connections(root, headings, weight)
@@ -90,11 +62,7 @@ module MESH
90
62
  heading.parents.each do |p|
91
63
  connections[p] ||= 0
92
64
  connections[p] += weight
93
- #add_connection(connections, root, p, weight)
94
65
  end
95
- #heading.siblings.each do |p|
96
- # add_connection(connections, p)
97
- #end
98
66
  end
99
67
 
100
68
  end
@@ -0,0 +1,155 @@
1
+ module MESH
2
+ class Heading
3
+
4
+ include Comparable
5
+ attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class, :default_locale
6
+
7
+ def <=> other
8
+ self.unique_id <=> other.unique_id
9
+ end
10
+
11
+ def original_heading(locale = default_locale)
12
+ return @original_heading[locale]
13
+ end
14
+
15
+ def natural_language_name(locale = default_locale)
16
+ return @natural_language_name[locale]
17
+ end
18
+
19
+ def summary(locale = default_locale)
20
+ return @summary[locale]
21
+ end
22
+
23
+ def entries(locale = default_locale)
24
+ @entries[locale] ||= []
25
+ return @entries[locale]
26
+ end
27
+
28
+
29
+ def has_ancestor(heading)
30
+ return false if parents.empty?
31
+ return true if parents.include? heading
32
+ in_grandparents = parents.map { |p| p.has_ancestor(heading) }
33
+ return in_grandparents.include? true
34
+ end
35
+
36
+ def has_descendant(heading)
37
+ return false if children.empty?
38
+ return true if children.include? heading
39
+ in_grandchildren = children.map { |p| p.has_descendant(heading) }
40
+ return in_grandchildren.include? true
41
+ end
42
+
43
+ def sibling?(heading)
44
+ common_parents = parents & heading.parents
45
+ !common_parents.empty?
46
+ end
47
+
48
+ def deepest_position(root = '')
49
+ return nil if tree_numbers.empty?
50
+ deepest_tree_number = tree_numbers.max_by { |tn| tn.start_with?(root) ? tn.length : 0 }
51
+ deepest_tree_number.split('.').length
52
+ end
53
+
54
+ def shallowest_position
55
+ return nil if tree_numbers.empty?
56
+ shallowest_tree_number = tree_numbers.min_by { |tn| tn.length }
57
+ shallowest_tree_number.split('.').length
58
+ end
59
+
60
+ def matches(conditions)
61
+ conditions.each do |field, pattern|
62
+ field_content = self.send(field)
63
+ if field_content.kind_of?(Array)
64
+ return false unless field_content.find { |fc| pattern =~ fc }
65
+ elsif field_content.is_a?(TrueClass) || field_content.is_a?(FalseClass)
66
+ return false unless field_content == pattern
67
+ elsif field_content.is_a? Symbol
68
+ return field_content == pattern
69
+ else
70
+ return false unless pattern =~ field_content
71
+ end
72
+ end
73
+ return true
74
+ end
75
+
76
+ def inspect
77
+ return "#{unique_id}, #{original_heading}, [#{tree_numbers.join(',')}]"
78
+ end
79
+
80
+ def set_original_heading(heading, locale = default_locale)
81
+ @original_heading[locale] = heading
82
+ end
83
+
84
+ def set_natural_language_name(name, locale = default_locale)
85
+ @natural_language_name[locale] = name
86
+ end
87
+
88
+ def set_summary(summary, locale = default_locale)
89
+ @summary[locale] = summary
90
+ end
91
+
92
+ private
93
+
94
+ def initialize
95
+ @useful = true
96
+ @tree_numbers = []
97
+ @roots = []
98
+ @parents = []
99
+ @children = []
100
+ @entries = {}
101
+ @original_heading = {}
102
+ @natural_language_name = {}
103
+ @summary = {}
104
+ end
105
+
106
+
107
+ end
108
+ end
109
+
110
+ #
111
+ #*NEWRECORD
112
+ #RECTYPE = D
113
+ #MH = Calcimycin
114
+ #AQ = AA AD AE AG AI AN BI BL CF CH CL CS CT DU EC HI IM IP ME PD PK PO RE SD ST TO TU UR
115
+ #ENTRY = A-23187|T109|T195|LAB|NRW|NLM (1991)|900308|abbcdef
116
+ #ENTRY = A23187|T109|T195|LAB|NRW|UNK (19XX)|741111|abbcdef
117
+ #ENTRY = Antibiotic A23187|T109|T195|NON|NRW|NLM (1991)|900308|abbcdef
118
+ #ENTRY = A 23187
119
+ #ENTRY = A23187, Antibiotic
120
+ #MN = D03.438.221.173
121
+ #PA = Anti-Bacterial Agents
122
+ #PA = Calcium Ionophores
123
+ #MH_TH = FDA SRS (2014)
124
+ #MH_TH = NLM (1975)
125
+ #ST = T109
126
+ #ST = T195
127
+ #N1 = 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-
128
+ # RN = 37H9VM9WZL
129
+ #RR = 52665-69-7 (Calcimycin)
130
+ #PI = Antibiotics (1973-1974)
131
+ #PI = Carboxylic Acids (1973-1974)
132
+ #MS = An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems.
133
+ # OL = use CALCIMYCIN to search A 23187 1975-90
134
+ #PM = 91; was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)
135
+ #HN = 91(75); was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)
136
+ #MED = *62
137
+ #MED = 847
138
+ #M90 = *299
139
+ #M90 = 2405
140
+ #M85 = *454
141
+ #M85 = 2878
142
+ #M80 = *316
143
+ #M80 = 1601
144
+ #M75 = *300
145
+ #M75 = 823
146
+ #M66 = *1
147
+ #M66 = 3
148
+ #M94 = *153
149
+ #M94 = 1606
150
+ #MR = 20130708
151
+ #DA = 19741119
152
+ #DC = 1
153
+ #DX = 19840101
154
+ #UI = D000001
155
+ #
data/lib/MESH/tree.rb ADDED
@@ -0,0 +1,216 @@
1
+ module MESH
2
+
3
+ class Tree
4
+
5
+ @@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor]
6
+ @@default_locale = :en_us
7
+
8
+ def initialize
9
+
10
+ @headings = []
11
+ @by_unique_id = {}
12
+ @by_tree_number = {}
13
+ @by_original_heading = {}
14
+ @locales = [@@default_locale]
15
+
16
+ filename = File.expand_path('../../../data/mesh_data_2014/d2014.bin.gz', __FILE__)
17
+ gzipped_file = File.open(filename)
18
+ file = Zlib::GzipReader.new(gzipped_file)
19
+
20
+ current_heading = MESH::Heading.new
21
+ current_heading.default_locale = @@default_locale
22
+ file.each_line do |line|
23
+
24
+ case
25
+
26
+ when matches = line.match(/^\*NEWRECORD$/)
27
+ unless current_heading.unique_id.nil?
28
+ current_heading.entries.sort!
29
+ @headings << current_heading
30
+ @by_unique_id[current_heading.unique_id] = current_heading
31
+ @by_original_heading[current_heading.original_heading] = current_heading
32
+ current_heading.tree_numbers.each do |tree_number|
33
+ @by_tree_number[tree_number] = current_heading
34
+ end
35
+ end
36
+ current_heading = MESH::Heading.new
37
+ current_heading.default_locale = @@default_locale
38
+
39
+ when matches = line.match(/^UI = (.*)/)
40
+ current_heading.unique_id = matches[1]
41
+
42
+ when matches = line.match(/^MN = (.*)/)
43
+ current_heading.tree_numbers << matches[1]
44
+ current_heading.roots << matches[1][0] unless current_heading.roots.include?(matches[1][0])
45
+
46
+ when matches = line.match(/^MS = (.*)/)
47
+ current_heading.set_summary(matches[1])
48
+
49
+ when matches = line.match(/^DC = (.*)/)
50
+ current_heading.descriptor_class = @@descriptor_classes[matches[1].to_i]
51
+
52
+ when matches = line.match(/^MH = (.*)/)
53
+ mh = matches[1]
54
+ current_heading.set_original_heading(mh)
55
+ current_heading.entries << mh
56
+ librarian_parts = mh.match(/(.*), (.*)/)
57
+ nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
58
+ current_heading.set_natural_language_name(nln)
59
+
60
+ when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
61
+ entry = matches[1].chomp
62
+ current_heading.entries << entry
63
+
64
+ end
65
+
66
+ end
67
+
68
+ @by_unique_id.each do |id, heading|
69
+ heading.tree_numbers.each do |tree_number|
70
+ #D03.438.221.173
71
+ parts = tree_number.split('.')
72
+ if parts.size > 1
73
+ parts.pop
74
+ parent_tree_number = parts.join '.'
75
+ parent = @by_tree_number[parent_tree_number]
76
+ heading.parents << parent unless parent.nil?
77
+ parent.children << heading unless parent.nil?
78
+ end
79
+ end
80
+ end
81
+
82
+ end
83
+
84
+ def load_translation(locale)
85
+ return if @locales.include? locale
86
+ filename = File.expand_path("../../../data/mesh_data_2014/d2014.#{locale}.bin.gz", __FILE__)
87
+ gzipped_file = File.open(filename)
88
+ file = Zlib::GzipReader.new(gzipped_file)
89
+
90
+ entries = []
91
+ original_heading = nil
92
+ natural_language_name = nil
93
+ summary = nil
94
+ unique_id = nil
95
+ file.each_line do |line|
96
+
97
+ case
98
+
99
+ when matches = line.match(/^\*NEWRECORD$/)
100
+ unless unique_id.nil?
101
+ entries.sort!
102
+ entries.uniq!
103
+ if heading = find(unique_id)
104
+ heading.set_original_heading(original_heading, locale) unless original_heading.nil?
105
+ heading.set_natural_language_name(natural_language_name, locale) unless natural_language_name.nil?
106
+ heading.set_summary(summary, locale) unless summary.nil?
107
+ entries.each { |entry| heading.entries(locale) << entry }
108
+ end
109
+
110
+ entries = []
111
+ original_heading = nil
112
+ summary = nil
113
+ unique_id = nil
114
+ end
115
+
116
+ when matches = line.match(/^UI = (.*)/)
117
+ unique_id = matches[1]
118
+
119
+ when matches = line.match(/^MS = (.*)/)
120
+ summary = matches[1]
121
+
122
+ when matches = line.match(/^MH = (.*)/)
123
+ mh = matches[1]
124
+ original_heading = mh
125
+ entries << mh
126
+ librarian_parts = mh.match(/(.*), (.*)/)
127
+ natural_language_name = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
128
+
129
+ when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
130
+ entry = matches[1].chomp
131
+ entries << entry
132
+
133
+ end
134
+
135
+ end
136
+ @locales << locale
137
+ end
138
+
139
+ # NO LONGER COVERED BY TESTS
140
+ # def translate(locale, tr)
141
+ # return if @locales.include? locale
142
+ # @headings.each_with_index do |h, i|
143
+ # h.set_original_heading(tr.translate(h.original_heading), locale)
144
+ # h.set_natural_language_name(tr.translate(h.natural_language_name), locale)
145
+ # h.set_summary(tr.translate(h.summary), locale)
146
+ # h.entries.each { |entry| h.entries(locale) << tr.translate(entry) }
147
+ # h.entries(locale).sort!
148
+ # end
149
+ #
150
+ # @locales << locale
151
+ # end
152
+
153
+ def find(unique_id)
154
+ return @by_unique_id[unique_id]
155
+ end
156
+
157
+ def find_by_tree_number(tree_number)
158
+ return @by_tree_number[tree_number]
159
+ end
160
+
161
+ def find_by_original_heading(heading)
162
+ return @by_original_heading[heading]
163
+ end
164
+
165
+ def where(conditions)
166
+ matches = []
167
+ @headings.each do |heading|
168
+ matches << heading if heading.matches(conditions)
169
+ end
170
+ matches
171
+ end
172
+
173
+ def each
174
+ for i in 0 ... @headings.size
175
+ yield @headings[i] if @headings[i].useful
176
+ end
177
+ end
178
+
179
+ def match_in_text(text)
180
+ return [] if text.nil?
181
+ downcased = text.downcase
182
+ matches = []
183
+ @headings.each do |heading|
184
+ next unless heading.useful
185
+ @locales.each do |locale|
186
+ heading.entries(locale).each do |entry|
187
+ if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
188
+ if /^[A-Z0-9]+$/ =~ entry
189
+ regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
190
+ else
191
+ regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
192
+ end
193
+ text.to_enum(:scan, regex).map do |m,|
194
+ matches << {heading: heading, matched: entry, index: $`.size}
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+ confirmed_matches = []
201
+ matches.combination(2) do |l, r|
202
+ if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length)
203
+ #r is within l
204
+ r[:delete] = true
205
+ elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length)
206
+ #l is within r
207
+ l[:delete] = true
208
+ end
209
+ end
210
+ matches.delete_if { |match| match[:delete] }
211
+ end
212
+
213
+
214
+ end
215
+
216
+ end