mesh-medical-subject-headings 1.2.2 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ebe0318253b106903a11e4084cd1e457e136766e
4
- data.tar.gz: d60d3d541858a7dfffa11f4fa96e8e373665e631
3
+ metadata.gz: 3e0859a3e5c4757eb7babc4072ac97177ae0feee
4
+ data.tar.gz: 2488bf617fa6761cf5b91c47c7b6a0b0641afca4
5
5
  SHA512:
6
- metadata.gz: b6ed7232863986263b34bbed703da3f3dba58bda02b68a514af772f847f425b43ea25d757264bc6ee0363edcc33f061262f406defb3b4cd98e4b986da4fdf067
7
- data.tar.gz: 7f99f960a64fd84595f1c46076b588c590fb33689464449e8a176888db4ca91a309c356f98f2f443850d94e8e7812cbc07fd2feee1853fd3e83056f071b291b8
6
+ metadata.gz: 2770ad376f99544f8825eaab301ab457ad29d559beeca42cc0e66fee92bc68f70d6eecdf23775c34de57734a31505e2e44e79461fa052ab73e8eba27738c2fad
7
+ data.tar.gz: 9be16f17ace608d05d8dca68c407f76e7c450ffd1b79907a00fe20d6879aaa21acbfc1c2b0e5820f9ff04a0b52067f9f5e9464b19639d3b2135b0d6eae08d56d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ #1.3.0 / 2014-04-10
2
+ * [FEATURE] Released classifier to group and score matched headings
3
+
1
4
  #1.2.2 / 2014-03-05
2
5
  * [FEATURE] Optimised match_in_text to find headings much, much faster
3
6
 
data/Gemfile.lock CHANGED
@@ -1,18 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- mesh-medical-subject-headings (1.2.2)
4
+ mesh-medical-subject-headings (1.3.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- metaclass (0.0.2)
9
+ metaclass (0.0.4)
10
10
  minitest (5.0.8)
11
11
  mocha (1.0.0)
12
12
  metaclass (~> 0.0.1)
13
- rake (10.1.1)
13
+ rake (10.2.2)
14
14
  ruby-prof (0.14.2)
15
- yard (0.8.7.3)
15
+ yard (0.8.7.4)
16
16
 
17
17
  PLATFORMS
18
18
  ruby
data/MESH.gemspec CHANGED
@@ -4,24 +4,24 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'MESH/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "mesh-medical-subject-headings"
7
+ spec.name = 'mesh-medical-subject-headings'
8
8
  spec.version = Mesh::VERSION
9
- spec.authors = ["mmmmmrob"]
10
- spec.email = ["rob@dynamicorange.com"]
11
- spec.description = %q{A ruby gem containing MeSH subject headings (https://www.nlm.nih.gov/mesh/) for use in classifying and entity recognition.}
12
- spec.summary = %q{A ruby gem containing MeSH subject headings (https://www.nlm.nih.gov/mesh/) for use in classifying and entity recognition.}
13
- spec.homepage = ""
14
- spec.license = "AGPL3"
9
+ spec.authors = ['Rob Styles']
10
+ spec.email = ['rob@dynamicorange.com']
11
+ spec.description = 'A ruby gem containing MeSH subject headings (https://www.nlm.nih.gov/mesh/) for use in classifying and entity recognition.'
12
+ spec.summary = spec.description
13
+ spec.homepage = ''
14
+ spec.license = 'AGPL3'
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.executables = nil
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
19
+ spec.require_paths = ['lib']
20
20
 
21
- spec.add_development_dependency "bundler", "~> 1.3"
22
- spec.add_development_dependency "rake"
23
- spec.add_development_dependency "mocha"
24
- spec.add_development_dependency "yard"
25
- spec.add_development_dependency "minitest", "~> 5.0.8"
26
- spec.add_development_dependency "ruby-prof"
21
+ spec.add_development_dependency 'bundler', '~> 1.3'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'mocha'
24
+ spec.add_development_dependency 'yard'
25
+ spec.add_development_dependency 'minitest', '~> 5.0.8'
26
+ spec.add_development_dependency 'ruby-prof'
27
27
  end
data/lib/MESH.rb CHANGED
@@ -1,2 +1,4 @@
1
1
  require 'MESH/version'
2
+ require 'MESH/translator'
2
3
  require 'MESH/mesh'
4
+ require 'MESH/classifier'
@@ -0,0 +1,103 @@
1
+ module MESH
2
+ class Classifier
3
+ def classify(weighted_matches)
4
+
5
+ weighted_headings = []
6
+ weighted_matches.each do |wm|
7
+ wm[:matches].each do |match|
8
+ weighted_headings << [wm[:weight], match[:heading]]
9
+ end
10
+ end
11
+
12
+ root_groups = {}
13
+ weighted_headings.each do |weight, heading|
14
+ heading.roots.each do |root|
15
+ root_groups[root] ||= []
16
+ root_groups[root] << [weight, heading]
17
+ end
18
+ end
19
+
20
+ chosen = {}
21
+
22
+ root_groups.each do |root, weighted_headings|
23
+ scored = {}
24
+ weighted_headings.each do |weight, heading|
25
+ calculate_scores(scored, root, heading, weight)
26
+ #scored[heading] ||= 0
27
+ #scored[heading] += weight
28
+ #heading.parents.each do |p|
29
+ # if p.roots.include? root
30
+ # scored[p] ||= 0
31
+ # scored[p] += weight / 3.0
32
+ # end
33
+ #end
34
+ end
35
+ scored.each { |h,s| scored[h] = s.round(3) }
36
+ scored.delete_if { |h,s| s == 0 }
37
+ best_score, best_connected = scored.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
38
+ most_specific = best_connected.max_by { |h| h.deepest_position(root) }
39
+ #chosen[root] = [most_specific, best_score]
40
+ chosen[root] = [best_score, scored]
41
+ end
42
+
43
+ chosen
44
+
45
+ end
46
+
47
+ def calculate_scores(scored, root, heading, weight)
48
+ scored[heading] ||= 0
49
+ scored[heading] += weight
50
+ heading.parents.each do |p|
51
+ if p.roots.include? root
52
+ calculate_scores(scored, root, p, weight / 3.0)
53
+ end
54
+ end
55
+ end
56
+
57
+
58
+
59
+
60
+ #text = "#{document[:title]}\n#{document[:abstract]}\n#{document[:content]}"
61
+ #matches = MESH::Mesh.match_in_text(text)
62
+ #headings = matches.map { |m| m[:heading] }
63
+ #root_groups = headings.reduce({}) do |rg, heading|
64
+ # heading.roots.each { |root| (rg[root] ||= []) << heading }
65
+ # rg
66
+ #end
67
+ #root_groups.reduce({}) do |chosen, (root, candidates)|
68
+ # connections = calculate_connections(root, candidates)
69
+ # best_score, best_connected = connections.reduce({}) { |h, (k, v)| (h[v] ||= []) << k; h }.max
70
+ # most_specific = best_connected.max_by { |h| h.deepest_position }
71
+ # chosen[root] = most_specific
72
+ # chosen
73
+ #end
74
+ #end
75
+
76
+ private
77
+
78
+ def calculate_connections(root, headings, weight)
79
+ connections = {}
80
+ headings.each do |h|
81
+ add_connection(connections, root, h, weight)
82
+ end
83
+ connections
84
+ end
85
+
86
+ def add_connection(connections, root, heading, weight)
87
+ return unless heading.roots.include? root
88
+ connections[heading] ||= 0
89
+ connections[heading] += weight
90
+ heading.parents.each do |p|
91
+ connections[p] ||= 0
92
+ connections[p] += weight
93
+ #add_connection(connections, root, p, weight)
94
+ end
95
+ #heading.siblings.each do |p|
96
+ # add_connection(connections, p)
97
+ #end
98
+ end
99
+
100
+ end
101
+
102
+ end
103
+
data/lib/MESH/mesh.rb CHANGED
@@ -1,73 +1,79 @@
1
- require_relative 'translator'
2
-
3
1
  module MESH
4
2
  class Mesh
5
3
 
6
- attr_accessor :unique_id, :original_heading, :tree_numbers, :parents, :children, :natural_language_name, :summary, :entries, :useful
4
+ include Comparable
5
+ attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class
6
+
7
+ def <=> other
8
+ self.unique_id <=> other.unique_id
9
+ end
7
10
 
8
- def original_heading(locale = nil)
9
- return @original_heading if locale.nil?
10
- @@translator.translate(@original_heading)
11
+ def original_heading(locale = @@default_locale)
12
+ return @original_heading[locale]
11
13
  end
12
14
 
13
- def natural_language_name(locale = nil)
14
- return @natural_language_name if locale.nil?
15
- @@translator.translate(@natural_language_name)
15
+ def natural_language_name(locale = @@default_locale)
16
+ return @natural_language_name[locale]
16
17
  end
17
18
 
18
- def summary(locale = nil)
19
- return @summary if locale.nil?
20
- @@translator.translate(@summary)
19
+ def summary(locale = @@default_locale)
20
+ return @summary[locale]
21
21
  end
22
22
 
23
- def entries(locale = nil)
24
- return @entries if locale.nil?
25
- @entries.map { |entry| @@translator.translate(entry) }.sort
23
+ def entries(locale = @@default_locale)
24
+ @entries[locale] ||= []
25
+ return @entries[locale]
26
26
  end
27
27
 
28
28
  def self.configure(args)
29
29
  return if @@configured
30
30
  raise ArgumentError.new('MeshHeadingGraph requires a filename in order to configure itself') unless not args[:filename].nil?
31
+
31
32
  gzipped_file = File.open(args[:filename])
32
33
  file = Zlib::GzipReader.new(gzipped_file)
34
+
33
35
  current_heading = Mesh.new
34
36
  file.each_line do |line|
35
- if line.match(/^\*NEWRECORD$/) #Then store the previous record before continuing
36
- unless current_heading.unique_id.nil?
37
- current_heading.entries.sort!
38
- @@headings << current_heading
39
- @@by_unique_id[current_heading.unique_id] = current_heading
40
- @@by_original_heading[current_heading.original_heading] = current_heading
41
- current_heading.tree_numbers.each do |tree_number|
42
- @@by_tree_number[tree_number] = current_heading
37
+
38
+ case
39
+
40
+ when matches = line.match(/^\*NEWRECORD$/)
41
+ unless current_heading.unique_id.nil?
42
+ current_heading.entries.sort!
43
+ @@headings << current_heading
44
+ @@by_unique_id[current_heading.unique_id] = current_heading
45
+ @@by_original_heading[current_heading.original_heading] = current_heading
46
+ current_heading.tree_numbers.each do |tree_number|
47
+ @@by_tree_number[tree_number] = current_heading
48
+ end
43
49
  end
44
- end
45
- current_heading = Mesh.new
46
- end
50
+ current_heading = Mesh.new
47
51
 
48
- matches = line.match(/^UI = (.*)/)
49
- current_heading.unique_id = matches[1] unless matches.nil?
52
+ when matches = line.match(/^UI = (.*)/)
53
+ current_heading.unique_id = matches[1]
50
54
 
51
- matches = line.match(/^MN = (.*)/)
52
- current_heading.tree_numbers << matches[1] unless matches.nil?
55
+ when matches = line.match(/^MN = (.*)/)
56
+ current_heading.tree_numbers << matches[1]
57
+ current_heading.roots << matches[1][0] unless current_heading.roots.include?(matches[1][0])
53
58
 
54
- matches = line.match(/^MS = (.*)/)
55
- current_heading.summary = matches[1] unless matches.nil?
59
+ when matches = line.match(/^MS = (.*)/)
60
+ current_heading.set_summary(matches[1])
56
61
 
57
- matches = line.match(/^MH = (.*)/)
58
- unless matches.nil?
59
- mh = matches[1]
60
- current_heading.original_heading = mh
61
- current_heading.natural_language_name = mh
62
- current_heading.entries << mh
63
- librarian_parts = mh.match(/(.*), (.*)/)
64
- current_heading.natural_language_name = "#{librarian_parts[2]} #{librarian_parts[1]}" unless librarian_parts.nil?
65
- end
62
+ when matches = line.match(/^DC = (.*)/)
63
+ current_heading.descriptor_class = @@descriptor_classes[matches[1].to_i]
64
+
65
+ when matches = line.match(/^MH = (.*)/)
66
+ mh = matches[1]
67
+ current_heading.set_original_heading(mh)
68
+ current_heading.entries << mh
69
+ librarian_parts = mh.match(/(.*), (.*)/)
70
+ nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
71
+ current_heading.set_natural_language_name(nln)
72
+
73
+ when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
74
+ entry = matches[1].chomp
75
+ current_heading.entries << entry
66
76
 
67
- matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
68
- unless matches.nil?
69
- mh = matches[1].chomp
70
- current_heading.entries << mh
71
77
  end
72
78
 
73
79
  end
@@ -88,6 +94,19 @@ module MESH
88
94
  @@configured = true
89
95
  end
90
96
 
97
+ def self.translate(locale, tr)
98
+ return if @@locales.include? locale
99
+ @@headings.each_with_index do |h, i|
100
+ h.set_original_heading(tr.translate(h.original_heading), locale)
101
+ h.set_natural_language_name(tr.translate(h.natural_language_name), locale)
102
+ h.set_summary(tr.translate(h.summary), locale)
103
+ h.entries.each { |entry| h.entries(locale) << tr.translate(entry) }
104
+ h.entries(locale).sort!
105
+ end
106
+
107
+ @@locales << locale
108
+ end
109
+
91
110
  def self.find(unique_id)
92
111
  raise 'MeshHeadingGraph.configure must be called before use' unless @@configured
93
112
  return @@by_unique_id[unique_id]
@@ -118,20 +137,72 @@ module MESH
118
137
  end
119
138
 
120
139
  def self.match_in_text(text)
121
- text = text.downcase
140
+ return [] if text.nil?
141
+ downcased = text.downcase
122
142
  matches = []
123
143
  @@headings.each do |heading|
124
144
  next unless heading.useful
125
- heading.entries.each do |entry|
126
- if text.include? entry.downcase #This is a looser check than the regex but much, much faster
127
- regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
128
- if regex =~ text
129
- matches << {heading: heading, matched: entry}
145
+ @@locales.each do |locale|
146
+ heading.entries(locale).each do |entry|
147
+ if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
148
+ if /^[A-Z0-9]+$/ =~ entry
149
+ regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
150
+ else
151
+ regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
152
+ end
153
+ text.to_enum(:scan, regex).map do |m,|
154
+ matches << {heading: heading, matched: entry, index: $`.size}
155
+ end
130
156
  end
131
157
  end
132
158
  end
133
159
  end
134
- matches
160
+ confirmed_matches = []
161
+ matches.combination(2) do |l, r|
162
+ if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length)
163
+ #r is within l
164
+ r[:delete] = true
165
+ elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length)
166
+ #l is within r
167
+ l[:delete] = true
168
+ end
169
+ end
170
+ matches.delete_if { |match| match[:delete] }
171
+ end
172
+
173
+ def has_ancestor(heading)
174
+ return false if parents.empty?
175
+ return true if parents.include? heading
176
+ in_grandparents = parents.map { |p| p.has_ancestor(heading) }
177
+ return in_grandparents.include? true
178
+ end
179
+
180
+ def has_descendant(heading)
181
+ return false if children.empty?
182
+ return true if children.include? heading
183
+ in_grandchildren = children.map { |p| p.has_descendant(heading) }
184
+ return in_grandchildren.include? true
185
+ end
186
+
187
+ def sibling?(heading)
188
+ common_parents = parents & heading.parents
189
+ !common_parents.empty?
190
+ end
191
+
192
+ def deepest_position(root = '')
193
+ return nil if tree_numbers.empty?
194
+ deepest_tree_number = tree_numbers.max_by { |tn| tn.start_with?(root) ? tn.length : 0 }
195
+ deepest_tree_number.split('.').length
196
+ end
197
+
198
+ def shallowest_position
199
+ return nil if tree_numbers.empty?
200
+ shallowest_tree_number = tree_numbers.min_by { |tn| tn.length }
201
+ shallowest_tree_number.split('.').length
202
+ end
203
+
204
+ def self.cluster(headings)
205
+ return headings
135
206
  end
136
207
 
137
208
  def matches(conditions)
@@ -141,6 +212,8 @@ module MESH
141
212
  return false unless field_content.find { |fc| pattern =~ fc }
142
213
  elsif field_content.is_a?(TrueClass) || field_content.is_a?(FalseClass)
143
214
  return false unless field_content == pattern
215
+ elsif field_content.is_a? Symbol
216
+ return field_content == pattern
144
217
  else
145
218
  return false unless pattern =~ field_content
146
219
  end
@@ -149,7 +222,19 @@ module MESH
149
222
  end
150
223
 
151
224
  def inspect
152
- return "#{@unique_id}, #{@original_heading}"
225
+ return "#{unique_id}, #{original_heading}, [#{tree_numbers.join(',')}]"
226
+ end
227
+
228
+ def set_original_heading(heading, locale = @@default_locale)
229
+ @original_heading[locale] = heading
230
+ end
231
+
232
+ def set_natural_language_name(name, locale = @@default_locale)
233
+ @natural_language_name[locale] = name
234
+ end
235
+
236
+ def set_summary(summary, locale = @@default_locale)
237
+ @summary[locale] = summary
153
238
  end
154
239
 
155
240
  private
@@ -160,16 +245,24 @@ module MESH
160
245
  @@by_tree_number = {}
161
246
  @@by_original_heading = {}
162
247
  @@default_locale = 'en-US'
163
- @@translator = Translator.new
248
+ @@locales = [@@default_locale]
249
+ @@us_to_gb = Translator.new(Translator.enus_to_engb)
250
+ @@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor]
164
251
 
165
252
  def initialize
166
253
  @useful = true
167
254
  @tree_numbers = []
255
+ @roots = []
168
256
  @parents = []
169
257
  @children = []
170
- @entries = []
258
+ @entries = {}
259
+ @entries[@@default_locale] = []
260
+ @original_heading = {}
261
+ @natural_language_name = {}
262
+ @summary = {}
171
263
  end
172
264
 
265
+
173
266
  end
174
267
  end
175
268