mesh-medical-subject-headings 2.3.0 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 05dc9df68ff77e4f8a3b89e41f4b67a3da727332
4
- data.tar.gz: 8f57de7c848aa7e112ae9b3ecf0d34db871eca28
3
+ metadata.gz: 9c3b985c4316ac7e79bfcb96be4f6c1e96bf880a
4
+ data.tar.gz: 1929dd7755fa64eefdc9d806d4169cb8eb0a3537
5
5
  SHA512:
6
- metadata.gz: 345a7a0f904a25a0518a53f7babf30f13a1e4cb05e0862fe79eb1ae1a054b4224803bb8ba4d55dbe1f49575f0fa2f6ed6c5d32ec9163c7f195b025e59fa3ee6d
7
- data.tar.gz: 6090811709b5570512708496c2bf468f6ae62ae4b551f95df54e3101b94dc93ac763133d8b5e6b91bf6e0ac5b1978206b32162ab7638b47a57b8481b4ebba77b
6
+ metadata.gz: 0a2b332c6768577055fb64a0b9db08a3ed13c0d16d87b9f542d556ed97ad33716edc43d41e2de2c6c4571301de4561433f899cb32c406e37bc6b85a3c53ddebd
7
+ data.tar.gz: 0b1b8ad7dc99b21b4e911760e16c15c44612cabd7f6850959f313d2cecf552f9c2a8fe5bfb8fa410a26b5b9a3952dd1d85d54d5df92f41074339de6bcd2eb73c
@@ -1,3 +1,11 @@
1
+ #3.0.0 / 2014-10-06
2
+ * [FEATURE] Finder methods on MESH::Tree renamed for clarity
3
+ * [FEATURE] Significant performance improvements to Tree.match_in_text
4
+ * [FEATURE] Using google_hash to improve GC performance (best in 2.1.3+)
5
+ * [FEATURE] Heading.entries replaced by heading.structured_entries (first introduced in 2.2.0)
6
+ * [FEATURE] Matching in text now returns structured entry used for match, as well as heading
7
+ * [FEATURE] Entries are now able to find themselves within given text (Entry.match_in_text)
8
+
1
9
  #2.3.0 / 2014-09-30
2
10
  * [FEATURE] Significant performance improvements to entity recognition in text
3
11
 
@@ -1,11 +1,16 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- mesh-medical-subject-headings (2.3.0)
4
+ mesh-medical-subject-headings (3.0.0)
5
+ clarifier
6
+ google_hash
5
7
 
6
8
  GEM
7
9
  remote: https://rubygems.org/
8
10
  specs:
11
+ clarifier (0.0.3)
12
+ google_hash (0.8.4)
13
+ sane (~> 0)
9
14
  metaclass (0.0.4)
10
15
  mini_portile (0.6.0)
11
16
  minitest (5.0.8)
@@ -13,8 +18,11 @@ GEM
13
18
  metaclass (~> 0.0.1)
14
19
  nokogiri (1.6.2.1)
15
20
  mini_portile (= 0.6.0)
21
+ os (0.9.6)
16
22
  rake (10.2.2)
17
23
  ruby-prof (0.14.2)
24
+ sane (0.25.8)
25
+ os (~> 0)
18
26
  yard (0.8.7.4)
19
27
 
20
28
  PLATFORMS
@@ -17,6 +17,8 @@ Gem::Specification.new do |spec|
17
17
  spec.executables = nil
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ['lib']
20
+ spec.add_dependency 'google_hash'
21
+ spec.add_dependency 'clarifier'
20
22
 
21
23
  spec.add_development_dependency 'bundler', '~> 1.3'
22
24
  spec.add_development_dependency 'rake'
@@ -23,29 +23,52 @@ class Numeric
23
23
  end
24
24
 
25
25
  def time_this(name, &block)
26
- print "#{name}"
27
- STDOUT.flush
26
+ # print "#{name}"
27
+ # STDOUT.flush
28
28
  start = Time.now.to_f
29
29
  result = yield
30
30
  finish = Time.now.to_f
31
- puts "\t#{(finish - start).duration}"
32
- result
31
+ # puts "\t#{(finish - start).duration}"
32
+ finish - start
33
33
  end
34
34
 
35
- mesh_tree = time_this('Loading MeSH Tree') { MESH::Tree.new }
36
- time_this('Loading en_gb translation') { mesh_tree.load_translation(:en_gb) }
37
- time_this('Loading wikipedia') { mesh_tree.load_wikipedia }
35
+ mesh_tree = MESH::Tree.new
36
+ mesh_tree.load_translation(:en_gb)
37
+ mesh_tree.load_wikipedia
38
38
 
39
39
  json_str = File.new('./example.json').read
40
40
  extracted = JSON.parse(json_str)
41
41
 
42
- title_headings = time_this('Matching in title') { mesh_tree.match_in_text(extracted['title']) }
43
- description_headings = time_this('Matching in description') { mesh_tree.match_in_text(extracted['description']) }
44
- content_headings = time_this('Matching in content') { mesh_tree.match_in_text(extracted['content']) }
42
+ timings = Hash.new { |h, k| h[k] = [] }
45
43
 
46
- classifier = MESH::Classifier.new()
47
- classification = time_this('Classifying from matches') { classifier.classify([
48
- {weight: 10.0, matches: title_headings},
49
- {weight: 5.0, matches: description_headings},
50
- {weight: 1.0, matches: content_headings}
51
- ]) }
44
+ (0..10).each do |i|
45
+ time = time_this('Matching in title') { mesh_tree.match_in_text(extracted['title']) }
46
+ timings[:title_headings] << time
47
+ time = time_this('Matching in description') { mesh_tree.match_in_text(extracted['description']) }
48
+ timings[:description_headings] << time
49
+ time = time_this('Matching in content') { mesh_tree.match_in_text(extracted['content']) }
50
+ timings[:content_headings] << time
51
+ end
52
+
53
+ # (0..10).each do |i|
54
+ # time = time_this('Matching in title') { mesh_tree.match_in_text_2(extracted['title']) }
55
+ # timings[:title_entries_2] << time
56
+ # time = time_this('Matching in description') { mesh_tree.match_in_text_2(extracted['description']) }
57
+ # timings[:description_entries_2] << time
58
+ # time = time_this('Matching in content') { mesh_tree.match_in_text_2(extracted['content']) }
59
+ # timings[:content_entries_2] << time
60
+ # end
61
+
62
+ timings.each do |k, v|
63
+ t = v.map { |e| e.round(3) }.join("\t")
64
+ puts "#{k}\t#{t}"
65
+ avg = v.inject { |sum, el| sum + el }.to_f / v.size
66
+ puts "#{k}\t#{avg.round(3)}"
67
+ end
68
+
69
+ # classifier = MESH::Classifier.new()
70
+ # classification = time_this('Classifying from matches') { classifier.classify([
71
+ # {weight: 10.0, matches: title_headings},
72
+ # {weight: 5.0, matches: description_headings},
73
+ # {weight: 1.0, matches: content_headings}
74
+ # ]) }
@@ -42,9 +42,9 @@ extracted = JSON.parse(json_str)
42
42
 
43
43
  result = RubyProf.profile do
44
44
 
45
- title_headings = time_this('Matching in title') { mesh_tree.match_in_text(extracted['title']) }
45
+ # title_headings = time_this('Matching in title') { mesh_tree.match_in_text(extracted['title']) }
46
46
  # description_headings = time_this('Matching in description') { mesh_tree.match_in_text(extracted['description']) }
47
- # content_headings = time_this('Matching in content') { mesh_tree.match_in_text(extracted['content']) }
47
+ content_headings = time_this('Matching in content') { mesh_tree.match_in_text(extracted['content']) }
48
48
 
49
49
  # classifier = MESH::Classifier.new()
50
50
  # classification = time_this('Classifying from matches') { classifier.classify([
@@ -1,15 +1,25 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative '../lib/MESH'
4
-
5
- mesh_tree = MESH::Tree.new
6
4
  tr = MESH::Translator.new(MESH::Translator.enus_to_engb)
7
5
 
8
- mesh_tree.each do |h|
9
- puts "*NEWRECORD"
10
- puts "MH = #{tr.translate(h.original_heading)}"
11
- puts "MS = #{tr.translate(h.summary)}"
12
- h.entries.each { |e| puts "ENTRY = #{tr.translate(e)}" }
13
- puts "UI = #{h.unique_id}"
14
- puts ''
6
+ filename = File.expand_path('../../data/mesh_data_2014/d2014.bin.gz', __FILE__)
7
+ gzipped_file = File.open(filename)
8
+ file = Zlib::GzipReader.new(gzipped_file)
9
+
10
+ file.each_line do |line|
11
+ case
12
+ when line.start_with?('*NEWRECORD')
13
+ puts "\n*NEWRECORD"
14
+ when line.start_with?('UI = ')
15
+ puts line
16
+ when line.start_with?('MH = ')
17
+ puts tr.translate(line)
18
+ when line.start_with?('MS = ')
19
+ puts tr.translate(line)
20
+ when line.start_with?('PRINT ENTRY = ')
21
+ puts tr.translate(line)
22
+ when line.start_with?('ENTRY = ')
23
+ puts tr.translate(line)
24
+ end
15
25
  end
@@ -1,3 +1,5 @@
1
+ require 'clarifier'
2
+ require 'google_hash'
1
3
  require 'json'
2
4
  require 'set'
3
5
  require 'zlib'
@@ -2,10 +2,20 @@ module MESH
2
2
 
3
3
  class Entry
4
4
 
5
- attr_accessor :heading, :term, :semantic_types, :semantic_relationship, :lexical_type
5
+ include Comparable
6
+ attr_accessor :heading, :term, :semantic_types, :semantic_relationship, :lexical_type, :regex, :case_sensitive,
7
+ :downcased, :locales, :loose_match_term
6
8
 
7
- def initialize(heading, entry_text)
9
+ @@wordy_characters = ('a'..'z').to_a + ('A'..'Z').to_a + ('0'..'9').to_a
10
+
11
+ def <=> other
12
+ self.term <=> other.term
13
+ end
14
+
15
+ def initialize(heading, entry_text, locale)
8
16
  @heading = heading
17
+ @locales = Set.new
18
+ @locales << locale
9
19
  @semantic_types = []
10
20
  parts = entry_text.split('|')
11
21
  if entry_text.include? '|'
@@ -29,6 +39,37 @@ module MESH
29
39
  else
30
40
  @term = entry_text
31
41
  end
42
+ if /^[A-Z0-9]+$/ =~ @term
43
+ @regex = /(^|\W)#{Regexp.quote(@term)}(\W|$)/
44
+ @case_sensitive = true
45
+ else
46
+ @regex = /(^|\W)#{Regexp.quote(@term)}(\W|$)/i
47
+ @case_sensitive = false
48
+ end
49
+
50
+ @downcased = @term.downcase
51
+ @loose_match_term = Entry.loose_match(@term)
52
+
53
+ end
54
+
55
+ def self.loose_match(term)
56
+ term.gsub(/\W+/, ' ').upcase
57
+ end
58
+
59
+ def match_in_text(text, downcased)
60
+ matches = []
61
+ return matches if text.nil? || text.empty?
62
+
63
+ loose_match = @case_sensitive ? (text.include? @term) : (downcased.include? @downcased)
64
+ if loose_match
65
+ text.to_enum(:scan, @regex).map do |m,|
66
+ match = Regexp.last_match
67
+ matches << {heading: @heading, matched: self, index: match.offset(0)}
68
+ end
69
+ end
70
+
71
+ matches
72
+
32
73
  end
33
74
 
34
75
  end
@@ -4,7 +4,8 @@ module MESH
4
4
  @@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor]
5
5
 
6
6
  include Comparable
7
- attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class, :default_locale, :semantic_types, :wikipedia_links, :structured_entries, :forward_references
7
+ attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class, :default_locale,
8
+ :semantic_types, :wikipedia_links, :structured_entries, :forward_references
8
9
  attr_reader :linkified_summary
9
10
 
10
11
  def <=> other
@@ -23,18 +24,18 @@ module MESH
23
24
  @summary[locale]
24
25
  end
25
26
 
27
+ def entries
28
+ @structured_entries
29
+ end
30
+
26
31
  def linkify_summary
27
32
  return if summary.nil?
28
33
  @linkified_summary = summary.gsub(/[A-Z]+[A-Z,\s-]+[A-Z]+/).each do |text|
29
- heading = @tree.find_by_entry(text)
30
- heading ? yield(text, heading) : text
34
+ entry = @tree.find_entry_by_loose_match(text)
35
+ entry ? yield(text, entry) : text
31
36
  end
32
37
  end
33
38
 
34
- def entries(locale = default_locale)
35
- @entries[locale] ||= []
36
- end
37
-
38
39
  def has_ancestor(heading)
39
40
  return false if parents.empty?
40
41
  return true if parents.include? heading
@@ -68,6 +69,11 @@ module MESH
68
69
 
69
70
  def matches(conditions)
70
71
  conditions.each do |field, pattern|
72
+ if field == :entries
73
+ entries = @structured_entries.select { |entry| pattern =~ entry.term }
74
+ return !entries.nil? && !entries.empty?
75
+ end
76
+
71
77
  field_content = self.send(field)
72
78
  if field_content.kind_of?(Array)
73
79
  return false unless field_content.find { |fc| pattern =~ fc }
@@ -110,7 +116,7 @@ module MESH
110
116
  if parts.size > 1
111
117
  parts.pop
112
118
  parent_tree_number = parts.join '.'
113
- parent = @tree.find_by_tree_number(parent_tree_number)
119
+ parent = @tree.find_heading_by_tree_number(parent_tree_number)
114
120
  @parents << parent unless parent.nil? || @parents.include?(parent)
115
121
  parent.children << self unless parent.nil? || parent.children.include?(self)
116
122
  end
@@ -122,13 +128,15 @@ module MESH
122
128
  def connect_to_forward_references
123
129
  if !@connected_to_forward_references
124
130
  @forward_references = @forward_reference_terms.map do |term|
125
- @tree.find_by_original_heading(term)
131
+ @tree.find_heading_by_main_heading(term)
126
132
  end
127
133
  @connected_to_forward_references = true
128
134
  end
129
135
  end
130
136
 
131
- private
137
+ def entries_by_term
138
+ Hash[@structured_entries.map { |entry| [entry.term, entry] }]
139
+ end
132
140
 
133
141
  def initialize(tree, default_locale, lines)
134
142
  @tree = tree
@@ -141,7 +149,6 @@ module MESH
141
149
  @children = []
142
150
  @forward_references = []
143
151
  @forward_reference_terms = []
144
- @entries = {@default_locale => []}
145
152
  @structured_entries = []
146
153
  @original_heading = {}
147
154
  @natural_language_name = {}
@@ -170,16 +177,14 @@ module MESH
170
177
  when matches = line.match(/^MH = (.*)/)
171
178
  mh = matches[1]
172
179
  set_original_heading(mh)
173
- @entries[@default_locale] << mh unless @entries.include? mh
180
+ @structured_entries << MESH::Entry.new(self, mh, default_locale)
174
181
  librarian_parts = mh.match(/(.*), (.*)/)
175
182
  nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
176
183
  set_natural_language_name(nln)
177
184
 
178
185
  when matches = line.match(/^(?:PRINT )?ENTRY = (.*)/)
179
186
  entry = matches[1]
180
- term = entry.match(/([^|]+)/)
181
- @entries[@default_locale] << term[1] unless @entries.include? term[1]
182
- @structured_entries << MESH::Entry.new(self, entry)
187
+ @structured_entries << MESH::Entry.new(self, entry, default_locale)
183
188
 
184
189
  when matches = line.match(/^FX = (.*)/)
185
190
  @forward_reference_terms << matches[1]
@@ -187,9 +192,46 @@ module MESH
187
192
  end
188
193
 
189
194
  end
190
- @entries[@default_locale].sort!
191
195
 
192
196
  end
197
+
198
+
199
+ def load_translation(lines, locale)
200
+ new_entries = []
201
+ lines.each do |line|
202
+ case
203
+
204
+ when matches = line.match(/^MS = (.*)/)
205
+ set_summary(matches[1], locale)
206
+
207
+ when matches = line.match(/^MH = (.*)/)
208
+ set_original_heading(matches[1], locale)
209
+ librarian_parts = matches[1].match(/(.*), (.*)/)
210
+ natural_language_name = librarian_parts.nil? ? matches[1] : "#{librarian_parts[2]} #{librarian_parts[1]}"
211
+ set_natural_language_name(natural_language_name, locale)
212
+ entry = new_or_existing_entry(matches[1], locale)
213
+ new_entries << entry
214
+
215
+ when matches = line.match(/^(?:PRINT )?ENTRY = (.*)/)
216
+ entry = new_or_existing_entry(matches[1], locale)
217
+ new_entries << entry
218
+
219
+ end
220
+ end
221
+ new_entries
222
+ end
223
+
224
+ def new_or_existing_entry(term, locale)
225
+ existing_entries = @structured_entries.select { |entry| entry.term == term }
226
+ if existing_entries.empty?
227
+ new_entry = MESH::Entry.new(self, term, locale)
228
+ @structured_entries << new_entry
229
+ else
230
+ new_entry = existing_entries[0]
231
+ new_entry.locales << locale
232
+ end
233
+ new_entry
234
+ end
193
235
  end
194
236
  end
195
237
 
@@ -3,15 +3,20 @@ module MESH
3
3
  class Tree
4
4
 
5
5
  @@default_locale = :en_us
6
+ @@sw = Clarifier::StopWords.new()
6
7
 
7
8
  def initialize
8
9
 
9
- @headings = []
10
- @by_unique_id = {}
11
- @by_tree_number = {}
12
- @by_original_heading = {}
13
- @by_entry = {}
14
- @by_entry_word = Hash.new { |h, k| h[k] = Set.new }
10
+ @headings_last_position = -1
11
+ @headings = GoogleHashDenseLongToRuby.new
12
+ @headings_by_unique_id = GoogleHashDenseLongToRuby.new
13
+ @headings_by_tree_number = GoogleHashDenseLongToRuby.new
14
+ @headings_by_original_heading = GoogleHashDenseLongToRuby.new
15
+ @entries_by_term = GoogleHashDenseLongToRuby.new
16
+ @entries_by_loose_match_term = GoogleHashDenseLongToRuby.new #case insensitive, no punctuation, normalised whitespace
17
+ # @entries_by_word = Hash.new { |h, k| h[k] = Set.new }
18
+ @entries_by_first_word = GoogleHashDenseLongToRuby.new
19
+ # @entries_by_first_word = Hash.new { |h, k| h[k] = Set.new }
15
20
  @locales = [@@default_locale]
16
21
 
17
22
  filename = File.expand_path('../../../data/mesh_data_2014/d2014.bin.gz', __FILE__)
@@ -21,10 +26,26 @@ module MESH
21
26
  lines = []
22
27
  file.each_line do |line|
23
28
  case
24
- when line.match(/^\*NEWRECORD$/)
29
+ when line.start_with?('*NEWRECORD')
25
30
  unless lines.empty?
26
31
  mh = MESH::Heading.new(self, @@default_locale, lines)
27
- add_heading_to_hashes(mh)
32
+ @headings_last_position += 1
33
+ @headings[@headings_last_position] = mh
34
+ @headings_by_unique_id[mh.unique_id.hash] = mh
35
+ @headings_by_original_heading[mh.original_heading.hash] = mh
36
+ mh.tree_numbers.each do |tree_number|
37
+ hash = tree_number.hash
38
+ raise if @headings_by_tree_number[hash]
39
+ @headings_by_tree_number[hash] = mh
40
+ end
41
+ mh.structured_entries.each do |entry|
42
+ @entries_by_term[entry.term.hash] = entry
43
+ @entries_by_loose_match_term[entry.loose_match_term.hash] = entry
44
+ entry_words = entry.term.downcase.split(/\W+/)
45
+ hash = entry_words[0].hash
46
+ @entries_by_first_word[hash] ||= Set.new
47
+ @entries_by_first_word[hash] << entry
48
+ end
28
49
  lines = [line]
29
50
  end
30
51
  else
@@ -32,39 +53,12 @@ module MESH
32
53
  end
33
54
  end
34
55
 
35
- @headings.each do |heading|
36
- heading.connect_to_parents
37
- heading.connect_to_forward_references
38
- end
39
-
40
- end
41
-
42
- def add_heading_to_hashes(mh)
43
- @headings << mh
44
- @by_unique_id[mh.unique_id] = mh
45
- @by_original_heading[mh.original_heading] = mh
46
- add_heading_by_entry_word(mh, mh.original_heading)
47
- mh.tree_numbers.each do |tree_number|
48
- raise if @by_tree_number[tree_number]
49
- @by_tree_number[tree_number] = mh
50
- end
51
- match_headings = mh.entries.map { |e| entry_match_key(e) }.uniq
52
- match_headings.each do |entry|
53
- raise if @by_entry[entry]
54
- @by_entry[entry] = mh
55
- add_heading_by_entry_word(mh, entry)
56
+ (0..@headings_last_position).each do |i|
57
+ # @headings.each do |heading|
58
+ @headings[i].connect_to_parents
59
+ @headings[i].connect_to_forward_references
56
60
  end
57
- end
58
61
 
59
- def add_heading_by_entry_word(mh, entry)
60
- entry.split.each do |word|
61
- word.downcase!
62
- @by_entry_word[word] << mh
63
- end
64
- end
65
-
66
- def entry_match_key(e)
67
- e.strip.upcase
68
62
  end
69
63
 
70
64
  def load_translation(locale)
@@ -73,54 +67,39 @@ module MESH
73
67
  gzipped_file = File.open(filename)
74
68
  file = Zlib::GzipReader.new(gzipped_file)
75
69
 
76
- entries = []
77
- original_heading = nil
78
- natural_language_name = nil
79
- summary = nil
80
70
  unique_id = nil
71
+ lines = []
81
72
  file.each_line do |line|
82
73
 
83
74
  case
84
75
 
85
- when line.match(/^\*NEWRECORD$/)
86
- unless unique_id.nil?
87
- entries.sort!
88
- entries.uniq!
89
- if heading = find(unique_id)
90
- heading.set_original_heading(original_heading, locale) unless original_heading.nil?
91
- heading.set_natural_language_name(natural_language_name, locale) unless natural_language_name.nil?
92
- heading.set_summary(summary, locale) unless summary.nil?
93
- entries.each do |entry|
94
- heading.entries(locale) << entry
95
- add_heading_by_entry_word(heading, entry)
76
+ when line.start_with?('*NEWRECORD')
77
+ unless unique_id.nil? || lines.empty?
78
+ if heading = find_heading_by_unique_id(unique_id)
79
+ new_entries = heading.load_translation(lines, locale)
80
+ new_entries.each do |entry|
81
+ @entries_by_term[entry.term.hash] = entry
82
+ @entries_by_loose_match_term[entry.loose_match_term.hash] = entry
83
+ entry_words = entry.term.downcase.split(/\W+/)
84
+ hash = entry_words[0].hash
85
+ @entries_by_first_word[hash] ||= Set.new
86
+ @entries_by_first_word[hash] << entry
96
87
  end
88
+ else
89
+ raise 'Translation provided for missing header'
97
90
  end
98
91
 
99
- entries = []
100
- original_heading = nil
101
- summary = nil
102
92
  unique_id = nil
93
+ lines = []
103
94
  end
104
95
 
105
96
  when matches = line.match(/^UI = (.*)/)
106
97
  unique_id = matches[1]
107
98
 
108
- when matches = line.match(/^MS = (.*)/)
109
- summary = matches[1]
110
-
111
- when matches = line.match(/^MH = (.*)/)
112
- mh = matches[1]
113
- original_heading = mh
114
- entries << mh
115
- librarian_parts = mh.match(/(.*), (.*)/)
116
- natural_language_name = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
117
-
118
- when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
119
- entry = matches[1].chomp
120
- entries << entry
121
-
122
99
  end
123
100
 
101
+ lines << line
102
+
124
103
  end
125
104
  @locales << locale
126
105
  end
@@ -139,9 +118,9 @@ module MESH
139
118
 
140
119
  when line.match(/^\*NEWRECORD$/)
141
120
  unless unique_id.nil?
142
- if heading = find(unique_id)
121
+ if heading = find_heading_by_unique_id(unique_id)
143
122
  wikipedia_links.each do |wl|
144
- wl[:score] = (wl[:score].to_f / heading.entries.length.to_f).round(2)
123
+ wl[:score] = (wl[:score].to_f / heading.structured_entries.length.to_f).round(2)
145
124
  end
146
125
  heading.wikipedia_links = wikipedia_links
147
126
  end
@@ -165,86 +144,78 @@ module MESH
165
144
 
166
145
 
167
146
  def linkify_summaries &block
168
- @headings.each do |h|
147
+ (0..@headings_last_position).each do |i|
148
+ h = @headings[i]
149
+ # @headings.each do |h|
169
150
  h.linkify_summary &block
170
151
  end
171
152
  end
172
153
 
173
- # NO LONGER COVERED BY TESTS
174
- # def translate(locale, tr)
175
- # return if @locales.include? locale
176
- # @headings.each_with_index do |h, i|
177
- # h.set_original_heading(tr.translate(h.original_heading), locale)
178
- # h.set_natural_language_name(tr.translate(h.natural_language_name), locale)
179
- # h.set_summary(tr.translate(h.summary), locale)
180
- # h.entries.each { |entry| h.entries(locale) << tr.translate(entry) }
181
- # h.entries(locale).sort!
182
- # end
183
- #
184
- # @locales << locale
185
- # end
186
-
187
- def find(unique_id)
188
- return @by_unique_id[unique_id]
154
+ def find_heading_by_unique_id(unique_id)
155
+ return @headings_by_unique_id[unique_id.hash]
156
+ end
157
+
158
+ def find_heading_by_tree_number(tree_number)
159
+ return @headings_by_tree_number[tree_number.hash]
189
160
  end
190
161
 
191
- def find_by_tree_number(tree_number)
192
- return @by_tree_number[tree_number]
162
+ def find_heading_by_main_heading(heading)
163
+ return @headings_by_original_heading[heading.hash]
193
164
  end
194
165
 
195
- def find_by_original_heading(heading)
196
- return @by_original_heading[heading]
166
+ def find_entry_by_term(term)
167
+ return @entries_by_term[term.hash]
197
168
  end
198
169
 
199
- def find_by_entry(entry)
200
- return @by_entry[entry_match_key(entry)]
170
+ def find_entry_by_loose_match(term)
171
+ return @entries_by_loose_match_term[Entry.loose_match(term).hash]
201
172
  end
202
173
 
203
- def find_by_entry_word(word)
204
- return @by_entry_word[word]
174
+ def find_entries_by_word(word)
175
+ return @entries_by_first_word[word.hash]
205
176
  end
206
177
 
207
178
  def where(conditions)
208
179
  matches = []
209
- @headings.each do |heading|
180
+ (0..@headings_last_position).each do |i|
181
+ # @headings.each do |heading|
182
+ heading = @headings[i]
210
183
  matches << heading if heading.matches(conditions)
211
184
  end
212
185
  matches
213
186
  end
214
187
 
215
188
  def each
216
- for i in 0 ... @headings.size
189
+ (0..@headings_last_position).each do |i|
190
+ # for i in 0 ... @headings.size
217
191
  yield @headings[i] if @headings[i].useful
218
192
  end
219
193
  end
220
194
 
221
- def match_in_text(text)
195
+ def match_in_text (text)
222
196
  return [] if text.nil?
223
197
  downcased = text.downcase
224
- candidate_headings = Set.new
225
- downcased.split(/\W+/).uniq.each do |word|
226
- candidate_headings.merge(find_by_entry_word(word))
198
+ candidate_entries = []
199
+ text_words = @@sw.clarify(downcased).split(/\W+/)
200
+ text_words.uniq!
201
+ text_words.each do |word|
202
+ entries_by_word = find_entries_by_word(word)
203
+ candidate_entries << entries_by_word.to_a
227
204
  end
205
+ candidate_entries.compact!
206
+ candidate_entries.flatten!
207
+ # candidate_entries.uniq! #30% in this uniq
208
+ candidate_entries.keep_if { |entry| entry.heading.useful }
209
+ # puts "\n\n****\n#{candidate_entries.length}\n*****\n\n"
228
210
  matches = []
229
- candidate_headings.each do |heading|
230
- next unless heading.useful
231
- @locales.each do |locale|
232
- heading.entries(locale).each do |entry|
233
- if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
234
- if /^[A-Z0-9]+$/ =~ entry
235
- regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
236
- else
237
- regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
238
- end
239
- text.to_enum(:scan, regex).map do |m,|
240
- match = Regexp.last_match
241
- matches << {heading: heading, matched: entry, index: match.offset(0)}
242
- end
243
- end
244
- end
245
- end
211
+ candidate_entries.each do |entry|
212
+ entry_matches = entry.match_in_text(text, downcased)
213
+ matches << entry_matches
246
214
  end
247
- confirmed_matches = []
215
+
216
+ matches.compact!
217
+ matches.flatten!
218
+
248
219
  matches.combination(2) do |l, r|
249
220
  if (r[:index][0] >= l[:index][0]) && (r[:index][1] <= l[:index][1])
250
221
  #r is within l
@@ -257,6 +228,13 @@ module MESH
257
228
  matches.delete_if { |match| match[:delete] }
258
229
  end
259
230
 
231
+ private
232
+
233
+
234
+ def entry_match_key(e)
235
+ e.strip.upcase
236
+ end
237
+
260
238
 
261
239
  end
262
240