mesh-medical-subject-headings 2.3.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Gemfile.lock +9 -1
- data/MESH.gemspec +2 -0
- data/bin/benchmark_match_in_text +39 -16
- data/bin/profile_in_text +2 -2
- data/bin/translate +19 -9
- data/lib/MESH.rb +2 -0
- data/lib/MESH/entry.rb +43 -2
- data/lib/MESH/heading.rb +58 -16
- data/lib/MESH/tree.rb +100 -122
- data/lib/MESH/version.rb +1 -1
- data/test/classifier_test.rb +223 -223
- data/test/entry_test.rb +147 -9
- data/test/heading_test.rb +173 -167
- data/test/test_base.rb +177 -3
- data/test/tree_test.rb +81 -212
- metadata +48 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c3b985c4316ac7e79bfcb96be4f6c1e96bf880a
|
4
|
+
data.tar.gz: 1929dd7755fa64eefdc9d806d4169cb8eb0a3537
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a2b332c6768577055fb64a0b9db08a3ed13c0d16d87b9f542d556ed97ad33716edc43d41e2de2c6c4571301de4561433f899cb32c406e37bc6b85a3c53ddebd
|
7
|
+
data.tar.gz: 0b1b8ad7dc99b21b4e911760e16c15c44612cabd7f6850959f313d2cecf552f9c2a8fe5bfb8fa410a26b5b9a3952dd1d85d54d5df92f41074339de6bcd2eb73c
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
#3.0.0 / 2014-10-06
|
2
|
+
* [FEATURE] Finder methods on MESH::Tree renamed for clarity
|
3
|
+
* [FEATURE] Significant performance improvements to Tree.match_in_text
|
4
|
+
* [FEATURE] Using google_hash to improve GC performance (best in 2.1.3+)
|
5
|
+
* [FEATURE] Heading.entries replaced by heading.structured_entries (first introduced in 2.2.0)
|
6
|
+
* [FEATURE] Matching in text now returns structured entry used for match, as well as heading
|
7
|
+
* [FEATURE] Entries are now able to find themselves within given text (Entry.match_in_text)
|
8
|
+
|
1
9
|
#2.3.0 / 2014-09-30
|
2
10
|
* [FEATURE] Significant performance improvements to entity recognition in text
|
3
11
|
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
mesh-medical-subject-headings (
|
4
|
+
mesh-medical-subject-headings (3.0.0)
|
5
|
+
clarifier
|
6
|
+
google_hash
|
5
7
|
|
6
8
|
GEM
|
7
9
|
remote: https://rubygems.org/
|
8
10
|
specs:
|
11
|
+
clarifier (0.0.3)
|
12
|
+
google_hash (0.8.4)
|
13
|
+
sane (~> 0)
|
9
14
|
metaclass (0.0.4)
|
10
15
|
mini_portile (0.6.0)
|
11
16
|
minitest (5.0.8)
|
@@ -13,8 +18,11 @@ GEM
|
|
13
18
|
metaclass (~> 0.0.1)
|
14
19
|
nokogiri (1.6.2.1)
|
15
20
|
mini_portile (= 0.6.0)
|
21
|
+
os (0.9.6)
|
16
22
|
rake (10.2.2)
|
17
23
|
ruby-prof (0.14.2)
|
24
|
+
sane (0.25.8)
|
25
|
+
os (~> 0)
|
18
26
|
yard (0.8.7.4)
|
19
27
|
|
20
28
|
PLATFORMS
|
data/MESH.gemspec
CHANGED
@@ -17,6 +17,8 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.executables = nil
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ['lib']
|
20
|
+
spec.add_dependency 'google_hash'
|
21
|
+
spec.add_dependency 'clarifier'
|
20
22
|
|
21
23
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
22
24
|
spec.add_development_dependency 'rake'
|
data/bin/benchmark_match_in_text
CHANGED
@@ -23,29 +23,52 @@ class Numeric
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def time_this(name, &block)
|
26
|
-
print "#{name}"
|
27
|
-
STDOUT.flush
|
26
|
+
# print "#{name}"
|
27
|
+
# STDOUT.flush
|
28
28
|
start = Time.now.to_f
|
29
29
|
result = yield
|
30
30
|
finish = Time.now.to_f
|
31
|
-
puts "\t#{(finish - start).duration}"
|
32
|
-
|
31
|
+
# puts "\t#{(finish - start).duration}"
|
32
|
+
finish - start
|
33
33
|
end
|
34
34
|
|
35
|
-
mesh_tree =
|
36
|
-
|
37
|
-
|
35
|
+
mesh_tree = MESH::Tree.new
|
36
|
+
mesh_tree.load_translation(:en_gb)
|
37
|
+
mesh_tree.load_wikipedia
|
38
38
|
|
39
39
|
json_str = File.new('./example.json').read
|
40
40
|
extracted = JSON.parse(json_str)
|
41
41
|
|
42
|
-
|
43
|
-
description_headings = time_this('Matching in description') { mesh_tree.match_in_text(extracted['description']) }
|
44
|
-
content_headings = time_this('Matching in content') { mesh_tree.match_in_text(extracted['content']) }
|
42
|
+
timings = Hash.new { |h, k| h[k] = [] }
|
45
43
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
44
|
+
(0..10).each do |i|
|
45
|
+
time = time_this('Matching in title') { mesh_tree.match_in_text(extracted['title']) }
|
46
|
+
timings[:title_headings] << time
|
47
|
+
time = time_this('Matching in description') { mesh_tree.match_in_text(extracted['description']) }
|
48
|
+
timings[:description_headings] << time
|
49
|
+
time = time_this('Matching in content') { mesh_tree.match_in_text(extracted['content']) }
|
50
|
+
timings[:content_headings] << time
|
51
|
+
end
|
52
|
+
|
53
|
+
# (0..10).each do |i|
|
54
|
+
# time = time_this('Matching in title') { mesh_tree.match_in_text_2(extracted['title']) }
|
55
|
+
# timings[:title_entries_2] << time
|
56
|
+
# time = time_this('Matching in description') { mesh_tree.match_in_text_2(extracted['description']) }
|
57
|
+
# timings[:description_entries_2] << time
|
58
|
+
# time = time_this('Matching in content') { mesh_tree.match_in_text_2(extracted['content']) }
|
59
|
+
# timings[:content_entries_2] << time
|
60
|
+
# end
|
61
|
+
|
62
|
+
timings.each do |k, v|
|
63
|
+
t = v.map { |e| e.round(3) }.join("\t")
|
64
|
+
puts "#{k}\t#{t}"
|
65
|
+
avg = v.inject { |sum, el| sum + el }.to_f / v.size
|
66
|
+
puts "#{k}\t#{avg.round(3)}"
|
67
|
+
end
|
68
|
+
|
69
|
+
# classifier = MESH::Classifier.new()
|
70
|
+
# classification = time_this('Classifying from matches') { classifier.classify([
|
71
|
+
# {weight: 10.0, matches: title_headings},
|
72
|
+
# {weight: 5.0, matches: description_headings},
|
73
|
+
# {weight: 1.0, matches: content_headings}
|
74
|
+
# ]) }
|
data/bin/profile_in_text
CHANGED
@@ -42,9 +42,9 @@ extracted = JSON.parse(json_str)
|
|
42
42
|
|
43
43
|
result = RubyProf.profile do
|
44
44
|
|
45
|
-
title_headings = time_this('Matching in title') { mesh_tree.match_in_text(extracted['title']) }
|
45
|
+
# title_headings = time_this('Matching in title') { mesh_tree.match_in_text(extracted['title']) }
|
46
46
|
# description_headings = time_this('Matching in description') { mesh_tree.match_in_text(extracted['description']) }
|
47
|
-
|
47
|
+
content_headings = time_this('Matching in content') { mesh_tree.match_in_text(extracted['content']) }
|
48
48
|
|
49
49
|
# classifier = MESH::Classifier.new()
|
50
50
|
# classification = time_this('Classifying from matches') { classifier.classify([
|
data/bin/translate
CHANGED
@@ -1,15 +1,25 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require_relative '../lib/MESH'
|
4
|
-
|
5
|
-
mesh_tree = MESH::Tree.new
|
6
4
|
tr = MESH::Translator.new(MESH::Translator.enus_to_engb)
|
7
5
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
6
|
+
filename = File.expand_path('../../data/mesh_data_2014/d2014.bin.gz', __FILE__)
|
7
|
+
gzipped_file = File.open(filename)
|
8
|
+
file = Zlib::GzipReader.new(gzipped_file)
|
9
|
+
|
10
|
+
file.each_line do |line|
|
11
|
+
case
|
12
|
+
when line.start_with?('*NEWRECORD')
|
13
|
+
puts "\n*NEWRECORD"
|
14
|
+
when line.start_with?('UI = ')
|
15
|
+
puts line
|
16
|
+
when line.start_with?('MH = ')
|
17
|
+
puts tr.translate(line)
|
18
|
+
when line.start_with?('MS = ')
|
19
|
+
puts tr.translate(line)
|
20
|
+
when line.start_with?('PRINT ENTRY = ')
|
21
|
+
puts tr.translate(line)
|
22
|
+
when line.start_with?('ENTRY = ')
|
23
|
+
puts tr.translate(line)
|
24
|
+
end
|
15
25
|
end
|
data/lib/MESH.rb
CHANGED
data/lib/MESH/entry.rb
CHANGED
@@ -2,10 +2,20 @@ module MESH
|
|
2
2
|
|
3
3
|
class Entry
|
4
4
|
|
5
|
-
|
5
|
+
include Comparable
|
6
|
+
attr_accessor :heading, :term, :semantic_types, :semantic_relationship, :lexical_type, :regex, :case_sensitive,
|
7
|
+
:downcased, :locales, :loose_match_term
|
6
8
|
|
7
|
-
|
9
|
+
@@wordy_characters = ('a'..'z').to_a + ('A'..'Z').to_a + ('0'..'9').to_a
|
10
|
+
|
11
|
+
def <=> other
|
12
|
+
self.term <=> other.term
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(heading, entry_text, locale)
|
8
16
|
@heading = heading
|
17
|
+
@locales = Set.new
|
18
|
+
@locales << locale
|
9
19
|
@semantic_types = []
|
10
20
|
parts = entry_text.split('|')
|
11
21
|
if entry_text.include? '|'
|
@@ -29,6 +39,37 @@ module MESH
|
|
29
39
|
else
|
30
40
|
@term = entry_text
|
31
41
|
end
|
42
|
+
if /^[A-Z0-9]+$/ =~ @term
|
43
|
+
@regex = /(^|\W)#{Regexp.quote(@term)}(\W|$)/
|
44
|
+
@case_sensitive = true
|
45
|
+
else
|
46
|
+
@regex = /(^|\W)#{Regexp.quote(@term)}(\W|$)/i
|
47
|
+
@case_sensitive = false
|
48
|
+
end
|
49
|
+
|
50
|
+
@downcased = @term.downcase
|
51
|
+
@loose_match_term = Entry.loose_match(@term)
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.loose_match(term)
|
56
|
+
term.gsub(/\W+/, ' ').upcase
|
57
|
+
end
|
58
|
+
|
59
|
+
def match_in_text(text, downcased)
|
60
|
+
matches = []
|
61
|
+
return matches if text.nil? || text.empty?
|
62
|
+
|
63
|
+
loose_match = @case_sensitive ? (text.include? @term) : (downcased.include? @downcased)
|
64
|
+
if loose_match
|
65
|
+
text.to_enum(:scan, @regex).map do |m,|
|
66
|
+
match = Regexp.last_match
|
67
|
+
matches << {heading: @heading, matched: self, index: match.offset(0)}
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
matches
|
72
|
+
|
32
73
|
end
|
33
74
|
|
34
75
|
end
|
data/lib/MESH/heading.rb
CHANGED
@@ -4,7 +4,8 @@ module MESH
|
|
4
4
|
@@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor]
|
5
5
|
|
6
6
|
include Comparable
|
7
|
-
attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class, :default_locale,
|
7
|
+
attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class, :default_locale,
|
8
|
+
:semantic_types, :wikipedia_links, :structured_entries, :forward_references
|
8
9
|
attr_reader :linkified_summary
|
9
10
|
|
10
11
|
def <=> other
|
@@ -23,18 +24,18 @@ module MESH
|
|
23
24
|
@summary[locale]
|
24
25
|
end
|
25
26
|
|
27
|
+
def entries
|
28
|
+
@structured_entries
|
29
|
+
end
|
30
|
+
|
26
31
|
def linkify_summary
|
27
32
|
return if summary.nil?
|
28
33
|
@linkified_summary = summary.gsub(/[A-Z]+[A-Z,\s-]+[A-Z]+/).each do |text|
|
29
|
-
|
30
|
-
|
34
|
+
entry = @tree.find_entry_by_loose_match(text)
|
35
|
+
entry ? yield(text, entry) : text
|
31
36
|
end
|
32
37
|
end
|
33
38
|
|
34
|
-
def entries(locale = default_locale)
|
35
|
-
@entries[locale] ||= []
|
36
|
-
end
|
37
|
-
|
38
39
|
def has_ancestor(heading)
|
39
40
|
return false if parents.empty?
|
40
41
|
return true if parents.include? heading
|
@@ -68,6 +69,11 @@ module MESH
|
|
68
69
|
|
69
70
|
def matches(conditions)
|
70
71
|
conditions.each do |field, pattern|
|
72
|
+
if field == :entries
|
73
|
+
entries = @structured_entries.select { |entry| pattern =~ entry.term }
|
74
|
+
return !entries.nil? && !entries.empty?
|
75
|
+
end
|
76
|
+
|
71
77
|
field_content = self.send(field)
|
72
78
|
if field_content.kind_of?(Array)
|
73
79
|
return false unless field_content.find { |fc| pattern =~ fc }
|
@@ -110,7 +116,7 @@ module MESH
|
|
110
116
|
if parts.size > 1
|
111
117
|
parts.pop
|
112
118
|
parent_tree_number = parts.join '.'
|
113
|
-
parent = @tree.
|
119
|
+
parent = @tree.find_heading_by_tree_number(parent_tree_number)
|
114
120
|
@parents << parent unless parent.nil? || @parents.include?(parent)
|
115
121
|
parent.children << self unless parent.nil? || parent.children.include?(self)
|
116
122
|
end
|
@@ -122,13 +128,15 @@ module MESH
|
|
122
128
|
def connect_to_forward_references
|
123
129
|
if !@connected_to_forward_references
|
124
130
|
@forward_references = @forward_reference_terms.map do |term|
|
125
|
-
@tree.
|
131
|
+
@tree.find_heading_by_main_heading(term)
|
126
132
|
end
|
127
133
|
@connected_to_forward_references = true
|
128
134
|
end
|
129
135
|
end
|
130
136
|
|
131
|
-
|
137
|
+
def entries_by_term
|
138
|
+
Hash[@structured_entries.map { |entry| [entry.term, entry] }]
|
139
|
+
end
|
132
140
|
|
133
141
|
def initialize(tree, default_locale, lines)
|
134
142
|
@tree = tree
|
@@ -141,7 +149,6 @@ module MESH
|
|
141
149
|
@children = []
|
142
150
|
@forward_references = []
|
143
151
|
@forward_reference_terms = []
|
144
|
-
@entries = {@default_locale => []}
|
145
152
|
@structured_entries = []
|
146
153
|
@original_heading = {}
|
147
154
|
@natural_language_name = {}
|
@@ -170,16 +177,14 @@ module MESH
|
|
170
177
|
when matches = line.match(/^MH = (.*)/)
|
171
178
|
mh = matches[1]
|
172
179
|
set_original_heading(mh)
|
173
|
-
@
|
180
|
+
@structured_entries << MESH::Entry.new(self, mh, default_locale)
|
174
181
|
librarian_parts = mh.match(/(.*), (.*)/)
|
175
182
|
nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
|
176
183
|
set_natural_language_name(nln)
|
177
184
|
|
178
185
|
when matches = line.match(/^(?:PRINT )?ENTRY = (.*)/)
|
179
186
|
entry = matches[1]
|
180
|
-
|
181
|
-
@entries[@default_locale] << term[1] unless @entries.include? term[1]
|
182
|
-
@structured_entries << MESH::Entry.new(self, entry)
|
187
|
+
@structured_entries << MESH::Entry.new(self, entry, default_locale)
|
183
188
|
|
184
189
|
when matches = line.match(/^FX = (.*)/)
|
185
190
|
@forward_reference_terms << matches[1]
|
@@ -187,9 +192,46 @@ module MESH
|
|
187
192
|
end
|
188
193
|
|
189
194
|
end
|
190
|
-
@entries[@default_locale].sort!
|
191
195
|
|
192
196
|
end
|
197
|
+
|
198
|
+
|
199
|
+
def load_translation(lines, locale)
|
200
|
+
new_entries = []
|
201
|
+
lines.each do |line|
|
202
|
+
case
|
203
|
+
|
204
|
+
when matches = line.match(/^MS = (.*)/)
|
205
|
+
set_summary(matches[1], locale)
|
206
|
+
|
207
|
+
when matches = line.match(/^MH = (.*)/)
|
208
|
+
set_original_heading(matches[1], locale)
|
209
|
+
librarian_parts = matches[1].match(/(.*), (.*)/)
|
210
|
+
natural_language_name = librarian_parts.nil? ? matches[1] : "#{librarian_parts[2]} #{librarian_parts[1]}"
|
211
|
+
set_natural_language_name(natural_language_name, locale)
|
212
|
+
entry = new_or_existing_entry(matches[1], locale)
|
213
|
+
new_entries << entry
|
214
|
+
|
215
|
+
when matches = line.match(/^(?:PRINT )?ENTRY = (.*)/)
|
216
|
+
entry = new_or_existing_entry(matches[1], locale)
|
217
|
+
new_entries << entry
|
218
|
+
|
219
|
+
end
|
220
|
+
end
|
221
|
+
new_entries
|
222
|
+
end
|
223
|
+
|
224
|
+
def new_or_existing_entry(term, locale)
|
225
|
+
existing_entries = @structured_entries.select { |entry| entry.term == term }
|
226
|
+
if existing_entries.empty?
|
227
|
+
new_entry = MESH::Entry.new(self, term, locale)
|
228
|
+
@structured_entries << new_entry
|
229
|
+
else
|
230
|
+
new_entry = existing_entries[0]
|
231
|
+
new_entry.locales << locale
|
232
|
+
end
|
233
|
+
new_entry
|
234
|
+
end
|
193
235
|
end
|
194
236
|
end
|
195
237
|
|
data/lib/MESH/tree.rb
CHANGED
@@ -3,15 +3,20 @@ module MESH
|
|
3
3
|
class Tree
|
4
4
|
|
5
5
|
@@default_locale = :en_us
|
6
|
+
@@sw = Clarifier::StopWords.new()
|
6
7
|
|
7
8
|
def initialize
|
8
9
|
|
9
|
-
@
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@
|
13
|
-
@
|
14
|
-
@
|
10
|
+
@headings_last_position = -1
|
11
|
+
@headings = GoogleHashDenseLongToRuby.new
|
12
|
+
@headings_by_unique_id = GoogleHashDenseLongToRuby.new
|
13
|
+
@headings_by_tree_number = GoogleHashDenseLongToRuby.new
|
14
|
+
@headings_by_original_heading = GoogleHashDenseLongToRuby.new
|
15
|
+
@entries_by_term = GoogleHashDenseLongToRuby.new
|
16
|
+
@entries_by_loose_match_term = GoogleHashDenseLongToRuby.new #case insensitive, no punctuation, normalised whitespace
|
17
|
+
# @entries_by_word = Hash.new { |h, k| h[k] = Set.new }
|
18
|
+
@entries_by_first_word = GoogleHashDenseLongToRuby.new
|
19
|
+
# @entries_by_first_word = Hash.new { |h, k| h[k] = Set.new }
|
15
20
|
@locales = [@@default_locale]
|
16
21
|
|
17
22
|
filename = File.expand_path('../../../data/mesh_data_2014/d2014.bin.gz', __FILE__)
|
@@ -21,10 +26,26 @@ module MESH
|
|
21
26
|
lines = []
|
22
27
|
file.each_line do |line|
|
23
28
|
case
|
24
|
-
when line.
|
29
|
+
when line.start_with?('*NEWRECORD')
|
25
30
|
unless lines.empty?
|
26
31
|
mh = MESH::Heading.new(self, @@default_locale, lines)
|
27
|
-
|
32
|
+
@headings_last_position += 1
|
33
|
+
@headings[@headings_last_position] = mh
|
34
|
+
@headings_by_unique_id[mh.unique_id.hash] = mh
|
35
|
+
@headings_by_original_heading[mh.original_heading.hash] = mh
|
36
|
+
mh.tree_numbers.each do |tree_number|
|
37
|
+
hash = tree_number.hash
|
38
|
+
raise if @headings_by_tree_number[hash]
|
39
|
+
@headings_by_tree_number[hash] = mh
|
40
|
+
end
|
41
|
+
mh.structured_entries.each do |entry|
|
42
|
+
@entries_by_term[entry.term.hash] = entry
|
43
|
+
@entries_by_loose_match_term[entry.loose_match_term.hash] = entry
|
44
|
+
entry_words = entry.term.downcase.split(/\W+/)
|
45
|
+
hash = entry_words[0].hash
|
46
|
+
@entries_by_first_word[hash] ||= Set.new
|
47
|
+
@entries_by_first_word[hash] << entry
|
48
|
+
end
|
28
49
|
lines = [line]
|
29
50
|
end
|
30
51
|
else
|
@@ -32,39 +53,12 @@ module MESH
|
|
32
53
|
end
|
33
54
|
end
|
34
55
|
|
35
|
-
|
36
|
-
heading
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
def add_heading_to_hashes(mh)
|
43
|
-
@headings << mh
|
44
|
-
@by_unique_id[mh.unique_id] = mh
|
45
|
-
@by_original_heading[mh.original_heading] = mh
|
46
|
-
add_heading_by_entry_word(mh, mh.original_heading)
|
47
|
-
mh.tree_numbers.each do |tree_number|
|
48
|
-
raise if @by_tree_number[tree_number]
|
49
|
-
@by_tree_number[tree_number] = mh
|
50
|
-
end
|
51
|
-
match_headings = mh.entries.map { |e| entry_match_key(e) }.uniq
|
52
|
-
match_headings.each do |entry|
|
53
|
-
raise if @by_entry[entry]
|
54
|
-
@by_entry[entry] = mh
|
55
|
-
add_heading_by_entry_word(mh, entry)
|
56
|
+
(0..@headings_last_position).each do |i|
|
57
|
+
# @headings.each do |heading|
|
58
|
+
@headings[i].connect_to_parents
|
59
|
+
@headings[i].connect_to_forward_references
|
56
60
|
end
|
57
|
-
end
|
58
61
|
|
59
|
-
def add_heading_by_entry_word(mh, entry)
|
60
|
-
entry.split.each do |word|
|
61
|
-
word.downcase!
|
62
|
-
@by_entry_word[word] << mh
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def entry_match_key(e)
|
67
|
-
e.strip.upcase
|
68
62
|
end
|
69
63
|
|
70
64
|
def load_translation(locale)
|
@@ -73,54 +67,39 @@ module MESH
|
|
73
67
|
gzipped_file = File.open(filename)
|
74
68
|
file = Zlib::GzipReader.new(gzipped_file)
|
75
69
|
|
76
|
-
entries = []
|
77
|
-
original_heading = nil
|
78
|
-
natural_language_name = nil
|
79
|
-
summary = nil
|
80
70
|
unique_id = nil
|
71
|
+
lines = []
|
81
72
|
file.each_line do |line|
|
82
73
|
|
83
74
|
case
|
84
75
|
|
85
|
-
when line.
|
86
|
-
unless unique_id.nil?
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
76
|
+
when line.start_with?('*NEWRECORD')
|
77
|
+
unless unique_id.nil? || lines.empty?
|
78
|
+
if heading = find_heading_by_unique_id(unique_id)
|
79
|
+
new_entries = heading.load_translation(lines, locale)
|
80
|
+
new_entries.each do |entry|
|
81
|
+
@entries_by_term[entry.term.hash] = entry
|
82
|
+
@entries_by_loose_match_term[entry.loose_match_term.hash] = entry
|
83
|
+
entry_words = entry.term.downcase.split(/\W+/)
|
84
|
+
hash = entry_words[0].hash
|
85
|
+
@entries_by_first_word[hash] ||= Set.new
|
86
|
+
@entries_by_first_word[hash] << entry
|
96
87
|
end
|
88
|
+
else
|
89
|
+
raise 'Translation provided for missing header'
|
97
90
|
end
|
98
91
|
|
99
|
-
entries = []
|
100
|
-
original_heading = nil
|
101
|
-
summary = nil
|
102
92
|
unique_id = nil
|
93
|
+
lines = []
|
103
94
|
end
|
104
95
|
|
105
96
|
when matches = line.match(/^UI = (.*)/)
|
106
97
|
unique_id = matches[1]
|
107
98
|
|
108
|
-
when matches = line.match(/^MS = (.*)/)
|
109
|
-
summary = matches[1]
|
110
|
-
|
111
|
-
when matches = line.match(/^MH = (.*)/)
|
112
|
-
mh = matches[1]
|
113
|
-
original_heading = mh
|
114
|
-
entries << mh
|
115
|
-
librarian_parts = mh.match(/(.*), (.*)/)
|
116
|
-
natural_language_name = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}"
|
117
|
-
|
118
|
-
when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/)
|
119
|
-
entry = matches[1].chomp
|
120
|
-
entries << entry
|
121
|
-
|
122
99
|
end
|
123
100
|
|
101
|
+
lines << line
|
102
|
+
|
124
103
|
end
|
125
104
|
@locales << locale
|
126
105
|
end
|
@@ -139,9 +118,9 @@ module MESH
|
|
139
118
|
|
140
119
|
when line.match(/^\*NEWRECORD$/)
|
141
120
|
unless unique_id.nil?
|
142
|
-
if heading =
|
121
|
+
if heading = find_heading_by_unique_id(unique_id)
|
143
122
|
wikipedia_links.each do |wl|
|
144
|
-
wl[:score] = (wl[:score].to_f / heading.
|
123
|
+
wl[:score] = (wl[:score].to_f / heading.structured_entries.length.to_f).round(2)
|
145
124
|
end
|
146
125
|
heading.wikipedia_links = wikipedia_links
|
147
126
|
end
|
@@ -165,86 +144,78 @@ module MESH
|
|
165
144
|
|
166
145
|
|
167
146
|
def linkify_summaries &block
|
168
|
-
|
147
|
+
(0..@headings_last_position).each do |i|
|
148
|
+
h = @headings[i]
|
149
|
+
# @headings.each do |h|
|
169
150
|
h.linkify_summary &block
|
170
151
|
end
|
171
152
|
end
|
172
153
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
# h.set_summary(tr.translate(h.summary), locale)
|
180
|
-
# h.entries.each { |entry| h.entries(locale) << tr.translate(entry) }
|
181
|
-
# h.entries(locale).sort!
|
182
|
-
# end
|
183
|
-
#
|
184
|
-
# @locales << locale
|
185
|
-
# end
|
186
|
-
|
187
|
-
def find(unique_id)
|
188
|
-
return @by_unique_id[unique_id]
|
154
|
+
def find_heading_by_unique_id(unique_id)
|
155
|
+
return @headings_by_unique_id[unique_id.hash]
|
156
|
+
end
|
157
|
+
|
158
|
+
def find_heading_by_tree_number(tree_number)
|
159
|
+
return @headings_by_tree_number[tree_number.hash]
|
189
160
|
end
|
190
161
|
|
191
|
-
def
|
192
|
-
return @
|
162
|
+
def find_heading_by_main_heading(heading)
|
163
|
+
return @headings_by_original_heading[heading.hash]
|
193
164
|
end
|
194
165
|
|
195
|
-
def
|
196
|
-
return @
|
166
|
+
def find_entry_by_term(term)
|
167
|
+
return @entries_by_term[term.hash]
|
197
168
|
end
|
198
169
|
|
199
|
-
def
|
200
|
-
return @
|
170
|
+
def find_entry_by_loose_match(term)
|
171
|
+
return @entries_by_loose_match_term[Entry.loose_match(term).hash]
|
201
172
|
end
|
202
173
|
|
203
|
-
def
|
204
|
-
return @
|
174
|
+
def find_entries_by_word(word)
|
175
|
+
return @entries_by_first_word[word.hash]
|
205
176
|
end
|
206
177
|
|
207
178
|
def where(conditions)
|
208
179
|
matches = []
|
209
|
-
|
180
|
+
(0..@headings_last_position).each do |i|
|
181
|
+
# @headings.each do |heading|
|
182
|
+
heading = @headings[i]
|
210
183
|
matches << heading if heading.matches(conditions)
|
211
184
|
end
|
212
185
|
matches
|
213
186
|
end
|
214
187
|
|
215
188
|
def each
|
216
|
-
|
189
|
+
(0..@headings_last_position).each do |i|
|
190
|
+
# for i in 0 ... @headings.size
|
217
191
|
yield @headings[i] if @headings[i].useful
|
218
192
|
end
|
219
193
|
end
|
220
194
|
|
221
|
-
def match_in_text(text)
|
195
|
+
def match_in_text (text)
|
222
196
|
return [] if text.nil?
|
223
197
|
downcased = text.downcase
|
224
|
-
|
225
|
-
downcased.split(/\W+/)
|
226
|
-
|
198
|
+
candidate_entries = []
|
199
|
+
text_words = @@sw.clarify(downcased).split(/\W+/)
|
200
|
+
text_words.uniq!
|
201
|
+
text_words.each do |word|
|
202
|
+
entries_by_word = find_entries_by_word(word)
|
203
|
+
candidate_entries << entries_by_word.to_a
|
227
204
|
end
|
205
|
+
candidate_entries.compact!
|
206
|
+
candidate_entries.flatten!
|
207
|
+
# candidate_entries.uniq! #30% in this uniq
|
208
|
+
candidate_entries.keep_if { |entry| entry.heading.useful }
|
209
|
+
# puts "\n\n****\n#{candidate_entries.length}\n*****\n\n"
|
228
210
|
matches = []
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
heading.entries(locale).each do |entry|
|
233
|
-
if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
|
234
|
-
if /^[A-Z0-9]+$/ =~ entry
|
235
|
-
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
|
236
|
-
else
|
237
|
-
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
|
238
|
-
end
|
239
|
-
text.to_enum(:scan, regex).map do |m,|
|
240
|
-
match = Regexp.last_match
|
241
|
-
matches << {heading: heading, matched: entry, index: match.offset(0)}
|
242
|
-
end
|
243
|
-
end
|
244
|
-
end
|
245
|
-
end
|
211
|
+
candidate_entries.each do |entry|
|
212
|
+
entry_matches = entry.match_in_text(text, downcased)
|
213
|
+
matches << entry_matches
|
246
214
|
end
|
247
|
-
|
215
|
+
|
216
|
+
matches.compact!
|
217
|
+
matches.flatten!
|
218
|
+
|
248
219
|
matches.combination(2) do |l, r|
|
249
220
|
if (r[:index][0] >= l[:index][0]) && (r[:index][1] <= l[:index][1])
|
250
221
|
#r is within l
|
@@ -257,6 +228,13 @@ module MESH
|
|
257
228
|
matches.delete_if { |match| match[:delete] }
|
258
229
|
end
|
259
230
|
|
231
|
+
private
|
232
|
+
|
233
|
+
|
234
|
+
def entry_match_key(e)
|
235
|
+
e.strip.upcase
|
236
|
+
end
|
237
|
+
|
260
238
|
|
261
239
|
end
|
262
240
|
|