vaextractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/vaextractor.rb +263 -0
  3. metadata +74 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 99733aa02a6893e211ea821ee43ad63b558b161a
4
+ data.tar.gz: 21f5b410828fe8e1e1f66dd72308f726ce4a54e8
5
+ SHA512:
6
+ metadata.gz: 30a3983c99e90003bcfffc1b85d98162402d3d298ec33af539cdf076d7fdfda744a7766c0bcdd3b09117a375d0da014f7dd76a9406ba0a2ecc72e1f959324f44
7
+ data.tar.gz: ebe76eb202d0ff2819f2a66d7f25006cc35d3bf6f21ae89dd9a76a7f178e71418194705cf2e1becb79b846d7507656d563314ddaef97cdeff1c029c290f8a2c2
@@ -0,0 +1,263 @@
1
+ #!/usr/bin/env ruby
2
+ # Copyright 2016 Aaron Y. Lee MD MSCI
3
+ # University of Washington, Seattle WA
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+ #
18
+ #
19
+ require 'textoken'
20
+
21
+ class VAExtractor
22
+ @@varegex = /(\s|^|~|:)(20|3E|E)\/\s*(\d+)\s*([+|-])*\s*(\d)*|(HM|CF|LP|NLP)(\W+(@|at|x)*\s*((\d+)(\s*'|\s*"|\s*in|\s*ft|\s*feet)*|face)*|$)/
23
+ @@snellenlevels = [10,15,20,25,30,40,50,60,70,80,100,125,150,200,250,300,400,600,800]
24
+ @@validtokens = {"OD" => "OD", "RE" => "OD", "RIGHT" => "OD", "R" => "OD",
25
+ "L" => "OS", "OS" => "OS", "LE" => "OS", "LEFT" => "OS",
26
+ "BOTH" => "OU", "BE" => "OU", "OU" => "OU", "BILATERAL" => "OU"}
27
+ def initialize
28
+ @usedsnellen = {}
29
+ @@snellenlevels.each do |k|
30
+ @usedsnellen[k] = -Math.log(20.0 / k) / Math.log(10.0)
31
+ end
32
+ end
33
+
34
+ def aligntokens(s)
35
+ arr = Textoken(s).tokens
36
+ ret = []
37
+ lasti = 0
38
+ arr.each do |w|
39
+ subs = s[lasti, s.size]
40
+ i = subs.index(w)
41
+ ret.push [w, lasti + i]
42
+ lasti += i
43
+ end
44
+ return ret
45
+ end
46
+
47
+ def runentirefreq(rawtext)
48
+ tokens = aligntokens(rawtext)
49
+ scores = Hash.new(0)
50
+ tokens.each do |w, i|
51
+ w = w.upcase
52
+ if @@validtokens.has_key?(w) and @@validtokens[w] != "OU"
53
+ scores[@@validtokens[w]] += 1
54
+ end
55
+ end
56
+ return nil if scores.keys.count == 0
57
+ #p scores
58
+ return scores.sort_by {|k,v| v}.last.first
59
+ end
60
+
61
+
62
+ def logmar(va)
63
+ if va[0] == "20"
64
+ manual = @usedsnellen[va[1].to_i]
65
+ if va[2] == "+"
66
+ va[3] = "1" if va[3] == nil
67
+ denom = @@snellenlevels.index(va[1].to_i)
68
+ denom -= 1
69
+ denom = @usedsnellen[@@snellenlevels[denom]]
70
+ manual = 1.0 * va[3].to_i * (denom - manual) / 5.0 + manual
71
+ elsif va[2] == "-"
72
+ va[3] = "1" if va[3] == nil
73
+ denom = @@snellenlevels.index(va[1].to_i)
74
+ denom += 1
75
+ denom = @usedsnellen[@@snellenlevels[denom]]
76
+ manual = 1.0 * va[3].to_i * (denom - manual) / 5.0 + manual
77
+ end
78
+ return manual, va[0...4]
79
+ elsif va[0] == "3E" or va[0] == "3" or va[0] == "E"
80
+ denom = va[1].to_i
81
+ return -Math.log(3.0 / denom) / Math.log(10.0), va[0...4]
82
+ elsif va[4] == "CF"
83
+ return 2.0, [va[4],va[8],va[9],nil]
84
+ elsif va[4] == "HM"
85
+ return 2.4, [va[4],va[8],va[9],nil]
86
+ elsif va[4] == "LP"
87
+ return 2.7, [va[4],va[8],va[9],nil]
88
+ elsif va[4] == "NLP"
89
+ return 3.0, [va[4],va[8],va[9],nil]
90
+ end
91
+ return nil, nil
92
+ end
93
+
94
+ def searchpriorlines(lines)
95
+ lines.each do |l|
96
+ return nil if l.strip == ""
97
+ arr = Textoken(l).tokens
98
+ arr.each do |w|
99
+ if @@validtokens.has_key?(w.upcase)
100
+ next if @@validtokens[w.upcase] == "OU"
101
+ return @@validtokens[w.upcase]
102
+ end
103
+ end
104
+ end
105
+ return nil
106
+ end
107
+
108
+
109
+ def findlaterality(pos, tokens, linestr)
110
+ walls = {"." => 10, "!" => 10, "?" => 10, "," => 5, "and" => 5}
111
+ answers = {}
112
+ debug = ""
113
+ revtoken = {}
114
+ tokens.each do |w, i|
115
+ revtoken[i] = w
116
+ end
117
+ tokens.each do |w, i|
118
+ w = w.upcase
119
+ if @@validtokens.has_key?(w)
120
+ score = 0
121
+ l = i
122
+ r = pos
123
+ l = pos if i > pos
124
+ r = i if i > pos
125
+ (l...r).each do |j|
126
+ next if not revtoken.has_key?(j)
127
+ score += walls[revtoken[j]] if walls.has_key?(revtoken[j])
128
+ end
129
+ if not answers.has_key?(score)
130
+ answers[score] = []
131
+ end
132
+ answers[score].push [@@validtokens[w], (i-pos).abs]
133
+ end
134
+ end
135
+ return nil if answers.keys.count == 0
136
+
137
+ bestscore = answers.sort_by {|k, v| k}.first.last
138
+ sorted = bestscore.sort_by {|r| r[1]}
139
+ return sorted.first[0], answers
140
+ end
141
+
142
+
143
+ def extract(rawtext)
144
+ lines = rawtext.split("\n")
145
+ debug = false
146
+
147
+ rfound = false
148
+ lfound = false
149
+ found = false
150
+ vas = {"OD" => [], "OS" => []}
151
+ alreadychecked = {}
152
+ debugtxt = ""
153
+ for i in (0...lines.count)
154
+ next if lines[i].strip=~ /^IOP/ or lines[i].strip =~ /^Ta\s/ or lines[i].strip =~ /^Tp/i
155
+ next if alreadychecked.has_key?(i)
156
+ arr = lines[i].scan(@@varegex)
157
+ lines[i].enum_for(:scan, @@varegex).each do |val|
158
+ #puts "============================================="
159
+ debugtxt += "NEW VA DETECTED\n"
160
+ debugtxt += "#{lines[i-1]}\n"
161
+ debugtxt += "#{lines[i]}\n"
162
+ debugtxt += "#{lines[i+1]}\n"
163
+ debugtxt += "#{val}\n"
164
+ val.shift
165
+ next if val[3] != nil and val[3].to_i >= 5
166
+ pos = Regexp.last_match.begin(0)
167
+ tokens = aligntokens(lines[i])
168
+ lat,debughash = findlaterality(pos, tokens, lines[i])
169
+ debugtxt += "#{pos}\n"
170
+ debugtxt += "#{tokens}\n"
171
+ debugtxt += "#{debughash}\n"
172
+ #p lat
173
+ if lat == "OU"
174
+ vas["OD"].push [val, i, 5]
175
+ vas["OS"].push [val, i, 5]
176
+ elsif lat != nil
177
+ vas[lat].push [val, i, 5]
178
+ elsif lat == nil and vas["OD"].count == 0 and vas["OS"].count == 0
179
+ lat = searchpriorlines(lines[i-3..i-1].reverse)
180
+ if lat == nil and vas["OD"].count == 0 and vas["OS"].count == 0
181
+ # most likely the VAs are either two in one line OD/OS or on two consecutive lines
182
+ found = false
183
+ arr2 = lines[i+1].scan(@@varegex)
184
+ if arr2.count > 0 and arr.count > 0
185
+ arr.each do |row|
186
+ row.shift
187
+ next if row[0] != nil and row[3].to_i >= 5
188
+ vas["OD"].push [row, i, 0]
189
+ end
190
+ arr2.each do |row|
191
+ row.shift
192
+ next if row[0] != nil and row[3].to_i >= 5
193
+ vas["OS"].push [row, i+1, 0]
194
+ end
195
+ alreadychecked[i+1] = 1
196
+ found = true
197
+ elsif arr.count == 2
198
+ arr[0].shift
199
+ arr[1].shift
200
+ next if arr[0] != nil and arr[0][3].to_i >= 5
201
+ next if arr[1] != nil and arr[1][3].to_i >= 5
202
+ vas["OD"].push [arr[0], i, 0]
203
+ vas["OS"].push [arr[1], i, 0]
204
+ found = true
205
+ end
206
+ if not found
207
+ # worst case scenario, count up all the occurences of r/l and then take highest occuring freq
208
+ lat = runentirefreq(rawtext)
209
+ if lat == nil
210
+ #puts "ERROR: Laterality not found for #{val}"
211
+ raise ErrorLateralityNotFound
212
+ else
213
+ vas[lat].push [val, i, 0]
214
+ end
215
+ end
216
+ else
217
+ vas[lat].push [val, i, 3]
218
+ end
219
+ #exit
220
+ end
221
+ end
222
+ end
223
+
224
+ if vas["OD"].count == 0 and vas["OS"].count == 0
225
+ #puts "ERROR: No valid visual acuities found"
226
+ return {:RE => nil, :LE => nil, :RElogmar => nil, :LElogmar => nil}
227
+ else
228
+ bcva = {"OD" => nil, "OS" => nil}
229
+ puts "=================NEW PT" if debug
230
+ puts rawtext if debug
231
+ puts "===DEBUG" if debug
232
+ puts debugtxt if debug
233
+ puts "===OD" if debug
234
+
235
+ vas["OD"].each do |varr,line,priority|
236
+ puts "new va" if debug
237
+ p varr if debug
238
+ p lines[line] if debug
239
+ p priority if debug
240
+ lva = logmar(varr)
241
+ p lva if debug
242
+ if bcva["OD"] == nil or (bcva["OD"][0] <= priority and lva[0] < bcva["OD"][1])
243
+ bcva["OD"] = [priority, lva[0].round(4), lva[1]]
244
+ end
245
+ end
246
+ puts "===OS" if debug
247
+ vas["OS"].each do |varr,line,priority|
248
+ puts "new va" if debug
249
+ p varr if debug
250
+ p lines[line] if debug
251
+ p priority if debug
252
+ lva = logmar(varr)
253
+ p lva if debug
254
+ if bcva["OS"] == nil or (bcva["OS"][0] <= priority and lva[0] < bcva["OS"][1])
255
+ bcva["OS"] = [priority, lva[0].round(4), lva[1]]
256
+ end
257
+ end
258
+ bcva["OD"] = [nil, nil, [nil,nil,nil,nil]] if bcva["OD"] == nil
259
+ bcva["OS"] = [nil, nil, [nil,nil,nil,nil]] if bcva["OS"] == nil
260
+ return {:RE => bcva["OD"][2], :LE => bcva["OS"][2], :RElogmar => bcva["OD"][1], :LElogmar => bcva["OS"][1]}
261
+ end
262
+ end
263
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vaextractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Aaron Y. Lee MD MSCI
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-10-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: textoken
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.1.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 1.1.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 5.9.1
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 5.9.1
41
+ description: |
42
+ vaextractor uses rule-based NLP strategy to extract Snellen visual acuities
43
+ from unstructured ophthalmology clinical notes.
44
+ email: aaronylee@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - lib/vaextractor.rb
50
+ homepage: http://github.org/ayl/vaextractor
51
+ licenses:
52
+ - GNU GPLv3
53
+ metadata: {}
54
+ post_install_message:
55
+ rdoc_options: []
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ requirements: []
69
+ rubyforge_project:
70
+ rubygems_version: 2.4.8
71
+ signing_key:
72
+ specification_version: 4
73
+ summary: Rule based NLP library to extract visual acuities
74
+ test_files: []