excite 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,306 @@
|
|
1
|
+
require 'excite/crfparser'
|
2
|
+
|
3
|
+
DIR = File.dirname(__FILE__)
|
4
|
+
ROOT_DIR = "#{DIR}/../.."
|
5
|
+
RESOURCES_DIR = "#{ROOT_DIR}/lib/excite/resources"
|
6
|
+
TAGGED_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_references.txt"
|
7
|
+
TAGGED_HTML_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_html_references.txt"
|
8
|
+
TRAINING_DATA = "#{DIR}/training_data.txt"
|
9
|
+
TESTING_DATA = "#{DIR}/testing_data.txt"
|
10
|
+
TRAINING_REFS = "#{DIR}/training_refs.txt"
|
11
|
+
TESTING_REFS = "#{DIR}/testing_refs.txt"
|
12
|
+
MODEL_FILE = "#{DIR}/model"
|
13
|
+
TEMPLATE_FILE = "#{RESOURCES_DIR}/parsCit.template"
|
14
|
+
HTML_TEMPLATE_FILE = "#{RESOURCES_DIR}/html.template"
|
15
|
+
OUTPUT_FILE = "#{DIR}/output.txt"
|
16
|
+
HTML_OUTPUT_FILE = "#{DIR}/html-output.txt"
|
17
|
+
ANALYSIS_FILE= "#{DIR}/analysis.csv"
|
18
|
+
HTML_ANALYSIS_FILE = "#{DIR}/html-analysis.csv"
|
19
|
+
REFS_PREFIX = "training_refs_"
|
20
|
+
DATA_PREFIX = "training_data_"
|
21
|
+
TAG = "model_test"
|
22
|
+
|
23
|
+
require "#{ROOT_DIR}/model/test/array_helpers"
|
24
|
+
|
25
|
+
class Array
|
26
|
+
include ArrayHelpers
|
27
|
+
end
|
28
|
+
|
29
|
+
module Excite
|
30
|
+
|
31
|
+
class ModelTest
|
32
|
+
|
33
|
+
def analysis_file
|
34
|
+
if @mode == :html
|
35
|
+
HTML_ANALYSIS_FILE
|
36
|
+
else
|
37
|
+
ANALYSIS_FILE
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def output_file
|
42
|
+
if @mode == :html
|
43
|
+
HTML_OUTPUT_FILE
|
44
|
+
else
|
45
|
+
OUTPUT_FILE
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def template_file
|
50
|
+
if @mode == :html
|
51
|
+
HTML_TEMPLATE_FILE
|
52
|
+
else
|
53
|
+
TEMPLATE_FILE
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def tagged_references
|
58
|
+
if @mode == :html
|
59
|
+
TAGGED_HTML_REFERENCES
|
60
|
+
else
|
61
|
+
TAGGED_REFERENCES
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def initialize(mode = :string)
|
66
|
+
@crf = CRFParser.new(mode)
|
67
|
+
@mode = mode
|
68
|
+
end
|
69
|
+
|
70
|
+
def version
|
71
|
+
@version ||= `cd #{ROOT_DIR}; git show --pretty=oneline HEAD | head -1`.strip
|
72
|
+
end
|
73
|
+
|
74
|
+
def branch
|
75
|
+
if @branch.nil?
|
76
|
+
branch = `cd #{ROOT_DIR}; git branch`
|
77
|
+
branch =~ /\*\s+(\S+)/
|
78
|
+
@branch = $1
|
79
|
+
end
|
80
|
+
@branch
|
81
|
+
end
|
82
|
+
|
83
|
+
def aggregate_tags
|
84
|
+
branches = `git branch`.gsub(/\*/, '').strip.split(/\s+/)
|
85
|
+
branches.each {|branch|
|
86
|
+
`git checkout #{branch}`
|
87
|
+
tags = `git tag -l #{TAG}\*`.strip.split(/\s+/)
|
88
|
+
}
|
89
|
+
end
|
90
|
+
|
91
|
+
# def benchmark
|
92
|
+
# refs = []
|
93
|
+
# f = File.open(TRAINING_REFS, 'r')
|
94
|
+
# while line = f.gets
|
95
|
+
# refs << line.strip
|
96
|
+
# end
|
97
|
+
# # strip out tags
|
98
|
+
# refs.map! {|s| s.gsub(/<[^>]*>/, '')}
|
99
|
+
# # parse one string, since the lexicon is lazily evaluated
|
100
|
+
# Citation.create_from_string(refs.first)
|
101
|
+
# time = Benchmark.measure {
|
102
|
+
# refs.each {|ref| Citation.create_from_string(ref) }
|
103
|
+
# }
|
104
|
+
# return (time.real / refs.length.to_f)
|
105
|
+
# end
|
106
|
+
|
107
|
+
def run_test(commit=false, commit_message="evaluating model", tag_name='', k=10)
|
108
|
+
cross_validate(k)
|
109
|
+
accuracy = analyze(k)
|
110
|
+
#time = benchmark
|
111
|
+
#`echo "Average time per parse:,#{time}\n" >> #{analysis_file}`
|
112
|
+
|
113
|
+
if commit and tag_name.strip.blank?
|
114
|
+
raise "You must supply a tag name if you want to commit and tag this test"
|
115
|
+
end
|
116
|
+
|
117
|
+
if commit
|
118
|
+
str = "git add #{analysis_file} #{output_file}"
|
119
|
+
puts "Adding test files to index \n#{str}"
|
120
|
+
`#{str}`
|
121
|
+
|
122
|
+
str = "git commit --message '#{commit_message}' #{analysis_file} #{output_file}"
|
123
|
+
puts "Committing files to source control \n#{str}"
|
124
|
+
`#{str}`
|
125
|
+
|
126
|
+
str = "git tag #{TAG}_#{tag_name}_#{accuracy}"
|
127
|
+
puts "Tagging: \n#{str}"
|
128
|
+
`#{str}`
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def cleanup
|
133
|
+
to_remove = [TRAINING_DATA, TESTING_DATA, TRAINING_REFS, TESTING_REFS, MODEL_FILE]
|
134
|
+
`rm -f #{to_remove.join(" ")} #{DIR}/#{DATA_PREFIX}*txt #{DIR}/#{REFS_PREFIX}*txt`
|
135
|
+
end
|
136
|
+
|
137
|
+
def cross_validate(k=10)
|
138
|
+
generate_data(k)
|
139
|
+
# clear the output file
|
140
|
+
f = File.open(output_file, 'w')
|
141
|
+
f.close
|
142
|
+
k.times {|i|
|
143
|
+
puts "Performing #{i+1}th iteration of #{k}-fold cross validation"
|
144
|
+
# generate training refs
|
145
|
+
`rm #{TRAINING_DATA}; touch #{TRAINING_DATA};`
|
146
|
+
k.times {|j|
|
147
|
+
next if j == i
|
148
|
+
`cat #{DIR}/#{DATA_PREFIX}#{j}.txt >> #{TRAINING_DATA}`
|
149
|
+
}
|
150
|
+
puts "Training model"
|
151
|
+
train
|
152
|
+
`cat #{DIR}/#{DATA_PREFIX}#{i}.txt > #{TESTING_DATA}`
|
153
|
+
puts "Testing model"
|
154
|
+
test
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
# testpct: percentage of tagged references to hold out for testing
|
159
|
+
def generate_data(k=10)
|
160
|
+
testpct = k/100.0
|
161
|
+
lines = []
|
162
|
+
k.times { lines << [] }
|
163
|
+
f = File.open(tagged_references, 'r')
|
164
|
+
while line = f.gets
|
165
|
+
lines[((rand * k) % k).floor] << line.strip
|
166
|
+
end
|
167
|
+
f.close
|
168
|
+
|
169
|
+
lines.each_with_index {|ll, i|
|
170
|
+
f = File.open("#{DIR}/#{REFS_PREFIX}#{i}.txt", 'w')
|
171
|
+
f.write(ll.join("\n"))
|
172
|
+
f.flush
|
173
|
+
f.close
|
174
|
+
@crf.write_training_file("#{DIR}/#{REFS_PREFIX}#{i}.txt",
|
175
|
+
"#{DIR}/#{DATA_PREFIX}#{i}.txt")
|
176
|
+
}
|
177
|
+
end
|
178
|
+
|
179
|
+
def train
|
180
|
+
@crf.train(TRAINING_REFS, MODEL_FILE, template_file, TRAINING_DATA)
|
181
|
+
end
|
182
|
+
|
183
|
+
def test
|
184
|
+
str = "crf_test -m #{MODEL_FILE} #{TESTING_DATA} >> #{output_file}"
|
185
|
+
puts str
|
186
|
+
`#{str}`
|
187
|
+
end
|
188
|
+
|
189
|
+
def analyze(k)
|
190
|
+
# get the size of the corpus
|
191
|
+
corpus_size = `wc #{tagged_references}`.split.first
|
192
|
+
|
193
|
+
# go through all training/testing data to get complete list of output tags
|
194
|
+
labels = {}
|
195
|
+
[TRAINING_DATA, TESTING_DATA].each {|fn|
|
196
|
+
f = File.open(fn, 'r')
|
197
|
+
while l = f.gets
|
198
|
+
next if l.strip.blank?
|
199
|
+
labels[l.strip.split.last] = true
|
200
|
+
end
|
201
|
+
f.close
|
202
|
+
}
|
203
|
+
labels = labels.keys.sort
|
204
|
+
#puts "got labels:\n#{labels.join("\n")}"
|
205
|
+
|
206
|
+
# reopen and go through the files again
|
207
|
+
# for each reference, populate a confusion matrix hash
|
208
|
+
references = []
|
209
|
+
testf = File.open(output_file, 'r')
|
210
|
+
ref = new_hash(labels)
|
211
|
+
while testl = testf.gets
|
212
|
+
if testl.strip.blank?
|
213
|
+
references << ref
|
214
|
+
ref = new_hash(labels)
|
215
|
+
next
|
216
|
+
end
|
217
|
+
w = testl.strip.split
|
218
|
+
te = w[-1]
|
219
|
+
tr = w[-2]
|
220
|
+
#puts "#{te} #{tr}"
|
221
|
+
ref[tr][te] += 1
|
222
|
+
end
|
223
|
+
testf.close
|
224
|
+
|
225
|
+
# print results to a file
|
226
|
+
f = File.open(analysis_file, 'w')
|
227
|
+
f.write "Results for model\n branch: #{branch}\n version: #{version}\n"
|
228
|
+
f.write "Test run on:,#{Time.now}\n"
|
229
|
+
f.write "K-fold x-validation:,#{k}\n"
|
230
|
+
f.write "Corpus size:,#{corpus_size}\n\n"
|
231
|
+
|
232
|
+
# aggregate results in total hash
|
233
|
+
total = {}
|
234
|
+
labels.each {|trl|
|
235
|
+
labels.each {|tel|
|
236
|
+
total[trl] ||= {}
|
237
|
+
total[trl][tel] = references.map {|r| r[trl][tel]}.sum
|
238
|
+
}
|
239
|
+
}
|
240
|
+
|
241
|
+
# print a confusion matrix
|
242
|
+
f.write 'truth\test,'
|
243
|
+
f.write labels.join(',')
|
244
|
+
f.write "\n"
|
245
|
+
# first, by counts
|
246
|
+
labels.each {|trl|
|
247
|
+
f.write "#{trl},"
|
248
|
+
f.write( labels.map {|tel| total[trl][tel] }.join(',') )
|
249
|
+
f.write "\n"
|
250
|
+
}
|
251
|
+
# then by percent
|
252
|
+
labels.each {|trl|
|
253
|
+
f.write "#{trl},"
|
254
|
+
f.write labels.map{|tel| total[trl][tel]/total[trl].values.sum.to_f }.join(',')
|
255
|
+
f.write "\n"
|
256
|
+
}
|
257
|
+
|
258
|
+
# precision and recal by label
|
259
|
+
f.write "\n"
|
260
|
+
f.write "Label,Precision,Recall,F-measure\n"
|
261
|
+
labels.each {|trl|
|
262
|
+
p = total[trl][trl].to_f / labels.map{|l| total[l][trl]}.sum
|
263
|
+
r = total[trl][trl].to_f / total[trl].values.sum
|
264
|
+
fs = (2*p*r)/(p+r)
|
265
|
+
f.write "#{trl},#{p},#{r},#{fs}\n"
|
266
|
+
}
|
267
|
+
|
268
|
+
# get the average accuracy-per-reference
|
269
|
+
perfect = 0
|
270
|
+
avgs = references.map {|r|
|
271
|
+
n = labels.map {|label| r[label][label] }.sum
|
272
|
+
d = labels.map {|lab| r[lab].values.sum }.sum
|
273
|
+
perfect += 1 if n == d
|
274
|
+
n.to_f / d
|
275
|
+
}
|
276
|
+
f.write "\nAverage accuracy by reference:,#{avgs.mean}\n"
|
277
|
+
f.write "STD of Average accuracy by reference:,#{avgs.stddev}\n"
|
278
|
+
|
279
|
+
# number of perfectly parsed references
|
280
|
+
f.write "Perfect parses:,#{perfect},#{perfect.to_f/references.length}\n"
|
281
|
+
|
282
|
+
# Total accuracy
|
283
|
+
n = labels.map {|lab| total[lab][lab]}.sum
|
284
|
+
d = labels.map {|lab1| labels.map {|lab2| total[lab1][lab2]}.sum }.sum
|
285
|
+
f.write "Accuracy:, #{n/d.to_f}\n"
|
286
|
+
|
287
|
+
f.flush
|
288
|
+
f.close
|
289
|
+
|
290
|
+
return n/d.to_f
|
291
|
+
end
|
292
|
+
|
293
|
+
private
|
294
|
+
def new_hash(labels)
|
295
|
+
h = Hash.new
|
296
|
+
labels.each {|lab1|
|
297
|
+
h[lab1] = {}
|
298
|
+
labels.each {|lab2|
|
299
|
+
h[lab1][lab2] = 0
|
300
|
+
}
|
301
|
+
}
|
302
|
+
h
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
end
|