excite 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,306 @@
|
|
1
|
+
require 'excite/crfparser'
|
2
|
+
|
3
|
+
DIR = File.dirname(__FILE__)
|
4
|
+
ROOT_DIR = "#{DIR}/../.."
|
5
|
+
RESOURCES_DIR = "#{ROOT_DIR}/lib/excite/resources"
|
6
|
+
TAGGED_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_references.txt"
|
7
|
+
TAGGED_HTML_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_html_references.txt"
|
8
|
+
TRAINING_DATA = "#{DIR}/training_data.txt"
|
9
|
+
TESTING_DATA = "#{DIR}/testing_data.txt"
|
10
|
+
TRAINING_REFS = "#{DIR}/training_refs.txt"
|
11
|
+
TESTING_REFS = "#{DIR}/testing_refs.txt"
|
12
|
+
MODEL_FILE = "#{DIR}/model"
|
13
|
+
TEMPLATE_FILE = "#{RESOURCES_DIR}/parsCit.template"
|
14
|
+
HTML_TEMPLATE_FILE = "#{RESOURCES_DIR}/html.template"
|
15
|
+
OUTPUT_FILE = "#{DIR}/output.txt"
|
16
|
+
HTML_OUTPUT_FILE = "#{DIR}/html-output.txt"
|
17
|
+
ANALYSIS_FILE= "#{DIR}/analysis.csv"
|
18
|
+
HTML_ANALYSIS_FILE = "#{DIR}/html-analysis.csv"
|
19
|
+
REFS_PREFIX = "training_refs_"
|
20
|
+
DATA_PREFIX = "training_data_"
|
21
|
+
TAG = "model_test"
|
22
|
+
|
23
|
+
require "#{ROOT_DIR}/model/test/array_helpers"
|
24
|
+
|
25
|
+
class Array
|
26
|
+
include ArrayHelpers
|
27
|
+
end
|
28
|
+
|
29
|
+
module Excite
|
30
|
+
|
31
|
+
class ModelTest
|
32
|
+
|
33
|
+
def analysis_file
|
34
|
+
if @mode == :html
|
35
|
+
HTML_ANALYSIS_FILE
|
36
|
+
else
|
37
|
+
ANALYSIS_FILE
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def output_file
|
42
|
+
if @mode == :html
|
43
|
+
HTML_OUTPUT_FILE
|
44
|
+
else
|
45
|
+
OUTPUT_FILE
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def template_file
|
50
|
+
if @mode == :html
|
51
|
+
HTML_TEMPLATE_FILE
|
52
|
+
else
|
53
|
+
TEMPLATE_FILE
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def tagged_references
|
58
|
+
if @mode == :html
|
59
|
+
TAGGED_HTML_REFERENCES
|
60
|
+
else
|
61
|
+
TAGGED_REFERENCES
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def initialize(mode = :string)
|
66
|
+
@crf = CRFParser.new(mode)
|
67
|
+
@mode = mode
|
68
|
+
end
|
69
|
+
|
70
|
+
def version
|
71
|
+
@version ||= `cd #{ROOT_DIR}; git show --pretty=oneline HEAD | head -1`.strip
|
72
|
+
end
|
73
|
+
|
74
|
+
def branch
|
75
|
+
if @branch.nil?
|
76
|
+
branch = `cd #{ROOT_DIR}; git branch`
|
77
|
+
branch =~ /\*\s+(\S+)/
|
78
|
+
@branch = $1
|
79
|
+
end
|
80
|
+
@branch
|
81
|
+
end
|
82
|
+
|
83
|
+
def aggregate_tags
|
84
|
+
branches = `git branch`.gsub(/\*/, '').strip.split(/\s+/)
|
85
|
+
branches.each {|branch|
|
86
|
+
`git checkout #{branch}`
|
87
|
+
tags = `git tag -l #{TAG}\*`.strip.split(/\s+/)
|
88
|
+
}
|
89
|
+
end
|
90
|
+
|
91
|
+
# def benchmark
|
92
|
+
# refs = []
|
93
|
+
# f = File.open(TRAINING_REFS, 'r')
|
94
|
+
# while line = f.gets
|
95
|
+
# refs << line.strip
|
96
|
+
# end
|
97
|
+
# # strip out tags
|
98
|
+
# refs.map! {|s| s.gsub(/<[^>]*>/, '')}
|
99
|
+
# # parse one string, since the lexicon is lazily evaluated
|
100
|
+
# Citation.create_from_string(refs.first)
|
101
|
+
# time = Benchmark.measure {
|
102
|
+
# refs.each {|ref| Citation.create_from_string(ref) }
|
103
|
+
# }
|
104
|
+
# return (time.real / refs.length.to_f)
|
105
|
+
# end
|
106
|
+
|
107
|
+
def run_test(commit=false, commit_message="evaluating model", tag_name='', k=10)
|
108
|
+
cross_validate(k)
|
109
|
+
accuracy = analyze(k)
|
110
|
+
#time = benchmark
|
111
|
+
#`echo "Average time per parse:,#{time}\n" >> #{analysis_file}`
|
112
|
+
|
113
|
+
if commit and tag_name.strip.blank?
|
114
|
+
raise "You must supply a tag name if you want to commit and tag this test"
|
115
|
+
end
|
116
|
+
|
117
|
+
if commit
|
118
|
+
str = "git add #{analysis_file} #{output_file}"
|
119
|
+
puts "Adding test files to index \n#{str}"
|
120
|
+
`#{str}`
|
121
|
+
|
122
|
+
str = "git commit --message '#{commit_message}' #{analysis_file} #{output_file}"
|
123
|
+
puts "Committing files to source control \n#{str}"
|
124
|
+
`#{str}`
|
125
|
+
|
126
|
+
str = "git tag #{TAG}_#{tag_name}_#{accuracy}"
|
127
|
+
puts "Tagging: \n#{str}"
|
128
|
+
`#{str}`
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def cleanup
|
133
|
+
to_remove = [TRAINING_DATA, TESTING_DATA, TRAINING_REFS, TESTING_REFS, MODEL_FILE]
|
134
|
+
`rm -f #{to_remove.join(" ")} #{DIR}/#{DATA_PREFIX}*txt #{DIR}/#{REFS_PREFIX}*txt`
|
135
|
+
end
|
136
|
+
|
137
|
+
def cross_validate(k=10)
|
138
|
+
generate_data(k)
|
139
|
+
# clear the output file
|
140
|
+
f = File.open(output_file, 'w')
|
141
|
+
f.close
|
142
|
+
k.times {|i|
|
143
|
+
puts "Performing #{i+1}th iteration of #{k}-fold cross validation"
|
144
|
+
# generate training refs
|
145
|
+
`rm #{TRAINING_DATA}; touch #{TRAINING_DATA};`
|
146
|
+
k.times {|j|
|
147
|
+
next if j == i
|
148
|
+
`cat #{DIR}/#{DATA_PREFIX}#{j}.txt >> #{TRAINING_DATA}`
|
149
|
+
}
|
150
|
+
puts "Training model"
|
151
|
+
train
|
152
|
+
`cat #{DIR}/#{DATA_PREFIX}#{i}.txt > #{TESTING_DATA}`
|
153
|
+
puts "Testing model"
|
154
|
+
test
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
# testpct: percentage of tagged references to hold out for testing
|
159
|
+
def generate_data(k=10)
|
160
|
+
testpct = k/100.0
|
161
|
+
lines = []
|
162
|
+
k.times { lines << [] }
|
163
|
+
f = File.open(tagged_references, 'r')
|
164
|
+
while line = f.gets
|
165
|
+
lines[((rand * k) % k).floor] << line.strip
|
166
|
+
end
|
167
|
+
f.close
|
168
|
+
|
169
|
+
lines.each_with_index {|ll, i|
|
170
|
+
f = File.open("#{DIR}/#{REFS_PREFIX}#{i}.txt", 'w')
|
171
|
+
f.write(ll.join("\n"))
|
172
|
+
f.flush
|
173
|
+
f.close
|
174
|
+
@crf.write_training_file("#{DIR}/#{REFS_PREFIX}#{i}.txt",
|
175
|
+
"#{DIR}/#{DATA_PREFIX}#{i}.txt")
|
176
|
+
}
|
177
|
+
end
|
178
|
+
|
179
|
+
def train
|
180
|
+
@crf.train(TRAINING_REFS, MODEL_FILE, template_file, TRAINING_DATA)
|
181
|
+
end
|
182
|
+
|
183
|
+
def test
|
184
|
+
str = "crf_test -m #{MODEL_FILE} #{TESTING_DATA} >> #{output_file}"
|
185
|
+
puts str
|
186
|
+
`#{str}`
|
187
|
+
end
|
188
|
+
|
189
|
+
def analyze(k)
|
190
|
+
# get the size of the corpus
|
191
|
+
corpus_size = `wc #{tagged_references}`.split.first
|
192
|
+
|
193
|
+
# go through all training/testing data to get complete list of output tags
|
194
|
+
labels = {}
|
195
|
+
[TRAINING_DATA, TESTING_DATA].each {|fn|
|
196
|
+
f = File.open(fn, 'r')
|
197
|
+
while l = f.gets
|
198
|
+
next if l.strip.blank?
|
199
|
+
labels[l.strip.split.last] = true
|
200
|
+
end
|
201
|
+
f.close
|
202
|
+
}
|
203
|
+
labels = labels.keys.sort
|
204
|
+
#puts "got labels:\n#{labels.join("\n")}"
|
205
|
+
|
206
|
+
# reopen and go through the files again
|
207
|
+
# for each reference, populate a confusion matrix hash
|
208
|
+
references = []
|
209
|
+
testf = File.open(output_file, 'r')
|
210
|
+
ref = new_hash(labels)
|
211
|
+
while testl = testf.gets
|
212
|
+
if testl.strip.blank?
|
213
|
+
references << ref
|
214
|
+
ref = new_hash(labels)
|
215
|
+
next
|
216
|
+
end
|
217
|
+
w = testl.strip.split
|
218
|
+
te = w[-1]
|
219
|
+
tr = w[-2]
|
220
|
+
#puts "#{te} #{tr}"
|
221
|
+
ref[tr][te] += 1
|
222
|
+
end
|
223
|
+
testf.close
|
224
|
+
|
225
|
+
# print results to a file
|
226
|
+
f = File.open(analysis_file, 'w')
|
227
|
+
f.write "Results for model\n branch: #{branch}\n version: #{version}\n"
|
228
|
+
f.write "Test run on:,#{Time.now}\n"
|
229
|
+
f.write "K-fold x-validation:,#{k}\n"
|
230
|
+
f.write "Corpus size:,#{corpus_size}\n\n"
|
231
|
+
|
232
|
+
# aggregate results in total hash
|
233
|
+
total = {}
|
234
|
+
labels.each {|trl|
|
235
|
+
labels.each {|tel|
|
236
|
+
total[trl] ||= {}
|
237
|
+
total[trl][tel] = references.map {|r| r[trl][tel]}.sum
|
238
|
+
}
|
239
|
+
}
|
240
|
+
|
241
|
+
# print a confusion matrix
|
242
|
+
f.write 'truth\test,'
|
243
|
+
f.write labels.join(',')
|
244
|
+
f.write "\n"
|
245
|
+
# first, by counts
|
246
|
+
labels.each {|trl|
|
247
|
+
f.write "#{trl},"
|
248
|
+
f.write( labels.map {|tel| total[trl][tel] }.join(',') )
|
249
|
+
f.write "\n"
|
250
|
+
}
|
251
|
+
# then by percent
|
252
|
+
labels.each {|trl|
|
253
|
+
f.write "#{trl},"
|
254
|
+
f.write labels.map{|tel| total[trl][tel]/total[trl].values.sum.to_f }.join(',')
|
255
|
+
f.write "\n"
|
256
|
+
}
|
257
|
+
|
258
|
+
# precision and recal by label
|
259
|
+
f.write "\n"
|
260
|
+
f.write "Label,Precision,Recall,F-measure\n"
|
261
|
+
labels.each {|trl|
|
262
|
+
p = total[trl][trl].to_f / labels.map{|l| total[l][trl]}.sum
|
263
|
+
r = total[trl][trl].to_f / total[trl].values.sum
|
264
|
+
fs = (2*p*r)/(p+r)
|
265
|
+
f.write "#{trl},#{p},#{r},#{fs}\n"
|
266
|
+
}
|
267
|
+
|
268
|
+
# get the average accuracy-per-reference
|
269
|
+
perfect = 0
|
270
|
+
avgs = references.map {|r|
|
271
|
+
n = labels.map {|label| r[label][label] }.sum
|
272
|
+
d = labels.map {|lab| r[lab].values.sum }.sum
|
273
|
+
perfect += 1 if n == d
|
274
|
+
n.to_f / d
|
275
|
+
}
|
276
|
+
f.write "\nAverage accuracy by reference:,#{avgs.mean}\n"
|
277
|
+
f.write "STD of Average accuracy by reference:,#{avgs.stddev}\n"
|
278
|
+
|
279
|
+
# number of perfectly parsed references
|
280
|
+
f.write "Perfect parses:,#{perfect},#{perfect.to_f/references.length}\n"
|
281
|
+
|
282
|
+
# Total accuracy
|
283
|
+
n = labels.map {|lab| total[lab][lab]}.sum
|
284
|
+
d = labels.map {|lab1| labels.map {|lab2| total[lab1][lab2]}.sum }.sum
|
285
|
+
f.write "Accuracy:, #{n/d.to_f}\n"
|
286
|
+
|
287
|
+
f.flush
|
288
|
+
f.close
|
289
|
+
|
290
|
+
return n/d.to_f
|
291
|
+
end
|
292
|
+
|
293
|
+
private
|
294
|
+
def new_hash(labels)
|
295
|
+
h = Hash.new
|
296
|
+
labels.each {|lab1|
|
297
|
+
h[lab1] = {}
|
298
|
+
labels.each {|lab2|
|
299
|
+
h[lab1][lab2] = 0
|
300
|
+
}
|
301
|
+
}
|
302
|
+
h
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
end
|