excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,306 @@
1
+ require 'excite/crfparser'
2
+
3
+ DIR = File.dirname(__FILE__)
4
+ ROOT_DIR = "#{DIR}/../.."
5
+ RESOURCES_DIR = "#{ROOT_DIR}/lib/excite/resources"
6
+ TAGGED_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_references.txt"
7
+ TAGGED_HTML_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_html_references.txt"
8
+ TRAINING_DATA = "#{DIR}/training_data.txt"
9
+ TESTING_DATA = "#{DIR}/testing_data.txt"
10
+ TRAINING_REFS = "#{DIR}/training_refs.txt"
11
+ TESTING_REFS = "#{DIR}/testing_refs.txt"
12
+ MODEL_FILE = "#{DIR}/model"
13
+ TEMPLATE_FILE = "#{RESOURCES_DIR}/parsCit.template"
14
+ HTML_TEMPLATE_FILE = "#{RESOURCES_DIR}/html.template"
15
+ OUTPUT_FILE = "#{DIR}/output.txt"
16
+ HTML_OUTPUT_FILE = "#{DIR}/html-output.txt"
17
+ ANALYSIS_FILE= "#{DIR}/analysis.csv"
18
+ HTML_ANALYSIS_FILE = "#{DIR}/html-analysis.csv"
19
+ REFS_PREFIX = "training_refs_"
20
+ DATA_PREFIX = "training_data_"
21
+ TAG = "model_test"
22
+
23
+ require "#{ROOT_DIR}/model/test/array_helpers"
24
+
25
+ class Array
26
+ include ArrayHelpers
27
+ end
28
+
29
+ module Excite
30
+
31
+ class ModelTest
32
+
33
+ def analysis_file
34
+ if @mode == :html
35
+ HTML_ANALYSIS_FILE
36
+ else
37
+ ANALYSIS_FILE
38
+ end
39
+ end
40
+
41
+ def output_file
42
+ if @mode == :html
43
+ HTML_OUTPUT_FILE
44
+ else
45
+ OUTPUT_FILE
46
+ end
47
+ end
48
+
49
+ def template_file
50
+ if @mode == :html
51
+ HTML_TEMPLATE_FILE
52
+ else
53
+ TEMPLATE_FILE
54
+ end
55
+ end
56
+
57
+ def tagged_references
58
+ if @mode == :html
59
+ TAGGED_HTML_REFERENCES
60
+ else
61
+ TAGGED_REFERENCES
62
+ end
63
+ end
64
+
65
+ def initialize(mode = :string)
66
+ @crf = CRFParser.new(mode)
67
+ @mode = mode
68
+ end
69
+
70
+ def version
71
+ @version ||= `cd #{ROOT_DIR}; git show --pretty=oneline HEAD | head -1`.strip
72
+ end
73
+
74
+ def branch
75
+ if @branch.nil?
76
+ branch = `cd #{ROOT_DIR}; git branch`
77
+ branch =~ /\*\s+(\S+)/
78
+ @branch = $1
79
+ end
80
+ @branch
81
+ end
82
+
83
+ def aggregate_tags
84
+ branches = `git branch`.gsub(/\*/, '').strip.split(/\s+/)
85
+ branches.each {|branch|
86
+ `git checkout #{branch}`
87
+ tags = `git tag -l #{TAG}\*`.strip.split(/\s+/)
88
+ }
89
+ end
90
+
91
+ # def benchmark
92
+ # refs = []
93
+ # f = File.open(TRAINING_REFS, 'r')
94
+ # while line = f.gets
95
+ # refs << line.strip
96
+ # end
97
+ # # strip out tags
98
+ # refs.map! {|s| s.gsub(/<[^>]*>/, '')}
99
+ # # parse one string, since the lexicon is lazily evaluated
100
+ # Citation.create_from_string(refs.first)
101
+ # time = Benchmark.measure {
102
+ # refs.each {|ref| Citation.create_from_string(ref) }
103
+ # }
104
+ # return (time.real / refs.length.to_f)
105
+ # end
106
+
107
+ def run_test(commit=false, commit_message="evaluating model", tag_name='', k=10)
108
+ cross_validate(k)
109
+ accuracy = analyze(k)
110
+ #time = benchmark
111
+ #`echo "Average time per parse:,#{time}\n" >> #{analysis_file}`
112
+
113
+ if commit and tag_name.strip.blank?
114
+ raise "You must supply a tag name if you want to commit and tag this test"
115
+ end
116
+
117
+ if commit
118
+ str = "git add #{analysis_file} #{output_file}"
119
+ puts "Adding test files to index \n#{str}"
120
+ `#{str}`
121
+
122
+ str = "git commit --message '#{commit_message}' #{analysis_file} #{output_file}"
123
+ puts "Committing files to source control \n#{str}"
124
+ `#{str}`
125
+
126
+ str = "git tag #{TAG}_#{tag_name}_#{accuracy}"
127
+ puts "Tagging: \n#{str}"
128
+ `#{str}`
129
+ end
130
+ end
131
+
132
+ def cleanup
133
+ to_remove = [TRAINING_DATA, TESTING_DATA, TRAINING_REFS, TESTING_REFS, MODEL_FILE]
134
+ `rm -f #{to_remove.join(" ")} #{DIR}/#{DATA_PREFIX}*txt #{DIR}/#{REFS_PREFIX}*txt`
135
+ end
136
+
137
+ def cross_validate(k=10)
138
+ generate_data(k)
139
+ # clear the output file
140
+ f = File.open(output_file, 'w')
141
+ f.close
142
+ k.times {|i|
143
+ puts "Performing #{i+1}th iteration of #{k}-fold cross validation"
144
+ # generate training refs
145
+ `rm #{TRAINING_DATA}; touch #{TRAINING_DATA};`
146
+ k.times {|j|
147
+ next if j == i
148
+ `cat #{DIR}/#{DATA_PREFIX}#{j}.txt >> #{TRAINING_DATA}`
149
+ }
150
+ puts "Training model"
151
+ train
152
+ `cat #{DIR}/#{DATA_PREFIX}#{i}.txt > #{TESTING_DATA}`
153
+ puts "Testing model"
154
+ test
155
+ }
156
+ end
157
+
158
+ # testpct: percentage of tagged references to hold out for testing
159
+ def generate_data(k=10)
160
+ testpct = k/100.0
161
+ lines = []
162
+ k.times { lines << [] }
163
+ f = File.open(tagged_references, 'r')
164
+ while line = f.gets
165
+ lines[((rand * k) % k).floor] << line.strip
166
+ end
167
+ f.close
168
+
169
+ lines.each_with_index {|ll, i|
170
+ f = File.open("#{DIR}/#{REFS_PREFIX}#{i}.txt", 'w')
171
+ f.write(ll.join("\n"))
172
+ f.flush
173
+ f.close
174
+ @crf.write_training_file("#{DIR}/#{REFS_PREFIX}#{i}.txt",
175
+ "#{DIR}/#{DATA_PREFIX}#{i}.txt")
176
+ }
177
+ end
178
+
179
+ def train
180
+ @crf.train(TRAINING_REFS, MODEL_FILE, template_file, TRAINING_DATA)
181
+ end
182
+
183
+ def test
184
+ str = "crf_test -m #{MODEL_FILE} #{TESTING_DATA} >> #{output_file}"
185
+ puts str
186
+ `#{str}`
187
+ end
188
+
189
+ def analyze(k)
190
+ # get the size of the corpus
191
+ corpus_size = `wc #{tagged_references}`.split.first
192
+
193
+ # go through all training/testing data to get complete list of output tags
194
+ labels = {}
195
+ [TRAINING_DATA, TESTING_DATA].each {|fn|
196
+ f = File.open(fn, 'r')
197
+ while l = f.gets
198
+ next if l.strip.blank?
199
+ labels[l.strip.split.last] = true
200
+ end
201
+ f.close
202
+ }
203
+ labels = labels.keys.sort
204
+ #puts "got labels:\n#{labels.join("\n")}"
205
+
206
+ # reopen and go through the files again
207
+ # for each reference, populate a confusion matrix hash
208
+ references = []
209
+ testf = File.open(output_file, 'r')
210
+ ref = new_hash(labels)
211
+ while testl = testf.gets
212
+ if testl.strip.blank?
213
+ references << ref
214
+ ref = new_hash(labels)
215
+ next
216
+ end
217
+ w = testl.strip.split
218
+ te = w[-1]
219
+ tr = w[-2]
220
+ #puts "#{te} #{tr}"
221
+ ref[tr][te] += 1
222
+ end
223
+ testf.close
224
+
225
+ # print results to a file
226
+ f = File.open(analysis_file, 'w')
227
+ f.write "Results for model\n branch: #{branch}\n version: #{version}\n"
228
+ f.write "Test run on:,#{Time.now}\n"
229
+ f.write "K-fold x-validation:,#{k}\n"
230
+ f.write "Corpus size:,#{corpus_size}\n\n"
231
+
232
+ # aggregate results in total hash
233
+ total = {}
234
+ labels.each {|trl|
235
+ labels.each {|tel|
236
+ total[trl] ||= {}
237
+ total[trl][tel] = references.map {|r| r[trl][tel]}.sum
238
+ }
239
+ }
240
+
241
+ # print a confusion matrix
242
+ f.write 'truth\test,'
243
+ f.write labels.join(',')
244
+ f.write "\n"
245
+ # first, by counts
246
+ labels.each {|trl|
247
+ f.write "#{trl},"
248
+ f.write( labels.map {|tel| total[trl][tel] }.join(',') )
249
+ f.write "\n"
250
+ }
251
+ # then by percent
252
+ labels.each {|trl|
253
+ f.write "#{trl},"
254
+ f.write labels.map{|tel| total[trl][tel]/total[trl].values.sum.to_f }.join(',')
255
+ f.write "\n"
256
+ }
257
+
258
+ # precision and recal by label
259
+ f.write "\n"
260
+ f.write "Label,Precision,Recall,F-measure\n"
261
+ labels.each {|trl|
262
+ p = total[trl][trl].to_f / labels.map{|l| total[l][trl]}.sum
263
+ r = total[trl][trl].to_f / total[trl].values.sum
264
+ fs = (2*p*r)/(p+r)
265
+ f.write "#{trl},#{p},#{r},#{fs}\n"
266
+ }
267
+
268
+ # get the average accuracy-per-reference
269
+ perfect = 0
270
+ avgs = references.map {|r|
271
+ n = labels.map {|label| r[label][label] }.sum
272
+ d = labels.map {|lab| r[lab].values.sum }.sum
273
+ perfect += 1 if n == d
274
+ n.to_f / d
275
+ }
276
+ f.write "\nAverage accuracy by reference:,#{avgs.mean}\n"
277
+ f.write "STD of Average accuracy by reference:,#{avgs.stddev}\n"
278
+
279
+ # number of perfectly parsed references
280
+ f.write "Perfect parses:,#{perfect},#{perfect.to_f/references.length}\n"
281
+
282
+ # Total accuracy
283
+ n = labels.map {|lab| total[lab][lab]}.sum
284
+ d = labels.map {|lab1| labels.map {|lab2| total[lab1][lab2]}.sum }.sum
285
+ f.write "Accuracy:, #{n/d.to_f}\n"
286
+
287
+ f.flush
288
+ f.close
289
+
290
+ return n/d.to_f
291
+ end
292
+
293
+ private
294
+ def new_hash(labels)
295
+ h = Hash.new
296
+ labels.each {|lab1|
297
+ h[lab1] = {}
298
+ labels.each {|lab2|
299
+ h[lab1][lab2] = 0
300
+ }
301
+ }
302
+ h
303
+ end
304
+ end
305
+
306
+ end