excite 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,306 @@
1
+ require 'excite/crfparser'
2
+
3
+ DIR = File.dirname(__FILE__)
4
+ ROOT_DIR = "#{DIR}/../.."
5
+ RESOURCES_DIR = "#{ROOT_DIR}/lib/excite/resources"
6
+ TAGGED_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_references.txt"
7
+ TAGGED_HTML_REFERENCES = "#{RESOURCES_DIR}/trainingdata/tagged_html_references.txt"
8
+ TRAINING_DATA = "#{DIR}/training_data.txt"
9
+ TESTING_DATA = "#{DIR}/testing_data.txt"
10
+ TRAINING_REFS = "#{DIR}/training_refs.txt"
11
+ TESTING_REFS = "#{DIR}/testing_refs.txt"
12
+ MODEL_FILE = "#{DIR}/model"
13
+ TEMPLATE_FILE = "#{RESOURCES_DIR}/parsCit.template"
14
+ HTML_TEMPLATE_FILE = "#{RESOURCES_DIR}/html.template"
15
+ OUTPUT_FILE = "#{DIR}/output.txt"
16
+ HTML_OUTPUT_FILE = "#{DIR}/html-output.txt"
17
+ ANALYSIS_FILE= "#{DIR}/analysis.csv"
18
+ HTML_ANALYSIS_FILE = "#{DIR}/html-analysis.csv"
19
+ REFS_PREFIX = "training_refs_"
20
+ DATA_PREFIX = "training_data_"
21
+ TAG = "model_test"
22
+
23
+ require "#{ROOT_DIR}/model/test/array_helpers"
24
+
25
+ class Array
26
+ include ArrayHelpers
27
+ end
28
+
29
+ module Excite
30
+
31
+ class ModelTest
32
+
33
+ def analysis_file
34
+ if @mode == :html
35
+ HTML_ANALYSIS_FILE
36
+ else
37
+ ANALYSIS_FILE
38
+ end
39
+ end
40
+
41
+ def output_file
42
+ if @mode == :html
43
+ HTML_OUTPUT_FILE
44
+ else
45
+ OUTPUT_FILE
46
+ end
47
+ end
48
+
49
+ def template_file
50
+ if @mode == :html
51
+ HTML_TEMPLATE_FILE
52
+ else
53
+ TEMPLATE_FILE
54
+ end
55
+ end
56
+
57
+ def tagged_references
58
+ if @mode == :html
59
+ TAGGED_HTML_REFERENCES
60
+ else
61
+ TAGGED_REFERENCES
62
+ end
63
+ end
64
+
65
+ def initialize(mode = :string)
66
+ @crf = CRFParser.new(mode)
67
+ @mode = mode
68
+ end
69
+
70
+ def version
71
+ @version ||= `cd #{ROOT_DIR}; git show --pretty=oneline HEAD | head -1`.strip
72
+ end
73
+
74
+ def branch
75
+ if @branch.nil?
76
+ branch = `cd #{ROOT_DIR}; git branch`
77
+ branch =~ /\*\s+(\S+)/
78
+ @branch = $1
79
+ end
80
+ @branch
81
+ end
82
+
83
+ def aggregate_tags
84
+ branches = `git branch`.gsub(/\*/, '').strip.split(/\s+/)
85
+ branches.each {|branch|
86
+ `git checkout #{branch}`
87
+ tags = `git tag -l #{TAG}\*`.strip.split(/\s+/)
88
+ }
89
+ end
90
+
91
+ # def benchmark
92
+ # refs = []
93
+ # f = File.open(TRAINING_REFS, 'r')
94
+ # while line = f.gets
95
+ # refs << line.strip
96
+ # end
97
+ # # strip out tags
98
+ # refs.map! {|s| s.gsub(/<[^>]*>/, '')}
99
+ # # parse one string, since the lexicon is lazily evaluated
100
+ # Citation.create_from_string(refs.first)
101
+ # time = Benchmark.measure {
102
+ # refs.each {|ref| Citation.create_from_string(ref) }
103
+ # }
104
+ # return (time.real / refs.length.to_f)
105
+ # end
106
+
107
+ def run_test(commit=false, commit_message="evaluating model", tag_name='', k=10)
108
+ cross_validate(k)
109
+ accuracy = analyze(k)
110
+ #time = benchmark
111
+ #`echo "Average time per parse:,#{time}\n" >> #{analysis_file}`
112
+
113
+ if commit and tag_name.strip.blank?
114
+ raise "You must supply a tag name if you want to commit and tag this test"
115
+ end
116
+
117
+ if commit
118
+ str = "git add #{analysis_file} #{output_file}"
119
+ puts "Adding test files to index \n#{str}"
120
+ `#{str}`
121
+
122
+ str = "git commit --message '#{commit_message}' #{analysis_file} #{output_file}"
123
+ puts "Committing files to source control \n#{str}"
124
+ `#{str}`
125
+
126
+ str = "git tag #{TAG}_#{tag_name}_#{accuracy}"
127
+ puts "Tagging: \n#{str}"
128
+ `#{str}`
129
+ end
130
+ end
131
+
132
+ def cleanup
133
+ to_remove = [TRAINING_DATA, TESTING_DATA, TRAINING_REFS, TESTING_REFS, MODEL_FILE]
134
+ `rm -f #{to_remove.join(" ")} #{DIR}/#{DATA_PREFIX}*txt #{DIR}/#{REFS_PREFIX}*txt`
135
+ end
136
+
137
+ def cross_validate(k=10)
138
+ generate_data(k)
139
+ # clear the output file
140
+ f = File.open(output_file, 'w')
141
+ f.close
142
+ k.times {|i|
143
+ puts "Performing #{i+1}th iteration of #{k}-fold cross validation"
144
+ # generate training refs
145
+ `rm #{TRAINING_DATA}; touch #{TRAINING_DATA};`
146
+ k.times {|j|
147
+ next if j == i
148
+ `cat #{DIR}/#{DATA_PREFIX}#{j}.txt >> #{TRAINING_DATA}`
149
+ }
150
+ puts "Training model"
151
+ train
152
+ `cat #{DIR}/#{DATA_PREFIX}#{i}.txt > #{TESTING_DATA}`
153
+ puts "Testing model"
154
+ test
155
+ }
156
+ end
157
+
158
+ # testpct: percentage of tagged references to hold out for testing
159
+ def generate_data(k=10)
160
+ testpct = k/100.0
161
+ lines = []
162
+ k.times { lines << [] }
163
+ f = File.open(tagged_references, 'r')
164
+ while line = f.gets
165
+ lines[((rand * k) % k).floor] << line.strip
166
+ end
167
+ f.close
168
+
169
+ lines.each_with_index {|ll, i|
170
+ f = File.open("#{DIR}/#{REFS_PREFIX}#{i}.txt", 'w')
171
+ f.write(ll.join("\n"))
172
+ f.flush
173
+ f.close
174
+ @crf.write_training_file("#{DIR}/#{REFS_PREFIX}#{i}.txt",
175
+ "#{DIR}/#{DATA_PREFIX}#{i}.txt")
176
+ }
177
+ end
178
+
179
+ def train
180
+ @crf.train(TRAINING_REFS, MODEL_FILE, template_file, TRAINING_DATA)
181
+ end
182
+
183
+ def test
184
+ str = "crf_test -m #{MODEL_FILE} #{TESTING_DATA} >> #{output_file}"
185
+ puts str
186
+ `#{str}`
187
+ end
188
+
189
+ def analyze(k)
190
+ # get the size of the corpus
191
+ corpus_size = `wc #{tagged_references}`.split.first
192
+
193
+ # go through all training/testing data to get complete list of output tags
194
+ labels = {}
195
+ [TRAINING_DATA, TESTING_DATA].each {|fn|
196
+ f = File.open(fn, 'r')
197
+ while l = f.gets
198
+ next if l.strip.blank?
199
+ labels[l.strip.split.last] = true
200
+ end
201
+ f.close
202
+ }
203
+ labels = labels.keys.sort
204
+ #puts "got labels:\n#{labels.join("\n")}"
205
+
206
+ # reopen and go through the files again
207
+ # for each reference, populate a confusion matrix hash
208
+ references = []
209
+ testf = File.open(output_file, 'r')
210
+ ref = new_hash(labels)
211
+ while testl = testf.gets
212
+ if testl.strip.blank?
213
+ references << ref
214
+ ref = new_hash(labels)
215
+ next
216
+ end
217
+ w = testl.strip.split
218
+ te = w[-1]
219
+ tr = w[-2]
220
+ #puts "#{te} #{tr}"
221
+ ref[tr][te] += 1
222
+ end
223
+ testf.close
224
+
225
+ # print results to a file
226
+ f = File.open(analysis_file, 'w')
227
+ f.write "Results for model\n branch: #{branch}\n version: #{version}\n"
228
+ f.write "Test run on:,#{Time.now}\n"
229
+ f.write "K-fold x-validation:,#{k}\n"
230
+ f.write "Corpus size:,#{corpus_size}\n\n"
231
+
232
+ # aggregate results in total hash
233
+ total = {}
234
+ labels.each {|trl|
235
+ labels.each {|tel|
236
+ total[trl] ||= {}
237
+ total[trl][tel] = references.map {|r| r[trl][tel]}.sum
238
+ }
239
+ }
240
+
241
+ # print a confusion matrix
242
+ f.write 'truth\test,'
243
+ f.write labels.join(',')
244
+ f.write "\n"
245
+ # first, by counts
246
+ labels.each {|trl|
247
+ f.write "#{trl},"
248
+ f.write( labels.map {|tel| total[trl][tel] }.join(',') )
249
+ f.write "\n"
250
+ }
251
+ # then by percent
252
+ labels.each {|trl|
253
+ f.write "#{trl},"
254
+ f.write labels.map{|tel| total[trl][tel]/total[trl].values.sum.to_f }.join(',')
255
+ f.write "\n"
256
+ }
257
+
258
+ # precision and recal by label
259
+ f.write "\n"
260
+ f.write "Label,Precision,Recall,F-measure\n"
261
+ labels.each {|trl|
262
+ p = total[trl][trl].to_f / labels.map{|l| total[l][trl]}.sum
263
+ r = total[trl][trl].to_f / total[trl].values.sum
264
+ fs = (2*p*r)/(p+r)
265
+ f.write "#{trl},#{p},#{r},#{fs}\n"
266
+ }
267
+
268
+ # get the average accuracy-per-reference
269
+ perfect = 0
270
+ avgs = references.map {|r|
271
+ n = labels.map {|label| r[label][label] }.sum
272
+ d = labels.map {|lab| r[lab].values.sum }.sum
273
+ perfect += 1 if n == d
274
+ n.to_f / d
275
+ }
276
+ f.write "\nAverage accuracy by reference:,#{avgs.mean}\n"
277
+ f.write "STD of Average accuracy by reference:,#{avgs.stddev}\n"
278
+
279
+ # number of perfectly parsed references
280
+ f.write "Perfect parses:,#{perfect},#{perfect.to_f/references.length}\n"
281
+
282
+ # Total accuracy
283
+ n = labels.map {|lab| total[lab][lab]}.sum
284
+ d = labels.map {|lab1| labels.map {|lab2| total[lab1][lab2]}.sum }.sum
285
+ f.write "Accuracy:, #{n/d.to_f}\n"
286
+
287
+ f.flush
288
+ f.close
289
+
290
+ return n/d.to_f
291
+ end
292
+
293
+ private
294
+ def new_hash(labels)
295
+ h = Hash.new
296
+ labels.each {|lab1|
297
+ h[lab1] = {}
298
+ labels.each {|lab2|
299
+ h[lab1][lab2] = 0
300
+ }
301
+ }
302
+ h
303
+ end
304
+ end
305
+
306
+ end