excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,97 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Script to assist in verifying tagged references
4
+
5
+ f = ARGV[0]
6
+
7
+ cleaned = IO.readlines(f).map(&:strip)
8
+
9
+ tags = []
10
+ tag_contents = {}
11
+
12
+ annotation_tags = %w{ author title date booktitle journal volume pages editor workid link publisher location institution bullet tech note }.map { |t| "<#{t}>" }
13
+
14
+ cleaned.each_with_index do |line, index|
15
+ open_tags = line.scan(/<\s*\w+\s*>/).map(&:downcase)
16
+ for tag in open_tags
17
+ i = line.downcase.index(tag)
18
+ j = line.downcase.index(tag.sub('<','</'))
19
+ if tag != "<br>" && (j.nil? || j <= i)
20
+ puts "Missing close tag for #{tag} on line #{index+1}: #{line}"
21
+ end
22
+ end
23
+
24
+ close_tags = line.scan(/<\/\s*\w+\s*>/).map(&:downcase)
25
+ for tag in close_tags
26
+ i = line.downcase.index(tag)
27
+ j = line.downcase.index(tag.sub(/<\/\s*/, ''))
28
+ if j.nil? || j >= i
29
+ puts "Missing open tag for #{tag} on line #{index+1}: #{line}"
30
+ end
31
+ end
32
+
33
+ tags += open_tags
34
+
35
+ toks = line.split(/(\s+)|(?=<)|(?<=>)/)
36
+
37
+ start_tag = nil
38
+ tag_content = ''
39
+
40
+ for tok in toks
41
+ if annotation_tags.include?(tok)
42
+ if !start_tag.nil?
43
+ puts "Started #{tok} within #{start_tag} on line #{index+1}: #{line}"
44
+ start_tag = nil
45
+ else
46
+ start_tag = tok
47
+ end
48
+ elsif annotation_tags.include?(tok.sub(/<\/\s*/, '<'))
49
+ if start_tag.nil?
50
+ puts "End tag #{tok} without a corresponding start tag on line #{index+1}: #{line}"
51
+ elsif start_tag != tok.sub(/<\/s*/, '<')
52
+ puts "End tag #{tok} doesn't match start tag #{start_tag} on line #{index+1}: #{line}"
53
+ start_tag = nil
54
+ else
55
+ tag_contents[start_tag] ||= []
56
+ tag_contents[start_tag] << tag_content.strip
57
+
58
+ tag_content = ''
59
+ start_tag = nil
60
+ end
61
+ elsif start_tag.nil?
62
+ puts "Token '#{tok}' is not tagged in line #{index+1}: #{line}" unless tok.strip.empty?
63
+ else
64
+ tag_content += tok
65
+ end
66
+ end
67
+
68
+ for tag in annotation_tags
69
+ if open_tags.count(tag) > 1
70
+ puts "(Might be ok but...) More than one #{tag} in line #{index+1}: #{line}"
71
+ end
72
+ end
73
+
74
+ for tag in open_tags
75
+ if open_tags.count(tag) != close_tags.count(tag.sub('<','</'))
76
+ puts "Unequal numbers of open and close tags for #{tag} in line #{index+1}: #{line}" unless tag.match(/<\/?br>/)
77
+ end
78
+ end
79
+
80
+ if !open_tags.include?("<title>")
81
+ puts "Missing title on line #{index+1}: #{line}"
82
+ end
83
+ end
84
+
85
+ tag_counts = tags.inject({}) do |counts, tag|
86
+ counts[tag] ||= 0
87
+ counts[tag] += 1
88
+ counts
89
+ end
90
+
91
+ puts "\n\nAnnotation tags used: #{tag_counts.select { |t,c| annotation_tags.include?(t) } }"
92
+ puts "Other tags: #{tag_counts.reject { |t,c| annotation_tags.include?(t) }}\n\n\n"
93
+
94
+ tag_contents.each do |tag, contents|
95
+ puts "#{tag}s:"
96
+ contents.each { |c| puts "\t\t#{c}" }
97
+ end
@@ -0,0 +1,313 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ module TokenFeatures
6
+
7
+ module DictFlags
8
+ PUBLISHER_NAME = 32
9
+ PLACE_NAME = 16
10
+ MONTH_NAME = 8
11
+ LAST_NAME = 4
12
+ FIRST_NAME = 1
13
+ end
14
+
15
+ def TokenFeatures.read_dict_files(dir_name)
16
+ dict = {}
17
+ [
18
+ ['first-names',DictFlags::FIRST_NAME],
19
+ ['surnames',DictFlags::LAST_NAME],
20
+ ['months',DictFlags::MONTH_NAME],
21
+ ['places',DictFlags::PLACE_NAME],
22
+ ['publishers',DictFlags::PUBLISHER_NAME],
23
+ ].each do |file_name, flag|
24
+ filename = File.join(dir_name, file_name)
25
+ f = File.open(filename, 'r')
26
+
27
+ while l = f.gets
28
+ l.strip!
29
+ if !l.match(/^\#/)
30
+ dict[l] ||= 0
31
+ unless dict[l] & flag > 0
32
+ dict[l] += flag
33
+ end
34
+ end
35
+ end
36
+
37
+ f.close
38
+ end
39
+ dict
40
+ end
41
+
42
+ DIR = File.dirname(__FILE__)
43
+ DICT = TokenFeatures.read_dict_files("#{DIR}/resources/dicts")
44
+
45
+ private_class_method :read_dict_files
46
+
47
+ def clear
48
+ @possible_editor = nil
49
+ @possible_chapter = nil
50
+ @dict_status = nil
51
+ @is_proceeding = nil
52
+ end
53
+
54
+ def last_char(toks, idx, author_names=nil)
55
+ case toks[idx].raw[-1,1]
56
+ when /[a-z]/
57
+ 'a'
58
+ when /[A-Z]/
59
+ 'A'
60
+ when /[0-9]/
61
+ 0
62
+ else
63
+ toks[idx].raw[-1,1]
64
+ end
65
+ end
66
+
67
+ def first_1_char(toks, idx, author_names=nil); toks[idx].raw[0,1]; end
68
+ def first_2_chars(toks, idx, author_names=nil); toks[idx].raw[0,2]; end
69
+ def first_3_chars(toks, idx, author_names=nil); toks[idx].raw[0,3]; end
70
+ def first_4_chars(toks, idx, author_names=nil); toks[idx].raw[0,4]; end
71
+ def first_5_chars(toks, idx, author_names=nil); toks[idx].raw[0,5]; end
72
+
73
+ def last_1_char(toks, idx, author_names=nil); toks[idx].raw[-1,1]; end
74
+ def last_2_chars(toks, idx, author_names=nil); toks[idx].raw[-2,2] || toks[idx].raw; end
75
+ def last_3_chars(toks, idx, author_names=nil); toks[idx].raw[-3,3] || toks[idx].raw; end
76
+ def last_4_chars(toks, idx, author_names=nil); toks[idx].raw[-4,4] || toks[idx].raw; end
77
+
78
+ def toklcnp(toks, idx, author_names=nil); toks[idx].lcnp; end
79
+
80
+ def capitalization(toks, idx, author_names=nil)
81
+ case toks[idx].np
82
+ when "EMPTY"
83
+ "others"
84
+ when /^[[:upper:]]$/
85
+ "singleCap"
86
+ when /^[[:upper:]][[:lower:]]+/
87
+ "InitCap"
88
+ when /^[[:upper:]]+$/
89
+ "AllCap"
90
+ else
91
+ "others"
92
+ end
93
+ end
94
+
95
+ def numbers(toks, idx, author_names=nil)
96
+ (toks[idx].raw =~ /[0-9]\-[0-9]/) ? "possiblePage" :
97
+ (toks[idx].raw =~ /^\D*(19|20)[0-9][0-9]\D*$/) ? "year" :
98
+ (toks[idx].np =~ /^(19|20)[0-9][0-9]$/) ? "year" :
99
+ (toks[idx].np =~ /^[0-9]$/) ? "1dig" :
100
+ (toks[idx].np =~ /^[0-9][0-9]$/) ? "2dig" :
101
+ (toks[idx].np =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
102
+ (toks[idx].np =~ /^[0-9]+$/) ? "4+dig" :
103
+ (toks[idx].np =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
104
+ (toks[idx].np =~ /[0-9]/) ? "hasDig" : "nonNum"
105
+ end
106
+
107
+ # ignores idx
108
+ def possible_editor(toks, idx=nil, author_names=nil)
109
+ if !@possible_editor.nil?
110
+ @possible_editor
111
+ else
112
+ @possible_editor =
113
+ (toks.any? { |t| %w(ed editor editors eds edited).include?(t.lcnp) } ?
114
+ "possibleEditors" : "noEditors")
115
+ end
116
+ end
117
+
118
+ # if there is possible editor entry and "IN" preceeded by punctuation
119
+ # this citation may be a book chapter
120
+ #
121
+ # ignores idx
122
+ def possible_chapter(toks, idx=nil, author_names=nil)
123
+ if !@possible_chapter.nil?
124
+ @possible_chapter
125
+ else
126
+ has_editor = possible_editor(toks) == 'possibleEditors'
127
+ has_chapter = toks.each_with_index.any? do |t, i|
128
+ if i > 0 && i < (toks.length-1) && t.lcnp == 'in'
129
+ prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[i-1].part_of_speech)
130
+ next_is_separator = ['ppl','ppc','pps'].include?(toks[i+1].part_of_speech)
131
+ prev_is_separator && (has_editor || next_is_separator)
132
+ end
133
+ end
134
+ has_chapter ? "possibleChapter" : "noChapter"
135
+ end
136
+ end
137
+
138
+ # ignores idx
139
+ def is_proceeding(toks, idx=nil, author_names=nil)
140
+ if !@is_proceeding.nil?
141
+ @is_proceeding
142
+ else
143
+ @is_proceeding =
144
+ (toks.any? { |t|
145
+ %w( proc proceeding proceedings ).include?(t.lcnp.strip)
146
+ } ? 'isProc' : 'noProc')
147
+ end
148
+ end
149
+
150
+ # TODO remove duplication with possible_chapter
151
+ def is_in(toks, idx, author_names=nil)
152
+ is_in = if idx > 0 && idx < (toks.length-1) && toks[idx].lcnp == 'in'
153
+ prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[idx-1].part_of_speech)
154
+ next_is_separator = ['ppl','ppc','pps'].include?(toks[idx+1].part_of_speech)
155
+ prev_is_separator && (next_is_separator || toks[idx+1].np =~ /^[A-Z]/)
156
+ end
157
+ is_in ? "inBook" : "notInBook"
158
+ end
159
+
160
+ def location(toks, idx, author_names=nil)
161
+ r = ((idx.to_f / toks.length) * 10).round
162
+ end
163
+
164
+ def punct(toks, idx, author_names=nil)
165
+ (toks[idx].raw =~ /\-.*\-/) ? "multiHyphen" :
166
+ (toks[idx].raw =~ /[[:alpha:]].*\-$/) ? "truncated" :
167
+ (toks[idx].raw =~ /[[:alpha:]].*\.$/) ? "abbrev" :
168
+ (toks[idx].np != toks[idx].raw) ? "hasPunct" : "others"
169
+ end
170
+
171
+ def possible_volume(toks, idx, author_names=nil)
172
+ if possible_vol_with_str(toks, idx)
173
+ 'volume'
174
+ elsif possible_vol_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
175
+ 'issue'
176
+ elsif possible_vol_with_str(toks, idx-2) && possible_issue_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
177
+ 'issue'
178
+ elsif possible_vol_with_parens(toks, idx)
179
+ 'volume'
180
+ elsif (1..3).any? { |i| possible_vol_with_parens(toks, idx-i) }
181
+ 'issue'
182
+ elsif possible_vol_with_colon(toks, idx)
183
+ 'volume'
184
+ else
185
+ 'noVolume'
186
+ end
187
+ end
188
+
189
+ # TODO this method is weirdly named b/c of alphabetical ordering hack: remove that
190
+ def a_is_in_dict(toks, idx, author_names=nil)
191
+ dict_status(toks, idx)
192
+ end
193
+
194
+ def publisherName(toks, idx, author_names=nil)
195
+ (dict_status(toks, idx) & DictFlags::PUBLISHER_NAME) > 0 ? 'publisherName' : 'noPublisherName'
196
+ end
197
+
198
+ def placeName(toks, idx, author_names=nil)
199
+ (dict_status(toks, idx) & DictFlags::PLACE_NAME) > 0 ? 'placeName' : 'noPlaceName'
200
+ end
201
+
202
+ def monthName(toks, idx, author_names=nil)
203
+ (dict_status(toks, idx) & DictFlags::MONTH_NAME) > 0 ? 'monthName' : 'noMonthName'
204
+ end
205
+
206
+ def lastName(toks, idx, author_names=nil)
207
+ return 'lastName' if author_names && author_names.last == toks[idx].lcnp
208
+ (dict_status(toks, idx) & DictFlags::LAST_NAME) > 0 ? 'lastName' : 'noLastName'
209
+ end
210
+
211
+ def firstName(toks, idx, author_names=nil)
212
+ return 'firstName' if author_names && author_names.first == toks[idx].lcnp
213
+ (dict_status(toks, idx) & DictFlags::FIRST_NAME) > 0 ? 'firstName' : 'noFirstName'
214
+ end
215
+
216
+ def dict_status(toks, idx)
217
+ @dict_status ||= [nil]*toks.length
218
+ @dict_status[idx] ||= (DICT[toks[idx].lcnp] || 0)
219
+ end
220
+
221
+ NODE_TYPES_BY_NAME = {
222
+ 'div'=>'div',
223
+ 'p'=>'p',
224
+ 'ul'=>'div', # lump with div - higher-level structure
225
+ 'li'=>'li',
226
+ 'tr'=>'div', # lump with div - higher-level structure
227
+ 'td'=>'td',
228
+ 'span'=>'span',
229
+ 'font'=>'span',
230
+ 'em'=>'em',
231
+ 'i'=>'em',
232
+ 'strong'=>'strong',
233
+ 'b'=>'strong',
234
+ 'u'=>'u',
235
+ 'h1'=>'h',
236
+ 'h2'=>'h',
237
+ 'h3'=>'h',
238
+ 'h4'=>'h',
239
+ 'h5'=>'h',
240
+ 'h6'=>'h',
241
+ 'a'=>'a',
242
+ '#document-fragment'=>'unknown' # the actual tag wasn't captured in the fragment we're parsing
243
+ }
244
+
245
+ def tag_name(toks, idx, author_names=nil)
246
+ name = toks[idx].node.parent.name # node is always a text node; the informative one is the parent
247
+ NODE_TYPES_BY_NAME[name.downcase] || 'other'
248
+ end
249
+
250
+ def location_in_node(toks, idx, author_names=nil)
251
+ ((toks[idx].idx_in_node.to_f / toks[idx].node_token_count) * 10).round
252
+ end
253
+
254
+ def part_of_speech(toks, idx, author_names=nil)
255
+ toks[idx].part_of_speech
256
+ end
257
+
258
+ private
259
+
260
+ def possible_issue_with_str(toks, idx)
261
+ return unless toks[idx]
262
+
263
+ possible_issue_str(toks, idx) ||
264
+ (possible_issue_str(toks, idx-1) && toks[idx].raw =~ /^\d+$/)
265
+ end
266
+
267
+ def possible_issue_str(toks, idx)
268
+ if toks[idx]
269
+ if toks[idx].raw =~ /^(no)|(issue)?\.?\d+.?$/i
270
+ return true
271
+ elsif toks[idx+1]
272
+ return ['no','issue'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
273
+ end
274
+ end
275
+ end
276
+
277
+ def possible_vol_with_str(toks, idx)
278
+ return unless toks[idx]
279
+
280
+ possible_vol_str(toks, idx) ||
281
+ (possible_vol_str(toks, idx-1) && (toks[idx].raw =~ /^\d+$/ || toks[idx].raw == ',')) ||
282
+ (possible_vol_str(toks, idx-2) && toks[idx-1].raw =~ /^\d+$/ && toks[idx].raw == ',')
283
+ end
284
+
285
+ def possible_vol_str(toks, idx)
286
+ if toks[idx]
287
+ if toks[idx].raw =~ /^vol(ume)?\.?\d+.?$/i
288
+ return true
289
+ elsif toks[idx+1]
290
+ return ['vol','volume'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
291
+ end
292
+ end
293
+ end
294
+
295
+ def possible_vol_with_parens(toks, idx)
296
+ if toks[idx] && toks[idx+3]
297
+ toks[idx].raw =~ /^\d+$/ && toks[idx+1].raw == '(' && toks[idx+2].raw =~ /^\d+$/ && toks[idx+3].raw == ')'
298
+ end
299
+ end
300
+
301
+ def possible_vol_with_colon(toks, idx)
302
+ if toks[idx] && toks[idx+1]
303
+ # case of <year>: something is common so make sure we exclude it
304
+ if toks[idx].np =~ /^\d{1,3}$/ && toks[idx+1].raw =~ /^:/
305
+ # at this point it's likely a volume, but exclude it if it's not followed by an apparent page or issue
306
+ toks[idx+1].np =~ /^\d+$/ || (toks[idx+1].raw == ':' && toks[idx+2] && toks[idx+2].np =~ /^\d+/)
307
+ end
308
+ end
309
+ end
310
+
311
+ end
312
+
313
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ VERSION = '2.1.1'
6
+
7
+ end
data/lib/excite.rb ADDED
@@ -0,0 +1,13 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+ end
5
+
6
+ require 'crfpp'
7
+
8
+ require 'excite/array_helpers'
9
+ require 'excite/citation'
10
+ require 'excite/preprocessor'
11
+ require 'excite/postprocessor'
12
+ require 'excite/token_features'
13
+ require 'excite/crfparser'
@@ -0,0 +1,54 @@
1
+ Results for model
2
+ branch: add_parts_of_speech
3
+ version: fe881b78325f48e511a2d015ba80edb55ba9a64c Rename module to Excite
4
+ Test run on:,2013-03-06 15:15:20 -0800
5
+ K-fold x-validation:,10
6
+ Corpus size:,500
7
+
8
+ truth\test,author,booktitle,date,editor,institution,journal,location,note,pages,publisher,tech,title,volume
9
+ author,3894,6,0,7,0,0,0,3,0,0,0,1,0
10
+ booktitle,0,2053,0,8,3,52,8,9,0,0,3,31,6
11
+ date,1,0,1547,1,0,0,6,2,4,0,0,1,3
12
+ editor,35,5,0,442,0,4,0,0,0,0,0,1,0
13
+ institution,3,8,0,4,368,0,6,0,0,1,1,8,0
14
+ journal,0,35,0,5,0,731,0,0,0,3,0,26,4
15
+ location,0,21,2,0,14,0,479,0,0,0,0,3,3
16
+ note,0,32,1,5,2,6,0,99,0,0,7,3,9
17
+ pages,0,0,2,0,0,0,0,0,772,1,2,0,6
18
+ publisher,0,10,0,0,11,2,6,0,0,271,0,4,0
19
+ tech,0,0,0,0,2,0,4,0,4,0,199,30,5
20
+ title,1,32,0,4,0,4,2,0,0,0,0,4215,2
21
+ volume,0,3,1,0,0,3,0,0,7,0,3,0,609
22
+ author,0.9956532856047047,0.001534134492457172,0.0,0.0017898235745333673,0.0,0.0,0.0,0.000767067246228586,0.0,0.0,0.0,0.00025568908207619537,0.0
23
+ booktitle,0.0,0.9447768062586286,0.0,0.0036815462494247586,0.0013805798435342844,0.02393005062126093,0.0036815462494247586,0.0041417395306028535,0.0,0.0,0.0013805798435342844,0.014265991716520939,0.0027611596870685687
24
+ date,0.0006389776357827476,0.0,0.9884984025559106,0.0006389776357827476,0.0,0.0,0.0038338658146964857,0.0012779552715654952,0.0025559105431309905,0.0,0.0,0.0006389776357827476,0.0019169329073482429
25
+ editor,0.07186858316221766,0.01026694045174538,0.0,0.9075975359342916,0.0,0.008213552361396304,0.0,0.0,0.0,0.0,0.0,0.002053388090349076,0.0
26
+ institution,0.007518796992481203,0.020050125313283207,0.0,0.010025062656641603,0.9223057644110275,0.0,0.015037593984962405,0.0,0.0,0.002506265664160401,0.002506265664160401,0.020050125313283207,0.0
27
+ journal,0.0,0.043532338308457715,0.0,0.006218905472636816,0.0,0.9092039800995025,0.0,0.0,0.0,0.0037313432835820895,0.0,0.03233830845771144,0.004975124378109453
28
+ location,0.0,0.040229885057471264,0.0038314176245210726,0.0,0.02681992337164751,0.0,0.9176245210727969,0.0,0.0,0.0,0.0,0.005747126436781609,0.005747126436781609
29
+ note,0.0,0.1951219512195122,0.006097560975609756,0.03048780487804878,0.012195121951219513,0.036585365853658534,0.0,0.6036585365853658,0.0,0.0,0.042682926829268296,0.018292682926829267,0.054878048780487805
30
+ pages,0.0,0.0,0.002554278416347382,0.0,0.0,0.0,0.0,0.0,0.9859514687100894,0.001277139208173691,0.002554278416347382,0.0,0.007662835249042145
31
+ publisher,0.0,0.03289473684210526,0.0,0.0,0.03618421052631579,0.006578947368421052,0.019736842105263157,0.0,0.0,0.8914473684210527,0.0,0.013157894736842105,0.0
32
+ tech,0.0,0.0,0.0,0.0,0.00819672131147541,0.0,0.01639344262295082,0.0,0.01639344262295082,0.0,0.8155737704918032,0.12295081967213115,0.020491803278688523
33
+ title,0.00023474178403755868,0.007511737089201878,0.0,0.0009389671361502347,0.0,0.0009389671361502347,0.00046948356807511736,0.0,0.0,0.0,0.0,0.9894366197183099,0.00046948356807511736
34
+ volume,0.0,0.004792332268370607,0.001597444089456869,0.0,0.0,0.004792332268370607,0.0,0.0,0.011182108626198083,0.0,0.004792332268370607,0.0,0.9728434504792333
35
+
36
+ Label,Precision,Recall,F-measure
37
+ author,0.9898322318251144,0.9956532856047047,0.9927342256214148
38
+ booktitle,0.9310657596371882,0.9447768062586286,0.9378711740520785
39
+ date,0.9961365099806826,0.9884984025559106,0.9923027581783195
40
+ editor,0.9285714285714286,0.9075975359342916,0.9179646936656283
41
+ institution,0.92,0.9223057644110275,0.9211514392991239
42
+ journal,0.9114713216957606,0.9092039800995025,0.9103362391033625
43
+ location,0.9373776908023483,0.9176245210727969,0.9273959341723136
44
+ note,0.8761061946902655,0.6036585365853658,0.7148014440433211
45
+ pages,0.9809402795425667,0.9859514687100894,0.9834394904458599
46
+ publisher,0.9818840579710145,0.8914473684210527,0.9344827586206896
47
+ tech,0.9255813953488372,0.8155737704918032,0.8671023965141613
48
+ title,0.9750173490631506,0.9894366197183099,0.9821740650122335
49
+ volume,0.9412673879443586,0.9728434504792333,0.9567949725058915
50
+
51
+ Average accuracy by reference:,0.9621987290789997
52
+ STD of Average accuracy by reference:,0.09112313872147432
53
+ Perfect parses:,383,0.766
54
+ Accuracy:, 0.9653367811845832
@@ -0,0 +1,30 @@
1
+ module ArrayHelpers
2
+
3
+ def sum
4
+ inject(0, :+)
5
+ end
6
+
7
+ def mean
8
+ (size > 0) ? sum.to_f / size : 0
9
+ end
10
+
11
+ def stddev
12
+ m = mean
13
+ devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
14
+ (size > 0) ? (devsum.to_f / size) ** 0.5 : 0
15
+ end
16
+
17
+ def cov(other)
18
+ zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
19
+ end
20
+
21
+ def pearson_r(other)
22
+ unless size == other.size
23
+ raise "Vectors must be of same length to calculate pearson_r"
24
+ end
25
+ devp = stddev * other.stddev
26
+ (devp > 0) ? cov(other) / devp : 0.0
27
+ end
28
+
29
+ end
30
+
@@ -0,0 +1,60 @@
1
+ Results for model
2
+ branch: add_parts_of_speech
3
+ version: 19c853489cbc259793dd24d89e1e16c78222a14b Add possible_volume based on regex (not actually usable unfortunately)
4
+ Test run on:,2013-03-06 14:52:26 -0800
5
+ K-fold x-validation:,10
6
+ Corpus size:,500
7
+
8
+ truth\test,author,booktitle,bullet,date,editor,institution,journal,link,location,note,pages,publisher,title,volume,workid
9
+ author,3503,8,4,2,32,0,2,3,0,7,2,0,72,0,0
10
+ booktitle,3,1271,0,20,27,15,82,0,13,21,4,13,171,2,0
11
+ bullet,3,1,31,1,0,0,1,0,0,3,0,0,1,2,0
12
+ date,1,24,0,1274,5,0,10,0,4,8,5,8,3,8,0
13
+ editor,35,19,0,0,1012,0,0,4,5,0,0,9,23,0,0
14
+ institution,0,34,0,2,11,0,18,0,0,10,0,36,29,0,0
15
+ journal,3,45,0,3,2,4,1130,1,0,8,0,11,62,2,0
16
+ link,3,16,0,1,0,0,0,326,0,48,7,2,2,1,0
17
+ location,9,13,0,3,4,0,5,2,441,7,0,34,9,0,0
18
+ note,9,61,0,13,21,0,16,19,0,175,3,20,107,2,0
19
+ pages,0,1,0,14,0,0,0,4,0,9,693,3,4,16,0
20
+ publisher,6,31,0,4,16,0,15,1,12,13,2,532,14,0,0
21
+ title,54,159,0,12,4,0,20,0,10,34,2,8,6129,0,0
22
+ volume,0,6,0,18,0,0,8,0,3,2,7,0,0,861,0
23
+ workid,0,0,0,7,0,0,0,2,4,7,8,0,3,4,64
24
+ author,0.9636863823933975,0.002200825309491059,0.0011004126547455295,0.0005502063273727648,0.008803301237964236,0.0,0.0005502063273727648,0.0008253094910591472,0.0,0.0019257221458046766,0.0005502063273727648,0.0,0.019807427785419534,0.0,0.0
25
+ booktitle,0.0018270401948842874,0.7740560292326432,0.0,0.012180267965895249,0.016443361753958587,0.009135200974421437,0.049939098660170524,0.0,0.007917174177831911,0.012789281364190013,0.00243605359317905,0.007917174177831911,0.10414129110840438,0.001218026796589525,0.0
26
+ bullet,0.06976744186046512,0.023255813953488372,0.7209302325581395,0.023255813953488372,0.0,0.0,0.023255813953488372,0.0,0.0,0.06976744186046512,0.0,0.0,0.023255813953488372,0.046511627906976744,0.0
27
+ date,0.0007407407407407407,0.017777777777777778,0.0,0.9437037037037037,0.003703703703703704,0.0,0.007407407407407408,0.0,0.002962962962962963,0.005925925925925926,0.003703703703703704,0.005925925925925926,0.0022222222222222222,0.005925925925925926,0.0
28
+ editor,0.031616982836495035,0.017163504968383016,0.0,0.0,0.9141824751580849,0.0,0.0,0.0036133694670280035,0.004516711833785004,0.0,0.0,0.008130081300813009,0.02077687443541102,0.0,0.0
29
+ institution,0.0,0.24285714285714285,0.0,0.014285714285714285,0.07857142857142857,0.0,0.12857142857142856,0.0,0.0,0.07142857142857142,0.0,0.2571428571428571,0.20714285714285716,0.0,0.0
30
+ journal,0.0023603461841070024,0.03540519276160504,0.0,0.0023603461841070024,0.0015735641227380016,0.003147128245476003,0.8890637293469709,0.0007867820613690008,0.0,0.006294256490952006,0.0,0.00865460267505901,0.04878048780487805,0.0015735641227380016,0.0
31
+ link,0.007389162561576354,0.03940886699507389,0.0,0.0024630541871921183,0.0,0.0,0.0,0.8029556650246306,0.0,0.11822660098522167,0.017241379310344827,0.0049261083743842365,0.0049261083743842365,0.0024630541871921183,0.0
32
+ location,0.017077798861480076,0.024667931688804556,0.0,0.0056925996204933585,0.007590132827324478,0.0,0.009487666034155597,0.003795066413662239,0.8368121442125237,0.013282732447817837,0.0,0.06451612903225806,0.017077798861480076,0.0,0.0
33
+ note,0.020179372197309416,0.1367713004484305,0.0,0.02914798206278027,0.04708520179372197,0.0,0.03587443946188341,0.042600896860986545,0.0,0.3923766816143498,0.006726457399103139,0.04484304932735426,0.2399103139013453,0.004484304932735426,0.0
34
+ pages,0.0,0.0013440860215053765,0.0,0.01881720430107527,0.0,0.0,0.0,0.005376344086021506,0.0,0.012096774193548387,0.9314516129032258,0.004032258064516129,0.005376344086021506,0.021505376344086023,0.0
35
+ publisher,0.009287925696594427,0.047987616099071206,0.0,0.006191950464396285,0.02476780185758514,0.0,0.02321981424148607,0.0015479876160990713,0.018575851393188854,0.020123839009287926,0.0030959752321981426,0.8235294117647058,0.021671826625386997,0.0,0.0
36
+ title,0.008395522388059701,0.024720149253731342,0.0,0.0018656716417910447,0.0006218905472636816,0.0,0.003109452736318408,0.0,0.001554726368159204,0.005286069651741294,0.0003109452736318408,0.0012437810945273632,0.9528917910447762,0.0,0.0
37
+ volume,0.0,0.0066298342541436465,0.0,0.019889502762430938,0.0,0.0,0.008839779005524863,0.0,0.0033149171270718232,0.0022099447513812156,0.0077348066298342545,0.0,0.0,0.9513812154696133,0.0
38
+ workid,0.0,0.0,0.0,0.0707070707070707,0.0,0.0,0.0,0.020202020202020204,0.04040404040404041,0.0707070707070707,0.08080808080808081,0.0,0.030303030303030304,0.04040404040404041,0.6464646464646465
39
+
40
+ Label,Precision,Recall,F-measure
41
+ author,0.9652796913750344,0.9636863823933975,0.9644823788546256
42
+ booktitle,0.7525162818235642,0.7740560292326432,0.7631341939357552
43
+ bullet,0.8857142857142857,0.7209302325581395,0.7948717948717948
44
+ date,0.9272197962154294,0.9437037037037037,0.9353891336270191
45
+ editor,0.892416225749559,0.9141824751580849,0.9031682284694332
46
+ institution,0.0,0.0,NaN
47
+ journal,0.864575363427697,0.8890637293469709,0.8766485647788983
48
+ link,0.9005524861878453,0.8029556650246306,0.8489583333333334
49
+ location,0.8963414634146342,0.8368121442125237,0.8655544651619235
50
+ note,0.4971590909090909,0.3923766816143498,0.43859649122807015
51
+ pages,0.9454297407912687,0.9314516129032258,0.938388625592417
52
+ publisher,0.7869822485207101,0.8235294117647058,0.8048411497730711
53
+ title,0.9245738422084779,0.9528917910447762,0.9385192557997091
54
+ volume,0.9587973273942093,0.9513812154696133,0.9550748752079867
55
+ workid,1.0,0.6464646464646465,0.7852760736196319
56
+
57
+ Average accuracy by reference:,0.8980503362135145
58
+ STD of Average accuracy by reference:,0.150249693017486
59
+ Perfect parses:,239,0.478
60
+ Accuracy:, 0.899396689527149