excite 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,97 @@
1
+ # encoding: UTF-8
2
+ #
3
+ # Script to assist in verifying tagged references
4
+
5
+ f = ARGV[0]
6
+
7
+ cleaned = IO.readlines(f).map(&:strip)
8
+
9
+ tags = []
10
+ tag_contents = {}
11
+
12
+ annotation_tags = %w{ author title date booktitle journal volume pages editor workid link publisher location institution bullet tech note }.map { |t| "<#{t}>" }
13
+
14
+ cleaned.each_with_index do |line, index|
15
+ open_tags = line.scan(/<\s*\w+\s*>/).map(&:downcase)
16
+ for tag in open_tags
17
+ i = line.downcase.index(tag)
18
+ j = line.downcase.index(tag.sub('<','</'))
19
+ if tag != "<br>" && (j.nil? || j <= i)
20
+ puts "Missing close tag for #{tag} on line #{index+1}: #{line}"
21
+ end
22
+ end
23
+
24
+ close_tags = line.scan(/<\/\s*\w+\s*>/).map(&:downcase)
25
+ for tag in close_tags
26
+ i = line.downcase.index(tag)
27
+ j = line.downcase.index(tag.sub(/<\/\s*/, ''))
28
+ if j.nil? || j >= i
29
+ puts "Missing open tag for #{tag} on line #{index+1}: #{line}"
30
+ end
31
+ end
32
+
33
+ tags += open_tags
34
+
35
+ toks = line.split(/(\s+)|(?=<)|(?<=>)/)
36
+
37
+ start_tag = nil
38
+ tag_content = ''
39
+
40
+ for tok in toks
41
+ if annotation_tags.include?(tok)
42
+ if !start_tag.nil?
43
+ puts "Started #{tok} within #{start_tag} on line #{index+1}: #{line}"
44
+ start_tag = nil
45
+ else
46
+ start_tag = tok
47
+ end
48
+ elsif annotation_tags.include?(tok.sub(/<\/\s*/, '<'))
49
+ if start_tag.nil?
50
+ puts "End tag #{tok} without a corresponding start tag on line #{index+1}: #{line}"
51
+ elsif start_tag != tok.sub(/<\/s*/, '<')
52
+ puts "End tag #{tok} doesn't match start tag #{start_tag} on line #{index+1}: #{line}"
53
+ start_tag = nil
54
+ else
55
+ tag_contents[start_tag] ||= []
56
+ tag_contents[start_tag] << tag_content.strip
57
+
58
+ tag_content = ''
59
+ start_tag = nil
60
+ end
61
+ elsif start_tag.nil?
62
+ puts "Token '#{tok}' is not tagged in line #{index+1}: #{line}" unless tok.strip.empty?
63
+ else
64
+ tag_content += tok
65
+ end
66
+ end
67
+
68
+ for tag in annotation_tags
69
+ if open_tags.count(tag) > 1
70
+ puts "(Might be ok but...) More than one #{tag} in line #{index+1}: #{line}"
71
+ end
72
+ end
73
+
74
+ for tag in open_tags
75
+ if open_tags.count(tag) != close_tags.count(tag.sub('<','</'))
76
+ puts "Unequal numbers of open and close tags for #{tag} in line #{index+1}: #{line}" unless tag.match(/<\/?br>/)
77
+ end
78
+ end
79
+
80
+ if !open_tags.include?("<title>")
81
+ puts "Missing title on line #{index+1}: #{line}"
82
+ end
83
+ end
84
+
85
+ tag_counts = tags.inject({}) do |counts, tag|
86
+ counts[tag] ||= 0
87
+ counts[tag] += 1
88
+ counts
89
+ end
90
+
91
+ puts "\n\nAnnotation tags used: #{tag_counts.select { |t,c| annotation_tags.include?(t) } }"
92
+ puts "Other tags: #{tag_counts.reject { |t,c| annotation_tags.include?(t) }}\n\n\n"
93
+
94
+ tag_contents.each do |tag, contents|
95
+ puts "#{tag}s:"
96
+ contents.each { |c| puts "\t\t#{c}" }
97
+ end
@@ -0,0 +1,313 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ module TokenFeatures
6
+
7
+ module DictFlags
8
+ PUBLISHER_NAME = 32
9
+ PLACE_NAME = 16
10
+ MONTH_NAME = 8
11
+ LAST_NAME = 4
12
+ FIRST_NAME = 1
13
+ end
14
+
15
+ def TokenFeatures.read_dict_files(dir_name)
16
+ dict = {}
17
+ [
18
+ ['first-names',DictFlags::FIRST_NAME],
19
+ ['surnames',DictFlags::LAST_NAME],
20
+ ['months',DictFlags::MONTH_NAME],
21
+ ['places',DictFlags::PLACE_NAME],
22
+ ['publishers',DictFlags::PUBLISHER_NAME],
23
+ ].each do |file_name, flag|
24
+ filename = File.join(dir_name, file_name)
25
+ f = File.open(filename, 'r')
26
+
27
+ while l = f.gets
28
+ l.strip!
29
+ if !l.match(/^\#/)
30
+ dict[l] ||= 0
31
+ unless dict[l] & flag > 0
32
+ dict[l] += flag
33
+ end
34
+ end
35
+ end
36
+
37
+ f.close
38
+ end
39
+ dict
40
+ end
41
+
42
+ DIR = File.dirname(__FILE__)
43
+ DICT = TokenFeatures.read_dict_files("#{DIR}/resources/dicts")
44
+
45
+ private_class_method :read_dict_files
46
+
47
+ def clear
48
+ @possible_editor = nil
49
+ @possible_chapter = nil
50
+ @dict_status = nil
51
+ @is_proceeding = nil
52
+ end
53
+
54
+ def last_char(toks, idx, author_names=nil)
55
+ case toks[idx].raw[-1,1]
56
+ when /[a-z]/
57
+ 'a'
58
+ when /[A-Z]/
59
+ 'A'
60
+ when /[0-9]/
61
+ 0
62
+ else
63
+ toks[idx].raw[-1,1]
64
+ end
65
+ end
66
+
67
+ def first_1_char(toks, idx, author_names=nil); toks[idx].raw[0,1]; end
68
+ def first_2_chars(toks, idx, author_names=nil); toks[idx].raw[0,2]; end
69
+ def first_3_chars(toks, idx, author_names=nil); toks[idx].raw[0,3]; end
70
+ def first_4_chars(toks, idx, author_names=nil); toks[idx].raw[0,4]; end
71
+ def first_5_chars(toks, idx, author_names=nil); toks[idx].raw[0,5]; end
72
+
73
+ def last_1_char(toks, idx, author_names=nil); toks[idx].raw[-1,1]; end
74
+ def last_2_chars(toks, idx, author_names=nil); toks[idx].raw[-2,2] || toks[idx].raw; end
75
+ def last_3_chars(toks, idx, author_names=nil); toks[idx].raw[-3,3] || toks[idx].raw; end
76
+ def last_4_chars(toks, idx, author_names=nil); toks[idx].raw[-4,4] || toks[idx].raw; end
77
+
78
+ def toklcnp(toks, idx, author_names=nil); toks[idx].lcnp; end
79
+
80
+ def capitalization(toks, idx, author_names=nil)
81
+ case toks[idx].np
82
+ when "EMPTY"
83
+ "others"
84
+ when /^[[:upper:]]$/
85
+ "singleCap"
86
+ when /^[[:upper:]][[:lower:]]+/
87
+ "InitCap"
88
+ when /^[[:upper:]]+$/
89
+ "AllCap"
90
+ else
91
+ "others"
92
+ end
93
+ end
94
+
95
+ def numbers(toks, idx, author_names=nil)
96
+ (toks[idx].raw =~ /[0-9]\-[0-9]/) ? "possiblePage" :
97
+ (toks[idx].raw =~ /^\D*(19|20)[0-9][0-9]\D*$/) ? "year" :
98
+ (toks[idx].np =~ /^(19|20)[0-9][0-9]$/) ? "year" :
99
+ (toks[idx].np =~ /^[0-9]$/) ? "1dig" :
100
+ (toks[idx].np =~ /^[0-9][0-9]$/) ? "2dig" :
101
+ (toks[idx].np =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
102
+ (toks[idx].np =~ /^[0-9]+$/) ? "4+dig" :
103
+ (toks[idx].np =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
104
+ (toks[idx].np =~ /[0-9]/) ? "hasDig" : "nonNum"
105
+ end
106
+
107
+ # ignores idx
108
+ def possible_editor(toks, idx=nil, author_names=nil)
109
+ if !@possible_editor.nil?
110
+ @possible_editor
111
+ else
112
+ @possible_editor =
113
+ (toks.any? { |t| %w(ed editor editors eds edited).include?(t.lcnp) } ?
114
+ "possibleEditors" : "noEditors")
115
+ end
116
+ end
117
+
118
+ # if there is possible editor entry and "IN" preceeded by punctuation
119
+ # this citation may be a book chapter
120
+ #
121
+ # ignores idx
122
+ def possible_chapter(toks, idx=nil, author_names=nil)
123
+ if !@possible_chapter.nil?
124
+ @possible_chapter
125
+ else
126
+ has_editor = possible_editor(toks) == 'possibleEditors'
127
+ has_chapter = toks.each_with_index.any? do |t, i|
128
+ if i > 0 && i < (toks.length-1) && t.lcnp == 'in'
129
+ prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[i-1].part_of_speech)
130
+ next_is_separator = ['ppl','ppc','pps'].include?(toks[i+1].part_of_speech)
131
+ prev_is_separator && (has_editor || next_is_separator)
132
+ end
133
+ end
134
+ has_chapter ? "possibleChapter" : "noChapter"
135
+ end
136
+ end
137
+
138
+ # ignores idx
139
+ def is_proceeding(toks, idx=nil, author_names=nil)
140
+ if !@is_proceeding.nil?
141
+ @is_proceeding
142
+ else
143
+ @is_proceeding =
144
+ (toks.any? { |t|
145
+ %w( proc proceeding proceedings ).include?(t.lcnp.strip)
146
+ } ? 'isProc' : 'noProc')
147
+ end
148
+ end
149
+
150
+ # TODO remove duplication with possible_chapter
151
+ def is_in(toks, idx, author_names=nil)
152
+ is_in = if idx > 0 && idx < (toks.length-1) && toks[idx].lcnp == 'in'
153
+ prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[idx-1].part_of_speech)
154
+ next_is_separator = ['ppl','ppc','pps'].include?(toks[idx+1].part_of_speech)
155
+ prev_is_separator && (next_is_separator || toks[idx+1].np =~ /^[A-Z]/)
156
+ end
157
+ is_in ? "inBook" : "notInBook"
158
+ end
159
+
160
+ def location(toks, idx, author_names=nil)
161
+ r = ((idx.to_f / toks.length) * 10).round
162
+ end
163
+
164
+ def punct(toks, idx, author_names=nil)
165
+ (toks[idx].raw =~ /\-.*\-/) ? "multiHyphen" :
166
+ (toks[idx].raw =~ /[[:alpha:]].*\-$/) ? "truncated" :
167
+ (toks[idx].raw =~ /[[:alpha:]].*\.$/) ? "abbrev" :
168
+ (toks[idx].np != toks[idx].raw) ? "hasPunct" : "others"
169
+ end
170
+
171
+ def possible_volume(toks, idx, author_names=nil)
172
+ if possible_vol_with_str(toks, idx)
173
+ 'volume'
174
+ elsif possible_vol_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
175
+ 'issue'
176
+ elsif possible_vol_with_str(toks, idx-2) && possible_issue_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
177
+ 'issue'
178
+ elsif possible_vol_with_parens(toks, idx)
179
+ 'volume'
180
+ elsif (1..3).any? { |i| possible_vol_with_parens(toks, idx-i) }
181
+ 'issue'
182
+ elsif possible_vol_with_colon(toks, idx)
183
+ 'volume'
184
+ else
185
+ 'noVolume'
186
+ end
187
+ end
188
+
189
+ # TODO this method is weirdly named b/c of alphabetical ordering hack: remove that
190
+ def a_is_in_dict(toks, idx, author_names=nil)
191
+ dict_status(toks, idx)
192
+ end
193
+
194
+ def publisherName(toks, idx, author_names=nil)
195
+ (dict_status(toks, idx) & DictFlags::PUBLISHER_NAME) > 0 ? 'publisherName' : 'noPublisherName'
196
+ end
197
+
198
+ def placeName(toks, idx, author_names=nil)
199
+ (dict_status(toks, idx) & DictFlags::PLACE_NAME) > 0 ? 'placeName' : 'noPlaceName'
200
+ end
201
+
202
+ def monthName(toks, idx, author_names=nil)
203
+ (dict_status(toks, idx) & DictFlags::MONTH_NAME) > 0 ? 'monthName' : 'noMonthName'
204
+ end
205
+
206
+ def lastName(toks, idx, author_names=nil)
207
+ return 'lastName' if author_names && author_names.last == toks[idx].lcnp
208
+ (dict_status(toks, idx) & DictFlags::LAST_NAME) > 0 ? 'lastName' : 'noLastName'
209
+ end
210
+
211
+ def firstName(toks, idx, author_names=nil)
212
+ return 'firstName' if author_names && author_names.first == toks[idx].lcnp
213
+ (dict_status(toks, idx) & DictFlags::FIRST_NAME) > 0 ? 'firstName' : 'noFirstName'
214
+ end
215
+
216
+ def dict_status(toks, idx)
217
+ @dict_status ||= [nil]*toks.length
218
+ @dict_status[idx] ||= (DICT[toks[idx].lcnp] || 0)
219
+ end
220
+
221
+ NODE_TYPES_BY_NAME = {
222
+ 'div'=>'div',
223
+ 'p'=>'p',
224
+ 'ul'=>'div', # lump with div - higher-level structure
225
+ 'li'=>'li',
226
+ 'tr'=>'div', # lump with div - higher-level structure
227
+ 'td'=>'td',
228
+ 'span'=>'span',
229
+ 'font'=>'span',
230
+ 'em'=>'em',
231
+ 'i'=>'em',
232
+ 'strong'=>'strong',
233
+ 'b'=>'strong',
234
+ 'u'=>'u',
235
+ 'h1'=>'h',
236
+ 'h2'=>'h',
237
+ 'h3'=>'h',
238
+ 'h4'=>'h',
239
+ 'h5'=>'h',
240
+ 'h6'=>'h',
241
+ 'a'=>'a',
242
+ '#document-fragment'=>'unknown' # the actual tag wasn't captured in the fragment we're parsing
243
+ }
244
+
245
+ def tag_name(toks, idx, author_names=nil)
246
+ name = toks[idx].node.parent.name # node is always a text node; the informative one is the parent
247
+ NODE_TYPES_BY_NAME[name.downcase] || 'other'
248
+ end
249
+
250
+ def location_in_node(toks, idx, author_names=nil)
251
+ ((toks[idx].idx_in_node.to_f / toks[idx].node_token_count) * 10).round
252
+ end
253
+
254
+ def part_of_speech(toks, idx, author_names=nil)
255
+ toks[idx].part_of_speech
256
+ end
257
+
258
+ private
259
+
260
+ def possible_issue_with_str(toks, idx)
261
+ return unless toks[idx]
262
+
263
+ possible_issue_str(toks, idx) ||
264
+ (possible_issue_str(toks, idx-1) && toks[idx].raw =~ /^\d+$/)
265
+ end
266
+
267
+ def possible_issue_str(toks, idx)
268
+ if toks[idx]
269
+ if toks[idx].raw =~ /^(no)|(issue)?\.?\d+.?$/i
270
+ return true
271
+ elsif toks[idx+1]
272
+ return ['no','issue'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
273
+ end
274
+ end
275
+ end
276
+
277
+ def possible_vol_with_str(toks, idx)
278
+ return unless toks[idx]
279
+
280
+ possible_vol_str(toks, idx) ||
281
+ (possible_vol_str(toks, idx-1) && (toks[idx].raw =~ /^\d+$/ || toks[idx].raw == ',')) ||
282
+ (possible_vol_str(toks, idx-2) && toks[idx-1].raw =~ /^\d+$/ && toks[idx].raw == ',')
283
+ end
284
+
285
+ def possible_vol_str(toks, idx)
286
+ if toks[idx]
287
+ if toks[idx].raw =~ /^vol(ume)?\.?\d+.?$/i
288
+ return true
289
+ elsif toks[idx+1]
290
+ return ['vol','volume'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
291
+ end
292
+ end
293
+ end
294
+
295
+ def possible_vol_with_parens(toks, idx)
296
+ if toks[idx] && toks[idx+3]
297
+ toks[idx].raw =~ /^\d+$/ && toks[idx+1].raw == '(' && toks[idx+2].raw =~ /^\d+$/ && toks[idx+3].raw == ')'
298
+ end
299
+ end
300
+
301
+ def possible_vol_with_colon(toks, idx)
302
+ if toks[idx] && toks[idx+1]
303
+ # case of <year>: something is common so make sure we exclude it
304
+ if toks[idx].np =~ /^\d{1,3}$/ && toks[idx+1].raw =~ /^:/
305
+ # at this point it's likely a volume, but exclude it if it's not followed by an apparent page or issue
306
+ toks[idx+1].np =~ /^\d+$/ || (toks[idx+1].raw == ':' && toks[idx+2] && toks[idx+2].np =~ /^\d+/)
307
+ end
308
+ end
309
+ end
310
+
311
+ end
312
+
313
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ VERSION = '2.1.1'
6
+
7
+ end
data/lib/excite.rb ADDED
@@ -0,0 +1,13 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+ end
5
+
6
+ require 'crfpp'
7
+
8
+ require 'excite/array_helpers'
9
+ require 'excite/citation'
10
+ require 'excite/preprocessor'
11
+ require 'excite/postprocessor'
12
+ require 'excite/token_features'
13
+ require 'excite/crfparser'
@@ -0,0 +1,54 @@
1
+ Results for model
2
+ branch: add_parts_of_speech
3
+ version: fe881b78325f48e511a2d015ba80edb55ba9a64c Rename module to Excite
4
+ Test run on:,2013-03-06 15:15:20 -0800
5
+ K-fold x-validation:,10
6
+ Corpus size:,500
7
+
8
+ truth\test,author,booktitle,date,editor,institution,journal,location,note,pages,publisher,tech,title,volume
9
+ author,3894,6,0,7,0,0,0,3,0,0,0,1,0
10
+ booktitle,0,2053,0,8,3,52,8,9,0,0,3,31,6
11
+ date,1,0,1547,1,0,0,6,2,4,0,0,1,3
12
+ editor,35,5,0,442,0,4,0,0,0,0,0,1,0
13
+ institution,3,8,0,4,368,0,6,0,0,1,1,8,0
14
+ journal,0,35,0,5,0,731,0,0,0,3,0,26,4
15
+ location,0,21,2,0,14,0,479,0,0,0,0,3,3
16
+ note,0,32,1,5,2,6,0,99,0,0,7,3,9
17
+ pages,0,0,2,0,0,0,0,0,772,1,2,0,6
18
+ publisher,0,10,0,0,11,2,6,0,0,271,0,4,0
19
+ tech,0,0,0,0,2,0,4,0,4,0,199,30,5
20
+ title,1,32,0,4,0,4,2,0,0,0,0,4215,2
21
+ volume,0,3,1,0,0,3,0,0,7,0,3,0,609
22
+ author,0.9956532856047047,0.001534134492457172,0.0,0.0017898235745333673,0.0,0.0,0.0,0.000767067246228586,0.0,0.0,0.0,0.00025568908207619537,0.0
23
+ booktitle,0.0,0.9447768062586286,0.0,0.0036815462494247586,0.0013805798435342844,0.02393005062126093,0.0036815462494247586,0.0041417395306028535,0.0,0.0,0.0013805798435342844,0.014265991716520939,0.0027611596870685687
24
+ date,0.0006389776357827476,0.0,0.9884984025559106,0.0006389776357827476,0.0,0.0,0.0038338658146964857,0.0012779552715654952,0.0025559105431309905,0.0,0.0,0.0006389776357827476,0.0019169329073482429
25
+ editor,0.07186858316221766,0.01026694045174538,0.0,0.9075975359342916,0.0,0.008213552361396304,0.0,0.0,0.0,0.0,0.0,0.002053388090349076,0.0
26
+ institution,0.007518796992481203,0.020050125313283207,0.0,0.010025062656641603,0.9223057644110275,0.0,0.015037593984962405,0.0,0.0,0.002506265664160401,0.002506265664160401,0.020050125313283207,0.0
27
+ journal,0.0,0.043532338308457715,0.0,0.006218905472636816,0.0,0.9092039800995025,0.0,0.0,0.0,0.0037313432835820895,0.0,0.03233830845771144,0.004975124378109453
28
+ location,0.0,0.040229885057471264,0.0038314176245210726,0.0,0.02681992337164751,0.0,0.9176245210727969,0.0,0.0,0.0,0.0,0.005747126436781609,0.005747126436781609
29
+ note,0.0,0.1951219512195122,0.006097560975609756,0.03048780487804878,0.012195121951219513,0.036585365853658534,0.0,0.6036585365853658,0.0,0.0,0.042682926829268296,0.018292682926829267,0.054878048780487805
30
+ pages,0.0,0.0,0.002554278416347382,0.0,0.0,0.0,0.0,0.0,0.9859514687100894,0.001277139208173691,0.002554278416347382,0.0,0.007662835249042145
31
+ publisher,0.0,0.03289473684210526,0.0,0.0,0.03618421052631579,0.006578947368421052,0.019736842105263157,0.0,0.0,0.8914473684210527,0.0,0.013157894736842105,0.0
32
+ tech,0.0,0.0,0.0,0.0,0.00819672131147541,0.0,0.01639344262295082,0.0,0.01639344262295082,0.0,0.8155737704918032,0.12295081967213115,0.020491803278688523
33
+ title,0.00023474178403755868,0.007511737089201878,0.0,0.0009389671361502347,0.0,0.0009389671361502347,0.00046948356807511736,0.0,0.0,0.0,0.0,0.9894366197183099,0.00046948356807511736
34
+ volume,0.0,0.004792332268370607,0.001597444089456869,0.0,0.0,0.004792332268370607,0.0,0.0,0.011182108626198083,0.0,0.004792332268370607,0.0,0.9728434504792333
35
+
36
+ Label,Precision,Recall,F-measure
37
+ author,0.9898322318251144,0.9956532856047047,0.9927342256214148
38
+ booktitle,0.9310657596371882,0.9447768062586286,0.9378711740520785
39
+ date,0.9961365099806826,0.9884984025559106,0.9923027581783195
40
+ editor,0.9285714285714286,0.9075975359342916,0.9179646936656283
41
+ institution,0.92,0.9223057644110275,0.9211514392991239
42
+ journal,0.9114713216957606,0.9092039800995025,0.9103362391033625
43
+ location,0.9373776908023483,0.9176245210727969,0.9273959341723136
44
+ note,0.8761061946902655,0.6036585365853658,0.7148014440433211
45
+ pages,0.9809402795425667,0.9859514687100894,0.9834394904458599
46
+ publisher,0.9818840579710145,0.8914473684210527,0.9344827586206896
47
+ tech,0.9255813953488372,0.8155737704918032,0.8671023965141613
48
+ title,0.9750173490631506,0.9894366197183099,0.9821740650122335
49
+ volume,0.9412673879443586,0.9728434504792333,0.9567949725058915
50
+
51
+ Average accuracy by reference:,0.9621987290789997
52
+ STD of Average accuracy by reference:,0.09112313872147432
53
+ Perfect parses:,383,0.766
54
+ Accuracy:, 0.9653367811845832
@@ -0,0 +1,30 @@
1
+ module ArrayHelpers
2
+
3
+ def sum
4
+ inject(0, :+)
5
+ end
6
+
7
+ def mean
8
+ (size > 0) ? sum.to_f / size : 0
9
+ end
10
+
11
+ def stddev
12
+ m = mean
13
+ devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
14
+ (size > 0) ? (devsum.to_f / size) ** 0.5 : 0
15
+ end
16
+
17
+ def cov(other)
18
+ zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
19
+ end
20
+
21
+ def pearson_r(other)
22
+ unless size == other.size
23
+ raise "Vectors must be of same length to calculate pearson_r"
24
+ end
25
+ devp = stddev * other.stddev
26
+ (devp > 0) ? cov(other) / devp : 0.0
27
+ end
28
+
29
+ end
30
+
@@ -0,0 +1,60 @@
1
+ Results for model
2
+ branch: add_parts_of_speech
3
+ version: 19c853489cbc259793dd24d89e1e16c78222a14b Add possible_volume based on regex (not actually usable unfortunately)
4
+ Test run on:,2013-03-06 14:52:26 -0800
5
+ K-fold x-validation:,10
6
+ Corpus size:,500
7
+
8
+ truth\test,author,booktitle,bullet,date,editor,institution,journal,link,location,note,pages,publisher,title,volume,workid
9
+ author,3503,8,4,2,32,0,2,3,0,7,2,0,72,0,0
10
+ booktitle,3,1271,0,20,27,15,82,0,13,21,4,13,171,2,0
11
+ bullet,3,1,31,1,0,0,1,0,0,3,0,0,1,2,0
12
+ date,1,24,0,1274,5,0,10,0,4,8,5,8,3,8,0
13
+ editor,35,19,0,0,1012,0,0,4,5,0,0,9,23,0,0
14
+ institution,0,34,0,2,11,0,18,0,0,10,0,36,29,0,0
15
+ journal,3,45,0,3,2,4,1130,1,0,8,0,11,62,2,0
16
+ link,3,16,0,1,0,0,0,326,0,48,7,2,2,1,0
17
+ location,9,13,0,3,4,0,5,2,441,7,0,34,9,0,0
18
+ note,9,61,0,13,21,0,16,19,0,175,3,20,107,2,0
19
+ pages,0,1,0,14,0,0,0,4,0,9,693,3,4,16,0
20
+ publisher,6,31,0,4,16,0,15,1,12,13,2,532,14,0,0
21
+ title,54,159,0,12,4,0,20,0,10,34,2,8,6129,0,0
22
+ volume,0,6,0,18,0,0,8,0,3,2,7,0,0,861,0
23
+ workid,0,0,0,7,0,0,0,2,4,7,8,0,3,4,64
24
+ author,0.9636863823933975,0.002200825309491059,0.0011004126547455295,0.0005502063273727648,0.008803301237964236,0.0,0.0005502063273727648,0.0008253094910591472,0.0,0.0019257221458046766,0.0005502063273727648,0.0,0.019807427785419534,0.0,0.0
25
+ booktitle,0.0018270401948842874,0.7740560292326432,0.0,0.012180267965895249,0.016443361753958587,0.009135200974421437,0.049939098660170524,0.0,0.007917174177831911,0.012789281364190013,0.00243605359317905,0.007917174177831911,0.10414129110840438,0.001218026796589525,0.0
26
+ bullet,0.06976744186046512,0.023255813953488372,0.7209302325581395,0.023255813953488372,0.0,0.0,0.023255813953488372,0.0,0.0,0.06976744186046512,0.0,0.0,0.023255813953488372,0.046511627906976744,0.0
27
+ date,0.0007407407407407407,0.017777777777777778,0.0,0.9437037037037037,0.003703703703703704,0.0,0.007407407407407408,0.0,0.002962962962962963,0.005925925925925926,0.003703703703703704,0.005925925925925926,0.0022222222222222222,0.005925925925925926,0.0
28
+ editor,0.031616982836495035,0.017163504968383016,0.0,0.0,0.9141824751580849,0.0,0.0,0.0036133694670280035,0.004516711833785004,0.0,0.0,0.008130081300813009,0.02077687443541102,0.0,0.0
29
+ institution,0.0,0.24285714285714285,0.0,0.014285714285714285,0.07857142857142857,0.0,0.12857142857142856,0.0,0.0,0.07142857142857142,0.0,0.2571428571428571,0.20714285714285716,0.0,0.0
30
+ journal,0.0023603461841070024,0.03540519276160504,0.0,0.0023603461841070024,0.0015735641227380016,0.003147128245476003,0.8890637293469709,0.0007867820613690008,0.0,0.006294256490952006,0.0,0.00865460267505901,0.04878048780487805,0.0015735641227380016,0.0
31
+ link,0.007389162561576354,0.03940886699507389,0.0,0.0024630541871921183,0.0,0.0,0.0,0.8029556650246306,0.0,0.11822660098522167,0.017241379310344827,0.0049261083743842365,0.0049261083743842365,0.0024630541871921183,0.0
32
+ location,0.017077798861480076,0.024667931688804556,0.0,0.0056925996204933585,0.007590132827324478,0.0,0.009487666034155597,0.003795066413662239,0.8368121442125237,0.013282732447817837,0.0,0.06451612903225806,0.017077798861480076,0.0,0.0
33
+ note,0.020179372197309416,0.1367713004484305,0.0,0.02914798206278027,0.04708520179372197,0.0,0.03587443946188341,0.042600896860986545,0.0,0.3923766816143498,0.006726457399103139,0.04484304932735426,0.2399103139013453,0.004484304932735426,0.0
34
+ pages,0.0,0.0013440860215053765,0.0,0.01881720430107527,0.0,0.0,0.0,0.005376344086021506,0.0,0.012096774193548387,0.9314516129032258,0.004032258064516129,0.005376344086021506,0.021505376344086023,0.0
35
+ publisher,0.009287925696594427,0.047987616099071206,0.0,0.006191950464396285,0.02476780185758514,0.0,0.02321981424148607,0.0015479876160990713,0.018575851393188854,0.020123839009287926,0.0030959752321981426,0.8235294117647058,0.021671826625386997,0.0,0.0
36
+ title,0.008395522388059701,0.024720149253731342,0.0,0.0018656716417910447,0.0006218905472636816,0.0,0.003109452736318408,0.0,0.001554726368159204,0.005286069651741294,0.0003109452736318408,0.0012437810945273632,0.9528917910447762,0.0,0.0
37
+ volume,0.0,0.0066298342541436465,0.0,0.019889502762430938,0.0,0.0,0.008839779005524863,0.0,0.0033149171270718232,0.0022099447513812156,0.0077348066298342545,0.0,0.0,0.9513812154696133,0.0
38
+ workid,0.0,0.0,0.0,0.0707070707070707,0.0,0.0,0.0,0.020202020202020204,0.04040404040404041,0.0707070707070707,0.08080808080808081,0.0,0.030303030303030304,0.04040404040404041,0.6464646464646465
39
+
40
+ Label,Precision,Recall,F-measure
41
+ author,0.9652796913750344,0.9636863823933975,0.9644823788546256
42
+ booktitle,0.7525162818235642,0.7740560292326432,0.7631341939357552
43
+ bullet,0.8857142857142857,0.7209302325581395,0.7948717948717948
44
+ date,0.9272197962154294,0.9437037037037037,0.9353891336270191
45
+ editor,0.892416225749559,0.9141824751580849,0.9031682284694332
46
+ institution,0.0,0.0,NaN
47
+ journal,0.864575363427697,0.8890637293469709,0.8766485647788983
48
+ link,0.9005524861878453,0.8029556650246306,0.8489583333333334
49
+ location,0.8963414634146342,0.8368121442125237,0.8655544651619235
50
+ note,0.4971590909090909,0.3923766816143498,0.43859649122807015
51
+ pages,0.9454297407912687,0.9314516129032258,0.938388625592417
52
+ publisher,0.7869822485207101,0.8235294117647058,0.8048411497730711
53
+ title,0.9245738422084779,0.9528917910447762,0.9385192557997091
54
+ volume,0.9587973273942093,0.9513812154696133,0.9550748752079867
55
+ workid,1.0,0.6464646464646465,0.7852760736196319
56
+
57
+ Average accuracy by reference:,0.8980503362135145
58
+ STD of Average accuracy by reference:,0.150249693017486
59
+ Perfect parses:,239,0.478
60
+ Accuracy:, 0.899396689527149