excite 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,97 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# Script to assist in verifying tagged references
|
4
|
+
|
5
|
+
f = ARGV[0]
|
6
|
+
|
7
|
+
cleaned = IO.readlines(f).map(&:strip)
|
8
|
+
|
9
|
+
tags = []
|
10
|
+
tag_contents = {}
|
11
|
+
|
12
|
+
annotation_tags = %w{ author title date booktitle journal volume pages editor workid link publisher location institution bullet tech note }.map { |t| "<#{t}>" }
|
13
|
+
|
14
|
+
cleaned.each_with_index do |line, index|
|
15
|
+
open_tags = line.scan(/<\s*\w+\s*>/).map(&:downcase)
|
16
|
+
for tag in open_tags
|
17
|
+
i = line.downcase.index(tag)
|
18
|
+
j = line.downcase.index(tag.sub('<','</'))
|
19
|
+
if tag != "<br>" && (j.nil? || j <= i)
|
20
|
+
puts "Missing close tag for #{tag} on line #{index+1}: #{line}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
close_tags = line.scan(/<\/\s*\w+\s*>/).map(&:downcase)
|
25
|
+
for tag in close_tags
|
26
|
+
i = line.downcase.index(tag)
|
27
|
+
j = line.downcase.index(tag.sub(/<\/\s*/, ''))
|
28
|
+
if j.nil? || j >= i
|
29
|
+
puts "Missing open tag for #{tag} on line #{index+1}: #{line}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
tags += open_tags
|
34
|
+
|
35
|
+
toks = line.split(/(\s+)|(?=<)|(?<=>)/)
|
36
|
+
|
37
|
+
start_tag = nil
|
38
|
+
tag_content = ''
|
39
|
+
|
40
|
+
for tok in toks
|
41
|
+
if annotation_tags.include?(tok)
|
42
|
+
if !start_tag.nil?
|
43
|
+
puts "Started #{tok} within #{start_tag} on line #{index+1}: #{line}"
|
44
|
+
start_tag = nil
|
45
|
+
else
|
46
|
+
start_tag = tok
|
47
|
+
end
|
48
|
+
elsif annotation_tags.include?(tok.sub(/<\/\s*/, '<'))
|
49
|
+
if start_tag.nil?
|
50
|
+
puts "End tag #{tok} without a corresponding start tag on line #{index+1}: #{line}"
|
51
|
+
elsif start_tag != tok.sub(/<\/s*/, '<')
|
52
|
+
puts "End tag #{tok} doesn't match start tag #{start_tag} on line #{index+1}: #{line}"
|
53
|
+
start_tag = nil
|
54
|
+
else
|
55
|
+
tag_contents[start_tag] ||= []
|
56
|
+
tag_contents[start_tag] << tag_content.strip
|
57
|
+
|
58
|
+
tag_content = ''
|
59
|
+
start_tag = nil
|
60
|
+
end
|
61
|
+
elsif start_tag.nil?
|
62
|
+
puts "Token '#{tok}' is not tagged in line #{index+1}: #{line}" unless tok.strip.empty?
|
63
|
+
else
|
64
|
+
tag_content += tok
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
for tag in annotation_tags
|
69
|
+
if open_tags.count(tag) > 1
|
70
|
+
puts "(Might be ok but...) More than one #{tag} in line #{index+1}: #{line}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
for tag in open_tags
|
75
|
+
if open_tags.count(tag) != close_tags.count(tag.sub('<','</'))
|
76
|
+
puts "Unequal numbers of open and close tags for #{tag} in line #{index+1}: #{line}" unless tag.match(/<\/?br>/)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
if !open_tags.include?("<title>")
|
81
|
+
puts "Missing title on line #{index+1}: #{line}"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
tag_counts = tags.inject({}) do |counts, tag|
|
86
|
+
counts[tag] ||= 0
|
87
|
+
counts[tag] += 1
|
88
|
+
counts
|
89
|
+
end
|
90
|
+
|
91
|
+
puts "\n\nAnnotation tags used: #{tag_counts.select { |t,c| annotation_tags.include?(t) } }"
|
92
|
+
puts "Other tags: #{tag_counts.reject { |t,c| annotation_tags.include?(t) }}\n\n\n"
|
93
|
+
|
94
|
+
tag_contents.each do |tag, contents|
|
95
|
+
puts "#{tag}s:"
|
96
|
+
contents.each { |c| puts "\t\t#{c}" }
|
97
|
+
end
|
@@ -0,0 +1,313 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
|
5
|
+
module TokenFeatures
|
6
|
+
|
7
|
+
module DictFlags
|
8
|
+
PUBLISHER_NAME = 32
|
9
|
+
PLACE_NAME = 16
|
10
|
+
MONTH_NAME = 8
|
11
|
+
LAST_NAME = 4
|
12
|
+
FIRST_NAME = 1
|
13
|
+
end
|
14
|
+
|
15
|
+
def TokenFeatures.read_dict_files(dir_name)
|
16
|
+
dict = {}
|
17
|
+
[
|
18
|
+
['first-names',DictFlags::FIRST_NAME],
|
19
|
+
['surnames',DictFlags::LAST_NAME],
|
20
|
+
['months',DictFlags::MONTH_NAME],
|
21
|
+
['places',DictFlags::PLACE_NAME],
|
22
|
+
['publishers',DictFlags::PUBLISHER_NAME],
|
23
|
+
].each do |file_name, flag|
|
24
|
+
filename = File.join(dir_name, file_name)
|
25
|
+
f = File.open(filename, 'r')
|
26
|
+
|
27
|
+
while l = f.gets
|
28
|
+
l.strip!
|
29
|
+
if !l.match(/^\#/)
|
30
|
+
dict[l] ||= 0
|
31
|
+
unless dict[l] & flag > 0
|
32
|
+
dict[l] += flag
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
f.close
|
38
|
+
end
|
39
|
+
dict
|
40
|
+
end
|
41
|
+
|
42
|
+
DIR = File.dirname(__FILE__)
|
43
|
+
DICT = TokenFeatures.read_dict_files("#{DIR}/resources/dicts")
|
44
|
+
|
45
|
+
private_class_method :read_dict_files
|
46
|
+
|
47
|
+
def clear
|
48
|
+
@possible_editor = nil
|
49
|
+
@possible_chapter = nil
|
50
|
+
@dict_status = nil
|
51
|
+
@is_proceeding = nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def last_char(toks, idx, author_names=nil)
|
55
|
+
case toks[idx].raw[-1,1]
|
56
|
+
when /[a-z]/
|
57
|
+
'a'
|
58
|
+
when /[A-Z]/
|
59
|
+
'A'
|
60
|
+
when /[0-9]/
|
61
|
+
0
|
62
|
+
else
|
63
|
+
toks[idx].raw[-1,1]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def first_1_char(toks, idx, author_names=nil); toks[idx].raw[0,1]; end
|
68
|
+
def first_2_chars(toks, idx, author_names=nil); toks[idx].raw[0,2]; end
|
69
|
+
def first_3_chars(toks, idx, author_names=nil); toks[idx].raw[0,3]; end
|
70
|
+
def first_4_chars(toks, idx, author_names=nil); toks[idx].raw[0,4]; end
|
71
|
+
def first_5_chars(toks, idx, author_names=nil); toks[idx].raw[0,5]; end
|
72
|
+
|
73
|
+
def last_1_char(toks, idx, author_names=nil); toks[idx].raw[-1,1]; end
|
74
|
+
def last_2_chars(toks, idx, author_names=nil); toks[idx].raw[-2,2] || toks[idx].raw; end
|
75
|
+
def last_3_chars(toks, idx, author_names=nil); toks[idx].raw[-3,3] || toks[idx].raw; end
|
76
|
+
def last_4_chars(toks, idx, author_names=nil); toks[idx].raw[-4,4] || toks[idx].raw; end
|
77
|
+
|
78
|
+
def toklcnp(toks, idx, author_names=nil); toks[idx].lcnp; end
|
79
|
+
|
80
|
+
def capitalization(toks, idx, author_names=nil)
|
81
|
+
case toks[idx].np
|
82
|
+
when "EMPTY"
|
83
|
+
"others"
|
84
|
+
when /^[[:upper:]]$/
|
85
|
+
"singleCap"
|
86
|
+
when /^[[:upper:]][[:lower:]]+/
|
87
|
+
"InitCap"
|
88
|
+
when /^[[:upper:]]+$/
|
89
|
+
"AllCap"
|
90
|
+
else
|
91
|
+
"others"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def numbers(toks, idx, author_names=nil)
|
96
|
+
(toks[idx].raw =~ /[0-9]\-[0-9]/) ? "possiblePage" :
|
97
|
+
(toks[idx].raw =~ /^\D*(19|20)[0-9][0-9]\D*$/) ? "year" :
|
98
|
+
(toks[idx].np =~ /^(19|20)[0-9][0-9]$/) ? "year" :
|
99
|
+
(toks[idx].np =~ /^[0-9]$/) ? "1dig" :
|
100
|
+
(toks[idx].np =~ /^[0-9][0-9]$/) ? "2dig" :
|
101
|
+
(toks[idx].np =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
|
102
|
+
(toks[idx].np =~ /^[0-9]+$/) ? "4+dig" :
|
103
|
+
(toks[idx].np =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
|
104
|
+
(toks[idx].np =~ /[0-9]/) ? "hasDig" : "nonNum"
|
105
|
+
end
|
106
|
+
|
107
|
+
# ignores idx
|
108
|
+
def possible_editor(toks, idx=nil, author_names=nil)
|
109
|
+
if !@possible_editor.nil?
|
110
|
+
@possible_editor
|
111
|
+
else
|
112
|
+
@possible_editor =
|
113
|
+
(toks.any? { |t| %w(ed editor editors eds edited).include?(t.lcnp) } ?
|
114
|
+
"possibleEditors" : "noEditors")
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# if there is possible editor entry and "IN" preceeded by punctuation
|
119
|
+
# this citation may be a book chapter
|
120
|
+
#
|
121
|
+
# ignores idx
|
122
|
+
def possible_chapter(toks, idx=nil, author_names=nil)
|
123
|
+
if !@possible_chapter.nil?
|
124
|
+
@possible_chapter
|
125
|
+
else
|
126
|
+
has_editor = possible_editor(toks) == 'possibleEditors'
|
127
|
+
has_chapter = toks.each_with_index.any? do |t, i|
|
128
|
+
if i > 0 && i < (toks.length-1) && t.lcnp == 'in'
|
129
|
+
prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[i-1].part_of_speech)
|
130
|
+
next_is_separator = ['ppl','ppc','pps'].include?(toks[i+1].part_of_speech)
|
131
|
+
prev_is_separator && (has_editor || next_is_separator)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
has_chapter ? "possibleChapter" : "noChapter"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# ignores idx
|
139
|
+
def is_proceeding(toks, idx=nil, author_names=nil)
|
140
|
+
if !@is_proceeding.nil?
|
141
|
+
@is_proceeding
|
142
|
+
else
|
143
|
+
@is_proceeding =
|
144
|
+
(toks.any? { |t|
|
145
|
+
%w( proc proceeding proceedings ).include?(t.lcnp.strip)
|
146
|
+
} ? 'isProc' : 'noProc')
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# TODO remove duplication with possible_chapter
|
151
|
+
def is_in(toks, idx, author_names=nil)
|
152
|
+
is_in = if idx > 0 && idx < (toks.length-1) && toks[idx].lcnp == 'in'
|
153
|
+
prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[idx-1].part_of_speech)
|
154
|
+
next_is_separator = ['ppl','ppc','pps'].include?(toks[idx+1].part_of_speech)
|
155
|
+
prev_is_separator && (next_is_separator || toks[idx+1].np =~ /^[A-Z]/)
|
156
|
+
end
|
157
|
+
is_in ? "inBook" : "notInBook"
|
158
|
+
end
|
159
|
+
|
160
|
+
def location(toks, idx, author_names=nil)
|
161
|
+
r = ((idx.to_f / toks.length) * 10).round
|
162
|
+
end
|
163
|
+
|
164
|
+
def punct(toks, idx, author_names=nil)
|
165
|
+
(toks[idx].raw =~ /\-.*\-/) ? "multiHyphen" :
|
166
|
+
(toks[idx].raw =~ /[[:alpha:]].*\-$/) ? "truncated" :
|
167
|
+
(toks[idx].raw =~ /[[:alpha:]].*\.$/) ? "abbrev" :
|
168
|
+
(toks[idx].np != toks[idx].raw) ? "hasPunct" : "others"
|
169
|
+
end
|
170
|
+
|
171
|
+
def possible_volume(toks, idx, author_names=nil)
|
172
|
+
if possible_vol_with_str(toks, idx)
|
173
|
+
'volume'
|
174
|
+
elsif possible_vol_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
|
175
|
+
'issue'
|
176
|
+
elsif possible_vol_with_str(toks, idx-2) && possible_issue_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
|
177
|
+
'issue'
|
178
|
+
elsif possible_vol_with_parens(toks, idx)
|
179
|
+
'volume'
|
180
|
+
elsif (1..3).any? { |i| possible_vol_with_parens(toks, idx-i) }
|
181
|
+
'issue'
|
182
|
+
elsif possible_vol_with_colon(toks, idx)
|
183
|
+
'volume'
|
184
|
+
else
|
185
|
+
'noVolume'
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# TODO this method is weirdly named b/c of alphabetical ordering hack: remove that
|
190
|
+
def a_is_in_dict(toks, idx, author_names=nil)
|
191
|
+
dict_status(toks, idx)
|
192
|
+
end
|
193
|
+
|
194
|
+
def publisherName(toks, idx, author_names=nil)
|
195
|
+
(dict_status(toks, idx) & DictFlags::PUBLISHER_NAME) > 0 ? 'publisherName' : 'noPublisherName'
|
196
|
+
end
|
197
|
+
|
198
|
+
def placeName(toks, idx, author_names=nil)
|
199
|
+
(dict_status(toks, idx) & DictFlags::PLACE_NAME) > 0 ? 'placeName' : 'noPlaceName'
|
200
|
+
end
|
201
|
+
|
202
|
+
def monthName(toks, idx, author_names=nil)
|
203
|
+
(dict_status(toks, idx) & DictFlags::MONTH_NAME) > 0 ? 'monthName' : 'noMonthName'
|
204
|
+
end
|
205
|
+
|
206
|
+
def lastName(toks, idx, author_names=nil)
|
207
|
+
return 'lastName' if author_names && author_names.last == toks[idx].lcnp
|
208
|
+
(dict_status(toks, idx) & DictFlags::LAST_NAME) > 0 ? 'lastName' : 'noLastName'
|
209
|
+
end
|
210
|
+
|
211
|
+
def firstName(toks, idx, author_names=nil)
|
212
|
+
return 'firstName' if author_names && author_names.first == toks[idx].lcnp
|
213
|
+
(dict_status(toks, idx) & DictFlags::FIRST_NAME) > 0 ? 'firstName' : 'noFirstName'
|
214
|
+
end
|
215
|
+
|
216
|
+
def dict_status(toks, idx)
|
217
|
+
@dict_status ||= [nil]*toks.length
|
218
|
+
@dict_status[idx] ||= (DICT[toks[idx].lcnp] || 0)
|
219
|
+
end
|
220
|
+
|
221
|
+
NODE_TYPES_BY_NAME = {
|
222
|
+
'div'=>'div',
|
223
|
+
'p'=>'p',
|
224
|
+
'ul'=>'div', # lump with div - higher-level structure
|
225
|
+
'li'=>'li',
|
226
|
+
'tr'=>'div', # lump with div - higher-level structure
|
227
|
+
'td'=>'td',
|
228
|
+
'span'=>'span',
|
229
|
+
'font'=>'span',
|
230
|
+
'em'=>'em',
|
231
|
+
'i'=>'em',
|
232
|
+
'strong'=>'strong',
|
233
|
+
'b'=>'strong',
|
234
|
+
'u'=>'u',
|
235
|
+
'h1'=>'h',
|
236
|
+
'h2'=>'h',
|
237
|
+
'h3'=>'h',
|
238
|
+
'h4'=>'h',
|
239
|
+
'h5'=>'h',
|
240
|
+
'h6'=>'h',
|
241
|
+
'a'=>'a',
|
242
|
+
'#document-fragment'=>'unknown' # the actual tag wasn't captured in the fragment we're parsing
|
243
|
+
}
|
244
|
+
|
245
|
+
def tag_name(toks, idx, author_names=nil)
|
246
|
+
name = toks[idx].node.parent.name # node is always a text node; the informative one is the parent
|
247
|
+
NODE_TYPES_BY_NAME[name.downcase] || 'other'
|
248
|
+
end
|
249
|
+
|
250
|
+
def location_in_node(toks, idx, author_names=nil)
|
251
|
+
((toks[idx].idx_in_node.to_f / toks[idx].node_token_count) * 10).round
|
252
|
+
end
|
253
|
+
|
254
|
+
def part_of_speech(toks, idx, author_names=nil)
|
255
|
+
toks[idx].part_of_speech
|
256
|
+
end
|
257
|
+
|
258
|
+
private
|
259
|
+
|
260
|
+
def possible_issue_with_str(toks, idx)
|
261
|
+
return unless toks[idx]
|
262
|
+
|
263
|
+
possible_issue_str(toks, idx) ||
|
264
|
+
(possible_issue_str(toks, idx-1) && toks[idx].raw =~ /^\d+$/)
|
265
|
+
end
|
266
|
+
|
267
|
+
def possible_issue_str(toks, idx)
|
268
|
+
if toks[idx]
|
269
|
+
if toks[idx].raw =~ /^(no)|(issue)?\.?\d+.?$/i
|
270
|
+
return true
|
271
|
+
elsif toks[idx+1]
|
272
|
+
return ['no','issue'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def possible_vol_with_str(toks, idx)
|
278
|
+
return unless toks[idx]
|
279
|
+
|
280
|
+
possible_vol_str(toks, idx) ||
|
281
|
+
(possible_vol_str(toks, idx-1) && (toks[idx].raw =~ /^\d+$/ || toks[idx].raw == ',')) ||
|
282
|
+
(possible_vol_str(toks, idx-2) && toks[idx-1].raw =~ /^\d+$/ && toks[idx].raw == ',')
|
283
|
+
end
|
284
|
+
|
285
|
+
def possible_vol_str(toks, idx)
|
286
|
+
if toks[idx]
|
287
|
+
if toks[idx].raw =~ /^vol(ume)?\.?\d+.?$/i
|
288
|
+
return true
|
289
|
+
elsif toks[idx+1]
|
290
|
+
return ['vol','volume'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
def possible_vol_with_parens(toks, idx)
|
296
|
+
if toks[idx] && toks[idx+3]
|
297
|
+
toks[idx].raw =~ /^\d+$/ && toks[idx+1].raw == '(' && toks[idx+2].raw =~ /^\d+$/ && toks[idx+3].raw == ')'
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
def possible_vol_with_colon(toks, idx)
|
302
|
+
if toks[idx] && toks[idx+1]
|
303
|
+
# case of <year>: something is common so make sure we exclude it
|
304
|
+
if toks[idx].np =~ /^\d{1,3}$/ && toks[idx+1].raw =~ /^:/
|
305
|
+
# at this point it's likely a volume, but exclude it if it's not followed by an apparent page or issue
|
306
|
+
toks[idx+1].np =~ /^\d+$/ || (toks[idx+1].raw == ':' && toks[idx+2] && toks[idx+2].np =~ /^\d+/)
|
307
|
+
end
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
end
|
312
|
+
|
313
|
+
end
|
data/lib/excite.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
end
|
5
|
+
|
6
|
+
require 'crfpp'
|
7
|
+
|
8
|
+
require 'excite/array_helpers'
|
9
|
+
require 'excite/citation'
|
10
|
+
require 'excite/preprocessor'
|
11
|
+
require 'excite/postprocessor'
|
12
|
+
require 'excite/token_features'
|
13
|
+
require 'excite/crfparser'
|
@@ -0,0 +1,54 @@
|
|
1
|
+
Results for model
|
2
|
+
branch: add_parts_of_speech
|
3
|
+
version: fe881b78325f48e511a2d015ba80edb55ba9a64c Rename module to Excite
|
4
|
+
Test run on:,2013-03-06 15:15:20 -0800
|
5
|
+
K-fold x-validation:,10
|
6
|
+
Corpus size:,500
|
7
|
+
|
8
|
+
truth\test,author,booktitle,date,editor,institution,journal,location,note,pages,publisher,tech,title,volume
|
9
|
+
author,3894,6,0,7,0,0,0,3,0,0,0,1,0
|
10
|
+
booktitle,0,2053,0,8,3,52,8,9,0,0,3,31,6
|
11
|
+
date,1,0,1547,1,0,0,6,2,4,0,0,1,3
|
12
|
+
editor,35,5,0,442,0,4,0,0,0,0,0,1,0
|
13
|
+
institution,3,8,0,4,368,0,6,0,0,1,1,8,0
|
14
|
+
journal,0,35,0,5,0,731,0,0,0,3,0,26,4
|
15
|
+
location,0,21,2,0,14,0,479,0,0,0,0,3,3
|
16
|
+
note,0,32,1,5,2,6,0,99,0,0,7,3,9
|
17
|
+
pages,0,0,2,0,0,0,0,0,772,1,2,0,6
|
18
|
+
publisher,0,10,0,0,11,2,6,0,0,271,0,4,0
|
19
|
+
tech,0,0,0,0,2,0,4,0,4,0,199,30,5
|
20
|
+
title,1,32,0,4,0,4,2,0,0,0,0,4215,2
|
21
|
+
volume,0,3,1,0,0,3,0,0,7,0,3,0,609
|
22
|
+
author,0.9956532856047047,0.001534134492457172,0.0,0.0017898235745333673,0.0,0.0,0.0,0.000767067246228586,0.0,0.0,0.0,0.00025568908207619537,0.0
|
23
|
+
booktitle,0.0,0.9447768062586286,0.0,0.0036815462494247586,0.0013805798435342844,0.02393005062126093,0.0036815462494247586,0.0041417395306028535,0.0,0.0,0.0013805798435342844,0.014265991716520939,0.0027611596870685687
|
24
|
+
date,0.0006389776357827476,0.0,0.9884984025559106,0.0006389776357827476,0.0,0.0,0.0038338658146964857,0.0012779552715654952,0.0025559105431309905,0.0,0.0,0.0006389776357827476,0.0019169329073482429
|
25
|
+
editor,0.07186858316221766,0.01026694045174538,0.0,0.9075975359342916,0.0,0.008213552361396304,0.0,0.0,0.0,0.0,0.0,0.002053388090349076,0.0
|
26
|
+
institution,0.007518796992481203,0.020050125313283207,0.0,0.010025062656641603,0.9223057644110275,0.0,0.015037593984962405,0.0,0.0,0.002506265664160401,0.002506265664160401,0.020050125313283207,0.0
|
27
|
+
journal,0.0,0.043532338308457715,0.0,0.006218905472636816,0.0,0.9092039800995025,0.0,0.0,0.0,0.0037313432835820895,0.0,0.03233830845771144,0.004975124378109453
|
28
|
+
location,0.0,0.040229885057471264,0.0038314176245210726,0.0,0.02681992337164751,0.0,0.9176245210727969,0.0,0.0,0.0,0.0,0.005747126436781609,0.005747126436781609
|
29
|
+
note,0.0,0.1951219512195122,0.006097560975609756,0.03048780487804878,0.012195121951219513,0.036585365853658534,0.0,0.6036585365853658,0.0,0.0,0.042682926829268296,0.018292682926829267,0.054878048780487805
|
30
|
+
pages,0.0,0.0,0.002554278416347382,0.0,0.0,0.0,0.0,0.0,0.9859514687100894,0.001277139208173691,0.002554278416347382,0.0,0.007662835249042145
|
31
|
+
publisher,0.0,0.03289473684210526,0.0,0.0,0.03618421052631579,0.006578947368421052,0.019736842105263157,0.0,0.0,0.8914473684210527,0.0,0.013157894736842105,0.0
|
32
|
+
tech,0.0,0.0,0.0,0.0,0.00819672131147541,0.0,0.01639344262295082,0.0,0.01639344262295082,0.0,0.8155737704918032,0.12295081967213115,0.020491803278688523
|
33
|
+
title,0.00023474178403755868,0.007511737089201878,0.0,0.0009389671361502347,0.0,0.0009389671361502347,0.00046948356807511736,0.0,0.0,0.0,0.0,0.9894366197183099,0.00046948356807511736
|
34
|
+
volume,0.0,0.004792332268370607,0.001597444089456869,0.0,0.0,0.004792332268370607,0.0,0.0,0.011182108626198083,0.0,0.004792332268370607,0.0,0.9728434504792333
|
35
|
+
|
36
|
+
Label,Precision,Recall,F-measure
|
37
|
+
author,0.9898322318251144,0.9956532856047047,0.9927342256214148
|
38
|
+
booktitle,0.9310657596371882,0.9447768062586286,0.9378711740520785
|
39
|
+
date,0.9961365099806826,0.9884984025559106,0.9923027581783195
|
40
|
+
editor,0.9285714285714286,0.9075975359342916,0.9179646936656283
|
41
|
+
institution,0.92,0.9223057644110275,0.9211514392991239
|
42
|
+
journal,0.9114713216957606,0.9092039800995025,0.9103362391033625
|
43
|
+
location,0.9373776908023483,0.9176245210727969,0.9273959341723136
|
44
|
+
note,0.8761061946902655,0.6036585365853658,0.7148014440433211
|
45
|
+
pages,0.9809402795425667,0.9859514687100894,0.9834394904458599
|
46
|
+
publisher,0.9818840579710145,0.8914473684210527,0.9344827586206896
|
47
|
+
tech,0.9255813953488372,0.8155737704918032,0.8671023965141613
|
48
|
+
title,0.9750173490631506,0.9894366197183099,0.9821740650122335
|
49
|
+
volume,0.9412673879443586,0.9728434504792333,0.9567949725058915
|
50
|
+
|
51
|
+
Average accuracy by reference:,0.9621987290789997
|
52
|
+
STD of Average accuracy by reference:,0.09112313872147432
|
53
|
+
Perfect parses:,383,0.766
|
54
|
+
Accuracy:, 0.9653367811845832
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ArrayHelpers
|
2
|
+
|
3
|
+
def sum
|
4
|
+
inject(0, :+)
|
5
|
+
end
|
6
|
+
|
7
|
+
def mean
|
8
|
+
(size > 0) ? sum.to_f / size : 0
|
9
|
+
end
|
10
|
+
|
11
|
+
def stddev
|
12
|
+
m = mean
|
13
|
+
devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
|
14
|
+
(size > 0) ? (devsum.to_f / size) ** 0.5 : 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def cov(other)
|
18
|
+
zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
|
19
|
+
end
|
20
|
+
|
21
|
+
def pearson_r(other)
|
22
|
+
unless size == other.size
|
23
|
+
raise "Vectors must be of same length to calculate pearson_r"
|
24
|
+
end
|
25
|
+
devp = stddev * other.stddev
|
26
|
+
(devp > 0) ? cov(other) / devp : 0.0
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
Results for model
|
2
|
+
branch: add_parts_of_speech
|
3
|
+
version: 19c853489cbc259793dd24d89e1e16c78222a14b Add possible_volume based on regex (not actually usable unfortunately)
|
4
|
+
Test run on:,2013-03-06 14:52:26 -0800
|
5
|
+
K-fold x-validation:,10
|
6
|
+
Corpus size:,500
|
7
|
+
|
8
|
+
truth\test,author,booktitle,bullet,date,editor,institution,journal,link,location,note,pages,publisher,title,volume,workid
|
9
|
+
author,3503,8,4,2,32,0,2,3,0,7,2,0,72,0,0
|
10
|
+
booktitle,3,1271,0,20,27,15,82,0,13,21,4,13,171,2,0
|
11
|
+
bullet,3,1,31,1,0,0,1,0,0,3,0,0,1,2,0
|
12
|
+
date,1,24,0,1274,5,0,10,0,4,8,5,8,3,8,0
|
13
|
+
editor,35,19,0,0,1012,0,0,4,5,0,0,9,23,0,0
|
14
|
+
institution,0,34,0,2,11,0,18,0,0,10,0,36,29,0,0
|
15
|
+
journal,3,45,0,3,2,4,1130,1,0,8,0,11,62,2,0
|
16
|
+
link,3,16,0,1,0,0,0,326,0,48,7,2,2,1,0
|
17
|
+
location,9,13,0,3,4,0,5,2,441,7,0,34,9,0,0
|
18
|
+
note,9,61,0,13,21,0,16,19,0,175,3,20,107,2,0
|
19
|
+
pages,0,1,0,14,0,0,0,4,0,9,693,3,4,16,0
|
20
|
+
publisher,6,31,0,4,16,0,15,1,12,13,2,532,14,0,0
|
21
|
+
title,54,159,0,12,4,0,20,0,10,34,2,8,6129,0,0
|
22
|
+
volume,0,6,0,18,0,0,8,0,3,2,7,0,0,861,0
|
23
|
+
workid,0,0,0,7,0,0,0,2,4,7,8,0,3,4,64
|
24
|
+
author,0.9636863823933975,0.002200825309491059,0.0011004126547455295,0.0005502063273727648,0.008803301237964236,0.0,0.0005502063273727648,0.0008253094910591472,0.0,0.0019257221458046766,0.0005502063273727648,0.0,0.019807427785419534,0.0,0.0
|
25
|
+
booktitle,0.0018270401948842874,0.7740560292326432,0.0,0.012180267965895249,0.016443361753958587,0.009135200974421437,0.049939098660170524,0.0,0.007917174177831911,0.012789281364190013,0.00243605359317905,0.007917174177831911,0.10414129110840438,0.001218026796589525,0.0
|
26
|
+
bullet,0.06976744186046512,0.023255813953488372,0.7209302325581395,0.023255813953488372,0.0,0.0,0.023255813953488372,0.0,0.0,0.06976744186046512,0.0,0.0,0.023255813953488372,0.046511627906976744,0.0
|
27
|
+
date,0.0007407407407407407,0.017777777777777778,0.0,0.9437037037037037,0.003703703703703704,0.0,0.007407407407407408,0.0,0.002962962962962963,0.005925925925925926,0.003703703703703704,0.005925925925925926,0.0022222222222222222,0.005925925925925926,0.0
|
28
|
+
editor,0.031616982836495035,0.017163504968383016,0.0,0.0,0.9141824751580849,0.0,0.0,0.0036133694670280035,0.004516711833785004,0.0,0.0,0.008130081300813009,0.02077687443541102,0.0,0.0
|
29
|
+
institution,0.0,0.24285714285714285,0.0,0.014285714285714285,0.07857142857142857,0.0,0.12857142857142856,0.0,0.0,0.07142857142857142,0.0,0.2571428571428571,0.20714285714285716,0.0,0.0
|
30
|
+
journal,0.0023603461841070024,0.03540519276160504,0.0,0.0023603461841070024,0.0015735641227380016,0.003147128245476003,0.8890637293469709,0.0007867820613690008,0.0,0.006294256490952006,0.0,0.00865460267505901,0.04878048780487805,0.0015735641227380016,0.0
|
31
|
+
link,0.007389162561576354,0.03940886699507389,0.0,0.0024630541871921183,0.0,0.0,0.0,0.8029556650246306,0.0,0.11822660098522167,0.017241379310344827,0.0049261083743842365,0.0049261083743842365,0.0024630541871921183,0.0
|
32
|
+
location,0.017077798861480076,0.024667931688804556,0.0,0.0056925996204933585,0.007590132827324478,0.0,0.009487666034155597,0.003795066413662239,0.8368121442125237,0.013282732447817837,0.0,0.06451612903225806,0.017077798861480076,0.0,0.0
|
33
|
+
note,0.020179372197309416,0.1367713004484305,0.0,0.02914798206278027,0.04708520179372197,0.0,0.03587443946188341,0.042600896860986545,0.0,0.3923766816143498,0.006726457399103139,0.04484304932735426,0.2399103139013453,0.004484304932735426,0.0
|
34
|
+
pages,0.0,0.0013440860215053765,0.0,0.01881720430107527,0.0,0.0,0.0,0.005376344086021506,0.0,0.012096774193548387,0.9314516129032258,0.004032258064516129,0.005376344086021506,0.021505376344086023,0.0
|
35
|
+
publisher,0.009287925696594427,0.047987616099071206,0.0,0.006191950464396285,0.02476780185758514,0.0,0.02321981424148607,0.0015479876160990713,0.018575851393188854,0.020123839009287926,0.0030959752321981426,0.8235294117647058,0.021671826625386997,0.0,0.0
|
36
|
+
title,0.008395522388059701,0.024720149253731342,0.0,0.0018656716417910447,0.0006218905472636816,0.0,0.003109452736318408,0.0,0.001554726368159204,0.005286069651741294,0.0003109452736318408,0.0012437810945273632,0.9528917910447762,0.0,0.0
|
37
|
+
volume,0.0,0.0066298342541436465,0.0,0.019889502762430938,0.0,0.0,0.008839779005524863,0.0,0.0033149171270718232,0.0022099447513812156,0.0077348066298342545,0.0,0.0,0.9513812154696133,0.0
|
38
|
+
workid,0.0,0.0,0.0,0.0707070707070707,0.0,0.0,0.0,0.020202020202020204,0.04040404040404041,0.0707070707070707,0.08080808080808081,0.0,0.030303030303030304,0.04040404040404041,0.6464646464646465
|
39
|
+
|
40
|
+
Label,Precision,Recall,F-measure
|
41
|
+
author,0.9652796913750344,0.9636863823933975,0.9644823788546256
|
42
|
+
booktitle,0.7525162818235642,0.7740560292326432,0.7631341939357552
|
43
|
+
bullet,0.8857142857142857,0.7209302325581395,0.7948717948717948
|
44
|
+
date,0.9272197962154294,0.9437037037037037,0.9353891336270191
|
45
|
+
editor,0.892416225749559,0.9141824751580849,0.9031682284694332
|
46
|
+
institution,0.0,0.0,NaN
|
47
|
+
journal,0.864575363427697,0.8890637293469709,0.8766485647788983
|
48
|
+
link,0.9005524861878453,0.8029556650246306,0.8489583333333334
|
49
|
+
location,0.8963414634146342,0.8368121442125237,0.8655544651619235
|
50
|
+
note,0.4971590909090909,0.3923766816143498,0.43859649122807015
|
51
|
+
pages,0.9454297407912687,0.9314516129032258,0.938388625592417
|
52
|
+
publisher,0.7869822485207101,0.8235294117647058,0.8048411497730711
|
53
|
+
title,0.9245738422084779,0.9528917910447762,0.9385192557997091
|
54
|
+
volume,0.9587973273942093,0.9513812154696133,0.9550748752079867
|
55
|
+
workid,1.0,0.6464646464646465,0.7852760736196319
|
56
|
+
|
57
|
+
Average accuracy by reference:,0.8980503362135145
|
58
|
+
STD of Average accuracy by reference:,0.150249693017486
|
59
|
+
Perfect parses:,239,0.478
|
60
|
+
Accuracy:, 0.899396689527149
|