excite 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,97 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# Script to assist in verifying tagged references
|
4
|
+
|
5
|
+
f = ARGV[0]
|
6
|
+
|
7
|
+
cleaned = IO.readlines(f).map(&:strip)
|
8
|
+
|
9
|
+
tags = []
|
10
|
+
tag_contents = {}
|
11
|
+
|
12
|
+
annotation_tags = %w{ author title date booktitle journal volume pages editor workid link publisher location institution bullet tech note }.map { |t| "<#{t}>" }
|
13
|
+
|
14
|
+
cleaned.each_with_index do |line, index|
|
15
|
+
open_tags = line.scan(/<\s*\w+\s*>/).map(&:downcase)
|
16
|
+
for tag in open_tags
|
17
|
+
i = line.downcase.index(tag)
|
18
|
+
j = line.downcase.index(tag.sub('<','</'))
|
19
|
+
if tag != "<br>" && (j.nil? || j <= i)
|
20
|
+
puts "Missing close tag for #{tag} on line #{index+1}: #{line}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
close_tags = line.scan(/<\/\s*\w+\s*>/).map(&:downcase)
|
25
|
+
for tag in close_tags
|
26
|
+
i = line.downcase.index(tag)
|
27
|
+
j = line.downcase.index(tag.sub(/<\/\s*/, ''))
|
28
|
+
if j.nil? || j >= i
|
29
|
+
puts "Missing open tag for #{tag} on line #{index+1}: #{line}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
tags += open_tags
|
34
|
+
|
35
|
+
toks = line.split(/(\s+)|(?=<)|(?<=>)/)
|
36
|
+
|
37
|
+
start_tag = nil
|
38
|
+
tag_content = ''
|
39
|
+
|
40
|
+
for tok in toks
|
41
|
+
if annotation_tags.include?(tok)
|
42
|
+
if !start_tag.nil?
|
43
|
+
puts "Started #{tok} within #{start_tag} on line #{index+1}: #{line}"
|
44
|
+
start_tag = nil
|
45
|
+
else
|
46
|
+
start_tag = tok
|
47
|
+
end
|
48
|
+
elsif annotation_tags.include?(tok.sub(/<\/\s*/, '<'))
|
49
|
+
if start_tag.nil?
|
50
|
+
puts "End tag #{tok} without a corresponding start tag on line #{index+1}: #{line}"
|
51
|
+
elsif start_tag != tok.sub(/<\/s*/, '<')
|
52
|
+
puts "End tag #{tok} doesn't match start tag #{start_tag} on line #{index+1}: #{line}"
|
53
|
+
start_tag = nil
|
54
|
+
else
|
55
|
+
tag_contents[start_tag] ||= []
|
56
|
+
tag_contents[start_tag] << tag_content.strip
|
57
|
+
|
58
|
+
tag_content = ''
|
59
|
+
start_tag = nil
|
60
|
+
end
|
61
|
+
elsif start_tag.nil?
|
62
|
+
puts "Token '#{tok}' is not tagged in line #{index+1}: #{line}" unless tok.strip.empty?
|
63
|
+
else
|
64
|
+
tag_content += tok
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
for tag in annotation_tags
|
69
|
+
if open_tags.count(tag) > 1
|
70
|
+
puts "(Might be ok but...) More than one #{tag} in line #{index+1}: #{line}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
for tag in open_tags
|
75
|
+
if open_tags.count(tag) != close_tags.count(tag.sub('<','</'))
|
76
|
+
puts "Unequal numbers of open and close tags for #{tag} in line #{index+1}: #{line}" unless tag.match(/<\/?br>/)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
if !open_tags.include?("<title>")
|
81
|
+
puts "Missing title on line #{index+1}: #{line}"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
tag_counts = tags.inject({}) do |counts, tag|
|
86
|
+
counts[tag] ||= 0
|
87
|
+
counts[tag] += 1
|
88
|
+
counts
|
89
|
+
end
|
90
|
+
|
91
|
+
puts "\n\nAnnotation tags used: #{tag_counts.select { |t,c| annotation_tags.include?(t) } }"
|
92
|
+
puts "Other tags: #{tag_counts.reject { |t,c| annotation_tags.include?(t) }}\n\n\n"
|
93
|
+
|
94
|
+
tag_contents.each do |tag, contents|
|
95
|
+
puts "#{tag}s:"
|
96
|
+
contents.each { |c| puts "\t\t#{c}" }
|
97
|
+
end
|
@@ -0,0 +1,313 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
|
5
|
+
module TokenFeatures
|
6
|
+
|
7
|
+
module DictFlags
|
8
|
+
PUBLISHER_NAME = 32
|
9
|
+
PLACE_NAME = 16
|
10
|
+
MONTH_NAME = 8
|
11
|
+
LAST_NAME = 4
|
12
|
+
FIRST_NAME = 1
|
13
|
+
end
|
14
|
+
|
15
|
+
def TokenFeatures.read_dict_files(dir_name)
|
16
|
+
dict = {}
|
17
|
+
[
|
18
|
+
['first-names',DictFlags::FIRST_NAME],
|
19
|
+
['surnames',DictFlags::LAST_NAME],
|
20
|
+
['months',DictFlags::MONTH_NAME],
|
21
|
+
['places',DictFlags::PLACE_NAME],
|
22
|
+
['publishers',DictFlags::PUBLISHER_NAME],
|
23
|
+
].each do |file_name, flag|
|
24
|
+
filename = File.join(dir_name, file_name)
|
25
|
+
f = File.open(filename, 'r')
|
26
|
+
|
27
|
+
while l = f.gets
|
28
|
+
l.strip!
|
29
|
+
if !l.match(/^\#/)
|
30
|
+
dict[l] ||= 0
|
31
|
+
unless dict[l] & flag > 0
|
32
|
+
dict[l] += flag
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
f.close
|
38
|
+
end
|
39
|
+
dict
|
40
|
+
end
|
41
|
+
|
42
|
+
DIR = File.dirname(__FILE__)
|
43
|
+
DICT = TokenFeatures.read_dict_files("#{DIR}/resources/dicts")
|
44
|
+
|
45
|
+
private_class_method :read_dict_files
|
46
|
+
|
47
|
+
def clear
|
48
|
+
@possible_editor = nil
|
49
|
+
@possible_chapter = nil
|
50
|
+
@dict_status = nil
|
51
|
+
@is_proceeding = nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def last_char(toks, idx, author_names=nil)
|
55
|
+
case toks[idx].raw[-1,1]
|
56
|
+
when /[a-z]/
|
57
|
+
'a'
|
58
|
+
when /[A-Z]/
|
59
|
+
'A'
|
60
|
+
when /[0-9]/
|
61
|
+
0
|
62
|
+
else
|
63
|
+
toks[idx].raw[-1,1]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def first_1_char(toks, idx, author_names=nil); toks[idx].raw[0,1]; end
|
68
|
+
def first_2_chars(toks, idx, author_names=nil); toks[idx].raw[0,2]; end
|
69
|
+
def first_3_chars(toks, idx, author_names=nil); toks[idx].raw[0,3]; end
|
70
|
+
def first_4_chars(toks, idx, author_names=nil); toks[idx].raw[0,4]; end
|
71
|
+
def first_5_chars(toks, idx, author_names=nil); toks[idx].raw[0,5]; end
|
72
|
+
|
73
|
+
def last_1_char(toks, idx, author_names=nil); toks[idx].raw[-1,1]; end
|
74
|
+
def last_2_chars(toks, idx, author_names=nil); toks[idx].raw[-2,2] || toks[idx].raw; end
|
75
|
+
def last_3_chars(toks, idx, author_names=nil); toks[idx].raw[-3,3] || toks[idx].raw; end
|
76
|
+
def last_4_chars(toks, idx, author_names=nil); toks[idx].raw[-4,4] || toks[idx].raw; end
|
77
|
+
|
78
|
+
def toklcnp(toks, idx, author_names=nil); toks[idx].lcnp; end
|
79
|
+
|
80
|
+
def capitalization(toks, idx, author_names=nil)
|
81
|
+
case toks[idx].np
|
82
|
+
when "EMPTY"
|
83
|
+
"others"
|
84
|
+
when /^[[:upper:]]$/
|
85
|
+
"singleCap"
|
86
|
+
when /^[[:upper:]][[:lower:]]+/
|
87
|
+
"InitCap"
|
88
|
+
when /^[[:upper:]]+$/
|
89
|
+
"AllCap"
|
90
|
+
else
|
91
|
+
"others"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def numbers(toks, idx, author_names=nil)
|
96
|
+
(toks[idx].raw =~ /[0-9]\-[0-9]/) ? "possiblePage" :
|
97
|
+
(toks[idx].raw =~ /^\D*(19|20)[0-9][0-9]\D*$/) ? "year" :
|
98
|
+
(toks[idx].np =~ /^(19|20)[0-9][0-9]$/) ? "year" :
|
99
|
+
(toks[idx].np =~ /^[0-9]$/) ? "1dig" :
|
100
|
+
(toks[idx].np =~ /^[0-9][0-9]$/) ? "2dig" :
|
101
|
+
(toks[idx].np =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
|
102
|
+
(toks[idx].np =~ /^[0-9]+$/) ? "4+dig" :
|
103
|
+
(toks[idx].np =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
|
104
|
+
(toks[idx].np =~ /[0-9]/) ? "hasDig" : "nonNum"
|
105
|
+
end
|
106
|
+
|
107
|
+
# ignores idx
|
108
|
+
def possible_editor(toks, idx=nil, author_names=nil)
|
109
|
+
if !@possible_editor.nil?
|
110
|
+
@possible_editor
|
111
|
+
else
|
112
|
+
@possible_editor =
|
113
|
+
(toks.any? { |t| %w(ed editor editors eds edited).include?(t.lcnp) } ?
|
114
|
+
"possibleEditors" : "noEditors")
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# if there is possible editor entry and "IN" preceeded by punctuation
|
119
|
+
# this citation may be a book chapter
|
120
|
+
#
|
121
|
+
# ignores idx
|
122
|
+
def possible_chapter(toks, idx=nil, author_names=nil)
|
123
|
+
if !@possible_chapter.nil?
|
124
|
+
@possible_chapter
|
125
|
+
else
|
126
|
+
has_editor = possible_editor(toks) == 'possibleEditors'
|
127
|
+
has_chapter = toks.each_with_index.any? do |t, i|
|
128
|
+
if i > 0 && i < (toks.length-1) && t.lcnp == 'in'
|
129
|
+
prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[i-1].part_of_speech)
|
130
|
+
next_is_separator = ['ppl','ppc','pps'].include?(toks[i+1].part_of_speech)
|
131
|
+
prev_is_separator && (has_editor || next_is_separator)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
has_chapter ? "possibleChapter" : "noChapter"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# ignores idx
|
139
|
+
def is_proceeding(toks, idx=nil, author_names=nil)
|
140
|
+
if !@is_proceeding.nil?
|
141
|
+
@is_proceeding
|
142
|
+
else
|
143
|
+
@is_proceeding =
|
144
|
+
(toks.any? { |t|
|
145
|
+
%w( proc proceeding proceedings ).include?(t.lcnp.strip)
|
146
|
+
} ? 'isProc' : 'noProc')
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# TODO remove duplication with possible_chapter
|
151
|
+
def is_in(toks, idx, author_names=nil)
|
152
|
+
is_in = if idx > 0 && idx < (toks.length-1) && toks[idx].lcnp == 'in'
|
153
|
+
prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[idx-1].part_of_speech)
|
154
|
+
next_is_separator = ['ppl','ppc','pps'].include?(toks[idx+1].part_of_speech)
|
155
|
+
prev_is_separator && (next_is_separator || toks[idx+1].np =~ /^[A-Z]/)
|
156
|
+
end
|
157
|
+
is_in ? "inBook" : "notInBook"
|
158
|
+
end
|
159
|
+
|
160
|
+
def location(toks, idx, author_names=nil)
|
161
|
+
r = ((idx.to_f / toks.length) * 10).round
|
162
|
+
end
|
163
|
+
|
164
|
+
def punct(toks, idx, author_names=nil)
|
165
|
+
(toks[idx].raw =~ /\-.*\-/) ? "multiHyphen" :
|
166
|
+
(toks[idx].raw =~ /[[:alpha:]].*\-$/) ? "truncated" :
|
167
|
+
(toks[idx].raw =~ /[[:alpha:]].*\.$/) ? "abbrev" :
|
168
|
+
(toks[idx].np != toks[idx].raw) ? "hasPunct" : "others"
|
169
|
+
end
|
170
|
+
|
171
|
+
def possible_volume(toks, idx, author_names=nil)
|
172
|
+
if possible_vol_with_str(toks, idx)
|
173
|
+
'volume'
|
174
|
+
elsif possible_vol_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
|
175
|
+
'issue'
|
176
|
+
elsif possible_vol_with_str(toks, idx-2) && possible_issue_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
|
177
|
+
'issue'
|
178
|
+
elsif possible_vol_with_parens(toks, idx)
|
179
|
+
'volume'
|
180
|
+
elsif (1..3).any? { |i| possible_vol_with_parens(toks, idx-i) }
|
181
|
+
'issue'
|
182
|
+
elsif possible_vol_with_colon(toks, idx)
|
183
|
+
'volume'
|
184
|
+
else
|
185
|
+
'noVolume'
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# TODO this method is weirdly named b/c of alphabetical ordering hack: remove that
|
190
|
+
def a_is_in_dict(toks, idx, author_names=nil)
|
191
|
+
dict_status(toks, idx)
|
192
|
+
end
|
193
|
+
|
194
|
+
def publisherName(toks, idx, author_names=nil)
|
195
|
+
(dict_status(toks, idx) & DictFlags::PUBLISHER_NAME) > 0 ? 'publisherName' : 'noPublisherName'
|
196
|
+
end
|
197
|
+
|
198
|
+
def placeName(toks, idx, author_names=nil)
|
199
|
+
(dict_status(toks, idx) & DictFlags::PLACE_NAME) > 0 ? 'placeName' : 'noPlaceName'
|
200
|
+
end
|
201
|
+
|
202
|
+
def monthName(toks, idx, author_names=nil)
|
203
|
+
(dict_status(toks, idx) & DictFlags::MONTH_NAME) > 0 ? 'monthName' : 'noMonthName'
|
204
|
+
end
|
205
|
+
|
206
|
+
def lastName(toks, idx, author_names=nil)
|
207
|
+
return 'lastName' if author_names && author_names.last == toks[idx].lcnp
|
208
|
+
(dict_status(toks, idx) & DictFlags::LAST_NAME) > 0 ? 'lastName' : 'noLastName'
|
209
|
+
end
|
210
|
+
|
211
|
+
def firstName(toks, idx, author_names=nil)
|
212
|
+
return 'firstName' if author_names && author_names.first == toks[idx].lcnp
|
213
|
+
(dict_status(toks, idx) & DictFlags::FIRST_NAME) > 0 ? 'firstName' : 'noFirstName'
|
214
|
+
end
|
215
|
+
|
216
|
+
def dict_status(toks, idx)
|
217
|
+
@dict_status ||= [nil]*toks.length
|
218
|
+
@dict_status[idx] ||= (DICT[toks[idx].lcnp] || 0)
|
219
|
+
end
|
220
|
+
|
221
|
+
NODE_TYPES_BY_NAME = {
|
222
|
+
'div'=>'div',
|
223
|
+
'p'=>'p',
|
224
|
+
'ul'=>'div', # lump with div - higher-level structure
|
225
|
+
'li'=>'li',
|
226
|
+
'tr'=>'div', # lump with div - higher-level structure
|
227
|
+
'td'=>'td',
|
228
|
+
'span'=>'span',
|
229
|
+
'font'=>'span',
|
230
|
+
'em'=>'em',
|
231
|
+
'i'=>'em',
|
232
|
+
'strong'=>'strong',
|
233
|
+
'b'=>'strong',
|
234
|
+
'u'=>'u',
|
235
|
+
'h1'=>'h',
|
236
|
+
'h2'=>'h',
|
237
|
+
'h3'=>'h',
|
238
|
+
'h4'=>'h',
|
239
|
+
'h5'=>'h',
|
240
|
+
'h6'=>'h',
|
241
|
+
'a'=>'a',
|
242
|
+
'#document-fragment'=>'unknown' # the actual tag wasn't captured in the fragment we're parsing
|
243
|
+
}
|
244
|
+
|
245
|
+
def tag_name(toks, idx, author_names=nil)
|
246
|
+
name = toks[idx].node.parent.name # node is always a text node; the informative one is the parent
|
247
|
+
NODE_TYPES_BY_NAME[name.downcase] || 'other'
|
248
|
+
end
|
249
|
+
|
250
|
+
def location_in_node(toks, idx, author_names=nil)
|
251
|
+
((toks[idx].idx_in_node.to_f / toks[idx].node_token_count) * 10).round
|
252
|
+
end
|
253
|
+
|
254
|
+
def part_of_speech(toks, idx, author_names=nil)
|
255
|
+
toks[idx].part_of_speech
|
256
|
+
end
|
257
|
+
|
258
|
+
private
|
259
|
+
|
260
|
+
def possible_issue_with_str(toks, idx)
|
261
|
+
return unless toks[idx]
|
262
|
+
|
263
|
+
possible_issue_str(toks, idx) ||
|
264
|
+
(possible_issue_str(toks, idx-1) && toks[idx].raw =~ /^\d+$/)
|
265
|
+
end
|
266
|
+
|
267
|
+
def possible_issue_str(toks, idx)
|
268
|
+
if toks[idx]
|
269
|
+
if toks[idx].raw =~ /^(no)|(issue)?\.?\d+.?$/i
|
270
|
+
return true
|
271
|
+
elsif toks[idx+1]
|
272
|
+
return ['no','issue'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def possible_vol_with_str(toks, idx)
|
278
|
+
return unless toks[idx]
|
279
|
+
|
280
|
+
possible_vol_str(toks, idx) ||
|
281
|
+
(possible_vol_str(toks, idx-1) && (toks[idx].raw =~ /^\d+$/ || toks[idx].raw == ',')) ||
|
282
|
+
(possible_vol_str(toks, idx-2) && toks[idx-1].raw =~ /^\d+$/ && toks[idx].raw == ',')
|
283
|
+
end
|
284
|
+
|
285
|
+
def possible_vol_str(toks, idx)
|
286
|
+
if toks[idx]
|
287
|
+
if toks[idx].raw =~ /^vol(ume)?\.?\d+.?$/i
|
288
|
+
return true
|
289
|
+
elsif toks[idx+1]
|
290
|
+
return ['vol','volume'].include?(toks[idx].lcnp) && toks[idx+1].raw =~ /^\d+$/
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
def possible_vol_with_parens(toks, idx)
|
296
|
+
if toks[idx] && toks[idx+3]
|
297
|
+
toks[idx].raw =~ /^\d+$/ && toks[idx+1].raw == '(' && toks[idx+2].raw =~ /^\d+$/ && toks[idx+3].raw == ')'
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
def possible_vol_with_colon(toks, idx)
|
302
|
+
if toks[idx] && toks[idx+1]
|
303
|
+
# case of <year>: something is common so make sure we exclude it
|
304
|
+
if toks[idx].np =~ /^\d{1,3}$/ && toks[idx+1].raw =~ /^:/
|
305
|
+
# at this point it's likely a volume, but exclude it if it's not followed by an apparent page or issue
|
306
|
+
toks[idx+1].np =~ /^\d+$/ || (toks[idx+1].raw == ':' && toks[idx+2] && toks[idx+2].np =~ /^\d+/)
|
307
|
+
end
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
end
|
312
|
+
|
313
|
+
end
|
data/lib/excite.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
end
|
5
|
+
|
6
|
+
require 'crfpp'
|
7
|
+
|
8
|
+
require 'excite/array_helpers'
|
9
|
+
require 'excite/citation'
|
10
|
+
require 'excite/preprocessor'
|
11
|
+
require 'excite/postprocessor'
|
12
|
+
require 'excite/token_features'
|
13
|
+
require 'excite/crfparser'
|
@@ -0,0 +1,54 @@
|
|
1
|
+
Results for model
|
2
|
+
branch: add_parts_of_speech
|
3
|
+
version: fe881b78325f48e511a2d015ba80edb55ba9a64c Rename module to Excite
|
4
|
+
Test run on:,2013-03-06 15:15:20 -0800
|
5
|
+
K-fold x-validation:,10
|
6
|
+
Corpus size:,500
|
7
|
+
|
8
|
+
truth\test,author,booktitle,date,editor,institution,journal,location,note,pages,publisher,tech,title,volume
|
9
|
+
author,3894,6,0,7,0,0,0,3,0,0,0,1,0
|
10
|
+
booktitle,0,2053,0,8,3,52,8,9,0,0,3,31,6
|
11
|
+
date,1,0,1547,1,0,0,6,2,4,0,0,1,3
|
12
|
+
editor,35,5,0,442,0,4,0,0,0,0,0,1,0
|
13
|
+
institution,3,8,0,4,368,0,6,0,0,1,1,8,0
|
14
|
+
journal,0,35,0,5,0,731,0,0,0,3,0,26,4
|
15
|
+
location,0,21,2,0,14,0,479,0,0,0,0,3,3
|
16
|
+
note,0,32,1,5,2,6,0,99,0,0,7,3,9
|
17
|
+
pages,0,0,2,0,0,0,0,0,772,1,2,0,6
|
18
|
+
publisher,0,10,0,0,11,2,6,0,0,271,0,4,0
|
19
|
+
tech,0,0,0,0,2,0,4,0,4,0,199,30,5
|
20
|
+
title,1,32,0,4,0,4,2,0,0,0,0,4215,2
|
21
|
+
volume,0,3,1,0,0,3,0,0,7,0,3,0,609
|
22
|
+
author,0.9956532856047047,0.001534134492457172,0.0,0.0017898235745333673,0.0,0.0,0.0,0.000767067246228586,0.0,0.0,0.0,0.00025568908207619537,0.0
|
23
|
+
booktitle,0.0,0.9447768062586286,0.0,0.0036815462494247586,0.0013805798435342844,0.02393005062126093,0.0036815462494247586,0.0041417395306028535,0.0,0.0,0.0013805798435342844,0.014265991716520939,0.0027611596870685687
|
24
|
+
date,0.0006389776357827476,0.0,0.9884984025559106,0.0006389776357827476,0.0,0.0,0.0038338658146964857,0.0012779552715654952,0.0025559105431309905,0.0,0.0,0.0006389776357827476,0.0019169329073482429
|
25
|
+
editor,0.07186858316221766,0.01026694045174538,0.0,0.9075975359342916,0.0,0.008213552361396304,0.0,0.0,0.0,0.0,0.0,0.002053388090349076,0.0
|
26
|
+
institution,0.007518796992481203,0.020050125313283207,0.0,0.010025062656641603,0.9223057644110275,0.0,0.015037593984962405,0.0,0.0,0.002506265664160401,0.002506265664160401,0.020050125313283207,0.0
|
27
|
+
journal,0.0,0.043532338308457715,0.0,0.006218905472636816,0.0,0.9092039800995025,0.0,0.0,0.0,0.0037313432835820895,0.0,0.03233830845771144,0.004975124378109453
|
28
|
+
location,0.0,0.040229885057471264,0.0038314176245210726,0.0,0.02681992337164751,0.0,0.9176245210727969,0.0,0.0,0.0,0.0,0.005747126436781609,0.005747126436781609
|
29
|
+
note,0.0,0.1951219512195122,0.006097560975609756,0.03048780487804878,0.012195121951219513,0.036585365853658534,0.0,0.6036585365853658,0.0,0.0,0.042682926829268296,0.018292682926829267,0.054878048780487805
|
30
|
+
pages,0.0,0.0,0.002554278416347382,0.0,0.0,0.0,0.0,0.0,0.9859514687100894,0.001277139208173691,0.002554278416347382,0.0,0.007662835249042145
|
31
|
+
publisher,0.0,0.03289473684210526,0.0,0.0,0.03618421052631579,0.006578947368421052,0.019736842105263157,0.0,0.0,0.8914473684210527,0.0,0.013157894736842105,0.0
|
32
|
+
tech,0.0,0.0,0.0,0.0,0.00819672131147541,0.0,0.01639344262295082,0.0,0.01639344262295082,0.0,0.8155737704918032,0.12295081967213115,0.020491803278688523
|
33
|
+
title,0.00023474178403755868,0.007511737089201878,0.0,0.0009389671361502347,0.0,0.0009389671361502347,0.00046948356807511736,0.0,0.0,0.0,0.0,0.9894366197183099,0.00046948356807511736
|
34
|
+
volume,0.0,0.004792332268370607,0.001597444089456869,0.0,0.0,0.004792332268370607,0.0,0.0,0.011182108626198083,0.0,0.004792332268370607,0.0,0.9728434504792333
|
35
|
+
|
36
|
+
Label,Precision,Recall,F-measure
|
37
|
+
author,0.9898322318251144,0.9956532856047047,0.9927342256214148
|
38
|
+
booktitle,0.9310657596371882,0.9447768062586286,0.9378711740520785
|
39
|
+
date,0.9961365099806826,0.9884984025559106,0.9923027581783195
|
40
|
+
editor,0.9285714285714286,0.9075975359342916,0.9179646936656283
|
41
|
+
institution,0.92,0.9223057644110275,0.9211514392991239
|
42
|
+
journal,0.9114713216957606,0.9092039800995025,0.9103362391033625
|
43
|
+
location,0.9373776908023483,0.9176245210727969,0.9273959341723136
|
44
|
+
note,0.8761061946902655,0.6036585365853658,0.7148014440433211
|
45
|
+
pages,0.9809402795425667,0.9859514687100894,0.9834394904458599
|
46
|
+
publisher,0.9818840579710145,0.8914473684210527,0.9344827586206896
|
47
|
+
tech,0.9255813953488372,0.8155737704918032,0.8671023965141613
|
48
|
+
title,0.9750173490631506,0.9894366197183099,0.9821740650122335
|
49
|
+
volume,0.9412673879443586,0.9728434504792333,0.9567949725058915
|
50
|
+
|
51
|
+
Average accuracy by reference:,0.9621987290789997
|
52
|
+
STD of Average accuracy by reference:,0.09112313872147432
|
53
|
+
Perfect parses:,383,0.766
|
54
|
+
Accuracy:, 0.9653367811845832
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module ArrayHelpers
|
2
|
+
|
3
|
+
def sum
|
4
|
+
inject(0, :+)
|
5
|
+
end
|
6
|
+
|
7
|
+
def mean
|
8
|
+
(size > 0) ? sum.to_f / size : 0
|
9
|
+
end
|
10
|
+
|
11
|
+
def stddev
|
12
|
+
m = mean
|
13
|
+
devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
|
14
|
+
(size > 0) ? (devsum.to_f / size) ** 0.5 : 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def cov(other)
|
18
|
+
zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
|
19
|
+
end
|
20
|
+
|
21
|
+
def pearson_r(other)
|
22
|
+
unless size == other.size
|
23
|
+
raise "Vectors must be of same length to calculate pearson_r"
|
24
|
+
end
|
25
|
+
devp = stddev * other.stddev
|
26
|
+
(devp > 0) ? cov(other) / devp : 0.0
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
Results for model
|
2
|
+
branch: add_parts_of_speech
|
3
|
+
version: 19c853489cbc259793dd24d89e1e16c78222a14b Add possible_volume based on regex (not actually usable unfortunately)
|
4
|
+
Test run on:,2013-03-06 14:52:26 -0800
|
5
|
+
K-fold x-validation:,10
|
6
|
+
Corpus size:,500
|
7
|
+
|
8
|
+
truth\test,author,booktitle,bullet,date,editor,institution,journal,link,location,note,pages,publisher,title,volume,workid
|
9
|
+
author,3503,8,4,2,32,0,2,3,0,7,2,0,72,0,0
|
10
|
+
booktitle,3,1271,0,20,27,15,82,0,13,21,4,13,171,2,0
|
11
|
+
bullet,3,1,31,1,0,0,1,0,0,3,0,0,1,2,0
|
12
|
+
date,1,24,0,1274,5,0,10,0,4,8,5,8,3,8,0
|
13
|
+
editor,35,19,0,0,1012,0,0,4,5,0,0,9,23,0,0
|
14
|
+
institution,0,34,0,2,11,0,18,0,0,10,0,36,29,0,0
|
15
|
+
journal,3,45,0,3,2,4,1130,1,0,8,0,11,62,2,0
|
16
|
+
link,3,16,0,1,0,0,0,326,0,48,7,2,2,1,0
|
17
|
+
location,9,13,0,3,4,0,5,2,441,7,0,34,9,0,0
|
18
|
+
note,9,61,0,13,21,0,16,19,0,175,3,20,107,2,0
|
19
|
+
pages,0,1,0,14,0,0,0,4,0,9,693,3,4,16,0
|
20
|
+
publisher,6,31,0,4,16,0,15,1,12,13,2,532,14,0,0
|
21
|
+
title,54,159,0,12,4,0,20,0,10,34,2,8,6129,0,0
|
22
|
+
volume,0,6,0,18,0,0,8,0,3,2,7,0,0,861,0
|
23
|
+
workid,0,0,0,7,0,0,0,2,4,7,8,0,3,4,64
|
24
|
+
author,0.9636863823933975,0.002200825309491059,0.0011004126547455295,0.0005502063273727648,0.008803301237964236,0.0,0.0005502063273727648,0.0008253094910591472,0.0,0.0019257221458046766,0.0005502063273727648,0.0,0.019807427785419534,0.0,0.0
|
25
|
+
booktitle,0.0018270401948842874,0.7740560292326432,0.0,0.012180267965895249,0.016443361753958587,0.009135200974421437,0.049939098660170524,0.0,0.007917174177831911,0.012789281364190013,0.00243605359317905,0.007917174177831911,0.10414129110840438,0.001218026796589525,0.0
|
26
|
+
bullet,0.06976744186046512,0.023255813953488372,0.7209302325581395,0.023255813953488372,0.0,0.0,0.023255813953488372,0.0,0.0,0.06976744186046512,0.0,0.0,0.023255813953488372,0.046511627906976744,0.0
|
27
|
+
date,0.0007407407407407407,0.017777777777777778,0.0,0.9437037037037037,0.003703703703703704,0.0,0.007407407407407408,0.0,0.002962962962962963,0.005925925925925926,0.003703703703703704,0.005925925925925926,0.0022222222222222222,0.005925925925925926,0.0
|
28
|
+
editor,0.031616982836495035,0.017163504968383016,0.0,0.0,0.9141824751580849,0.0,0.0,0.0036133694670280035,0.004516711833785004,0.0,0.0,0.008130081300813009,0.02077687443541102,0.0,0.0
|
29
|
+
institution,0.0,0.24285714285714285,0.0,0.014285714285714285,0.07857142857142857,0.0,0.12857142857142856,0.0,0.0,0.07142857142857142,0.0,0.2571428571428571,0.20714285714285716,0.0,0.0
|
30
|
+
journal,0.0023603461841070024,0.03540519276160504,0.0,0.0023603461841070024,0.0015735641227380016,0.003147128245476003,0.8890637293469709,0.0007867820613690008,0.0,0.006294256490952006,0.0,0.00865460267505901,0.04878048780487805,0.0015735641227380016,0.0
|
31
|
+
link,0.007389162561576354,0.03940886699507389,0.0,0.0024630541871921183,0.0,0.0,0.0,0.8029556650246306,0.0,0.11822660098522167,0.017241379310344827,0.0049261083743842365,0.0049261083743842365,0.0024630541871921183,0.0
|
32
|
+
location,0.017077798861480076,0.024667931688804556,0.0,0.0056925996204933585,0.007590132827324478,0.0,0.009487666034155597,0.003795066413662239,0.8368121442125237,0.013282732447817837,0.0,0.06451612903225806,0.017077798861480076,0.0,0.0
|
33
|
+
note,0.020179372197309416,0.1367713004484305,0.0,0.02914798206278027,0.04708520179372197,0.0,0.03587443946188341,0.042600896860986545,0.0,0.3923766816143498,0.006726457399103139,0.04484304932735426,0.2399103139013453,0.004484304932735426,0.0
|
34
|
+
pages,0.0,0.0013440860215053765,0.0,0.01881720430107527,0.0,0.0,0.0,0.005376344086021506,0.0,0.012096774193548387,0.9314516129032258,0.004032258064516129,0.005376344086021506,0.021505376344086023,0.0
|
35
|
+
publisher,0.009287925696594427,0.047987616099071206,0.0,0.006191950464396285,0.02476780185758514,0.0,0.02321981424148607,0.0015479876160990713,0.018575851393188854,0.020123839009287926,0.0030959752321981426,0.8235294117647058,0.021671826625386997,0.0,0.0
|
36
|
+
title,0.008395522388059701,0.024720149253731342,0.0,0.0018656716417910447,0.0006218905472636816,0.0,0.003109452736318408,0.0,0.001554726368159204,0.005286069651741294,0.0003109452736318408,0.0012437810945273632,0.9528917910447762,0.0,0.0
|
37
|
+
volume,0.0,0.0066298342541436465,0.0,0.019889502762430938,0.0,0.0,0.008839779005524863,0.0,0.0033149171270718232,0.0022099447513812156,0.0077348066298342545,0.0,0.0,0.9513812154696133,0.0
|
38
|
+
workid,0.0,0.0,0.0,0.0707070707070707,0.0,0.0,0.0,0.020202020202020204,0.04040404040404041,0.0707070707070707,0.08080808080808081,0.0,0.030303030303030304,0.04040404040404041,0.6464646464646465
|
39
|
+
|
40
|
+
Label,Precision,Recall,F-measure
|
41
|
+
author,0.9652796913750344,0.9636863823933975,0.9644823788546256
|
42
|
+
booktitle,0.7525162818235642,0.7740560292326432,0.7631341939357552
|
43
|
+
bullet,0.8857142857142857,0.7209302325581395,0.7948717948717948
|
44
|
+
date,0.9272197962154294,0.9437037037037037,0.9353891336270191
|
45
|
+
editor,0.892416225749559,0.9141824751580849,0.9031682284694332
|
46
|
+
institution,0.0,0.0,NaN
|
47
|
+
journal,0.864575363427697,0.8890637293469709,0.8766485647788983
|
48
|
+
link,0.9005524861878453,0.8029556650246306,0.8489583333333334
|
49
|
+
location,0.8963414634146342,0.8368121442125237,0.8655544651619235
|
50
|
+
note,0.4971590909090909,0.3923766816143498,0.43859649122807015
|
51
|
+
pages,0.9454297407912687,0.9314516129032258,0.938388625592417
|
52
|
+
publisher,0.7869822485207101,0.8235294117647058,0.8048411497730711
|
53
|
+
title,0.9245738422084779,0.9528917910447762,0.9385192557997091
|
54
|
+
volume,0.9587973273942093,0.9513812154696133,0.9550748752079867
|
55
|
+
workid,1.0,0.6464646464646465,0.7852760736196319
|
56
|
+
|
57
|
+
Average accuracy by reference:,0.8980503362135145
|
58
|
+
STD of Average accuracy by reference:,0.150249693017486
|
59
|
+
Perfect parses:,239,0.478
|
60
|
+
Accuracy:, 0.899396689527149
|