anystyle-parser 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +28 -12
- data/HISTORY.md +6 -0
- data/LICENSE +2 -2
- data/README.md +11 -11
- data/Rakefile +14 -3
- data/anystyle-parser.gemspec +13 -8
- data/features/support/env.rb +18 -0
- data/lib/anystyle/parser/dictionary.rb +35 -37
- data/lib/anystyle/parser/errors.rb +18 -18
- data/lib/anystyle/parser/parser.rb +254 -244
- data/lib/anystyle/parser/utility.rb +18 -18
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/parser_spec.rb +119 -115
- data/spec/spec_helper.rb +9 -2
- metadata +26 -43
- data/.autotest +0 -0
- data/.gitignore +0 -5
- data/.rspec +0 -3
@@ -1,19 +1,19 @@
|
|
1
1
|
module Anystyle
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
2
|
+
module Parser
|
3
|
+
|
4
|
+
class Error < StandardError
|
5
|
+
|
6
|
+
attr_accessor :original
|
7
|
+
|
8
|
+
def initialize(message = nil, original = $!)
|
9
|
+
super(message)
|
10
|
+
@original = original
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
class DictionaryError < Error; end
|
16
|
+
class TrainingError < Error; end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -1,246 +1,256 @@
|
|
1
1
|
module Anystyle
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
2
|
+
module Parser
|
3
|
+
|
4
|
+
class Parser
|
5
|
+
|
6
|
+
@formats = [:bibtex, :hash, :citeproc, :tags, :raw].freeze
|
7
|
+
|
8
|
+
@defaults = {
|
9
|
+
:model => File.expand_path('../support/anystyle.mod', __FILE__),
|
10
|
+
:pattern => File.expand_path('../support/anystyle.pat', __FILE__),
|
11
|
+
:separator => /\s+/,
|
12
|
+
:tagged_separator => /\s+|(<\/?[^>]+>)/,
|
13
|
+
:strip => /[^[:alnum:]]/,
|
14
|
+
:format => :hash,
|
15
|
+
:training_data => File.expand_path('../../../../resources/train.txt', __FILE__)
|
16
|
+
}.freeze
|
17
|
+
|
18
|
+
@features = Feature.instances
|
19
|
+
@feature = Hash.new { |h,k| h[k.to_sym] = features.detect { |f| f.name == k.to_sym } }
|
20
|
+
|
21
|
+
class << self
|
22
|
+
|
23
|
+
attr_reader :defaults, :features, :feature, :formats
|
24
|
+
|
25
|
+
def load(path)
|
26
|
+
p = new
|
27
|
+
p.model = Wapiti.load(path)
|
28
|
+
p
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns a default parser instance
|
32
|
+
def instance
|
33
|
+
@instance ||= new
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
attr_reader :options
|
39
|
+
|
40
|
+
attr_accessor :model, :normalizer
|
41
|
+
|
42
|
+
def initialize(options = {})
|
43
|
+
@options = Parser.defaults.merge(options)
|
44
|
+
@model = Wapiti.load(@options[:model])
|
45
|
+
@normalizer = Normalizer.instance
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse(input, format = options[:format])
|
49
|
+
formatter = "format_#{format}".to_sym
|
50
|
+
|
51
|
+
raise ArgumentError, "format not supported: #{formatter}" unless
|
52
|
+
respond_to?(formatter, true)
|
53
|
+
|
54
|
+
send(formatter, label(input))
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns an array of label/segment pairs for each line in the passed-in string.
|
58
|
+
def label(input, labelled = false)
|
59
|
+
string = input_to_s(input)
|
60
|
+
|
61
|
+
model.label(prepare(string, labelled)).map! do |sequence|
|
62
|
+
sequence.inject([]) do |ts, (token, label)|
|
63
|
+
token, label = token[/^\S+/], label.to_sym
|
64
|
+
if (prev = ts[-1]) && prev[0] == label
|
65
|
+
prev[1] << ' ' << token
|
66
|
+
ts
|
67
|
+
else
|
68
|
+
ts << [label, token]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns an array of tokens for each line of input.
|
76
|
+
#
|
77
|
+
# If the passed-in string is marked as being tagged, extracts labels
|
78
|
+
# from the string and returns an array of token/label pairs for each
|
79
|
+
# line of input.
|
80
|
+
def tokenize(string, tagged = false)
|
81
|
+
if tagged
|
82
|
+
string.split(/[\n\r]+/).each_with_index.map do |s,i|
|
83
|
+
tt, tokens, tags = s.split(options[:tagged_separator]), [], []
|
84
|
+
|
85
|
+
tt.each do |token|
|
86
|
+
case token
|
87
|
+
when /^$/
|
88
|
+
# skip
|
89
|
+
when /^<([^\/>][^>]*)>$/
|
90
|
+
tags << $1
|
91
|
+
when /^<\/([^>]+)>$/
|
92
|
+
unless (tag = tags.pop) == $1
|
93
|
+
raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
|
94
|
+
end
|
95
|
+
else
|
96
|
+
tokens << [token, (tags[-1] || :unknown).to_sym]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
tokens
|
101
|
+
end
|
102
|
+
else
|
103
|
+
string.split(/[\n\r]+/).map { |s| s.split(options[:separator]) }
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Prepares the passed-in string for processing by a CRF tagger. The
|
108
|
+
# string is split into separate lines; each line is tokenized and
|
109
|
+
# expanded. Returns an array of sequence arrays that can be labelled
|
110
|
+
# by the CRF model.
|
111
|
+
#
|
112
|
+
# If the string is marked as being tagged by passing +true+ as the
|
113
|
+
# second argument, training labels will be extracted from the string
|
114
|
+
# and appended after feature expansion. The returned sequence arrays
|
115
|
+
# can be used for training or testing the CRF model.
|
116
|
+
def prepare(input, tagged = false)
|
117
|
+
string = input_to_s(input)
|
118
|
+
tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
# Expands the passed-in token string by appending a space separated list
|
123
|
+
# of all features for the token.
|
124
|
+
def expand(token, sequence = [], offset = 0, label = nil)
|
125
|
+
f = features_for(token, strip(token), sequence, offset)
|
126
|
+
f.unshift(token)
|
127
|
+
f.push(label) unless label.nil?
|
128
|
+
f.join(' ')
|
129
|
+
end
|
130
|
+
|
131
|
+
def train(input = options[:training_data], truncate = true)
|
132
|
+
string = input_to_s(input)
|
133
|
+
@model = Wapiti::Model.new(:pattern => options[:pattern]) if truncate
|
134
|
+
@model.train(prepare(string, true))
|
135
|
+
@model.compact
|
136
|
+
@model.path = options[:model]
|
137
|
+
@model
|
138
|
+
end
|
139
|
+
|
140
|
+
def test(input)
|
141
|
+
string = input_to_s(input)
|
142
|
+
model.options.check!
|
143
|
+
model.label(prepare(string, true))
|
144
|
+
end
|
145
|
+
|
146
|
+
def normalize(hash)
|
147
|
+
hash.keys.each do |label|
|
148
|
+
normalizer.send("normalize_#{label}", hash)
|
149
|
+
end
|
150
|
+
classify hash
|
151
|
+
end
|
152
|
+
|
153
|
+
def classify(hash)
|
154
|
+
return hash if hash.has_key?(:type)
|
155
|
+
|
156
|
+
keys = hash.keys
|
157
|
+
text = hash.values.flatten.join
|
158
|
+
|
159
|
+
case
|
160
|
+
when keys.include?(:journal)
|
161
|
+
hash[:type] = :article
|
162
|
+
when text =~ /proceedings/i
|
163
|
+
hash[:type] = :inproceedings
|
164
|
+
when keys.include?(:booktitle), keys.include?(:container)
|
165
|
+
hash[:type] = :incollection
|
166
|
+
when keys.include?(:publisher)
|
167
|
+
hash[:type] = :book
|
168
|
+
when keys.include?(:institution)
|
169
|
+
hash[:type] = :techreport
|
170
|
+
when keys.include?(:school) || text =~ /master('s)?\s+thesis/i
|
171
|
+
hash[:type] = :mastersthesis
|
172
|
+
when text =~ /interview/i
|
173
|
+
hash[:type] = :interview
|
174
|
+
when text =~ /videotape/i
|
175
|
+
hash[:type] = :videotape
|
176
|
+
when text =~ /unpublished/i
|
177
|
+
hash[:type] = :unpublished
|
178
|
+
else
|
179
|
+
hash[:type] = :misc
|
180
|
+
end
|
181
|
+
|
182
|
+
hash
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
def input_to_s(input)
|
188
|
+
case input
|
189
|
+
when String
|
190
|
+
if input.length < 128 && File.exists?(input)
|
191
|
+
f = File.open(input, 'r:UTF-8')
|
192
|
+
f.read
|
193
|
+
else
|
194
|
+
input
|
195
|
+
end
|
196
|
+
when Array
|
197
|
+
input.join("\n")
|
198
|
+
else
|
199
|
+
raise ArgumentError, "invalid input: #{input.class}"
|
200
|
+
end
|
201
|
+
ensure
|
202
|
+
f.close if f
|
203
|
+
end
|
204
|
+
|
205
|
+
def features_for(*arguments)
|
206
|
+
Parser.features.map { |f| f.match(*arguments) }
|
207
|
+
end
|
208
|
+
|
209
|
+
def strip(token)
|
210
|
+
token.gsub(options[:strip], '')
|
211
|
+
end
|
212
|
+
|
213
|
+
def format_bibtex(labels)
|
214
|
+
b = BibTeX::Bibliography.new
|
215
|
+
format_hash(labels).each do |hash|
|
216
|
+
b << BibTeX::Entry.new(hash)
|
217
|
+
end
|
218
|
+
b
|
219
|
+
end
|
220
|
+
|
221
|
+
def format_raw(labels)
|
222
|
+
labels.map do |line|
|
223
|
+
line.inject([]) do |tokens, (label, segment)|
|
224
|
+
tokens.concat segment.split(' ').map { |token| [label, token] }
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
def format_hash(labels)
|
230
|
+
labels.map do |line|
|
231
|
+
hash = line.inject({}) do |h, (label, token)|
|
232
|
+
if h.has_key?(label)
|
233
|
+
h[label] = [h[label]].flatten << token
|
234
|
+
else
|
235
|
+
h[label] = token
|
236
|
+
end
|
237
|
+
h
|
238
|
+
end
|
239
|
+
normalize hash
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def format_citeproc(labels)
|
244
|
+
format_bibtex(labels).to_citeproc
|
245
|
+
end
|
246
|
+
|
247
|
+
def format_tags(labels)
|
248
|
+
labels.map do |line|
|
249
|
+
line.map { |label, token| "<#{label}>#{token}</#{label}>" }.join(' ')
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
end
|
246
256
|
end
|