anystyle-parser 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,19 @@
1
1
  module Anystyle
2
- module Parser
3
-
4
- class Error < StandardError
5
-
6
- attr_accessor :original
7
-
8
- def initialize(message = nil, original = $!)
9
- super(message)
10
- @original = original
11
- end
12
-
13
- end
14
-
15
- class DictionaryError < Error; end
16
- class TrainingError < Error; end
17
-
18
- end
19
- end
2
+ module Parser
3
+
4
+ class Error < StandardError
5
+
6
+ attr_accessor :original
7
+
8
+ def initialize(message = nil, original = $!)
9
+ super(message)
10
+ @original = original
11
+ end
12
+
13
+ end
14
+
15
+ class DictionaryError < Error; end
16
+ class TrainingError < Error; end
17
+
18
+ end
19
+ end
@@ -1,246 +1,256 @@
1
1
  module Anystyle
2
- module Parser
3
-
4
- class Parser
5
-
6
- @formats = [:bibtex, :hash, :citeproc, :tags].freeze
7
-
8
- @defaults = {
9
- :model => File.expand_path('../support/anystyle.mod', __FILE__),
10
- :pattern => File.expand_path('../support/anystyle.pat', __FILE__),
11
- :separator => /\s+/,
12
- :tagged_separator => /\s+|(<\/?[^>]+>)/,
13
- :strip => /[^[:alnum:]]/,
14
- :format => :hash,
15
- :training_data => File.expand_path('../../../../resources/train.txt', __FILE__)
16
- }.freeze
17
-
18
- @features = Feature.instances
19
- @feature = Hash.new { |h,k| h[k.to_sym] = features.detect { |f| f.name == k.to_sym } }
20
-
21
- class << self
22
-
23
- attr_reader :defaults, :features, :feature, :formats
24
-
25
- def load(path)
26
- p = new
27
- p.model = Wapiti.load(path)
28
- p
29
- end
30
-
31
- # Returns a default parser instance
32
- def instance
33
- @instance ||= new
34
- end
35
-
36
- end
37
-
38
- attr_reader :options
39
-
40
- attr_accessor :model, :normalizer
41
-
42
- def initialize(options = {})
43
- @options = Parser.defaults.merge(options)
44
- @model = Wapiti.load(@options[:model])
45
- @normalizer = Normalizer.instance
46
- end
47
-
48
- def parse(input, format = options[:format])
49
- formatter = "format_#{format}".to_sym
50
- send(formatter, label(input))
51
- rescue NoMethodError
52
- raise ArgumentError, "format not supported: #{formatter}"
53
- end
54
-
55
- # Returns an array of label/segment pairs for each line in the passed-in string.
56
- def label(input, labelled = false)
57
- string = input_to_s(input)
58
-
59
- model.label(prepare(string, labelled)).map! do |sequence|
60
- sequence.inject([]) do |ts, (token, label)|
61
- token, label = token[/^\S+/], label.to_sym
62
- if (prev = ts[-1]) && prev[0] == label
63
- prev[1] << ' ' << token
64
- ts
65
- else
66
- ts << [label, token]
67
- end
68
- end
69
- end
70
-
71
- end
72
-
73
- # Returns an array of tokens for each line of input.
74
- #
75
- # If the passed-in string is marked as being tagged, extracts labels
76
- # from the string and returns an array of token/label pairs for each
77
- # line of input.
78
- def tokenize(string, tagged = false)
79
- if tagged
80
- string.split(/[\n\r]+/).each_with_index.map do |s,i|
81
- tt, tokens, tags = s.split(options[:tagged_separator]), [], []
82
-
83
- tt.each do |token|
84
- case token
85
- when /^$/
86
- # skip
87
- when /^<([^\/>][^>]*)>$/
88
- tags << $1
89
- when /^<\/([^>]+)>$/
90
- unless (tag = tags.pop) == $1
91
- raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
92
- end
93
- else
94
- tokens << [token, (tags[-1] || :unknown).to_sym]
95
- end
96
- end
97
-
98
- tokens
99
- end
100
- else
101
- string.split(/[\n\r]+/).map { |s| s.split(options[:separator]) }
102
- end
103
- end
104
-
105
- # Prepares the passed-in string for processing by a CRF tagger. The
106
- # string is split into separate lines; each line is tokenized and
107
- # expanded. Returns an array of sequence arrays that can be labelled
108
- # by the CRF model.
109
- #
110
- # If the string is marked as being tagged by passing +true+ as the
111
- # second argument, training labels will be extracted from the string
112
- # and appended after feature expansion. The returned sequence arrays
113
- # can be used for training or testing the CRF model.
114
- def prepare(input, tagged = false)
115
- string = input_to_s(input)
116
- tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
117
- end
118
-
119
-
120
- # Expands the passed-in token string by appending a space separated list
121
- # of all features for the token.
122
- def expand(token, sequence = [], offset = 0, label = nil)
123
- f = features_for(token, strip(token), sequence, offset)
124
- f.unshift(token)
125
- f.push(label) unless label.nil?
126
- f.join(' ')
127
- end
128
-
129
- def train(input = options[:training_data], truncate = true)
130
- string = input_to_s(input)
131
- @model = Wapiti::Model.new(:pattern => options[:pattern]) if truncate
132
- @model.train(prepare(string, true))
133
- @model.compact
134
- @model.path = options[:model]
135
- @model
136
- end
137
-
138
- def test(input)
139
- string = input_to_s(input)
140
- model.options.check!
141
- model.label(prepare(string, true))
142
- end
143
-
144
- def normalize(hash)
145
- hash.keys.each do |label|
146
- normalizer.send("normalize_#{label}", hash)
147
- end
148
- classify hash
149
- end
150
-
151
- def classify(hash)
152
- return hash if hash.has_key?(:type)
153
-
154
- keys = hash.keys
155
- text = hash.values.flatten.join
156
-
157
- case
158
- when keys.include?(:journal)
159
- hash[:type] = :article
160
- when text =~ /proceedings/i
161
- hash[:type] = :inproceedings
162
- when keys.include?(:booktitle), keys.include?(:container)
163
- hash[:type] = :incollection
164
- when keys.include?(:publisher)
165
- hash[:type] = :book
166
- when keys.include?(:institution)
167
- hash[:type] = :techreport
168
- when keys.include?(:school) || text =~ /master('s)?\s+thesis/i
169
- hash[:type] = :mastersthesis
170
- when text =~ /interview/i
171
- hash[:type] = :interview
172
- when text =~ /videotape/i
173
- hash[:type] = :videotape
174
- when text =~ /unpublished/i
175
- hash[:type] = :unpublished
176
- else
177
- hash[:type] = :misc
178
- end
179
-
180
- hash
181
- end
182
-
183
- private
184
-
185
- def input_to_s(input)
186
- case input
187
- when String
188
- if input.length < 128 && File.exists?(input)
189
- f = File.open(input, 'r:UTF-8')
190
- f.read
191
- else
192
- input
193
- end
194
- when Array
195
- input.join("\n")
196
- else
197
- raise ArgumentError, "invalid input: #{input.class}"
198
- end
199
- ensure
200
- f.close if f
201
- end
202
-
203
- def features_for(*arguments)
204
- Parser.features.map { |f| f.match(*arguments) }
205
- end
206
-
207
- def strip(token)
208
- token.gsub(options[:strip], '')
209
- end
210
-
211
- def format_bibtex(labels)
212
- b = BibTeX::Bibliography.new
213
- format_hash(labels).each do |hash|
214
- b << BibTeX::Entry.new(hash)
215
- end
216
- b
217
- end
218
-
219
- def format_hash(labels)
220
- labels.map do |line|
221
- hash = line.inject({}) do |h, (label, token)|
222
- if h.has_key?(label)
223
- h[label] = [h[label]].flatten << token
224
- else
225
- h[label] = token
226
- end
227
- h
228
- end
229
- normalize hash
230
- end
231
- end
232
-
233
- def format_citeproc(labels)
234
- format_bibtex(labels).to_citeproc
235
- end
236
-
237
- def format_tags(labels)
238
- labels.map do |line|
239
- line.map { |label, token| "<#{label}>#{token}</#{label}>" }.join(' ')
240
- end
241
- end
242
-
243
- end
244
-
245
- end
2
+ module Parser
3
+
4
+ class Parser
5
+
6
+ @formats = [:bibtex, :hash, :citeproc, :tags, :raw].freeze
7
+
8
+ @defaults = {
9
+ :model => File.expand_path('../support/anystyle.mod', __FILE__),
10
+ :pattern => File.expand_path('../support/anystyle.pat', __FILE__),
11
+ :separator => /\s+/,
12
+ :tagged_separator => /\s+|(<\/?[^>]+>)/,
13
+ :strip => /[^[:alnum:]]/,
14
+ :format => :hash,
15
+ :training_data => File.expand_path('../../../../resources/train.txt', __FILE__)
16
+ }.freeze
17
+
18
+ @features = Feature.instances
19
+ @feature = Hash.new { |h,k| h[k.to_sym] = features.detect { |f| f.name == k.to_sym } }
20
+
21
+ class << self
22
+
23
+ attr_reader :defaults, :features, :feature, :formats
24
+
25
+ def load(path)
26
+ p = new
27
+ p.model = Wapiti.load(path)
28
+ p
29
+ end
30
+
31
+ # Returns a default parser instance
32
+ def instance
33
+ @instance ||= new
34
+ end
35
+
36
+ end
37
+
38
+ attr_reader :options
39
+
40
+ attr_accessor :model, :normalizer
41
+
42
+ def initialize(options = {})
43
+ @options = Parser.defaults.merge(options)
44
+ @model = Wapiti.load(@options[:model])
45
+ @normalizer = Normalizer.instance
46
+ end
47
+
48
+ def parse(input, format = options[:format])
49
+ formatter = "format_#{format}".to_sym
50
+
51
+ raise ArgumentError, "format not supported: #{formatter}" unless
52
+ respond_to?(formatter, true)
53
+
54
+ send(formatter, label(input))
55
+ end
56
+
57
+ # Returns an array of label/segment pairs for each line in the passed-in string.
58
+ def label(input, labelled = false)
59
+ string = input_to_s(input)
60
+
61
+ model.label(prepare(string, labelled)).map! do |sequence|
62
+ sequence.inject([]) do |ts, (token, label)|
63
+ token, label = token[/^\S+/], label.to_sym
64
+ if (prev = ts[-1]) && prev[0] == label
65
+ prev[1] << ' ' << token
66
+ ts
67
+ else
68
+ ts << [label, token]
69
+ end
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ # Returns an array of tokens for each line of input.
76
+ #
77
+ # If the passed-in string is marked as being tagged, extracts labels
78
+ # from the string and returns an array of token/label pairs for each
79
+ # line of input.
80
+ def tokenize(string, tagged = false)
81
+ if tagged
82
+ string.split(/[\n\r]+/).each_with_index.map do |s,i|
83
+ tt, tokens, tags = s.split(options[:tagged_separator]), [], []
84
+
85
+ tt.each do |token|
86
+ case token
87
+ when /^$/
88
+ # skip
89
+ when /^<([^\/>][^>]*)>$/
90
+ tags << $1
91
+ when /^<\/([^>]+)>$/
92
+ unless (tag = tags.pop) == $1
93
+ raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
94
+ end
95
+ else
96
+ tokens << [token, (tags[-1] || :unknown).to_sym]
97
+ end
98
+ end
99
+
100
+ tokens
101
+ end
102
+ else
103
+ string.split(/[\n\r]+/).map { |s| s.split(options[:separator]) }
104
+ end
105
+ end
106
+
107
+ # Prepares the passed-in string for processing by a CRF tagger. The
108
+ # string is split into separate lines; each line is tokenized and
109
+ # expanded. Returns an array of sequence arrays that can be labelled
110
+ # by the CRF model.
111
+ #
112
+ # If the string is marked as being tagged by passing +true+ as the
113
+ # second argument, training labels will be extracted from the string
114
+ # and appended after feature expansion. The returned sequence arrays
115
+ # can be used for training or testing the CRF model.
116
+ def prepare(input, tagged = false)
117
+ string = input_to_s(input)
118
+ tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
119
+ end
120
+
121
+
122
+ # Expands the passed-in token string by appending a space separated list
123
+ # of all features for the token.
124
+ def expand(token, sequence = [], offset = 0, label = nil)
125
+ f = features_for(token, strip(token), sequence, offset)
126
+ f.unshift(token)
127
+ f.push(label) unless label.nil?
128
+ f.join(' ')
129
+ end
130
+
131
+ def train(input = options[:training_data], truncate = true)
132
+ string = input_to_s(input)
133
+ @model = Wapiti::Model.new(:pattern => options[:pattern]) if truncate
134
+ @model.train(prepare(string, true))
135
+ @model.compact
136
+ @model.path = options[:model]
137
+ @model
138
+ end
139
+
140
+ def test(input)
141
+ string = input_to_s(input)
142
+ model.options.check!
143
+ model.label(prepare(string, true))
144
+ end
145
+
146
+ def normalize(hash)
147
+ hash.keys.each do |label|
148
+ normalizer.send("normalize_#{label}", hash)
149
+ end
150
+ classify hash
151
+ end
152
+
153
+ def classify(hash)
154
+ return hash if hash.has_key?(:type)
155
+
156
+ keys = hash.keys
157
+ text = hash.values.flatten.join
158
+
159
+ case
160
+ when keys.include?(:journal)
161
+ hash[:type] = :article
162
+ when text =~ /proceedings/i
163
+ hash[:type] = :inproceedings
164
+ when keys.include?(:booktitle), keys.include?(:container)
165
+ hash[:type] = :incollection
166
+ when keys.include?(:publisher)
167
+ hash[:type] = :book
168
+ when keys.include?(:institution)
169
+ hash[:type] = :techreport
170
+ when keys.include?(:school) || text =~ /master('s)?\s+thesis/i
171
+ hash[:type] = :mastersthesis
172
+ when text =~ /interview/i
173
+ hash[:type] = :interview
174
+ when text =~ /videotape/i
175
+ hash[:type] = :videotape
176
+ when text =~ /unpublished/i
177
+ hash[:type] = :unpublished
178
+ else
179
+ hash[:type] = :misc
180
+ end
181
+
182
+ hash
183
+ end
184
+
185
+ private
186
+
187
+ def input_to_s(input)
188
+ case input
189
+ when String
190
+ if input.length < 128 && File.exists?(input)
191
+ f = File.open(input, 'r:UTF-8')
192
+ f.read
193
+ else
194
+ input
195
+ end
196
+ when Array
197
+ input.join("\n")
198
+ else
199
+ raise ArgumentError, "invalid input: #{input.class}"
200
+ end
201
+ ensure
202
+ f.close if f
203
+ end
204
+
205
+ def features_for(*arguments)
206
+ Parser.features.map { |f| f.match(*arguments) }
207
+ end
208
+
209
+ def strip(token)
210
+ token.gsub(options[:strip], '')
211
+ end
212
+
213
+ def format_bibtex(labels)
214
+ b = BibTeX::Bibliography.new
215
+ format_hash(labels).each do |hash|
216
+ b << BibTeX::Entry.new(hash)
217
+ end
218
+ b
219
+ end
220
+
221
+ def format_raw(labels)
222
+ labels.map do |line|
223
+ line.inject([]) do |tokens, (label, segment)|
224
+ tokens.concat segment.split(' ').map { |token| [label, token] }
225
+ end
226
+ end
227
+ end
228
+
229
+ def format_hash(labels)
230
+ labels.map do |line|
231
+ hash = line.inject({}) do |h, (label, token)|
232
+ if h.has_key?(label)
233
+ h[label] = [h[label]].flatten << token
234
+ else
235
+ h[label] = token
236
+ end
237
+ h
238
+ end
239
+ normalize hash
240
+ end
241
+ end
242
+
243
+ def format_citeproc(labels)
244
+ format_bibtex(labels).to_citeproc
245
+ end
246
+
247
+ def format_tags(labels)
248
+ labels.map do |line|
249
+ line.map { |label, token| "<#{label}>#{token}</#{label}>" }.join(' ')
250
+ end
251
+ end
252
+
253
+ end
254
+
255
+ end
246
256
  end