engtagger 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.yardopts +5 -0
- data/Gemfile +1 -2
- data/README.md +19 -25
- data/engtagger.gemspec +4 -4
- data/lib/engtagger/porter.rb +12 -12
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +0 -0
- data/lib/engtagger/version.rb +2 -2
- data/lib/engtagger.rb +341 -290
- data/test/test_engtagger.rb +246 -201
- metadata +7 -7
data/lib/engtagger.rb
CHANGED
@@ -3,30 +3,17 @@
|
|
3
3
|
|
4
4
|
$LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
|
5
5
|
require 'rubygems'
|
6
|
-
require 'kconv'
|
7
6
|
require 'porter'
|
7
|
+
require 'lru_redux'
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
require 'hpricot'
|
12
|
-
rescue LoadError
|
13
|
-
$no_hpricot = true
|
14
|
-
end
|
15
|
-
|
16
|
-
# File paths
|
17
|
-
$lexpath = File.join(File.dirname(__FILE__), 'engtagger')
|
18
|
-
$word_path = File.join($lexpath, "pos_words.hash")
|
19
|
-
$tag_path = File.join($lexpath, "pos_tags.hash")
|
20
|
-
|
21
|
-
# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
|
22
|
-
class Module
|
23
|
-
def memoize(method)
|
9
|
+
module BoundedSpaceMemoizable
|
10
|
+
def memoize(method, max_cache_size=100000)
|
24
11
|
# alias_method is faster than define_method + old.bind(self).call
|
25
12
|
alias_method "__memoized__#{method}", method
|
26
13
|
module_eval <<-EOF
|
27
|
-
def #{method}(*a
|
28
|
-
#
|
29
|
-
|
14
|
+
def #{method}(*a)
|
15
|
+
@__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
|
16
|
+
@__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
|
30
17
|
end
|
31
18
|
EOF
|
32
19
|
end
|
@@ -34,27 +21,39 @@ end
|
|
34
21
|
|
35
22
|
# English part-of-speech tagger class
|
36
23
|
class EngTagger
|
24
|
+
extend BoundedSpaceMemoizable
|
25
|
+
|
26
|
+
# File paths
|
27
|
+
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
|
28
|
+
DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
|
29
|
+
DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
|
37
30
|
|
38
31
|
#################
|
39
32
|
# Class methods #
|
40
33
|
#################
|
41
|
-
|
42
|
-
# Return a class variable that holds probability data
|
34
|
+
|
35
|
+
# Return a class variable that holds probability data.
|
36
|
+
#
|
37
|
+
# @return [Hash] the probability data
|
38
|
+
#
|
43
39
|
def self.hmm
|
44
40
|
return @@hmm
|
45
41
|
end
|
46
42
|
|
47
|
-
# Return a class variable that holds lexical data
|
43
|
+
# Return a class variable that holds lexical data.
|
44
|
+
#
|
45
|
+
# @return [Hash] the lexicon
|
46
|
+
#
|
48
47
|
def self.lexicon
|
49
48
|
return @@lexicon
|
50
49
|
end
|
51
|
-
|
52
|
-
# Return a regexp from a string argument that matches an XML-style pos tag
|
50
|
+
|
51
|
+
# Return a regexp from a string argument that matches an XML-style pos tag
|
53
52
|
def self.get_ext(tag = nil)
|
54
53
|
return nil unless tag
|
55
54
|
return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
|
56
55
|
end
|
57
|
-
|
56
|
+
|
58
57
|
# Regexps to match XML-style part-of-speech tags
|
59
58
|
NUM = get_ext('cd')
|
60
59
|
GER = get_ext('vbg')
|
@@ -70,22 +69,37 @@ class EngTagger
|
|
70
69
|
VB = get_ext('vb')
|
71
70
|
VBG = get_ext('vbg')
|
72
71
|
VBD = get_ext('vbd')
|
73
|
-
PART = get_ext('vbn')
|
72
|
+
PART = get_ext('vbn')
|
74
73
|
VBP = get_ext('vbp')
|
75
74
|
VBZ = get_ext('vbz')
|
76
75
|
JJ = get_ext('jj')
|
77
76
|
JJR = get_ext('jjr')
|
78
77
|
JJS = get_ext('jjs')
|
78
|
+
RB = get_ext('rb')
|
79
|
+
RBR = get_ext('rbr')
|
80
|
+
RBS = get_ext('rbs')
|
81
|
+
RP = get_ext('rp')
|
82
|
+
WRB = get_ext('wrb')
|
83
|
+
WDT = get_ext('wdt')
|
84
|
+
WP = get_ext('wp')
|
85
|
+
WPS = get_ext('wps')
|
86
|
+
CC = get_ext('cc')
|
87
|
+
IN = get_ext('in')
|
79
88
|
|
80
|
-
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
89
|
+
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
90
|
+
#
|
91
|
+
# @param tag [#to_s] the tag in question
|
92
|
+
# @return [String] the definition, if available
|
93
|
+
#
|
81
94
|
def self.explain_tag(tag)
|
95
|
+
tag = tag.to_s.downcase
|
82
96
|
if TAGS[tag]
|
83
97
|
return TAGS[tag]
|
84
98
|
else
|
85
99
|
return tag
|
86
100
|
end
|
87
|
-
end
|
88
|
-
|
101
|
+
end
|
102
|
+
|
89
103
|
# The folloging is to make a hash to convert a pos tag to its definition
|
90
104
|
# used by the explain_tag method
|
91
105
|
tags = [
|
@@ -132,35 +146,35 @@ class EngTagger
|
|
132
146
|
"PPR", "Punctuation, quotation mark right",
|
133
147
|
"PPS", "Punctuation, colon, semicolon, elipsis",
|
134
148
|
"LRB", "Punctuation, left bracket",
|
135
|
-
"RRB", "Punctuation, right bracket"
|
136
|
-
|
149
|
+
"RRB", "Punctuation, right bracket"
|
150
|
+
]
|
137
151
|
tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
|
138
152
|
tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
|
139
153
|
TAGS = Hash[*tags]
|
140
|
-
|
154
|
+
|
141
155
|
# Hash storing config values:
|
142
156
|
#
|
143
157
|
# * :unknown_word_tag
|
144
158
|
# => (String) Tag to assign to unknown words
|
145
|
-
# * :stem
|
159
|
+
# * :stem
|
146
160
|
# => (Boolean) Stem single words using Porter module
|
147
161
|
# * :weight_noun_phrases
|
148
|
-
# => (Boolean) When returning occurrence counts for a noun phrase, multiply
|
162
|
+
# => (Boolean) When returning occurrence counts for a noun phrase, multiply
|
149
163
|
# the valuethe number of words in the NP.
|
150
|
-
# * :longest_noun_phrase
|
151
|
-
# => (Integer) Will ignore noun phrases longer than this threshold. This
|
164
|
+
# * :longest_noun_phrase
|
165
|
+
# => (Integer) Will ignore noun phrases longer than this threshold. This
|
152
166
|
# affects only the get_words() and get_nouns() methods.
|
153
|
-
# * :relax
|
154
|
-
# => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
|
167
|
+
# * :relax
|
168
|
+
# => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
|
155
169
|
# uncommon words, particularly words used polysemously
|
156
170
|
# * :tag_lex
|
157
|
-
# => (String) Name of the YAML file containing a hash of adjacent part of
|
171
|
+
# => (String) Name of the YAML file containing a hash of adjacent part of
|
158
172
|
# speech tags and the probability of each
|
159
173
|
# * :word_lex
|
160
|
-
# => (String) Name of the YAML file containing a hash of words and corresponding
|
174
|
+
# => (String) Name of the YAML file containing a hash of words and corresponding
|
161
175
|
# parts of speech
|
162
176
|
# * :unknown_lex
|
163
|
-
# => (String) Name of the YAML file containing a hash of tags for unknown
|
177
|
+
# => (String) Name of the YAML file containing a hash of tags for unknown
|
164
178
|
# words and corresponding parts of speech
|
165
179
|
# * :tag_path
|
166
180
|
# => (String) Directory path of tag_lex
|
@@ -169,12 +183,12 @@ class EngTagger
|
|
169
183
|
# * :debug
|
170
184
|
# => (Boolean) Print debug messages
|
171
185
|
attr_accessor :conf
|
172
|
-
|
186
|
+
|
173
187
|
###############
|
174
188
|
# Constructor #
|
175
189
|
###############
|
176
|
-
|
177
|
-
# Take a hash of parameters that override default values.
|
190
|
+
|
191
|
+
# Take a hash of parameters that override default values.
|
178
192
|
# See above for details.
|
179
193
|
def initialize(params = {})
|
180
194
|
@conf = Hash.new
|
@@ -186,13 +200,13 @@ class EngTagger
|
|
186
200
|
@conf[:tag_lex] = 'tags.yml'
|
187
201
|
@conf[:word_lex] = 'words.yml'
|
188
202
|
@conf[:unknown_lex] = 'unknown.yml'
|
189
|
-
@conf[:word_path] =
|
190
|
-
@conf[:tag_path] =
|
203
|
+
@conf[:word_path] = DEFAULT_WORDPATH
|
204
|
+
@conf[:tag_path] = DEFAULT_TAGPATH
|
191
205
|
@conf[:debug] = false
|
192
206
|
# assuming that we start analyzing from the beginninga new sentence...
|
193
|
-
@conf[:current_tag] = 'pp'
|
194
|
-
@conf.merge!(params)
|
195
|
-
unless File.
|
207
|
+
@conf[:current_tag] = 'pp'
|
208
|
+
@conf.merge!(params) if params
|
209
|
+
unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
|
196
210
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
197
211
|
@@hmm = Hash.new
|
198
212
|
@@lexicon = Hash.new
|
@@ -206,11 +220,38 @@ class EngTagger
|
|
206
220
|
end
|
207
221
|
@@mnp = get_max_noun_regex
|
208
222
|
end
|
209
|
-
|
223
|
+
|
210
224
|
##################
|
211
225
|
# Public methods #
|
212
226
|
##################
|
213
|
-
|
227
|
+
|
228
|
+
# Return an array of pairs of the form `["word", :tag]`.
|
229
|
+
#
|
230
|
+
# @param text [String] the input text
|
231
|
+
# @return [Array] the tagged words
|
232
|
+
#
|
233
|
+
def tag_pairs(text)
|
234
|
+
return [] unless valid_text(text)
|
235
|
+
|
236
|
+
out = clean_text(text).map do |word|
|
237
|
+
cleaned_word = clean_word word
|
238
|
+
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
239
|
+
@conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
|
240
|
+
[word, tag.to_sym]
|
241
|
+
end
|
242
|
+
|
243
|
+
# reset the tagger state
|
244
|
+
reset
|
245
|
+
|
246
|
+
out
|
247
|
+
end
|
248
|
+
|
249
|
+
# Examine the string provided and return it fully tagged in XML style.
|
250
|
+
#
|
251
|
+
# @param text [String] the input text
|
252
|
+
# @param verbose [false, true] whether to use verbose tags
|
253
|
+
# @return [String] the marked-up string
|
254
|
+
#
|
214
255
|
# Examine the string provided and return it fully tagged in XML style
|
215
256
|
def add_tags(text, verbose = false)
|
216
257
|
return nil unless valid_text(text)
|
@@ -222,15 +263,15 @@ class EngTagger
|
|
222
263
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
223
264
|
@conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
|
224
265
|
tag = EngTagger.explain_tag(tag) if verbose
|
225
|
-
tagged << '<' + tag + '>' + word + '</' + tag + '>'
|
266
|
+
tagged << '<' + tag + '>' + word + '</' + tag + '>'
|
226
267
|
end
|
227
268
|
reset
|
228
269
|
return tagged.join(' ')
|
229
270
|
end
|
230
|
-
|
231
|
-
# Given a text string, return as many nouns and noun phrases as possible.
|
271
|
+
|
272
|
+
# Given a text string, return as many nouns and noun phrases as possible.
|
232
273
|
# Applies add_tags and involves three stages:
|
233
|
-
#
|
274
|
+
#
|
234
275
|
# * Tag the text
|
235
276
|
# * Extract all the maximal noun phrases
|
236
277
|
# * Recursively extract all noun phrases from the MNPs
|
@@ -244,19 +285,19 @@ class EngTagger
|
|
244
285
|
return get_noun_phrases(tagged)
|
245
286
|
end
|
246
287
|
end
|
247
|
-
|
248
|
-
# Return an easy-on-the-eyes tagged version of a text string.
|
288
|
+
|
289
|
+
# Return an easy-on-the-eyes tagged version of a text string.
|
249
290
|
# Applies add_tags and reformats to be easier to read.
|
250
291
|
def get_readable(text, verbose = false)
|
251
292
|
return nil unless valid_text(text)
|
252
293
|
tagged = add_tags(text, verbose)
|
253
|
-
tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
294
|
+
tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
|
295
|
+
#!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
254
296
|
$1 + '/' + $2.upcase
|
255
297
|
end
|
256
|
-
return tagged
|
257
298
|
end
|
258
|
-
|
259
|
-
# Return an array of sentences (without POS tags) from a text.
|
299
|
+
|
300
|
+
# Return an array of sentences (without POS tags) from a text.
|
260
301
|
def get_sentences(text)
|
261
302
|
return nil unless valid_text(text)
|
262
303
|
tagged = add_tags(text)
|
@@ -270,25 +311,19 @@ class EngTagger
|
|
270
311
|
sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
|
271
312
|
sentence.gsub(Regexp.new(" (\W+)$")){$1}
|
272
313
|
sentence.gsub(Regexp.new("^(`+) ")){$1}
|
273
|
-
end
|
314
|
+
end
|
274
315
|
return sentences
|
275
316
|
end
|
276
|
-
|
317
|
+
|
277
318
|
# Given a POS-tagged text, this method returns a hash of all proper nouns
|
278
319
|
# and their occurrence frequencies. The method is greedy and will
|
279
320
|
# return multi-word phrases, if possible, so it would find ``Linguistic
|
280
|
-
# Data Consortium'' as a single unit, rather than as three individual
|
281
|
-
# proper nouns. This method does not stem the found words.
|
321
|
+
# Data Consortium'' as a single unit, rather than as three individual
|
322
|
+
# proper nouns. This method does not stem the found words.
|
282
323
|
def get_proper_nouns(tagged)
|
283
324
|
return nil unless valid_text(tagged)
|
284
|
-
|
285
|
-
|
286
|
-
end
|
287
|
-
nnp = Hash.new(0)
|
288
|
-
trimmed.each do |n|
|
289
|
-
next unless n.length < 100 # sanity check on word length
|
290
|
-
nnp[n] += 1 unless n =~ /\A\s*\z/
|
291
|
-
end
|
325
|
+
tags = [NNP]
|
326
|
+
nnp = build_matches_hash(build_trimmed(tagged, tags))
|
292
327
|
# Now for some fancy resolution stuff...
|
293
328
|
nnp.keys.each do |key|
|
294
329
|
words = key.split(/\s/)
|
@@ -301,7 +336,7 @@ class EngTagger
|
|
301
336
|
/\A([a-z])[a-z]*\z/ =~ word
|
302
337
|
$1
|
303
338
|
end.join ''
|
304
|
-
# If that acronym has been seen,
|
339
|
+
# If that acronym has been seen,
|
305
340
|
# remove it and add the values to
|
306
341
|
# the full name
|
307
342
|
if nnp[acronym]
|
@@ -312,167 +347,170 @@ class EngTagger
|
|
312
347
|
end
|
313
348
|
return nnp
|
314
349
|
end
|
315
|
-
|
316
|
-
# Given a POS-tagged text, this method returns all nouns and their
|
317
|
-
# occurrence frequencies.
|
350
|
+
|
351
|
+
# Given a POS-tagged text, this method returns all nouns and their
|
352
|
+
# occurrence frequencies.
|
353
|
+
#
|
354
|
+
# @param tagged [String] the tagged text
|
355
|
+
# @return [Hash] the hash of matches
|
356
|
+
#
|
318
357
|
def get_nouns(tagged)
|
319
358
|
return nil unless valid_text(tagged)
|
320
|
-
NN
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
359
|
+
tags = [NN]
|
360
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
361
|
+
end
|
362
|
+
|
363
|
+
# Returns all types of verbs and does not descriminate between the
|
364
|
+
# various kinds. Combines all other verb methods listed in this
|
365
|
+
# class.
|
366
|
+
#
|
367
|
+
# @param tagged [String] the tagged text
|
368
|
+
# @return [Hash] the hash of matches
|
369
|
+
#
|
370
|
+
def get_verbs(tagged)
|
371
|
+
return nil unless valid_text(tagged)
|
372
|
+
tags = [VB, VBD, VBG, PART, VBP, VBZ]
|
373
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
331
374
|
end
|
332
375
|
|
376
|
+
#
|
377
|
+
# @param tagged [String] the tagged text
|
378
|
+
# @return [Hash] the hash of matches
|
379
|
+
#
|
380
|
+
|
333
381
|
def get_infinitive_verbs(tagged)
|
334
382
|
return nil unless valid_text(tagged)
|
335
|
-
VB
|
336
|
-
|
337
|
-
strip_tags(n)
|
338
|
-
end
|
339
|
-
ret = Hash.new(0)
|
340
|
-
trimmed.each do |n|
|
341
|
-
n = stem(n)
|
342
|
-
next unless n.length < 100 # sanity check on word length
|
343
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
344
|
-
end
|
345
|
-
return ret
|
383
|
+
tags = [VB]
|
384
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
346
385
|
end
|
347
386
|
|
387
|
+
#
|
388
|
+
# @param tagged [String] the tagged text
|
389
|
+
# @return [Hash] the hash of matches
|
390
|
+
#
|
348
391
|
def get_past_tense_verbs(tagged)
|
349
392
|
return nil unless valid_text(tagged)
|
350
|
-
VBD
|
351
|
-
|
352
|
-
strip_tags(n)
|
353
|
-
end
|
354
|
-
ret = Hash.new(0)
|
355
|
-
trimmed.each do |n|
|
356
|
-
n = stem(n)
|
357
|
-
next unless n.length < 100 # sanity check on word length
|
358
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
359
|
-
end
|
360
|
-
return ret
|
393
|
+
tags = [VBD]
|
394
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
361
395
|
end
|
362
396
|
|
397
|
+
#
|
398
|
+
# @param tagged [String] the tagged text
|
399
|
+
# @return [Hash] the hash of matches
|
400
|
+
#
|
363
401
|
def get_gerund_verbs(tagged)
|
364
402
|
return nil unless valid_text(tagged)
|
365
|
-
VBG
|
366
|
-
|
367
|
-
strip_tags(n)
|
368
|
-
end
|
369
|
-
ret = Hash.new(0)
|
370
|
-
trimmed.each do |n|
|
371
|
-
n = stem(n)
|
372
|
-
next unless n.length < 100 # sanity check on word length
|
373
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
374
|
-
end
|
375
|
-
return ret
|
403
|
+
tags = [VBG]
|
404
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
376
405
|
end
|
377
406
|
|
407
|
+
#
|
408
|
+
# @param tagged [String] the tagged text
|
409
|
+
# @return [Hash] the hash of matches
|
410
|
+
#
|
378
411
|
def get_passive_verbs(tagged)
|
379
412
|
return nil unless valid_text(tagged)
|
380
|
-
PART
|
381
|
-
|
382
|
-
strip_tags(n)
|
383
|
-
end
|
384
|
-
ret = Hash.new(0)
|
385
|
-
trimmed.each do |n|
|
386
|
-
n = stem(n)
|
387
|
-
next unless n.length < 100 # sanity check on word length
|
388
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
389
|
-
end
|
390
|
-
return ret
|
413
|
+
tags = [PART]
|
414
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
391
415
|
end
|
392
416
|
|
393
|
-
|
417
|
+
#
|
418
|
+
# @param tagged [String] the tagged text
|
419
|
+
# @return [Hash] the hash of matches
|
420
|
+
#
|
394
421
|
def get_base_present_verbs(tagged)
|
395
422
|
return nil unless valid_text(tagged)
|
396
|
-
VBP
|
397
|
-
|
398
|
-
strip_tags(n)
|
399
|
-
end
|
400
|
-
ret = Hash.new(0)
|
401
|
-
trimmed.each do |n|
|
402
|
-
n = stem(n)
|
403
|
-
next unless n.length < 100 # sanity check on word length
|
404
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
405
|
-
end
|
406
|
-
return ret
|
423
|
+
tags = [VBP]
|
424
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
407
425
|
end
|
408
426
|
|
427
|
+
#
|
428
|
+
# @param tagged [String] the tagged text
|
429
|
+
# @return [Hash] the hash of matches
|
430
|
+
#
|
409
431
|
def get_present_verbs(tagged)
|
410
432
|
return nil unless valid_text(tagged)
|
411
|
-
VBZ
|
412
|
-
|
413
|
-
strip_tags(n)
|
414
|
-
end
|
415
|
-
ret = Hash.new(0)
|
416
|
-
trimmed.each do |n|
|
417
|
-
n = stem(n)
|
418
|
-
next unless n.length < 100 # sanity check on word length
|
419
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
420
|
-
end
|
421
|
-
return ret
|
433
|
+
tags = [VBZ]
|
434
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
422
435
|
end
|
423
436
|
|
437
|
+
#
|
438
|
+
# @param tagged [String] the tagged text
|
439
|
+
# @return [Hash] the hash of matches
|
440
|
+
#
|
424
441
|
def get_adjectives(tagged)
|
425
442
|
return nil unless valid_text(tagged)
|
426
|
-
JJ
|
427
|
-
|
428
|
-
strip_tags(n)
|
429
|
-
end
|
430
|
-
ret = Hash.new(0)
|
431
|
-
trimmed.each do |n|
|
432
|
-
n = stem(n)
|
433
|
-
next unless n.length < 100 # sanity check on word length
|
434
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
435
|
-
end
|
436
|
-
return ret
|
443
|
+
tags = [JJ]
|
444
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
437
445
|
end
|
438
446
|
|
447
|
+
#
|
448
|
+
# @param tagged [String] the tagged text
|
449
|
+
# @return [Hash] the hash of matches
|
450
|
+
#
|
439
451
|
def get_comparative_adjectives(tagged)
|
440
452
|
return nil unless valid_text(tagged)
|
441
|
-
JJR
|
442
|
-
|
443
|
-
|
444
|
-
end
|
445
|
-
ret = Hash.new(0)
|
446
|
-
trimmed.each do |n|
|
447
|
-
n = stem(n)
|
448
|
-
next unless n.length < 100 # sanity check on word length
|
449
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
450
|
-
end
|
451
|
-
return ret
|
452
|
-
end
|
453
|
+
tags = [JJR]
|
454
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
455
|
+
end
|
453
456
|
|
457
|
+
#
|
458
|
+
# @param tagged [String] the tagged text
|
459
|
+
# @return [Hash] the hash of matches
|
460
|
+
#
|
454
461
|
def get_superlative_adjectives(tagged)
|
455
462
|
return nil unless valid_text(tagged)
|
456
|
-
JJS
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
463
|
+
tags = [JJS]
|
464
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
465
|
+
end
|
466
|
+
|
467
|
+
#
|
468
|
+
# @param tagged [String] the tagged text
|
469
|
+
# @return [Hash] the hash of matches
|
470
|
+
#
|
471
|
+
def get_adverbs(tagged)
|
472
|
+
return nil unless valid_text(tagged)
|
473
|
+
tags = [RB, RBR, RBS, RP]
|
474
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
475
|
+
end
|
476
|
+
|
477
|
+
#
|
478
|
+
# @param tagged [String] the tagged text
|
479
|
+
# @return [Hash] the hash of matches
|
480
|
+
#
|
481
|
+
def get_interrogatives(tagged)
|
482
|
+
return nil unless valid_text(tagged)
|
483
|
+
tags = [WRB, WDT, WP, WPS]
|
484
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
485
|
+
end
|
486
|
+
|
487
|
+
# To be consistent with documentation's naming of 'interrogative'
|
488
|
+
# parts of speech as 'question'
|
489
|
+
alias_method :get_question_parts, :get_interrogatives
|
490
|
+
|
491
|
+
# Returns all types of conjunctions and does not discriminate
|
492
|
+
# between the various kinds. E.g. coordinating, subordinating,
|
493
|
+
# correlative...
|
494
|
+
#
|
495
|
+
# @param tagged [String] the tagged text
|
496
|
+
# @return [Hash] the hash of matches
|
497
|
+
#
|
498
|
+
def get_conjunctions(tagged)
|
499
|
+
return nil unless valid_text(tagged)
|
500
|
+
tags = [CC, IN]
|
501
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
467
502
|
end
|
468
503
|
|
469
504
|
# Given a POS-tagged text, this method returns only the maximal noun phrases.
|
470
|
-
# May be called directly, but is also used by get_noun_phrases
|
505
|
+
# May be called directly, but is also used by `get_noun_phrases`.
|
506
|
+
#
|
507
|
+
# @param tagged [String] the tagged text
|
508
|
+
# @return [Hash] the hash of matches
|
509
|
+
#
|
471
510
|
def get_max_noun_phrases(tagged)
|
472
|
-
return unless valid_text(tagged)
|
473
|
-
|
474
|
-
|
475
|
-
end
|
511
|
+
return nil unless valid_text(tagged)
|
512
|
+
tags = [@@mnp]
|
513
|
+
mn_phrases = build_trimmed(tagged, tags)
|
476
514
|
ret = Hash.new(0)
|
477
515
|
mn_phrases.each do |p|
|
478
516
|
p = stem(p) unless p =~ /\s/ # stem single words
|
@@ -482,11 +520,15 @@ class EngTagger
|
|
482
520
|
end
|
483
521
|
|
484
522
|
# Similar to get_words, but requires a POS-tagged text as an argument.
|
523
|
+
#
|
524
|
+
# @param tagged [String] the tagged text
|
525
|
+
# @return [Hash] the hash of matches
|
526
|
+
#
|
485
527
|
def get_noun_phrases(tagged)
|
486
528
|
return nil unless valid_text(tagged)
|
487
529
|
found = Hash.new(0)
|
488
530
|
phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
|
489
|
-
|
531
|
+
scanned = tagged.scan(@@mnp)
|
490
532
|
# Find MNPs in the text, one sentence at a time
|
491
533
|
# Record and split if the phrase is extended by a (?:PREP|DET|NUM)
|
492
534
|
mn_phrases = []
|
@@ -495,9 +537,9 @@ class EngTagger
|
|
495
537
|
mn_phrases += m.split(phrase_ext)
|
496
538
|
end
|
497
539
|
mn_phrases.each do |mnp|
|
498
|
-
|
499
|
-
|
500
|
-
|
540
|
+
# Split the phrase into an array of words, and create a loop for each word,
|
541
|
+
# shortening the phrase by removing the word in the first position.
|
542
|
+
# Record the phrase and any single nouns that are found
|
501
543
|
words = mnp.split
|
502
544
|
words.length.times do |i|
|
503
545
|
found[words.join(' ')] += 1 if words.length > 1
|
@@ -519,12 +561,12 @@ class EngTagger
|
|
519
561
|
multiplier = word_count if @conf[:weight_noun_phrases]
|
520
562
|
ret[k] += multiplier * v
|
521
563
|
end
|
522
|
-
return ret
|
564
|
+
return ret
|
523
565
|
end
|
524
|
-
|
525
|
-
# Reads some included corpus data and saves it in a stored hash on the
|
526
|
-
# local file system. This is called automatically if the tagger can't
|
527
|
-
# find the stored lexicon.
|
566
|
+
|
567
|
+
# Reads some included corpus data and saves it in a stored hash on the
|
568
|
+
# local file system. This is called automatically if the tagger can't
|
569
|
+
# find the stored lexicon.
|
528
570
|
def install
|
529
571
|
puts "Creating part-of-speech lexicon" if @conf[:debug]
|
530
572
|
load_tags(@conf[:tag_lex])
|
@@ -542,7 +584,23 @@ class EngTagger
|
|
542
584
|
# Private methods #
|
543
585
|
###################
|
544
586
|
|
545
|
-
|
587
|
+
private
|
588
|
+
|
589
|
+
def build_trimmed(tagged, tags)
|
590
|
+
tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
|
591
|
+
strip_tags(n)
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
def build_matches_hash(trimmed)
|
596
|
+
ret = Hash.new(0)
|
597
|
+
trimmed.each do |n|
|
598
|
+
n = stem(n)
|
599
|
+
next unless n.length < 100 # sanity check on word length
|
600
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
601
|
+
end
|
602
|
+
ret
|
603
|
+
end
|
546
604
|
|
547
605
|
# Downcase the first letter of word
|
548
606
|
def lcfirst(word)
|
@@ -552,8 +610,8 @@ class EngTagger
|
|
552
610
|
# Upcase the first letter of word
|
553
611
|
def ucfirst(word)
|
554
612
|
word.split(//)[0].upcase + word.split(//)[1..-1].join
|
555
|
-
end
|
556
|
-
|
613
|
+
end
|
614
|
+
|
557
615
|
# Return the word stem as given by Stemmable module. This can be
|
558
616
|
# turned off with the class parameter @conf[:stem] => false.
|
559
617
|
def stem(word)
|
@@ -561,8 +619,8 @@ class EngTagger
|
|
561
619
|
return word.stem
|
562
620
|
end
|
563
621
|
|
564
|
-
# This method will reset the preceeding tag to a sentence ender (PP).
|
565
|
-
# This prepares the first word of a new sentence to be tagged correctly.
|
622
|
+
# This method will reset the preceeding tag to a sentence ender (PP).
|
623
|
+
# This prepares the first word of a new sentence to be tagged correctly.
|
566
624
|
def reset
|
567
625
|
@conf[:current_tag] = 'pp'
|
568
626
|
end
|
@@ -581,7 +639,7 @@ class EngTagger
|
|
581
639
|
return true
|
582
640
|
end
|
583
641
|
end
|
584
|
-
|
642
|
+
|
585
643
|
# Return a text string with the part-of-speech tags removed
|
586
644
|
def strip_tags(tagged, downcase = false)
|
587
645
|
return nil unless valid_text(tagged)
|
@@ -595,18 +653,11 @@ class EngTagger
|
|
595
653
|
return text
|
596
654
|
end
|
597
655
|
end
|
598
|
-
|
599
|
-
# Strip the provided text
|
600
|
-
# in preparation for tagging
|
656
|
+
|
657
|
+
# Strip the provided text and separate off any punctuation in preparation for tagging
|
601
658
|
def clean_text(text)
|
602
659
|
return false unless valid_text(text)
|
603
|
-
|
604
|
-
unless $no_hpricot
|
605
|
-
# Strip out any markup and convert entities to their proper form
|
606
|
-
cleaned_text = Hpricot(text).inner_text
|
607
|
-
else
|
608
|
-
cleaned_text = text
|
609
|
-
end
|
660
|
+
cleaned_text = text.encode('utf-8')
|
610
661
|
tokenized = []
|
611
662
|
# Tokenize the text (splitting on punctuation as you go)
|
612
663
|
cleaned_text.split(/\s+/).each do |line|
|
@@ -615,41 +666,43 @@ class EngTagger
|
|
615
666
|
words = split_sentences(tokenized)
|
616
667
|
return words
|
617
668
|
end
|
618
|
-
|
619
|
-
# This handles all of the trailing periods, keeping those that
|
669
|
+
|
670
|
+
# This handles all of the trailing periods, keeping those that
|
620
671
|
# belong on abbreviations and removing those that seem to be
|
621
672
|
# at the end of sentences. This method makes some assumptions
|
622
673
|
# about the use of capitalization in the incoming text
|
623
674
|
def split_sentences(array)
|
624
675
|
tokenized = array
|
625
|
-
people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
676
|
+
people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
626
677
|
supt det mssrs rev)
|
627
678
|
army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
|
628
679
|
inst = %w(dept univ assn bros ph.d)
|
629
|
-
place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
680
|
+
place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
630
681
|
hwy hway la pde pd plz pl rd st tce)
|
631
682
|
comp = %w(mfg inc ltd co corp)
|
632
|
-
state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
633
|
-
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
634
|
-
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
683
|
+
state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
684
|
+
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
685
|
+
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
635
686
|
va wash wis wisc wy wyo usafa alta man ont que sask yuk)
|
636
687
|
month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
|
637
688
|
misc = %w(vs etc no esp)
|
638
|
-
abbr = Hash.new
|
689
|
+
abbr = Hash.new
|
639
690
|
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
640
691
|
abbr[i] = true
|
641
692
|
end
|
642
693
|
words = Array.new
|
643
694
|
tokenized.each_with_index do |t, i|
|
644
|
-
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
|
695
|
+
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
|
696
|
+
tokenized[i] =~ /\A(.+)\.\z/
|
645
697
|
w = $1
|
646
|
-
# Don't separate the period off words that
|
698
|
+
# Don't separate the period off words that
|
647
699
|
# meet any of the following conditions:
|
648
700
|
#
|
649
701
|
# 1. It is defined in one of the lists above
|
650
|
-
# 2. It is only one letter long: Alfred E. Sloan
|
702
|
+
# 2. It is only one letter long: Alfred E. Sloan
|
651
703
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
652
|
-
unless abbr[w.downcase] or
|
704
|
+
unless abbr[w.downcase] or
|
705
|
+
[/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
653
706
|
words << w
|
654
707
|
words << '.'
|
655
708
|
next
|
@@ -664,8 +717,8 @@ class EngTagger
|
|
664
717
|
end
|
665
718
|
return words
|
666
719
|
end
|
667
|
-
|
668
|
-
# Separate punctuation from words, where appropriate. This leaves trailing
|
720
|
+
|
721
|
+
# Separate punctuation from words, where appropriate. This leaves trailing
|
669
722
|
# periods in place to be dealt with later. Called by the clean_text method.
|
670
723
|
def split_punct(text)
|
671
724
|
# If there's no punctuation, return immediately
|
@@ -675,27 +728,27 @@ class EngTagger
|
|
675
728
|
|
676
729
|
# Put quotes into a standard format
|
677
730
|
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
678
|
-
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
679
|
-
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
731
|
+
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
732
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
680
733
|
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
681
734
|
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
682
|
-
|
735
|
+
|
683
736
|
# Handle all other punctuation
|
684
737
|
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
685
738
|
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
686
|
-
text = text.gsub(/:/o, " :") # Shift semicolons off
|
687
|
-
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
739
|
+
text = text.gsub(/:/o, " : ") # Shift semicolons off
|
740
|
+
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
688
741
|
text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
|
689
742
|
text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
|
690
743
|
|
691
744
|
# English-specific contractions
|
692
745
|
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
|
693
|
-
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
746
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
694
747
|
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
|
695
748
|
result = text.split(' ')
|
696
749
|
return result
|
697
|
-
end
|
698
|
-
|
750
|
+
end
|
751
|
+
|
699
752
|
# Given a preceding tag, assign a tag word. Called by the add_tags method.
|
700
753
|
# This method is a modified version of the Viterbi algorithm for part-of-speech tagging
|
701
754
|
def assign_tag(prev_tag, word)
|
@@ -709,7 +762,7 @@ class EngTagger
|
|
709
762
|
best_so_far = 0
|
710
763
|
w = @@lexicon[word]
|
711
764
|
t = @@hmm
|
712
|
-
|
765
|
+
|
713
766
|
# TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
|
714
767
|
# which is used in most POS taggers
|
715
768
|
best_tag = ""
|
@@ -724,9 +777,9 @@ class EngTagger
|
|
724
777
|
else
|
725
778
|
next
|
726
779
|
end
|
727
|
-
|
728
|
-
# Bayesian logic:
|
729
|
-
# P = P( tag | prev_tag ) * P( tag | word )
|
780
|
+
|
781
|
+
# Bayesian logic:
|
782
|
+
# P = P( tag | prev_tag ) * P( tag | word )
|
730
783
|
probability = t[prev_tag][tag] * (pw + 1)
|
731
784
|
# Set the tag with maximal probability
|
732
785
|
if probability > best_so_far
|
@@ -735,18 +788,18 @@ class EngTagger
|
|
735
788
|
end
|
736
789
|
end
|
737
790
|
return best_tag
|
738
|
-
end
|
739
|
-
|
740
|
-
# This method determines whether a word should be considered in its
|
791
|
+
end
|
792
|
+
|
793
|
+
# This method determines whether a word should be considered in its
|
741
794
|
# lower or upper case form. This is useful in considering proper nouns
|
742
|
-
# and words that begin sentences. Called by add_tags.
|
795
|
+
# and words that begin sentences. Called by add_tags.
|
743
796
|
def clean_word(word)
|
744
797
|
lcf = lcfirst(word)
|
745
798
|
# seen this word as it appears (lower or upper case)
|
746
799
|
if @@lexicon[word]
|
747
800
|
return word
|
748
801
|
elsif @@lexicon[lcf]
|
749
|
-
# seen this word only as lower case
|
802
|
+
# seen this word only as lower case
|
750
803
|
return lcf
|
751
804
|
else
|
752
805
|
# never seen this word. guess.
|
@@ -754,14 +807,13 @@ class EngTagger
|
|
754
807
|
end
|
755
808
|
end
|
756
809
|
|
757
|
-
# This changes any word not appearing in the lexicon to identifiable
|
758
|
-
# classes of words handled by a simple unknown word classification
|
810
|
+
# This changes any word not appearing in the lexicon to identifiable
|
811
|
+
# classes of words handled by a simple unknown word classification
|
759
812
|
# metric. Called by the clean_word method.
|
760
813
|
def classify_unknown_word(word)
|
761
814
|
if /[\(\{\[]/ =~ word # Left brackets
|
762
815
|
classified = "*LRB*"
|
763
|
-
elsif
|
764
|
-
/[\)\}\]]/ =~ word # Right brackets
|
816
|
+
elsif /[\)\}\]]/ =~ word # Right brackets
|
765
817
|
classified = "*RRB*"
|
766
818
|
elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
|
767
819
|
classified = "*NUM*"
|
@@ -800,33 +852,33 @@ class EngTagger
|
|
800
852
|
end
|
801
853
|
return classified
|
802
854
|
end
|
803
|
-
|
804
|
-
# This returns a compiled regexp for extracting maximal noun phrases
|
855
|
+
|
856
|
+
# This returns a compiled regexp for extracting maximal noun phrases
|
805
857
|
# from a POS-tagged text.
|
806
858
|
def get_max_noun_regex
|
807
859
|
regex = /
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
end
|
823
|
-
|
824
|
-
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
825
|
-
# YAML data parser. It will load a YAML document with a collection of key:
|
826
|
-
# value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
|
827
|
-
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
828
|
-
def load_tags(lexicon)
|
829
|
-
path = File.join(
|
860
|
+
# optional number, gerund - adjective -participle
|
861
|
+
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
862
|
+
# Followed by one or more nouns
|
863
|
+
(?:#{NN})+
|
864
|
+
(?:
|
865
|
+
# Optional preposition, determinant, cardinal
|
866
|
+
(?:#{PREP})*(?:#{DET})?(?:#{NUM})?
|
867
|
+
# Optional gerund-adjective -participle
|
868
|
+
(?:#{GER}|#{ADJ}|#{PART})*
|
869
|
+
# one or more nouns
|
870
|
+
(?:#{NN})+
|
871
|
+
)*
|
872
|
+
/xo #/
|
873
|
+
return regex
|
874
|
+
end
|
875
|
+
|
876
|
+
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
877
|
+
# YAML data parser. It will load a YAML document with a collection of key:
|
878
|
+
# value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
|
879
|
+
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
880
|
+
def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
|
881
|
+
path = File.join(lexpath, lexicon)
|
830
882
|
fh = File.open(path, 'r')
|
831
883
|
while line = fh.gets
|
832
884
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
@@ -837,19 +889,19 @@ class EngTagger
|
|
837
889
|
pairs = {}
|
838
890
|
items.each do |i|
|
839
891
|
/([^:]+):\s*(.+)/ =~ i
|
840
|
-
pairs[$1] = $2.to_f
|
892
|
+
pairs[$1] = $2.to_f
|
841
893
|
end
|
842
894
|
@@hmm[key] = pairs
|
843
895
|
end
|
844
896
|
fh.close
|
845
897
|
end
|
846
898
|
|
847
|
-
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
848
|
-
# YAML data parser. It will load a YAML document with a collection of key:
|
849
|
-
# value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
|
850
|
-
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
851
|
-
def load_words(lexicon)
|
852
|
-
path = File.join(
|
899
|
+
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
900
|
+
# YAML data parser. It will load a YAML document with a collection of key:
|
901
|
+
# value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
|
902
|
+
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
903
|
+
def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
|
904
|
+
path = File.join(lexpath, lexicon)
|
853
905
|
fh = File.open(path, 'r')
|
854
906
|
while line = fh.gets
|
855
907
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
@@ -860,15 +912,14 @@ class EngTagger
|
|
860
912
|
pairs = {}
|
861
913
|
items.each do |i|
|
862
914
|
/([^:]+):\s*(.+)/ =~ i
|
863
|
-
pairs[$1] = $2.to_f
|
915
|
+
pairs[$1] = $2.to_f
|
864
916
|
end
|
865
917
|
@@lexicon[key] = pairs
|
866
918
|
end
|
867
919
|
fh.close
|
868
920
|
end
|
869
|
-
|
870
|
-
#memoize the stem and assign_tag methods
|
921
|
+
|
922
|
+
#memoize the stem and assign_tag methods
|
871
923
|
memoize("stem")
|
872
|
-
memoize("assign_tag")
|
924
|
+
memoize("assign_tag")
|
873
925
|
end
|
874
|
-
|