engtagger 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/{LICENSE.txt → LICENSE} +22 -0
- data/README.md +137 -0
- data/Rakefile +2 -24
- data/engtagger.gemspec +17 -0
- data/lib/engtagger.rb +730 -729
- data/lib/engtagger/porter.rb +2 -6
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +0 -0
- data/lib/engtagger/version.rb +3 -0
- metadata +41 -64
- data/History.txt +0 -10
- data/Manifest.txt +0 -13
- data/README.txt +0 -140
data/.gitignore
ADDED
data/Gemfile
ADDED
data/{LICENSE.txt → LICENSE}
RENAMED
@@ -338,3 +338,25 @@ proprietary programs. If your program is a subroutine library, you may
|
|
338
338
|
consider it more useful to permit linking proprietary applications with the
|
339
339
|
library. If this is what you want to do, use the GNU Library General
|
340
340
|
Public License instead of this License.
|
341
|
+
Copyright (c) 2012 Yoichiro Hasebe
|
342
|
+
|
343
|
+
MIT License
|
344
|
+
|
345
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
346
|
+
a copy of this software and associated documentation files (the
|
347
|
+
"Software"), to deal in the Software without restriction, including
|
348
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
349
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
350
|
+
permit persons to whom the Software is furnished to do so, subject to
|
351
|
+
the following conditions:
|
352
|
+
|
353
|
+
The above copyright notice and this permission notice shall be
|
354
|
+
included in all copies or substantial portions of the Software.
|
355
|
+
|
356
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
357
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
358
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
359
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
360
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
361
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
362
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
# EngTagger
|
2
|
+
|
3
|
+
English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|
4
|
+
|
5
|
+
### Description
|
6
|
+
|
7
|
+
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
+
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
+
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
+
conditional probabilities--it examines the preceding tag to determine the
|
11
|
+
appropriate tag for the current word. Unknown words are classified according to
|
12
|
+
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
+
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
14
|
+
of regular expressions.
|
15
|
+
|
16
|
+
### Features
|
17
|
+
|
18
|
+
* Assigns POS tags to English text
|
19
|
+
* Extract noun phrases from tagged text
|
20
|
+
* etc.
|
21
|
+
|
22
|
+
### Synopsis:
|
23
|
+
|
24
|
+
require 'rubygems'
|
25
|
+
require 'engtagger'
|
26
|
+
|
27
|
+
# Create a parser object
|
28
|
+
tgr = EngTagger.new
|
29
|
+
|
30
|
+
# Sample text
|
31
|
+
text = "Alice chased the big fat cat."
|
32
|
+
|
33
|
+
# Add part-of-speech tags to text
|
34
|
+
tagged = tgr.add_tags(text)
|
35
|
+
|
36
|
+
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
37
|
+
|
38
|
+
# Get a list of all nouns and noun phrases with occurrence counts
|
39
|
+
word_list = tgr.get_words(text)
|
40
|
+
|
41
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
42
|
+
|
43
|
+
# Get a readable version of the tagged text
|
44
|
+
readable = tgr.get_readable(text)
|
45
|
+
|
46
|
+
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
47
|
+
|
48
|
+
# Get all nouns from a tagged output
|
49
|
+
nouns = tgr.get_nouns(tagged)
|
50
|
+
|
51
|
+
#=> {"cat"=>1, "Alice"=>1}
|
52
|
+
|
53
|
+
# Get all proper nouns
|
54
|
+
proper = tgr.get_proper_nouns(tagged)
|
55
|
+
|
56
|
+
#=> {"Alice"=>1}
|
57
|
+
|
58
|
+
|
59
|
+
# Get all noun phrases of any syntactic level
|
60
|
+
# (same as word_list but take a tagged input)
|
61
|
+
nps = tgr.get_noun_phrases(tagged)
|
62
|
+
|
63
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
64
|
+
|
65
|
+
### Tag Set
|
66
|
+
|
67
|
+
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
68
|
+
|
69
|
+
CC Conjunction, coordinating and, or
|
70
|
+
CD Adjective, cardinal number 3, fifteen
|
71
|
+
DET Determiner this, each, some
|
72
|
+
EX Pronoun, existential there there
|
73
|
+
FW Foreign words
|
74
|
+
IN Preposition / Conjunction for, of, although, that
|
75
|
+
JJ Adjective happy, bad
|
76
|
+
JJR Adjective, comparative happier, worse
|
77
|
+
JJS Adjective, superlative happiest, worst
|
78
|
+
LS Symbol, list item A, A.
|
79
|
+
MD Verb, modal can, could, 'll
|
80
|
+
NN Noun aircraft, data
|
81
|
+
NNP Noun, proper London, Michael
|
82
|
+
NNPS Noun, proper, plural Australians, Methodists
|
83
|
+
NNS Noun, plural women, books
|
84
|
+
PDT Determiner, prequalifier quite, all, half
|
85
|
+
POS Possessive 's, '
|
86
|
+
PRP Determiner, possessive second mine, yours
|
87
|
+
PRPS Determiner, possessive their, your
|
88
|
+
RB Adverb often, not, very, here
|
89
|
+
RBR Adverb, comparative faster
|
90
|
+
RBS Adverb, superlative fastest
|
91
|
+
RP Adverb, particle up, off, out
|
92
|
+
SYM Symbol *
|
93
|
+
TO Preposition to
|
94
|
+
UH Interjection oh, yes, mmm
|
95
|
+
VB Verb, infinitive take, live
|
96
|
+
VBD Verb, past tense took, lived
|
97
|
+
VBG Verb, gerund taking, living
|
98
|
+
VBN Verb, past/passive participle taken, lived
|
99
|
+
VBP Verb, base present form take, live
|
100
|
+
VBZ Verb, present 3SG -s form takes, lives
|
101
|
+
WDT Determiner, question which, whatever
|
102
|
+
WP Pronoun, question who, whoever
|
103
|
+
WPS Determiner, possessive & question whose
|
104
|
+
WRB Adverb, question when, how, however
|
105
|
+
|
106
|
+
PP Punctuation, sentence ender ., !, ?
|
107
|
+
PPC Punctuation, comma ,
|
108
|
+
PPD Punctuation, dollar sign $
|
109
|
+
PPL Punctuation, quotation mark left ``
|
110
|
+
PPR Punctuation, quotation mark right ''
|
111
|
+
PPS Punctuation, colon, semicolon, elipsis :, ..., -
|
112
|
+
LRB Punctuation, left bracket (, {, [
|
113
|
+
RRB Punctuation, right bracket ), }, ]
|
114
|
+
|
115
|
+
### Requirements
|
116
|
+
|
117
|
+
* [Hpricot](http://code.whytheluckystiff.net/hpricot/) (optional)
|
118
|
+
|
119
|
+
### Install
|
120
|
+
|
121
|
+
(sudo) gem install engtagger
|
122
|
+
|
123
|
+
### Author
|
124
|
+
|
125
|
+
of this Ruby library
|
126
|
+
|
127
|
+
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
128
|
+
|
129
|
+
### Acknowledgement
|
130
|
+
|
131
|
+
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
132
|
+
The credit for the crucial part of its algorithm/design therefore goes to
|
133
|
+
Aaron Coburn, the author of the original Perl version.
|
134
|
+
|
135
|
+
### License
|
136
|
+
|
137
|
+
This library is distributed under the GPL. Please see the LICENSE file.
|
data/Rakefile
CHANGED
@@ -1,24 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'hoe'
|
5
|
-
require './lib/engtagger.rb'
|
6
|
-
|
7
|
-
Hoe.new('EngTagger', EngTagger::VERSION) do |p|
|
8
|
-
p.name = "engtagger"
|
9
|
-
p.author = "Yoichiro Hasebe"
|
10
|
-
p.description = p.paragraphs_of('README.txt', 3).join("\n\n")
|
11
|
-
p.email = 'yohasebe@gmail.com'
|
12
|
-
p.summary = p.paragraphs_of('README.txt', 1).join("\n\n")
|
13
|
-
p.url = "http://engtagger.rubyforge.org"
|
14
|
-
p.remote_rdoc_dir = '' # Release to root
|
15
|
-
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
16
|
-
p.extra_deps << ['hpricot']
|
17
|
-
p.rdoc_pattern = /^(.+\.rb|.+\.txt|.+\.yaml|[^\.]+)$/
|
18
|
-
p.need_zip = true
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "Release and publish documentation"
|
22
|
-
task :repubdoc => [:release, :publish_docs]
|
23
|
-
|
24
|
-
# vim: syntax=Ruby
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
data/engtagger.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/engtagger/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Yoichiro Hasebe"]
|
6
|
+
gem.email = ["yohasebe@gmail.com"]
|
7
|
+
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
+
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "engtagger"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = EngTagger::VERSION
|
17
|
+
end
|
data/lib/engtagger.rb
CHANGED
@@ -1,729 +1,730 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
require '
|
6
|
-
require '
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
$
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
return
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
#
|
82
|
-
|
83
|
-
|
84
|
-
"
|
85
|
-
"
|
86
|
-
"
|
87
|
-
"
|
88
|
-
"
|
89
|
-
"
|
90
|
-
"
|
91
|
-
"
|
92
|
-
"
|
93
|
-
"
|
94
|
-
"
|
95
|
-
"
|
96
|
-
"
|
97
|
-
"
|
98
|
-
"
|
99
|
-
"
|
100
|
-
"
|
101
|
-
"
|
102
|
-
"
|
103
|
-
"
|
104
|
-
"
|
105
|
-
"
|
106
|
-
"
|
107
|
-
"
|
108
|
-
"
|
109
|
-
"
|
110
|
-
"
|
111
|
-
"
|
112
|
-
"
|
113
|
-
"
|
114
|
-
"
|
115
|
-
"
|
116
|
-
"
|
117
|
-
"
|
118
|
-
"
|
119
|
-
"
|
120
|
-
"
|
121
|
-
"
|
122
|
-
"
|
123
|
-
"
|
124
|
-
"
|
125
|
-
"
|
126
|
-
"
|
127
|
-
|
128
|
-
|
129
|
-
tags = tags.collect{|t| t.
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
#
|
134
|
-
#
|
135
|
-
#
|
136
|
-
#
|
137
|
-
#
|
138
|
-
#
|
139
|
-
#
|
140
|
-
#
|
141
|
-
#
|
142
|
-
#
|
143
|
-
#
|
144
|
-
#
|
145
|
-
#
|
146
|
-
#
|
147
|
-
#
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
#
|
153
|
-
#
|
154
|
-
#
|
155
|
-
#
|
156
|
-
#
|
157
|
-
#
|
158
|
-
#
|
159
|
-
#
|
160
|
-
#
|
161
|
-
#
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
#
|
170
|
-
|
171
|
-
|
172
|
-
@conf
|
173
|
-
@conf[:
|
174
|
-
@conf[:
|
175
|
-
@conf[:
|
176
|
-
@conf[:
|
177
|
-
@conf[:
|
178
|
-
@conf[:
|
179
|
-
@conf[:
|
180
|
-
@conf[:
|
181
|
-
@conf[:
|
182
|
-
@conf[:
|
183
|
-
|
184
|
-
|
185
|
-
@conf
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
@@
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
lexf
|
194
|
-
|
195
|
-
|
196
|
-
hmmf
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
@conf[:current_tag]
|
215
|
-
tag =
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
#
|
224
|
-
#
|
225
|
-
#
|
226
|
-
# *
|
227
|
-
# *
|
228
|
-
#
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
#
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
tagged =
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
sentence.gsub(Regexp.new(" (
|
261
|
-
sentence.gsub(Regexp.new(" (
|
262
|
-
sentence.gsub(Regexp.new(" (
|
263
|
-
sentence.gsub(Regexp.new("
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
#
|
270
|
-
#
|
271
|
-
#
|
272
|
-
#
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
#
|
288
|
-
#
|
289
|
-
|
290
|
-
|
291
|
-
acronym
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
#
|
297
|
-
# the
|
298
|
-
|
299
|
-
|
300
|
-
nnp
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
#
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
#
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
#
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
#
|
355
|
-
#
|
356
|
-
|
357
|
-
words
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
multiplier =
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
#
|
382
|
-
#
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
load_words(@conf[:
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
#
|
414
|
-
|
415
|
-
|
416
|
-
return word
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
# This
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
text =
|
445
|
-
text = text.gsub(/\
|
446
|
-
text = text.gsub(/\s
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
#
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
#
|
476
|
-
#
|
477
|
-
#
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
#
|
503
|
-
#
|
504
|
-
#
|
505
|
-
#
|
506
|
-
#
|
507
|
-
|
508
|
-
|
509
|
-
words <<
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
words
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
#
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
text = text.gsub(
|
534
|
-
text = text.gsub(/(
|
535
|
-
text = text.gsub(/"
|
536
|
-
text = text.gsub(/
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
text = text.gsub(
|
541
|
-
text = text.gsub(
|
542
|
-
text = text.gsub(
|
543
|
-
text = text.gsub(/(
|
544
|
-
text = text.gsub(/([
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
text = text.gsub(/
|
549
|
-
text = text.gsub(/'
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
#
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
#
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
#
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
#
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
#
|
597
|
-
#
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
#
|
614
|
-
#
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
#
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
#
|
681
|
-
#
|
682
|
-
#
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
#
|
704
|
-
#
|
705
|
-
#
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
memoize
|
727
|
-
memoize("
|
728
|
-
|
729
|
-
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
$LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
|
5
|
+
require 'rubygems'
|
6
|
+
require 'kconv'
|
7
|
+
require 'porter'
|
8
|
+
|
9
|
+
# use hpricot for extracting English text from docs with XML like tags
|
10
|
+
begin
|
11
|
+
require 'hpricot'
|
12
|
+
rescue LoadError
|
13
|
+
$no_hpricot = true
|
14
|
+
end
|
15
|
+
|
16
|
+
# File paths
|
17
|
+
$lexpath = File.join(File.dirname(__FILE__), 'engtagger')
|
18
|
+
$word_path = File.join($lexpath, "pos_words.hash")
|
19
|
+
$tag_path = File.join($lexpath, "pos_tags.hash")
|
20
|
+
|
21
|
+
# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
|
22
|
+
class Module
|
23
|
+
def memoize(method)
|
24
|
+
# alias_method is faster than define_method + old.bind(self).call
|
25
|
+
alias_method "__memoized__#{method}", method
|
26
|
+
module_eval <<-EOF
|
27
|
+
def #{method}(*a, &b)
|
28
|
+
# assumes the block won't change the result if the args are the same
|
29
|
+
(@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
|
30
|
+
end
|
31
|
+
EOF
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# English part-of-speech tagger class
|
36
|
+
class EngTagger
|
37
|
+
|
38
|
+
#################
|
39
|
+
# Class methods #
|
40
|
+
#################
|
41
|
+
|
42
|
+
# Return a class variable that holds probability data
|
43
|
+
def self.hmm
|
44
|
+
return @@hmm
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return a class variable that holds lexical data
|
48
|
+
def self.lexicon
|
49
|
+
return @@lexicon
|
50
|
+
end
|
51
|
+
|
52
|
+
# Return a regexp from a string argument that matches an XML-style pos tag
|
53
|
+
def self.get_ext(tag = nil)
|
54
|
+
return nil unless tag
|
55
|
+
return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
|
56
|
+
end
|
57
|
+
|
58
|
+
# Regexps to match XML-style part-of-speech tags
|
59
|
+
NUM = get_ext('cd')
|
60
|
+
GER = get_ext('vbg')
|
61
|
+
ADJ = get_ext('jj[rs]*')
|
62
|
+
PART = get_ext('vbn')
|
63
|
+
NN = get_ext('nn[sp]*')
|
64
|
+
NNP = get_ext('nnp')
|
65
|
+
PREP = get_ext('in')
|
66
|
+
DET = get_ext('det')
|
67
|
+
PAREN = get_ext('[lr]rb')
|
68
|
+
QUOT = get_ext('ppr')
|
69
|
+
SEN = get_ext('pp')
|
70
|
+
WORD = get_ext('\w+')
|
71
|
+
|
72
|
+
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
73
|
+
def self.explain_tag(tag)
|
74
|
+
if TAGS[tag]
|
75
|
+
return TAGS[tag]
|
76
|
+
else
|
77
|
+
return tag
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# The folloging is to make a hash to convert a pos tag to its definition
|
82
|
+
# used by the explain_tag method
|
83
|
+
tags = [
|
84
|
+
"CC", "Conjunction, coordinating",
|
85
|
+
"CD", "Adjective, cardinal number",
|
86
|
+
"DET", "Determiner",
|
87
|
+
"EX", "Pronoun, existential there",
|
88
|
+
"FW", "Foreign words",
|
89
|
+
"IN", "Preposition / Conjunction",
|
90
|
+
"JJ", "Adjective",
|
91
|
+
"JJR", "Adjective, comparative",
|
92
|
+
"JJS", "Adjective, superlative",
|
93
|
+
"LS", "Symbol, list item",
|
94
|
+
"MD", "Verb, modal",
|
95
|
+
"NN", "Noun",
|
96
|
+
"NNP", "Noun, proper",
|
97
|
+
"NNPS", "Noun, proper, plural",
|
98
|
+
"NNS", "Noun, plural",
|
99
|
+
"PDT", "Determiner, prequalifier",
|
100
|
+
"POS", "Possessive",
|
101
|
+
"PRP", "Determiner, possessive second",
|
102
|
+
"PRPS", "Determiner, possessive",
|
103
|
+
"RB", "Adverb",
|
104
|
+
"RBR", "Adverb, comparative",
|
105
|
+
"RBS", "Adverb, superlative",
|
106
|
+
"RP", "Adverb, particle",
|
107
|
+
"SYM", "Symbol",
|
108
|
+
"TO", "Preposition",
|
109
|
+
"UH", "Interjection",
|
110
|
+
"VB", "Verb, infinitive",
|
111
|
+
"VBD", "Verb, past tense",
|
112
|
+
"VBG", "Verb, gerund",
|
113
|
+
"VBN", "Verb, past/passive participle",
|
114
|
+
"VBP", "Verb, base present form",
|
115
|
+
"VBZ", "Verb, present 3SG -s form",
|
116
|
+
"WDT", "Determiner, question",
|
117
|
+
"WP", "Pronoun, question",
|
118
|
+
"WPS", "Determiner, possessive & question",
|
119
|
+
"WRB", "Adverb, question",
|
120
|
+
"PP", "Punctuation, sentence ender",
|
121
|
+
"PPC", "Punctuation, comma",
|
122
|
+
"PPD", "Punctuation, dollar sign",
|
123
|
+
"PPL", "Punctuation, quotation mark left",
|
124
|
+
"PPR", "Punctuation, quotation mark right",
|
125
|
+
"PPS", "Punctuation, colon, semicolon, elipsis",
|
126
|
+
"LRB", "Punctuation, left bracket",
|
127
|
+
"RRB", "Punctuation, right bracket"
|
128
|
+
]
|
129
|
+
tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
|
130
|
+
tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
|
131
|
+
TAGS = Hash[*tags]
|
132
|
+
|
133
|
+
# Hash storing config values:
|
134
|
+
#
|
135
|
+
# * :unknown_word_tag
|
136
|
+
# => (String) Tag to assign to unknown words
|
137
|
+
# * :stem
|
138
|
+
# => (Boolean) Stem single words using Porter module
|
139
|
+
# * :weight_noun_phrases
|
140
|
+
# => (Boolean) When returning occurrence counts for a noun phrase, multiply
|
141
|
+
# the valuethe number of words in the NP.
|
142
|
+
# * :longest_noun_phrase
|
143
|
+
# => (Integer) Will ignore noun phrases longer than this threshold. This
|
144
|
+
# affects only the get_words() and get_nouns() methods.
|
145
|
+
# * :relax
|
146
|
+
# => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
|
147
|
+
# uncommon words, particularly words used polysemously
|
148
|
+
# * :tag_lex
|
149
|
+
# => (String) Name of the YAML file containing a hash of adjacent part of
|
150
|
+
# speech tags and the probability of each
|
151
|
+
# * :word_lex
|
152
|
+
# => (String) Name of the YAML file containing a hash of words and corresponding
|
153
|
+
# parts of speech
|
154
|
+
# * :unknown_lex
|
155
|
+
# => (String) Name of the YAML file containing a hash of tags for unknown
|
156
|
+
# words and corresponding parts of speech
|
157
|
+
# * :tag_path
|
158
|
+
# => (String) Directory path of tag_lex
|
159
|
+
# * :word_path
|
160
|
+
# => (String) Directory path of word_lex and unknown_lex
|
161
|
+
# * :debug
|
162
|
+
# => (Boolean) Print debug messages
|
163
|
+
attr_accessor :conf
|
164
|
+
|
165
|
+
###############
|
166
|
+
# Constructor #
|
167
|
+
###############
|
168
|
+
|
169
|
+
# Take a hash of parameters that override default values.
|
170
|
+
# See above for details.
|
171
|
+
def initialize(params = {})
|
172
|
+
@conf = Hash.new
|
173
|
+
@conf[:unknown_word_tag] = ''
|
174
|
+
@conf[:stem] = false
|
175
|
+
@conf[:weight_noun_phrases] = false
|
176
|
+
@conf[:longest_noun_phrase] = 5
|
177
|
+
@conf[:relax] = false
|
178
|
+
@conf[:tag_lex] = 'tags.yml'
|
179
|
+
@conf[:word_lex] = 'words.yml'
|
180
|
+
@conf[:unknown_lex] = 'unknown.yml'
|
181
|
+
@conf[:word_path] = $word_path
|
182
|
+
@conf[:tag_path] = $tag_path
|
183
|
+
@conf[:debug] = false
|
184
|
+
# assuming that we start analyzing from the beginninga new sentence...
|
185
|
+
@conf[:current_tag] = 'pp'
|
186
|
+
@conf.merge(params) if params
|
187
|
+
unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
|
188
|
+
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
189
|
+
@@hmm = Hash.new
|
190
|
+
@@lexicon = Hash.new
|
191
|
+
else
|
192
|
+
lexf = File.open(@conf[:word_path], 'r')
|
193
|
+
@@lexicon = Marshal.load(lexf)
|
194
|
+
lexf.close
|
195
|
+
hmmf = File.open(@conf[:tag_path], 'r')
|
196
|
+
@@hmm = Marshal.load(hmmf)
|
197
|
+
hmmf.close
|
198
|
+
end
|
199
|
+
@@mnp = get_max_noun_regex
|
200
|
+
end
|
201
|
+
|
202
|
+
##################
|
203
|
+
# Public methods #
|
204
|
+
##################
|
205
|
+
|
206
|
+
# Examine the string provided and return it fully tagged in XML style
|
207
|
+
def add_tags(text, verbose = false)
|
208
|
+
return nil unless valid_text(text)
|
209
|
+
tagged = []
|
210
|
+
words = clean_text(text)
|
211
|
+
tags = Array.new
|
212
|
+
words.each do |word|
|
213
|
+
cleaned_word = clean_word(word)
|
214
|
+
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
215
|
+
@conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
|
216
|
+
tag = EngTagger.explain_tag(tag) if verbose
|
217
|
+
tagged << '<' + tag + '>' + word + '</' + tag + '>'
|
218
|
+
end
|
219
|
+
reset
|
220
|
+
return tagged.join(' ')
|
221
|
+
end
|
222
|
+
|
223
|
+
# Given a text string, return as many nouns and noun phrases as possible.
|
224
|
+
# Applies add_tags and involves three stages:
|
225
|
+
#
|
226
|
+
# * Tag the text
|
227
|
+
# * Extract all the maximal noun phrases
|
228
|
+
# * Recursively extract all noun phrases from the MNPs
|
229
|
+
#
|
230
|
+
def get_words(text)
|
231
|
+
return false unless valid_text(text)
|
232
|
+
tagged = add_tags(text)
|
233
|
+
if(@conf[:longest_noun_phrase] <= 1)
|
234
|
+
return get_nouns(tagged)
|
235
|
+
else
|
236
|
+
return get_noun_phrases(tagged)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# Return an easy-on-the-eyes tagged version of a text string.
|
241
|
+
# Applies add_tags and reformats to be easier to read.
|
242
|
+
def get_readable(text, verbose = false)
|
243
|
+
return nil unless valid_text(text)
|
244
|
+
tagged = add_tags(text, verbose)
|
245
|
+
tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
246
|
+
$1 + '/' + $2.upcase
|
247
|
+
end
|
248
|
+
return tagged
|
249
|
+
end
|
250
|
+
|
251
|
+
# Return an array of sentences (without POS tags) from a text.
|
252
|
+
def get_sentences(text)
|
253
|
+
return nil unless valid_text(text)
|
254
|
+
tagged = add_tags(text)
|
255
|
+
sentences = Array.new
|
256
|
+
tagged.split(/<\/pp>/).each do |line|
|
257
|
+
sentences << strip_tags(line)
|
258
|
+
end
|
259
|
+
sentences = sentences.map do |sentence|
|
260
|
+
sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
|
261
|
+
sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
|
262
|
+
sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
|
263
|
+
sentence.gsub(Regexp.new(" (\W+)$")){$1}
|
264
|
+
sentence.gsub(Regexp.new("^(`+) ")){$1}
|
265
|
+
end
|
266
|
+
return sentences
|
267
|
+
end
|
268
|
+
|
269
|
+
# Given a POS-tagged text, this method returns a hash of all proper nouns
|
270
|
+
# and their occurrence frequencies. The method is greedy and will
|
271
|
+
# return multi-word phrases, if possible, so it would find ``Linguistic
|
272
|
+
# Data Consortium'' as a single unit, rather than as three individual
|
273
|
+
# proper nouns. This method does not stem the found words.
|
274
|
+
def get_proper_nouns(tagged)
|
275
|
+
return nil unless valid_text(tagged)
|
276
|
+
trimmed = tagged.scan(NNP).map do |n|
|
277
|
+
strip_tags(n)
|
278
|
+
end
|
279
|
+
nnp = Hash.new(0)
|
280
|
+
trimmed.each do |n|
|
281
|
+
next unless n.length < 100 # sanity check on word length
|
282
|
+
nnp[n] += 1 unless n =~ /\A\s*\z/
|
283
|
+
end
|
284
|
+
# Now for some fancy resolution stuff...
|
285
|
+
nnp.keys.each do |key|
|
286
|
+
words = key.split(/\s/)
|
287
|
+
# Let's say this is an organization's name --
|
288
|
+
# (and it's got at least three words)
|
289
|
+
# is there a corresponding acronym in this hash?
|
290
|
+
if words.length > 2
|
291
|
+
# Make a (naive) acronym out of this name
|
292
|
+
acronym = words.map do |word|
|
293
|
+
/\A([a-z])[a-z]*\z/ =~ word
|
294
|
+
$1
|
295
|
+
end.join ''
|
296
|
+
# If that acronym has been seen,
|
297
|
+
# remove it and add the values to
|
298
|
+
# the full name
|
299
|
+
if nnp[acronym]
|
300
|
+
nnp[key] += nnp[acronym]
|
301
|
+
nnp.delete(acronym)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
305
|
+
return nnp
|
306
|
+
end
|
307
|
+
|
308
|
+
# Given a POS-tagged text, this method returns all nouns and their
|
309
|
+
# occurrence frequencies.
|
310
|
+
def get_nouns(tagged)
|
311
|
+
return nil unless valid_text(tagged)
|
312
|
+
NN
|
313
|
+
trimmed = tagged.scan(NN).map do |n|
|
314
|
+
strip_tags(n)
|
315
|
+
end
|
316
|
+
ret = Hash.new(0)
|
317
|
+
trimmed.each do |n|
|
318
|
+
n = stem(n)
|
319
|
+
next unless n.length < 100 # sanity check on word length
|
320
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
321
|
+
end
|
322
|
+
return ret
|
323
|
+
end
|
324
|
+
|
325
|
+
# Given a POS-tagged text, this method returns only the maximal noun phrases.
|
326
|
+
# May be called directly, but is also used by get_noun_phrases
|
327
|
+
def get_max_noun_phrases(tagged)
|
328
|
+
return unless valid_text(tagged)
|
329
|
+
mn_phrases = tagged.scan(@@mnp).map do |m|
|
330
|
+
strip_tags(m)
|
331
|
+
end
|
332
|
+
ret = Hash.new(0)
|
333
|
+
mn_phrases.each do |p|
|
334
|
+
p = stem(p) unless p =~ /\s/ # stem single words
|
335
|
+
ret[p] += 1 unless p =~ /\A\s*\z/
|
336
|
+
end
|
337
|
+
return ret
|
338
|
+
end
|
339
|
+
|
340
|
+
# Similar to get_words, but requires a POS-tagged text as an argument.
|
341
|
+
def get_noun_phrases(tagged)
|
342
|
+
return nil unless valid_text(tagged)
|
343
|
+
found = Hash.new(0)
|
344
|
+
phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
|
345
|
+
scanned = tagged.scan(@@mnp)
|
346
|
+
# Find MNPs in the text, one sentence at a time
|
347
|
+
# Record and split if the phrase is extended by a (?:PREP|DET|NUM)
|
348
|
+
mn_phrases = []
|
349
|
+
scanned.each do |m|
|
350
|
+
found[m] += 1 if phrase_ext =~ m
|
351
|
+
mn_phrases += m.split(phrase_ext)
|
352
|
+
end
|
353
|
+
mn_phrases.each do |mnp|
|
354
|
+
# Split the phrase into an array of words, and create a loop for each word,
|
355
|
+
# shortening the phrase by removing the word in the first position.
|
356
|
+
# Record the phrase and any single nouns that are found
|
357
|
+
words = mnp.split
|
358
|
+
words.length.times do |i|
|
359
|
+
found[words.join(' ')] += 1 if words.length > 1
|
360
|
+
w = words.shift
|
361
|
+
found[w] += 1 if w =~ /#{NN}/
|
362
|
+
end
|
363
|
+
end
|
364
|
+
ret = Hash.new(0)
|
365
|
+
found.keys.each do |f|
|
366
|
+
k = strip_tags(f)
|
367
|
+
v = found[f]
|
368
|
+
# We weight by the word count to favor long noun phrases
|
369
|
+
space_count = k.scan(/\s+/)
|
370
|
+
word_count = space_count.length + 1
|
371
|
+
# Throttle MNPs if necessary
|
372
|
+
next if word_count > @conf[:longest_noun_phrase]
|
373
|
+
k = stem(k) unless word_count > 1 # stem single words
|
374
|
+
multiplier = 1
|
375
|
+
multiplier = word_count if @conf[:weight_noun_phrases]
|
376
|
+
ret[k] += multiplier * v
|
377
|
+
end
|
378
|
+
return ret
|
379
|
+
end
|
380
|
+
|
381
|
+
# Reads some included corpus data and saves it in a stored hash on the
|
382
|
+
# local file system. This is called automatically if the tagger can't
|
383
|
+
# find the stored lexicon.
|
384
|
+
def install
|
385
|
+
puts "Creating part-of-speech lexicon" if @conf[:debug]
|
386
|
+
load_tags(@conf[:tag_lex])
|
387
|
+
load_words(@conf[:word_lex])
|
388
|
+
load_words(@conf[:unknown_lex])
|
389
|
+
File.open(@conf[:word_path], 'w') do |f|
|
390
|
+
Marshal.dump(@@lexicon, f)
|
391
|
+
end
|
392
|
+
File.open(@conf[:tag_path], 'w') do |f|
|
393
|
+
Marshal.dump(@@hmm, f)
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
###################
|
398
|
+
# Private methods #
|
399
|
+
###################
|
400
|
+
|
401
|
+
:private
|
402
|
+
|
403
|
+
# Downcase the first letter of word
|
404
|
+
def lcfirst(word)
|
405
|
+
word.split(//)[0].downcase + word.split(//)[1..-1].join
|
406
|
+
end
|
407
|
+
|
408
|
+
# Upcase the first letter of word
|
409
|
+
def ucfirst(word)
|
410
|
+
word.split(//)[0].upcase + word.split(//)[1..-1].join
|
411
|
+
end
|
412
|
+
|
413
|
+
# Return the word stem as given by Stemmable module. This can be
|
414
|
+
# turned off with the class parameter @conf[:stem] => false.
|
415
|
+
def stem(word)
|
416
|
+
return word unless @conf[:stem]
|
417
|
+
return word.stem
|
418
|
+
end
|
419
|
+
|
420
|
+
# This method will reset the preceeding tag to a sentence ender (PP).
|
421
|
+
# This prepares the first word of a new sentence to be tagged correctly.
|
422
|
+
def reset
|
423
|
+
@conf[:current_tag] = 'pp'
|
424
|
+
end
|
425
|
+
|
426
|
+
# Check whether the text is a valid string
|
427
|
+
def valid_text(text)
|
428
|
+
if !text
|
429
|
+
# there's nothing to parse
|
430
|
+
"method call on uninitialized variable" if @conf[:debug]
|
431
|
+
return false
|
432
|
+
elsif /\A\s*\z/ =~ text
|
433
|
+
# text is an empty string, nothing to parse
|
434
|
+
return false
|
435
|
+
else
|
436
|
+
# $text is valid
|
437
|
+
return true
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
# Return a text string with the part-of-speech tags removed
|
442
|
+
def strip_tags(tagged, downcase = false)
|
443
|
+
return nil unless valid_text(tagged)
|
444
|
+
text = tagged.gsub(/<[^>]+>/m, "")
|
445
|
+
text = text.gsub(/\s+/m, " ")
|
446
|
+
text = text.gsub(/\A\s*/, "")
|
447
|
+
text = text.gsub(/\s*\z/, "")
|
448
|
+
if downcase
|
449
|
+
return text.downcase
|
450
|
+
else
|
451
|
+
return text
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
# Strip the provided text of HTML-style tags and separate off any punctuation
|
456
|
+
# in preparation for tagging
|
457
|
+
def clean_text(text)
|
458
|
+
return false unless valid_text(text)
|
459
|
+
text = text.toutf8
|
460
|
+
unless $no_hpricot
|
461
|
+
# Strip out any markup and convert entities to their proper form
|
462
|
+
cleaned_text = Hpricot(text).inner_text
|
463
|
+
else
|
464
|
+
cleaned_text = text
|
465
|
+
end
|
466
|
+
tokenized = []
|
467
|
+
# Tokenize the text (splitting on punctuation as you go)
|
468
|
+
cleaned_text.split(/\s+/).each do |line|
|
469
|
+
tokenized += split_punct(line)
|
470
|
+
end
|
471
|
+
words = split_sentences(tokenized)
|
472
|
+
return words
|
473
|
+
end
|
474
|
+
|
475
|
+
# This handles all of the trailing periods, keeping those that
|
476
|
+
# belong on abbreviations and removing those that seem to be
|
477
|
+
# at the end of sentences. This method makes some assumptions
|
478
|
+
# about the use of capitalization in the incoming text
|
479
|
+
def split_sentences(array)
|
480
|
+
tokenized = array
|
481
|
+
people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
482
|
+
supt det mssrs rev)
|
483
|
+
army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
|
484
|
+
inst = %w(dept univ assn bros ph.d)
|
485
|
+
place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
486
|
+
hwy hway la pde pd plz pl rd st tce)
|
487
|
+
comp = %w(mfg inc ltd co corp)
|
488
|
+
state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
489
|
+
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
490
|
+
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
491
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk)
|
492
|
+
month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
|
493
|
+
misc = %w(vs etc no esp)
|
494
|
+
abbr = Hash.new
|
495
|
+
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
496
|
+
abbr[i] = true
|
497
|
+
end
|
498
|
+
words = Array.new
|
499
|
+
tokenized.each_with_index do |t, i|
|
500
|
+
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
|
501
|
+
w = $1
|
502
|
+
# Don't separate the period off words that
|
503
|
+
# meet any of the following conditions:
|
504
|
+
#
|
505
|
+
# 1. It is defined in one of the lists above
|
506
|
+
# 2. It is only one letter long: Alfred E. Sloan
|
507
|
+
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
508
|
+
unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
|
509
|
+
words << w
|
510
|
+
words << '.'
|
511
|
+
next
|
512
|
+
end
|
513
|
+
end
|
514
|
+
words << tokenized[i]
|
515
|
+
end
|
516
|
+
# If the final word ends in a period..
|
517
|
+
if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
|
518
|
+
words[-1] = $1
|
519
|
+
words.push '.'
|
520
|
+
end
|
521
|
+
return words
|
522
|
+
end
|
523
|
+
|
524
|
+
# Separate punctuation from words, where appropriate. This leaves trailing
|
525
|
+
# periods in place to be dealt with later. Called by the clean_text method.
|
526
|
+
def split_punct(text)
|
527
|
+
# If there's no punctuation, return immediately
|
528
|
+
return [text] if /\A\w+\z/ =~ text
|
529
|
+
# Sanity checks
|
530
|
+
text = text.gsub(/\W{10,}/o, " ")
|
531
|
+
|
532
|
+
# Put quotes into a standard format
|
533
|
+
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
534
|
+
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
535
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
536
|
+
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
537
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
538
|
+
|
539
|
+
# Handle all other punctuation
|
540
|
+
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
541
|
+
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
542
|
+
text = text.gsub(/:/o, " :") # Shift semicolons off
|
543
|
+
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
544
|
+
text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
|
545
|
+
text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
|
546
|
+
|
547
|
+
# English-specific contractions
|
548
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
|
549
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
550
|
+
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
|
551
|
+
result = text.split(' ')
|
552
|
+
return result
|
553
|
+
end
|
554
|
+
|
555
|
+
# Given a preceding tag, assign a tag word. Called by the add_tags method.
|
556
|
+
# This method is a modified version of the Viterbi algorithm for part-of-speech tagging
|
557
|
+
def assign_tag(prev_tag, word)
|
558
|
+
if word == "-unknown-"
|
559
|
+
# classify unknown words accordingly
|
560
|
+
return @conf[:unknown_word_tag]
|
561
|
+
elsif word == "-sym-"
|
562
|
+
# If this is a symbol, tag it as a symbol
|
563
|
+
return "sym"
|
564
|
+
end
|
565
|
+
best_so_far = 0
|
566
|
+
w = @@lexicon[word]
|
567
|
+
t = @@hmm
|
568
|
+
|
569
|
+
# TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
|
570
|
+
# which is used in most POS taggers
|
571
|
+
best_tag = ""
|
572
|
+
t[prev_tag].keys.each do |tag|
|
573
|
+
# With @config[:relax] set, this method
|
574
|
+
# will also include any `open classes' of POS tags
|
575
|
+
pw = 0
|
576
|
+
if w[tag]
|
577
|
+
pw = w[tag]
|
578
|
+
elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
|
579
|
+
pw = 0
|
580
|
+
else
|
581
|
+
next
|
582
|
+
end
|
583
|
+
|
584
|
+
# Bayesian logic:
|
585
|
+
# P = P( tag | prev_tag ) * P( tag | word )
|
586
|
+
probability = t[prev_tag][tag] * (pw + 1)
|
587
|
+
# Set the tag with maximal probability
|
588
|
+
if probability > best_so_far
|
589
|
+
best_so_far = probability
|
590
|
+
best_tag = tag
|
591
|
+
end
|
592
|
+
end
|
593
|
+
return best_tag
|
594
|
+
end
|
595
|
+
|
596
|
+
# This method determines whether a word should be considered in its
|
597
|
+
# lower or upper case form. This is useful in considering proper nouns
|
598
|
+
# and words that begin sentences. Called by add_tags.
|
599
|
+
def clean_word(word)
|
600
|
+
lcf = lcfirst(word)
|
601
|
+
# seen this word as it appears (lower or upper case)
|
602
|
+
if @@lexicon[word]
|
603
|
+
return word
|
604
|
+
elsif @@lexicon[lcf]
|
605
|
+
# seen this word only as lower case
|
606
|
+
return lcf
|
607
|
+
else
|
608
|
+
# never seen this word. guess.
|
609
|
+
return classify_unknown_word(word)
|
610
|
+
end
|
611
|
+
end
|
612
|
+
|
613
|
+
# This changes any word not appearing in the lexicon to identifiable
|
614
|
+
# classes of words handled by a simple unknown word classification
|
615
|
+
# metric. Called by the clean_word method.
|
616
|
+
def classify_unknown_word(word)
|
617
|
+
if /[\(\{\[]/ =~ word # Left brackets
|
618
|
+
classified = "*LRB*"
|
619
|
+
elsif
|
620
|
+
/[\)\}\]]/ =~ word # Right brackets
|
621
|
+
classified = "*RRB*"
|
622
|
+
elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
|
623
|
+
classified = "*NUM*"
|
624
|
+
elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs
|
625
|
+
classified = "*NUM*"
|
626
|
+
elsif /\A-?\d+\w+\z/o =~ word # Ordinal number
|
627
|
+
classified = "*ORD*"
|
628
|
+
elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps)
|
629
|
+
classified = "-abr-"
|
630
|
+
elsif /\w-\w/o =~ word # Hyphenated word
|
631
|
+
/-([^-]+)\z/ =~ word
|
632
|
+
h_suffix = $1
|
633
|
+
if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
|
634
|
+
# last part of this is defined as an adjective
|
635
|
+
classified = "-hyp-adj-"
|
636
|
+
else
|
637
|
+
# last part of this is not defined as an adjective
|
638
|
+
classified = "-hyp-"
|
639
|
+
end
|
640
|
+
elsif /\A\W+\z/o =~ word
|
641
|
+
classified = "-sym-" # Symbol
|
642
|
+
elsif word == ucfirst(word)
|
643
|
+
classified = "-cap-" # Capitalized word
|
644
|
+
elsif /ing\z/o =~ word
|
645
|
+
classified = "-ing-" # Ends in 'ing'
|
646
|
+
elsif /s\z/o =~ word
|
647
|
+
classified = "-s-" # Ends in 's'
|
648
|
+
elsif /tion\z/o =~ word
|
649
|
+
classified = "-tion-" # Ends in 'tion'
|
650
|
+
elsif /ly\z/o =~ word
|
651
|
+
classified = "-ly-" # Ends in 'ly'
|
652
|
+
elsif /ed\z/o =~ word
|
653
|
+
classified = "-ed-" # Ends in 'ed
|
654
|
+
else
|
655
|
+
classified = "-unknown-" # Completely unknown
|
656
|
+
end
|
657
|
+
return classified
|
658
|
+
end
|
659
|
+
|
660
|
+
# This returns a compiled regexp for extracting maximal noun phrases
|
661
|
+
# from a POS-tagged text.
|
662
|
+
def get_max_noun_regex
|
663
|
+
regex = /
|
664
|
+
# optional number, gerund - adjective -participle
|
665
|
+
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
666
|
+
# Followed by one or more nouns
|
667
|
+
(?:#{NN})+
|
668
|
+
(?:
|
669
|
+
# Optional preposition, determinant, cardinal
|
670
|
+
(?:#{PREP})*(?:#{DET})?(?:#{NUM})?
|
671
|
+
# Optional gerund-adjective -participle
|
672
|
+
(?:#{GER}|#{ADJ}|#{PART})*
|
673
|
+
# one or more nouns
|
674
|
+
(?:#{NN})+
|
675
|
+
)*
|
676
|
+
/xo #/
|
677
|
+
return regex
|
678
|
+
end
|
679
|
+
|
680
|
+
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
681
|
+
# YAML data parser. It will load a YAML document with a collection of key:
|
682
|
+
# value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
|
683
|
+
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
684
|
+
def load_tags(lexicon)
|
685
|
+
path = File.join($lexpath, lexicon)
|
686
|
+
fh = File.open(path, 'r')
|
687
|
+
while line = fh.gets
|
688
|
+
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
689
|
+
next unless $1 and $2
|
690
|
+
key, data = $1, $2
|
691
|
+
tags = Hash.new
|
692
|
+
items = data.split(/,\s+/)
|
693
|
+
pairs = {}
|
694
|
+
items.each do |i|
|
695
|
+
/([^:]+):\s*(.+)/ =~ i
|
696
|
+
pairs[$1] = $2.to_f
|
697
|
+
end
|
698
|
+
@@hmm[key] = pairs
|
699
|
+
end
|
700
|
+
fh.close
|
701
|
+
end
|
702
|
+
|
703
|
+
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
704
|
+
# YAML data parser. It will load a YAML document with a collection of key:
|
705
|
+
# value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
|
706
|
+
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
707
|
+
def load_words(lexicon)
|
708
|
+
path = File.join($lexpath, lexicon)
|
709
|
+
fh = File.open(path, 'r')
|
710
|
+
while line = fh.gets
|
711
|
+
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
712
|
+
next unless $1 and $2
|
713
|
+
key, data = $1, $2
|
714
|
+
tags = Hash.new
|
715
|
+
items = data.split(/,\s+/)
|
716
|
+
pairs = {}
|
717
|
+
items.each do |i|
|
718
|
+
/([^:]+):\s*(.+)/ =~ i
|
719
|
+
pairs[$1] = $2.to_f
|
720
|
+
end
|
721
|
+
@@lexicon[key] = pairs
|
722
|
+
end
|
723
|
+
fh.close
|
724
|
+
end
|
725
|
+
|
726
|
+
#memoize the stem and assign_tag methods
|
727
|
+
memoize("stem")
|
728
|
+
memoize("assign_tag")
|
729
|
+
end
|
730
|
+
|