engtagger 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +75 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +33 -31
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +38 -60
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b61370e322595bd880097f51fe0728780fa6a01ee9975e6eb333c8720ff36d8
|
4
|
+
data.tar.gz: 0f990be4f4d5f71908d76f0fb52f2c925a2a01891a815cbc70eaf7a39f77edfe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ade5d1cf6fc11553519fe9217dffb06453e0ab7d69ab1532b3f2e2079dd05d035d90ce5ce92e4d0e1195f2a8f79df5b4d44c4cedb27f14df529ac0b0e91cf730
|
7
|
+
data.tar.gz: ff085546b0db152df0983dabea49ec5b0cf47525cca6118d3776378e908ea04fd675f0bb1daceb944d6be141615e3a5d9da5774025a0dc6ef609dd8b311b1412
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
AllCops:
|
2
|
+
NewCops: disable
|
3
|
+
SuggestExtensions: false
|
4
|
+
TargetRubyVersion: 2.6
|
5
|
+
|
6
|
+
Documentation:
|
7
|
+
Enabled: false
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Naming/VariableNumber:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Naming/FileName:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Security/MarshalLoad:
|
19
|
+
Enabled: false
|
20
|
+
|
21
|
+
Layout/EndOfLine:
|
22
|
+
Enabled: False
|
23
|
+
|
24
|
+
Style/ClassVars:
|
25
|
+
Enabled: false
|
26
|
+
|
27
|
+
Style/OptionalBooleanParameter:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/StringConcatenation:
|
31
|
+
Enabled: false
|
32
|
+
|
33
|
+
Style/PerlBackrefs:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
Style/StringLiterals:
|
37
|
+
Enabled: true
|
38
|
+
EnforcedStyle: double_quotes
|
39
|
+
|
40
|
+
Style/StringLiteralsInInterpolation:
|
41
|
+
Enabled: true
|
42
|
+
EnforcedStyle: double_quotes
|
43
|
+
|
44
|
+
Style/WordArray:
|
45
|
+
Enabled: false
|
46
|
+
|
47
|
+
Style/EvalWithLocation:
|
48
|
+
Enabled: false
|
49
|
+
|
50
|
+
Layout/LineLength:
|
51
|
+
Max: 400
|
52
|
+
|
53
|
+
Metrics/MethodLength:
|
54
|
+
Max: 80
|
55
|
+
|
56
|
+
Metrics/BlockLength:
|
57
|
+
Max: 60
|
58
|
+
|
59
|
+
Metrics/AbcSize:
|
60
|
+
Max: 60
|
61
|
+
|
62
|
+
Metrics/PerceivedComplexity:
|
63
|
+
Max: 60
|
64
|
+
|
65
|
+
Metrics/ClassLength:
|
66
|
+
Max: 800
|
67
|
+
|
68
|
+
Metrics/CyclomaticComplexity:
|
69
|
+
Max: 60
|
70
|
+
|
71
|
+
Metrics/ParameterLists:
|
72
|
+
Max: 8
|
73
|
+
|
74
|
+
Metrics/ModuleLength:
|
75
|
+
Max: 200
|
data/.solargraph.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
include:
|
3
|
+
- "**/*.rb"
|
4
|
+
exclude:
|
5
|
+
- spec/**/*
|
6
|
+
- test/**/*
|
7
|
+
- vendor/**/*
|
8
|
+
- ".bundle/**/*"
|
9
|
+
require: []
|
10
|
+
domains: []
|
11
|
+
reporters:
|
12
|
+
- rubocop
|
13
|
+
# - require_not_found
|
14
|
+
formatter:
|
15
|
+
rubocop:
|
16
|
+
cops: safe
|
17
|
+
except: []
|
18
|
+
only: []
|
19
|
+
extra_args: []
|
20
|
+
require_paths: []
|
21
|
+
plugins: []
|
22
|
+
max_files: 5000
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -19,56 +19,58 @@ of regular expressions.
|
|
19
19
|
* Extract noun phrases from tagged text
|
20
20
|
* etc.
|
21
21
|
|
22
|
-
### Synopsis
|
22
|
+
### Synopsis
|
23
23
|
|
24
|
-
|
24
|
+
```ruby
|
25
|
+
require 'engtagger'
|
25
26
|
|
26
|
-
|
27
|
-
|
27
|
+
# Create a parser object
|
28
|
+
tgr = EngTagger.new
|
28
29
|
|
29
|
-
|
30
|
-
|
30
|
+
# Sample text
|
31
|
+
text = "Alice chased the big fat cat."
|
31
32
|
|
32
|
-
|
33
|
-
|
33
|
+
# Add part-of-speech tags to text
|
34
|
+
tagged = tgr.add_tags(text)
|
34
35
|
|
35
|
-
|
36
|
+
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
36
37
|
|
37
|
-
|
38
|
-
|
38
|
+
# Get a list of all nouns and noun phrases with occurrence counts
|
39
|
+
word_list = tgr.get_words(text)
|
39
40
|
|
40
|
-
|
41
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
41
42
|
|
42
|
-
|
43
|
-
|
43
|
+
# Get a readable version of the tagged text
|
44
|
+
readable = tgr.get_readable(text)
|
44
45
|
|
45
|
-
|
46
|
+
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
46
47
|
|
47
|
-
|
48
|
-
|
48
|
+
# Get all nouns from a tagged output
|
49
|
+
nouns = tgr.get_nouns(tagged)
|
49
50
|
|
50
|
-
|
51
|
+
#=> {"cat"=>1, "Alice"=>1}
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
# Get all proper nouns
|
54
|
+
proper = tgr.get_proper_nouns(tagged)
|
54
55
|
|
55
|
-
|
56
|
+
#=> {"Alice"=>1}
|
56
57
|
|
57
|
-
|
58
|
-
|
58
|
+
# Get all past tense verbs
|
59
|
+
pt_verbs = tgr.get_past_tense_verbs(tagged)
|
59
60
|
|
60
|
-
|
61
|
+
#=> {"chased"=>1}
|
61
62
|
|
62
|
-
|
63
|
-
|
63
|
+
# Get all the adjectives
|
64
|
+
adj = tgr.get_adjectives(tagged)
|
64
65
|
|
65
|
-
|
66
|
+
#=> {"big"=>1, "fat"=>1}
|
66
67
|
|
67
|
-
|
68
|
-
|
69
|
-
|
68
|
+
# Get all noun phrases of any syntactic level
|
69
|
+
# (same as word_list but take a tagged input)
|
70
|
+
nps = tgr.get_noun_phrases(tagged)
|
70
71
|
|
71
|
-
|
72
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
73
|
+
```
|
72
74
|
|
73
75
|
### Tag Set
|
74
76
|
|
data/Rakefile
CHANGED
data/engtagger.gemspec
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/engtagger/version"
|
3
4
|
|
4
5
|
Gem::Specification.new do |gem|
|
5
6
|
gem.authors = ["Yoichiro Hasebe"]
|
6
7
|
gem.email = ["yohasebe@gmail.com"]
|
7
|
-
gem.summary
|
8
|
-
gem.description
|
9
|
-
gem.homepage
|
10
|
-
|
11
|
-
gem.
|
12
|
-
gem.
|
8
|
+
gem.summary = "A probability based, corpus-trained English POS tagger"
|
9
|
+
gem.description = "A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values."
|
10
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
11
|
+
gem.license = "GPL"
|
12
|
+
gem.required_ruby_version = Gem::Requirement.new(">= 2.6")
|
13
|
+
gem.files = Dir.chdir(File.expand_path(__dir__)) do
|
14
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
15
|
+
end
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
13
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
18
|
gem.name = "engtagger"
|
15
19
|
gem.require_paths = ["lib"]
|
16
20
|
gem.version = EngTagger::VERSION
|
17
|
-
|
18
|
-
gem.add_runtime_dependency 'lru_redux'
|
21
|
+
gem.add_dependency "lru_redux"
|
19
22
|
end
|
data/lib/engtagger/porter.rb
CHANGED
@@ -1,23 +1,20 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
3
|
module Stemmable
|
5
|
-
|
6
4
|
STEP_2_LIST = {
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
}
|
5
|
+
"ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
|
6
|
+
"izer" => "ize", "bli" => "ble",
|
7
|
+
"alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
|
8
|
+
"ization" => "ize", "ation" => "ate",
|
9
|
+
"ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
|
10
|
+
"ousness" => "ous", "aliti" => "al",
|
11
|
+
"iviti" => "ive", "biliti" => "ble", "logi" => "log"
|
12
|
+
}.freeze
|
15
13
|
|
16
14
|
STEP_3_LIST = {
|
17
|
-
|
18
|
-
|
19
|
-
}
|
20
|
-
|
15
|
+
"icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
|
16
|
+
"ical" => "ic", "ful" => "", "ness" => ""
|
17
|
+
}.freeze
|
21
18
|
|
22
19
|
SUFFIX_1_REGEXP = /(
|
23
20
|
ational |
|
@@ -40,7 +37,7 @@ module Stemmable
|
|
40
37
|
aliti |
|
41
38
|
iviti |
|
42
39
|
biliti |
|
43
|
-
logi)$/x
|
40
|
+
logi)$/x.freeze
|
44
41
|
|
45
42
|
|
46
43
|
SUFFIX_2_REGEXP = /(
|
@@ -61,20 +58,18 @@ module Stemmable
|
|
61
58
|
iti |
|
62
59
|
ous |
|
63
60
|
ive |
|
64
|
-
ize)$/x
|
65
|
-
|
61
|
+
ize)$/x.freeze
|
66
62
|
|
67
|
-
C = "[^aeiou]"
|
68
|
-
V = "[aeiouy]"
|
69
|
-
CC = "#{C}(?>[^aeiouy]*)"
|
70
|
-
VV = "#{V}(?>[aeiou]*)"
|
63
|
+
C = "[^aeiou]" # consonant
|
64
|
+
V = "[aeiouy]" # vowel
|
65
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
66
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
71
67
|
|
72
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o
|
73
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o
|
74
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o
|
75
|
-
VOWEL_IN_STEM
|
68
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
|
69
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
|
70
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
|
71
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
|
76
72
|
|
77
|
-
#
|
78
73
|
# Porter stemmer in Ruby.
|
79
74
|
#
|
80
75
|
# This is the Porter stemming algorithm, ported to Ruby from the
|
@@ -90,30 +85,31 @@ module Stemmable
|
|
90
85
|
#
|
91
86
|
|
92
87
|
def stem_porter
|
93
|
-
|
94
88
|
# make a copy of the given object and convert it to a string.
|
95
|
-
w =
|
89
|
+
w = dup.to_str
|
96
90
|
|
97
91
|
return w if w.length < 3
|
98
92
|
|
99
93
|
# now map initial y to Y so that the patterns never treat it as vowel
|
100
|
-
w[0] =
|
94
|
+
w[0] = "Y" if w[0] == "y"
|
101
95
|
|
102
96
|
# Step 1a
|
103
|
-
|
97
|
+
case w
|
98
|
+
when /(ss|i)es$/
|
104
99
|
w = $` + $1
|
105
|
-
|
100
|
+
when /([^s])s$/
|
106
101
|
w = $` + $1
|
107
102
|
end
|
108
103
|
|
109
104
|
# Step 1b
|
110
|
-
|
105
|
+
case w
|
106
|
+
when /eed$/
|
111
107
|
w.chop! if $` =~ MGR0
|
112
|
-
|
108
|
+
when /(ed|ing)$/
|
113
109
|
stem = $`
|
114
110
|
if stem =~ VOWEL_IN_STEM
|
115
111
|
w = stem
|
116
|
-
|
112
|
+
case w
|
117
113
|
when /(at|bl|iz)$/ then w << "e"
|
118
114
|
when /([^aeiouylsz])\1$/ then w.chop!
|
119
115
|
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
@@ -131,59 +127,41 @@ module Stemmable
|
|
131
127
|
stem = $`
|
132
128
|
suffix = $1
|
133
129
|
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
134
|
-
if stem =~ MGR0
|
135
|
-
w = stem + STEP_2_LIST[suffix]
|
136
|
-
end
|
130
|
+
w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
|
137
131
|
end
|
138
132
|
|
139
133
|
# Step 3
|
140
134
|
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
141
135
|
stem = $`
|
142
136
|
suffix = $1
|
143
|
-
if stem =~ MGR0
|
144
|
-
w = stem + STEP_3_LIST[suffix]
|
145
|
-
end
|
137
|
+
w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
|
146
138
|
end
|
147
139
|
|
148
140
|
# Step 4
|
149
141
|
if w =~ SUFFIX_2_REGEXP
|
150
142
|
stem = $`
|
151
|
-
if stem =~ MGR1
|
152
|
-
w = stem
|
153
|
-
end
|
143
|
+
w = stem if stem =~ MGR1
|
154
144
|
elsif w =~ /(s|t)(ion)$/
|
155
145
|
stem = $` + $1
|
156
|
-
if stem =~ MGR1
|
157
|
-
w = stem
|
158
|
-
end
|
146
|
+
w = stem if stem =~ MGR1
|
159
147
|
end
|
160
148
|
|
161
149
|
# Step 5
|
162
150
|
if w =~ /e$/
|
163
151
|
stem = $`
|
164
|
-
if (stem =~ MGR1) ||
|
165
|
-
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
166
|
-
w = stem
|
167
|
-
end
|
152
|
+
w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
168
153
|
end
|
169
154
|
|
170
|
-
if w =~ /ll$/ && w =~ MGR1
|
171
|
-
w.chop!
|
172
|
-
end
|
155
|
+
w.chop! if w =~ /ll$/ && w =~ MGR1
|
173
156
|
|
174
157
|
# and turn initial Y back to y
|
175
|
-
w[0] =
|
176
|
-
|
158
|
+
w[0] = "y" if w[0] == "Y"
|
177
159
|
w
|
178
160
|
end
|
179
161
|
|
180
|
-
|
181
|
-
#
|
182
162
|
# make the stem_porter the default stem method, just in case we
|
183
163
|
# feel like having multiple stemmers available later.
|
184
|
-
#
|
185
164
|
alias stem stem_porter
|
186
|
-
|
187
165
|
end
|
188
166
|
|
189
167
|
# Add stem method to all Strings
|
data/lib/engtagger/version.rb
CHANGED