engtagger 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +75 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +33 -31
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +38 -60
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b61370e322595bd880097f51fe0728780fa6a01ee9975e6eb333c8720ff36d8
|
4
|
+
data.tar.gz: 0f990be4f4d5f71908d76f0fb52f2c925a2a01891a815cbc70eaf7a39f77edfe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ade5d1cf6fc11553519fe9217dffb06453e0ab7d69ab1532b3f2e2079dd05d035d90ce5ce92e4d0e1195f2a8f79df5b4d44c4cedb27f14df529ac0b0e91cf730
|
7
|
+
data.tar.gz: ff085546b0db152df0983dabea49ec5b0cf47525cca6118d3776378e908ea04fd675f0bb1daceb944d6be141615e3a5d9da5774025a0dc6ef609dd8b311b1412
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
AllCops:
|
2
|
+
NewCops: disable
|
3
|
+
SuggestExtensions: false
|
4
|
+
TargetRubyVersion: 2.6
|
5
|
+
|
6
|
+
Documentation:
|
7
|
+
Enabled: false
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Naming/VariableNumber:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Naming/FileName:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Security/MarshalLoad:
|
19
|
+
Enabled: false
|
20
|
+
|
21
|
+
Layout/EndOfLine:
|
22
|
+
Enabled: False
|
23
|
+
|
24
|
+
Style/ClassVars:
|
25
|
+
Enabled: false
|
26
|
+
|
27
|
+
Style/OptionalBooleanParameter:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/StringConcatenation:
|
31
|
+
Enabled: false
|
32
|
+
|
33
|
+
Style/PerlBackrefs:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
Style/StringLiterals:
|
37
|
+
Enabled: true
|
38
|
+
EnforcedStyle: double_quotes
|
39
|
+
|
40
|
+
Style/StringLiteralsInInterpolation:
|
41
|
+
Enabled: true
|
42
|
+
EnforcedStyle: double_quotes
|
43
|
+
|
44
|
+
Style/WordArray:
|
45
|
+
Enabled: false
|
46
|
+
|
47
|
+
Style/EvalWithLocation:
|
48
|
+
Enabled: false
|
49
|
+
|
50
|
+
Layout/LineLength:
|
51
|
+
Max: 400
|
52
|
+
|
53
|
+
Metrics/MethodLength:
|
54
|
+
Max: 80
|
55
|
+
|
56
|
+
Metrics/BlockLength:
|
57
|
+
Max: 60
|
58
|
+
|
59
|
+
Metrics/AbcSize:
|
60
|
+
Max: 60
|
61
|
+
|
62
|
+
Metrics/PerceivedComplexity:
|
63
|
+
Max: 60
|
64
|
+
|
65
|
+
Metrics/ClassLength:
|
66
|
+
Max: 800
|
67
|
+
|
68
|
+
Metrics/CyclomaticComplexity:
|
69
|
+
Max: 60
|
70
|
+
|
71
|
+
Metrics/ParameterLists:
|
72
|
+
Max: 8
|
73
|
+
|
74
|
+
Metrics/ModuleLength:
|
75
|
+
Max: 200
|
data/.solargraph.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
include:
|
3
|
+
- "**/*.rb"
|
4
|
+
exclude:
|
5
|
+
- spec/**/*
|
6
|
+
- test/**/*
|
7
|
+
- vendor/**/*
|
8
|
+
- ".bundle/**/*"
|
9
|
+
require: []
|
10
|
+
domains: []
|
11
|
+
reporters:
|
12
|
+
- rubocop
|
13
|
+
# - require_not_found
|
14
|
+
formatter:
|
15
|
+
rubocop:
|
16
|
+
cops: safe
|
17
|
+
except: []
|
18
|
+
only: []
|
19
|
+
extra_args: []
|
20
|
+
require_paths: []
|
21
|
+
plugins: []
|
22
|
+
max_files: 5000
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -19,56 +19,58 @@ of regular expressions.
|
|
19
19
|
* Extract noun phrases from tagged text
|
20
20
|
* etc.
|
21
21
|
|
22
|
-
### Synopsis
|
22
|
+
### Synopsis
|
23
23
|
|
24
|
-
|
24
|
+
```ruby
|
25
|
+
require 'engtagger'
|
25
26
|
|
26
|
-
|
27
|
-
|
27
|
+
# Create a parser object
|
28
|
+
tgr = EngTagger.new
|
28
29
|
|
29
|
-
|
30
|
-
|
30
|
+
# Sample text
|
31
|
+
text = "Alice chased the big fat cat."
|
31
32
|
|
32
|
-
|
33
|
-
|
33
|
+
# Add part-of-speech tags to text
|
34
|
+
tagged = tgr.add_tags(text)
|
34
35
|
|
35
|
-
|
36
|
+
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
36
37
|
|
37
|
-
|
38
|
-
|
38
|
+
# Get a list of all nouns and noun phrases with occurrence counts
|
39
|
+
word_list = tgr.get_words(text)
|
39
40
|
|
40
|
-
|
41
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
41
42
|
|
42
|
-
|
43
|
-
|
43
|
+
# Get a readable version of the tagged text
|
44
|
+
readable = tgr.get_readable(text)
|
44
45
|
|
45
|
-
|
46
|
+
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
46
47
|
|
47
|
-
|
48
|
-
|
48
|
+
# Get all nouns from a tagged output
|
49
|
+
nouns = tgr.get_nouns(tagged)
|
49
50
|
|
50
|
-
|
51
|
+
#=> {"cat"=>1, "Alice"=>1}
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
# Get all proper nouns
|
54
|
+
proper = tgr.get_proper_nouns(tagged)
|
54
55
|
|
55
|
-
|
56
|
+
#=> {"Alice"=>1}
|
56
57
|
|
57
|
-
|
58
|
-
|
58
|
+
# Get all past tense verbs
|
59
|
+
pt_verbs = tgr.get_past_tense_verbs(tagged)
|
59
60
|
|
60
|
-
|
61
|
+
#=> {"chased"=>1}
|
61
62
|
|
62
|
-
|
63
|
-
|
63
|
+
# Get all the adjectives
|
64
|
+
adj = tgr.get_adjectives(tagged)
|
64
65
|
|
65
|
-
|
66
|
+
#=> {"big"=>1, "fat"=>1}
|
66
67
|
|
67
|
-
|
68
|
-
|
69
|
-
|
68
|
+
# Get all noun phrases of any syntactic level
|
69
|
+
# (same as word_list but take a tagged input)
|
70
|
+
nps = tgr.get_noun_phrases(tagged)
|
70
71
|
|
71
|
-
|
72
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
73
|
+
```
|
72
74
|
|
73
75
|
### Tag Set
|
74
76
|
|
data/Rakefile
CHANGED
data/engtagger.gemspec
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/engtagger/version"
|
3
4
|
|
4
5
|
Gem::Specification.new do |gem|
|
5
6
|
gem.authors = ["Yoichiro Hasebe"]
|
6
7
|
gem.email = ["yohasebe@gmail.com"]
|
7
|
-
gem.summary
|
8
|
-
gem.description
|
9
|
-
gem.homepage
|
10
|
-
|
11
|
-
gem.
|
12
|
-
gem.
|
8
|
+
gem.summary = "A probability based, corpus-trained English POS tagger"
|
9
|
+
gem.description = "A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values."
|
10
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
11
|
+
gem.license = "GPL"
|
12
|
+
gem.required_ruby_version = Gem::Requirement.new(">= 2.6")
|
13
|
+
gem.files = Dir.chdir(File.expand_path(__dir__)) do
|
14
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
15
|
+
end
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
13
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
18
|
gem.name = "engtagger"
|
15
19
|
gem.require_paths = ["lib"]
|
16
20
|
gem.version = EngTagger::VERSION
|
17
|
-
|
18
|
-
gem.add_runtime_dependency 'lru_redux'
|
21
|
+
gem.add_dependency "lru_redux"
|
19
22
|
end
|
data/lib/engtagger/porter.rb
CHANGED
@@ -1,23 +1,20 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
3
|
module Stemmable
|
5
|
-
|
6
4
|
STEP_2_LIST = {
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
}
|
5
|
+
"ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
|
6
|
+
"izer" => "ize", "bli" => "ble",
|
7
|
+
"alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
|
8
|
+
"ization" => "ize", "ation" => "ate",
|
9
|
+
"ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
|
10
|
+
"ousness" => "ous", "aliti" => "al",
|
11
|
+
"iviti" => "ive", "biliti" => "ble", "logi" => "log"
|
12
|
+
}.freeze
|
15
13
|
|
16
14
|
STEP_3_LIST = {
|
17
|
-
|
18
|
-
|
19
|
-
}
|
20
|
-
|
15
|
+
"icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
|
16
|
+
"ical" => "ic", "ful" => "", "ness" => ""
|
17
|
+
}.freeze
|
21
18
|
|
22
19
|
SUFFIX_1_REGEXP = /(
|
23
20
|
ational |
|
@@ -40,7 +37,7 @@ module Stemmable
|
|
40
37
|
aliti |
|
41
38
|
iviti |
|
42
39
|
biliti |
|
43
|
-
logi)$/x
|
40
|
+
logi)$/x.freeze
|
44
41
|
|
45
42
|
|
46
43
|
SUFFIX_2_REGEXP = /(
|
@@ -61,20 +58,18 @@ module Stemmable
|
|
61
58
|
iti |
|
62
59
|
ous |
|
63
60
|
ive |
|
64
|
-
ize)$/x
|
65
|
-
|
61
|
+
ize)$/x.freeze
|
66
62
|
|
67
|
-
C = "[^aeiou]"
|
68
|
-
V = "[aeiouy]"
|
69
|
-
CC = "#{C}(?>[^aeiouy]*)"
|
70
|
-
VV = "#{V}(?>[aeiou]*)"
|
63
|
+
C = "[^aeiou]" # consonant
|
64
|
+
V = "[aeiouy]" # vowel
|
65
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
66
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
71
67
|
|
72
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o
|
73
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o
|
74
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o
|
75
|
-
VOWEL_IN_STEM
|
68
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
|
69
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
|
70
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
|
71
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
|
76
72
|
|
77
|
-
#
|
78
73
|
# Porter stemmer in Ruby.
|
79
74
|
#
|
80
75
|
# This is the Porter stemming algorithm, ported to Ruby from the
|
@@ -90,30 +85,31 @@ module Stemmable
|
|
90
85
|
#
|
91
86
|
|
92
87
|
def stem_porter
|
93
|
-
|
94
88
|
# make a copy of the given object and convert it to a string.
|
95
|
-
w =
|
89
|
+
w = dup.to_str
|
96
90
|
|
97
91
|
return w if w.length < 3
|
98
92
|
|
99
93
|
# now map initial y to Y so that the patterns never treat it as vowel
|
100
|
-
w[0] =
|
94
|
+
w[0] = "Y" if w[0] == "y"
|
101
95
|
|
102
96
|
# Step 1a
|
103
|
-
|
97
|
+
case w
|
98
|
+
when /(ss|i)es$/
|
104
99
|
w = $` + $1
|
105
|
-
|
100
|
+
when /([^s])s$/
|
106
101
|
w = $` + $1
|
107
102
|
end
|
108
103
|
|
109
104
|
# Step 1b
|
110
|
-
|
105
|
+
case w
|
106
|
+
when /eed$/
|
111
107
|
w.chop! if $` =~ MGR0
|
112
|
-
|
108
|
+
when /(ed|ing)$/
|
113
109
|
stem = $`
|
114
110
|
if stem =~ VOWEL_IN_STEM
|
115
111
|
w = stem
|
116
|
-
|
112
|
+
case w
|
117
113
|
when /(at|bl|iz)$/ then w << "e"
|
118
114
|
when /([^aeiouylsz])\1$/ then w.chop!
|
119
115
|
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
@@ -131,59 +127,41 @@ module Stemmable
|
|
131
127
|
stem = $`
|
132
128
|
suffix = $1
|
133
129
|
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
134
|
-
if stem =~ MGR0
|
135
|
-
w = stem + STEP_2_LIST[suffix]
|
136
|
-
end
|
130
|
+
w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
|
137
131
|
end
|
138
132
|
|
139
133
|
# Step 3
|
140
134
|
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
141
135
|
stem = $`
|
142
136
|
suffix = $1
|
143
|
-
if stem =~ MGR0
|
144
|
-
w = stem + STEP_3_LIST[suffix]
|
145
|
-
end
|
137
|
+
w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
|
146
138
|
end
|
147
139
|
|
148
140
|
# Step 4
|
149
141
|
if w =~ SUFFIX_2_REGEXP
|
150
142
|
stem = $`
|
151
|
-
if stem =~ MGR1
|
152
|
-
w = stem
|
153
|
-
end
|
143
|
+
w = stem if stem =~ MGR1
|
154
144
|
elsif w =~ /(s|t)(ion)$/
|
155
145
|
stem = $` + $1
|
156
|
-
if stem =~ MGR1
|
157
|
-
w = stem
|
158
|
-
end
|
146
|
+
w = stem if stem =~ MGR1
|
159
147
|
end
|
160
148
|
|
161
149
|
# Step 5
|
162
150
|
if w =~ /e$/
|
163
151
|
stem = $`
|
164
|
-
if (stem =~ MGR1) ||
|
165
|
-
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
166
|
-
w = stem
|
167
|
-
end
|
152
|
+
w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
168
153
|
end
|
169
154
|
|
170
|
-
if w =~ /ll$/ && w =~ MGR1
|
171
|
-
w.chop!
|
172
|
-
end
|
155
|
+
w.chop! if w =~ /ll$/ && w =~ MGR1
|
173
156
|
|
174
157
|
# and turn initial Y back to y
|
175
|
-
w[0] =
|
176
|
-
|
158
|
+
w[0] = "y" if w[0] == "Y"
|
177
159
|
w
|
178
160
|
end
|
179
161
|
|
180
|
-
|
181
|
-
#
|
182
162
|
# make the stem_porter the default stem method, just in case we
|
183
163
|
# feel like having multiple stemmers available later.
|
184
|
-
#
|
185
164
|
alias stem stem_porter
|
186
|
-
|
187
165
|
end
|
188
166
|
|
189
167
|
# Add stem method to all Strings
|
data/lib/engtagger/version.rb
CHANGED