engtagger 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6aa6da6cfb58bffd900843f62675d5895e80428be7295ae056ed73327286233d
4
- data.tar.gz: dd412266b905ba4d378521540247a368bc4f73dfa89e8d6e58c220625c46e40d
3
+ metadata.gz: 0b61370e322595bd880097f51fe0728780fa6a01ee9975e6eb333c8720ff36d8
4
+ data.tar.gz: 0f990be4f4d5f71908d76f0fb52f2c925a2a01891a815cbc70eaf7a39f77edfe
5
5
  SHA512:
6
- metadata.gz: de1aa006ea943270e4dcea78690e8a10551c42819abbf3c27b6d2629d600745124ec5cfa6a6104d3cb4c87dbfc14d09e643e7b2143979dee27485841fd76b0fe
7
- data.tar.gz: 3404a699868beb475daee809cc67788a70152c0d5eba045b7d3c007e3b3fccb66ee6bb432832a8e9872cd6d3faf281fab60bf151c01eaf1cf52d6275644012bb
6
+ metadata.gz: ade5d1cf6fc11553519fe9217dffb06453e0ab7d69ab1532b3f2e2079dd05d035d90ce5ce92e4d0e1195f2a8f79df5b4d44c4cedb27f14df529ac0b0e91cf730
7
+ data.tar.gz: ff085546b0db152df0983dabea49ec5b0cf47525cca6118d3776378e908ea04fd675f0bb1daceb944d6be141615e3a5d9da5774025a0dc6ef609dd8b311b1412
data/.rubocop.yml ADDED
@@ -0,0 +1,75 @@
1
+ AllCops:
2
+ NewCops: disable
3
+ SuggestExtensions: false
4
+ TargetRubyVersion: 2.6
5
+
6
+ Documentation:
7
+ Enabled: false
8
+
9
+ Naming/AccessorMethodName:
10
+ Enabled: false
11
+
12
+ Naming/VariableNumber:
13
+ Enabled: false
14
+
15
+ Naming/FileName:
16
+ Enabled: false
17
+
18
+ Security/MarshalLoad:
19
+ Enabled: false
20
+
21
+ Layout/EndOfLine:
22
+ Enabled: False
23
+
24
+ Style/ClassVars:
25
+ Enabled: false
26
+
27
+ Style/OptionalBooleanParameter:
28
+ Enabled: false
29
+
30
+ Style/StringConcatenation:
31
+ Enabled: false
32
+
33
+ Style/PerlBackrefs:
34
+ Enabled: false
35
+
36
+ Style/StringLiterals:
37
+ Enabled: true
38
+ EnforcedStyle: double_quotes
39
+
40
+ Style/StringLiteralsInInterpolation:
41
+ Enabled: true
42
+ EnforcedStyle: double_quotes
43
+
44
+ Style/WordArray:
45
+ Enabled: false
46
+
47
+ Style/EvalWithLocation:
48
+ Enabled: false
49
+
50
+ Layout/LineLength:
51
+ Max: 400
52
+
53
+ Metrics/MethodLength:
54
+ Max: 80
55
+
56
+ Metrics/BlockLength:
57
+ Max: 60
58
+
59
+ Metrics/AbcSize:
60
+ Max: 60
61
+
62
+ Metrics/PerceivedComplexity:
63
+ Max: 60
64
+
65
+ Metrics/ClassLength:
66
+ Max: 800
67
+
68
+ Metrics/CyclomaticComplexity:
69
+ Max: 60
70
+
71
+ Metrics/ParameterLists:
72
+ Max: 8
73
+
74
+ Metrics/ModuleLength:
75
+ Max: 200
data/.solargraph.yml ADDED
@@ -0,0 +1,22 @@
1
+ ---
2
+ include:
3
+ - "**/*.rb"
4
+ exclude:
5
+ - spec/**/*
6
+ - test/**/*
7
+ - vendor/**/*
8
+ - ".bundle/**/*"
9
+ require: []
10
+ domains: []
11
+ reporters:
12
+ - rubocop
13
+ # - require_not_found
14
+ formatter:
15
+ rubocop:
16
+ cops: safe
17
+ except: []
18
+ only: []
19
+ extra_args: []
20
+ require_paths: []
21
+ plugins: []
22
+ max_files: 5000
data/Gemfile CHANGED
@@ -1,3 +1,7 @@
1
- source 'https://rubygems.org'
1
+ # frozen_string_literal: true
2
2
 
3
- gem 'lru_redux'
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ gem "lru_redux"
data/README.md CHANGED
@@ -19,56 +19,58 @@ of regular expressions.
19
19
  * Extract noun phrases from tagged text
20
20
  * etc.
21
21
 
22
- ### Synopsis:
22
+ ### Synopsis
23
23
 
24
- require 'engtagger'
24
+ ```ruby
25
+ require 'engtagger'
25
26
 
26
- # Create a parser object
27
- tgr = EngTagger.new
27
+ # Create a parser object
28
+ tgr = EngTagger.new
28
29
 
29
- # Sample text
30
- text = "Alice chased the big fat cat."
30
+ # Sample text
31
+ text = "Alice chased the big fat cat."
31
32
 
32
- # Add part-of-speech tags to text
33
- tagged = tgr.add_tags(text)
33
+ # Add part-of-speech tags to text
34
+ tagged = tgr.add_tags(text)
34
35
 
35
- #=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
36
+ #=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
36
37
 
37
- # Get a list of all nouns and noun phrases with occurrence counts
38
- word_list = tgr.get_words(text)
38
+ # Get a list of all nouns and noun phrases with occurrence counts
39
+ word_list = tgr.get_words(text)
39
40
 
40
- #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
41
+ #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
41
42
 
42
- # Get a readable version of the tagged text
43
- readable = tgr.get_readable(text)
43
+ # Get a readable version of the tagged text
44
+ readable = tgr.get_readable(text)
44
45
 
45
- #=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
46
+ #=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
46
47
 
47
- # Get all nouns from a tagged output
48
- nouns = tgr.get_nouns(tagged)
48
+ # Get all nouns from a tagged output
49
+ nouns = tgr.get_nouns(tagged)
49
50
 
50
- #=> {"cat"=>1, "Alice"=>1}
51
+ #=> {"cat"=>1, "Alice"=>1}
51
52
 
52
- # Get all proper nouns
53
- proper = tgr.get_proper_nouns(tagged)
53
+ # Get all proper nouns
54
+ proper = tgr.get_proper_nouns(tagged)
54
55
 
55
- #=> {"Alice"=>1}
56
+ #=> {"Alice"=>1}
56
57
 
57
- # Get all past tense verbs
58
- pt_verbs = tgr.get_past_tense_verbs(tagged)
58
+ # Get all past tense verbs
59
+ pt_verbs = tgr.get_past_tense_verbs(tagged)
59
60
 
60
- #=> {"chased"=>1}
61
+ #=> {"chased"=>1}
61
62
 
62
- # Get all the adjectives
63
- adj = tgr.get_adjectives(tagged)
63
+ # Get all the adjectives
64
+ adj = tgr.get_adjectives(tagged)
64
65
 
65
- #=> {"big"=>1, "fat"=>1}
66
+ #=> {"big"=>1, "fat"=>1}
66
67
 
67
- # Get all noun phrases of any syntactic level
68
- # (same as word_list but take a tagged input)
69
- nps = tgr.get_noun_phrases(tagged)
68
+ # Get all noun phrases of any syntactic level
69
+ # (same as word_list but take a tagged input)
70
+ nps = tgr.get_noun_phrases(tagged)
70
71
 
71
- #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
72
+ #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
73
+ ```
72
74
 
73
75
  ### Tag Set
74
76
 
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
- #!/usr/bin/env rake
1
+ # frozen_string_literal: true
2
+
2
3
  require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new do |t|
7
+ t.libs << "test"
8
+ t.test_files = FileList["test/test*.rb"]
9
+ t.verbose = true
10
+ end
data/engtagger.gemspec CHANGED
@@ -1,19 +1,22 @@
1
- # -*- encoding: utf-8 -*-
2
- require File.expand_path('../lib/engtagger/version', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/engtagger/version"
3
4
 
4
5
  Gem::Specification.new do |gem|
5
6
  gem.authors = ["Yoichiro Hasebe"]
6
7
  gem.email = ["yohasebe@gmail.com"]
7
- gem.summary = %q{A probability based, corpus-trained English POS tagger}
8
- gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
9
- gem.homepage = "http://github.com/yohasebe/engtagger"
10
-
11
- gem.files = `git ls-files`.split($\)
12
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
8
+ gem.summary = "A probability based, corpus-trained English POS tagger"
9
+ gem.description = "A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values."
10
+ gem.homepage = "http://github.com/yohasebe/engtagger"
11
+ gem.license = "GPL"
12
+ gem.required_ruby_version = Gem::Requirement.new(">= 2.6")
13
+ gem.files = Dir.chdir(File.expand_path(__dir__)) do
14
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
15
+ end
16
+ gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
13
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
18
  gem.name = "engtagger"
15
19
  gem.require_paths = ["lib"]
16
20
  gem.version = EngTagger::VERSION
17
-
18
- gem.add_runtime_dependency 'lru_redux'
21
+ gem.add_dependency "lru_redux"
19
22
  end
@@ -1,23 +1,20 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
1
+ # frozen_string_literal: true
3
2
 
4
3
  module Stemmable
5
-
6
4
  STEP_2_LIST = {
7
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
8
- 'izer'=>'ize', 'bli'=>'ble',
9
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
10
- 'ization'=>'ize', 'ation'=>'ate',
11
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
12
- 'ousness'=>'ous', 'aliti'=>'al',
13
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
14
- }
5
+ "ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
6
+ "izer" => "ize", "bli" => "ble",
7
+ "alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
8
+ "ization" => "ize", "ation" => "ate",
9
+ "ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
10
+ "ousness" => "ous", "aliti" => "al",
11
+ "iviti" => "ive", "biliti" => "ble", "logi" => "log"
12
+ }.freeze
15
13
 
16
14
  STEP_3_LIST = {
17
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
18
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
19
- }
20
-
15
+ "icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
16
+ "ical" => "ic", "ful" => "", "ness" => ""
17
+ }.freeze
21
18
 
22
19
  SUFFIX_1_REGEXP = /(
23
20
  ational |
@@ -40,7 +37,7 @@ module Stemmable
40
37
  aliti |
41
38
  iviti |
42
39
  biliti |
43
- logi)$/x
40
+ logi)$/x.freeze
44
41
 
45
42
 
46
43
  SUFFIX_2_REGEXP = /(
@@ -61,20 +58,18 @@ module Stemmable
61
58
  iti |
62
59
  ous |
63
60
  ive |
64
- ize)$/x
65
-
61
+ ize)$/x.freeze
66
62
 
67
- C = "[^aeiou]" # consonant
68
- V = "[aeiouy]" # vowel
69
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
70
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
63
+ C = "[^aeiou]" # consonant
64
+ V = "[aeiouy]" # vowel
65
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
66
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
71
67
 
72
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
73
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
74
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
75
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
68
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
69
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
70
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
71
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
76
72
 
77
- #
78
73
  # Porter stemmer in Ruby.
79
74
  #
80
75
  # This is the Porter stemming algorithm, ported to Ruby from the
@@ -90,30 +85,31 @@ module Stemmable
90
85
  #
91
86
 
92
87
  def stem_porter
93
-
94
88
  # make a copy of the given object and convert it to a string.
95
- w = self.dup.to_str
89
+ w = dup.to_str
96
90
 
97
91
  return w if w.length < 3
98
92
 
99
93
  # now map initial y to Y so that the patterns never treat it as vowel
100
- w[0] = 'Y' if w[0] == ?y
94
+ w[0] = "Y" if w[0] == "y"
101
95
 
102
96
  # Step 1a
103
- if w =~ /(ss|i)es$/
97
+ case w
98
+ when /(ss|i)es$/
104
99
  w = $` + $1
105
- elsif w =~ /([^s])s$/
100
+ when /([^s])s$/
106
101
  w = $` + $1
107
102
  end
108
103
 
109
104
  # Step 1b
110
- if w =~ /eed$/
105
+ case w
106
+ when /eed$/
111
107
  w.chop! if $` =~ MGR0
112
- elsif w =~ /(ed|ing)$/
108
+ when /(ed|ing)$/
113
109
  stem = $`
114
110
  if stem =~ VOWEL_IN_STEM
115
111
  w = stem
116
- case w
112
+ case w
117
113
  when /(at|bl|iz)$/ then w << "e"
118
114
  when /([^aeiouylsz])\1$/ then w.chop!
119
115
  when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
@@ -131,59 +127,41 @@ module Stemmable
131
127
  stem = $`
132
128
  suffix = $1
133
129
  # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
134
- if stem =~ MGR0
135
- w = stem + STEP_2_LIST[suffix]
136
- end
130
+ w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
137
131
  end
138
132
 
139
133
  # Step 3
140
134
  if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
141
135
  stem = $`
142
136
  suffix = $1
143
- if stem =~ MGR0
144
- w = stem + STEP_3_LIST[suffix]
145
- end
137
+ w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
146
138
  end
147
139
 
148
140
  # Step 4
149
141
  if w =~ SUFFIX_2_REGEXP
150
142
  stem = $`
151
- if stem =~ MGR1
152
- w = stem
153
- end
143
+ w = stem if stem =~ MGR1
154
144
  elsif w =~ /(s|t)(ion)$/
155
145
  stem = $` + $1
156
- if stem =~ MGR1
157
- w = stem
158
- end
146
+ w = stem if stem =~ MGR1
159
147
  end
160
148
 
161
149
  # Step 5
162
150
  if w =~ /e$/
163
151
  stem = $`
164
- if (stem =~ MGR1) ||
165
- (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
166
- w = stem
167
- end
152
+ w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
168
153
  end
169
154
 
170
- if w =~ /ll$/ && w =~ MGR1
171
- w.chop!
172
- end
155
+ w.chop! if w =~ /ll$/ && w =~ MGR1
173
156
 
174
157
  # and turn initial Y back to y
175
- w[0] = 'y' if w[0] == ?Y
176
-
158
+ w[0] = "y" if w[0] == "Y"
177
159
  w
178
160
  end
179
161
 
180
-
181
- #
182
162
  # make the stem_porter the default stem method, just in case we
183
163
  # feel like having multiple stemmers available later.
184
- #
185
164
  alias stem stem_porter
186
-
187
165
  end
188
166
 
189
167
  # Add stem method to all Strings
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class EngTagger
2
- VERSION = "0.3.2"
4
+ VERSION = "0.4.0"
3
5
  end