engtagger 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6aa6da6cfb58bffd900843f62675d5895e80428be7295ae056ed73327286233d
4
- data.tar.gz: dd412266b905ba4d378521540247a368bc4f73dfa89e8d6e58c220625c46e40d
3
+ metadata.gz: 0b61370e322595bd880097f51fe0728780fa6a01ee9975e6eb333c8720ff36d8
4
+ data.tar.gz: 0f990be4f4d5f71908d76f0fb52f2c925a2a01891a815cbc70eaf7a39f77edfe
5
5
  SHA512:
6
- metadata.gz: de1aa006ea943270e4dcea78690e8a10551c42819abbf3c27b6d2629d600745124ec5cfa6a6104d3cb4c87dbfc14d09e643e7b2143979dee27485841fd76b0fe
7
- data.tar.gz: 3404a699868beb475daee809cc67788a70152c0d5eba045b7d3c007e3b3fccb66ee6bb432832a8e9872cd6d3faf281fab60bf151c01eaf1cf52d6275644012bb
6
+ metadata.gz: ade5d1cf6fc11553519fe9217dffb06453e0ab7d69ab1532b3f2e2079dd05d035d90ce5ce92e4d0e1195f2a8f79df5b4d44c4cedb27f14df529ac0b0e91cf730
7
+ data.tar.gz: ff085546b0db152df0983dabea49ec5b0cf47525cca6118d3776378e908ea04fd675f0bb1daceb944d6be141615e3a5d9da5774025a0dc6ef609dd8b311b1412
data/.rubocop.yml ADDED
@@ -0,0 +1,75 @@
1
+ AllCops:
2
+ NewCops: disable
3
+ SuggestExtensions: false
4
+ TargetRubyVersion: 2.6
5
+
6
+ Documentation:
7
+ Enabled: false
8
+
9
+ Naming/AccessorMethodName:
10
+ Enabled: false
11
+
12
+ Naming/VariableNumber:
13
+ Enabled: false
14
+
15
+ Naming/FileName:
16
+ Enabled: false
17
+
18
+ Security/MarshalLoad:
19
+ Enabled: false
20
+
21
+ Layout/EndOfLine:
22
+ Enabled: False
23
+
24
+ Style/ClassVars:
25
+ Enabled: false
26
+
27
+ Style/OptionalBooleanParameter:
28
+ Enabled: false
29
+
30
+ Style/StringConcatenation:
31
+ Enabled: false
32
+
33
+ Style/PerlBackrefs:
34
+ Enabled: false
35
+
36
+ Style/StringLiterals:
37
+ Enabled: true
38
+ EnforcedStyle: double_quotes
39
+
40
+ Style/StringLiteralsInInterpolation:
41
+ Enabled: true
42
+ EnforcedStyle: double_quotes
43
+
44
+ Style/WordArray:
45
+ Enabled: false
46
+
47
+ Style/EvalWithLocation:
48
+ Enabled: false
49
+
50
+ Layout/LineLength:
51
+ Max: 400
52
+
53
+ Metrics/MethodLength:
54
+ Max: 80
55
+
56
+ Metrics/BlockLength:
57
+ Max: 60
58
+
59
+ Metrics/AbcSize:
60
+ Max: 60
61
+
62
+ Metrics/PerceivedComplexity:
63
+ Max: 60
64
+
65
+ Metrics/ClassLength:
66
+ Max: 800
67
+
68
+ Metrics/CyclomaticComplexity:
69
+ Max: 60
70
+
71
+ Metrics/ParameterLists:
72
+ Max: 8
73
+
74
+ Metrics/ModuleLength:
75
+ Max: 200
data/.solargraph.yml ADDED
@@ -0,0 +1,22 @@
1
+ ---
2
+ include:
3
+ - "**/*.rb"
4
+ exclude:
5
+ - spec/**/*
6
+ - test/**/*
7
+ - vendor/**/*
8
+ - ".bundle/**/*"
9
+ require: []
10
+ domains: []
11
+ reporters:
12
+ - rubocop
13
+ # - require_not_found
14
+ formatter:
15
+ rubocop:
16
+ cops: safe
17
+ except: []
18
+ only: []
19
+ extra_args: []
20
+ require_paths: []
21
+ plugins: []
22
+ max_files: 5000
data/Gemfile CHANGED
@@ -1,3 +1,7 @@
1
- source 'https://rubygems.org'
1
+ # frozen_string_literal: true
2
2
 
3
- gem 'lru_redux'
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ gem "lru_redux"
data/README.md CHANGED
@@ -19,56 +19,58 @@ of regular expressions.
19
19
  * Extract noun phrases from tagged text
20
20
  * etc.
21
21
 
22
- ### Synopsis:
22
+ ### Synopsis
23
23
 
24
- require 'engtagger'
24
+ ```ruby
25
+ require 'engtagger'
25
26
 
26
- # Create a parser object
27
- tgr = EngTagger.new
27
+ # Create a parser object
28
+ tgr = EngTagger.new
28
29
 
29
- # Sample text
30
- text = "Alice chased the big fat cat."
30
+ # Sample text
31
+ text = "Alice chased the big fat cat."
31
32
 
32
- # Add part-of-speech tags to text
33
- tagged = tgr.add_tags(text)
33
+ # Add part-of-speech tags to text
34
+ tagged = tgr.add_tags(text)
34
35
 
35
- #=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
36
+ #=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
36
37
 
37
- # Get a list of all nouns and noun phrases with occurrence counts
38
- word_list = tgr.get_words(text)
38
+ # Get a list of all nouns and noun phrases with occurrence counts
39
+ word_list = tgr.get_words(text)
39
40
 
40
- #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
41
+ #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
41
42
 
42
- # Get a readable version of the tagged text
43
- readable = tgr.get_readable(text)
43
+ # Get a readable version of the tagged text
44
+ readable = tgr.get_readable(text)
44
45
 
45
- #=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
46
+ #=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
46
47
 
47
- # Get all nouns from a tagged output
48
- nouns = tgr.get_nouns(tagged)
48
+ # Get all nouns from a tagged output
49
+ nouns = tgr.get_nouns(tagged)
49
50
 
50
- #=> {"cat"=>1, "Alice"=>1}
51
+ #=> {"cat"=>1, "Alice"=>1}
51
52
 
52
- # Get all proper nouns
53
- proper = tgr.get_proper_nouns(tagged)
53
+ # Get all proper nouns
54
+ proper = tgr.get_proper_nouns(tagged)
54
55
 
55
- #=> {"Alice"=>1}
56
+ #=> {"Alice"=>1}
56
57
 
57
- # Get all past tense verbs
58
- pt_verbs = tgr.get_past_tense_verbs(tagged)
58
+ # Get all past tense verbs
59
+ pt_verbs = tgr.get_past_tense_verbs(tagged)
59
60
 
60
- #=> {"chased"=>1}
61
+ #=> {"chased"=>1}
61
62
 
62
- # Get all the adjectives
63
- adj = tgr.get_adjectives(tagged)
63
+ # Get all the adjectives
64
+ adj = tgr.get_adjectives(tagged)
64
65
 
65
- #=> {"big"=>1, "fat"=>1}
66
+ #=> {"big"=>1, "fat"=>1}
66
67
 
67
- # Get all noun phrases of any syntactic level
68
- # (same as word_list but take a tagged input)
69
- nps = tgr.get_noun_phrases(tagged)
68
+ # Get all noun phrases of any syntactic level
69
+ # (same as word_list but take a tagged input)
70
+ nps = tgr.get_noun_phrases(tagged)
70
71
 
71
- #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
72
+ #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
73
+ ```
72
74
 
73
75
  ### Tag Set
74
76
 
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
- #!/usr/bin/env rake
1
+ # frozen_string_literal: true
2
+
2
3
  require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new do |t|
7
+ t.libs << "test"
8
+ t.test_files = FileList["test/test*.rb"]
9
+ t.verbose = true
10
+ end
data/engtagger.gemspec CHANGED
@@ -1,19 +1,22 @@
1
- # -*- encoding: utf-8 -*-
2
- require File.expand_path('../lib/engtagger/version', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/engtagger/version"
3
4
 
4
5
  Gem::Specification.new do |gem|
5
6
  gem.authors = ["Yoichiro Hasebe"]
6
7
  gem.email = ["yohasebe@gmail.com"]
7
- gem.summary = %q{A probability based, corpus-trained English POS tagger}
8
- gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
9
- gem.homepage = "http://github.com/yohasebe/engtagger"
10
-
11
- gem.files = `git ls-files`.split($\)
12
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
8
+ gem.summary = "A probability based, corpus-trained English POS tagger"
9
+ gem.description = "A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values."
10
+ gem.homepage = "http://github.com/yohasebe/engtagger"
11
+ gem.license = "GPL"
12
+ gem.required_ruby_version = Gem::Requirement.new(">= 2.6")
13
+ gem.files = Dir.chdir(File.expand_path(__dir__)) do
14
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
15
+ end
16
+ gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
13
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
18
  gem.name = "engtagger"
15
19
  gem.require_paths = ["lib"]
16
20
  gem.version = EngTagger::VERSION
17
-
18
- gem.add_runtime_dependency 'lru_redux'
21
+ gem.add_dependency "lru_redux"
19
22
  end
@@ -1,23 +1,20 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
1
+ # frozen_string_literal: true
3
2
 
4
3
  module Stemmable
5
-
6
4
  STEP_2_LIST = {
7
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
8
- 'izer'=>'ize', 'bli'=>'ble',
9
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
10
- 'ization'=>'ize', 'ation'=>'ate',
11
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
12
- 'ousness'=>'ous', 'aliti'=>'al',
13
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
14
- }
5
+ "ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
6
+ "izer" => "ize", "bli" => "ble",
7
+ "alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
8
+ "ization" => "ize", "ation" => "ate",
9
+ "ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
10
+ "ousness" => "ous", "aliti" => "al",
11
+ "iviti" => "ive", "biliti" => "ble", "logi" => "log"
12
+ }.freeze
15
13
 
16
14
  STEP_3_LIST = {
17
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
18
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
19
- }
20
-
15
+ "icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
16
+ "ical" => "ic", "ful" => "", "ness" => ""
17
+ }.freeze
21
18
 
22
19
  SUFFIX_1_REGEXP = /(
23
20
  ational |
@@ -40,7 +37,7 @@ module Stemmable
40
37
  aliti |
41
38
  iviti |
42
39
  biliti |
43
- logi)$/x
40
+ logi)$/x.freeze
44
41
 
45
42
 
46
43
  SUFFIX_2_REGEXP = /(
@@ -61,20 +58,18 @@ module Stemmable
61
58
  iti |
62
59
  ous |
63
60
  ive |
64
- ize)$/x
65
-
61
+ ize)$/x.freeze
66
62
 
67
- C = "[^aeiou]" # consonant
68
- V = "[aeiouy]" # vowel
69
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
70
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
63
+ C = "[^aeiou]" # consonant
64
+ V = "[aeiouy]" # vowel
65
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
66
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
71
67
 
72
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
73
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
74
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
75
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
68
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
69
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
70
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
71
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
76
72
 
77
- #
78
73
  # Porter stemmer in Ruby.
79
74
  #
80
75
  # This is the Porter stemming algorithm, ported to Ruby from the
@@ -90,30 +85,31 @@ module Stemmable
90
85
  #
91
86
 
92
87
  def stem_porter
93
-
94
88
  # make a copy of the given object and convert it to a string.
95
- w = self.dup.to_str
89
+ w = dup.to_str
96
90
 
97
91
  return w if w.length < 3
98
92
 
99
93
  # now map initial y to Y so that the patterns never treat it as vowel
100
- w[0] = 'Y' if w[0] == ?y
94
+ w[0] = "Y" if w[0] == "y"
101
95
 
102
96
  # Step 1a
103
- if w =~ /(ss|i)es$/
97
+ case w
98
+ when /(ss|i)es$/
104
99
  w = $` + $1
105
- elsif w =~ /([^s])s$/
100
+ when /([^s])s$/
106
101
  w = $` + $1
107
102
  end
108
103
 
109
104
  # Step 1b
110
- if w =~ /eed$/
105
+ case w
106
+ when /eed$/
111
107
  w.chop! if $` =~ MGR0
112
- elsif w =~ /(ed|ing)$/
108
+ when /(ed|ing)$/
113
109
  stem = $`
114
110
  if stem =~ VOWEL_IN_STEM
115
111
  w = stem
116
- case w
112
+ case w
117
113
  when /(at|bl|iz)$/ then w << "e"
118
114
  when /([^aeiouylsz])\1$/ then w.chop!
119
115
  when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
@@ -131,59 +127,41 @@ module Stemmable
131
127
  stem = $`
132
128
  suffix = $1
133
129
  # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
134
- if stem =~ MGR0
135
- w = stem + STEP_2_LIST[suffix]
136
- end
130
+ w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
137
131
  end
138
132
 
139
133
  # Step 3
140
134
  if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
141
135
  stem = $`
142
136
  suffix = $1
143
- if stem =~ MGR0
144
- w = stem + STEP_3_LIST[suffix]
145
- end
137
+ w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
146
138
  end
147
139
 
148
140
  # Step 4
149
141
  if w =~ SUFFIX_2_REGEXP
150
142
  stem = $`
151
- if stem =~ MGR1
152
- w = stem
153
- end
143
+ w = stem if stem =~ MGR1
154
144
  elsif w =~ /(s|t)(ion)$/
155
145
  stem = $` + $1
156
- if stem =~ MGR1
157
- w = stem
158
- end
146
+ w = stem if stem =~ MGR1
159
147
  end
160
148
 
161
149
  # Step 5
162
150
  if w =~ /e$/
163
151
  stem = $`
164
- if (stem =~ MGR1) ||
165
- (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
166
- w = stem
167
- end
152
+ w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
168
153
  end
169
154
 
170
- if w =~ /ll$/ && w =~ MGR1
171
- w.chop!
172
- end
155
+ w.chop! if w =~ /ll$/ && w =~ MGR1
173
156
 
174
157
  # and turn initial Y back to y
175
- w[0] = 'y' if w[0] == ?Y
176
-
158
+ w[0] = "y" if w[0] == "Y"
177
159
  w
178
160
  end
179
161
 
180
-
181
- #
182
162
  # make the stem_porter the default stem method, just in case we
183
163
  # feel like having multiple stemmers available later.
184
- #
185
164
  alias stem stem_porter
186
-
187
165
  end
188
166
 
189
167
  # Add stem method to all Strings
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class EngTagger
2
- VERSION = "0.3.2"
4
+ VERSION = "0.4.0"
3
5
  end