engtagger 0.3.2 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +72 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +74 -42
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +169 -192
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe357706e69ed72bec9569babe91cc8531e2c1d0eac71ac8d248bdd74b97ba98
|
4
|
+
data.tar.gz: 02e6bb2ba29ecabf8e5087c5a2dc92ccad57ef3578fbd9c844f93188a4d39ced
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e477b0d839e825e8d49135cb6d6c72c21555454d6f722d00b994442cdaaba2b1afa84ab8f22f82f64b9540a0e24914180c59a563830ea11b2a0239921d3e88e
|
7
|
+
data.tar.gz: 49b02532d7ad940b25b19ba59df364fc553373371f7850f068729af3a339773417ad7f8d5e0e58ecc3facc6ef2168ae86bb9f94a46f9a412bf51d7de36fdab1e
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
AllCops:
|
2
|
+
NewCops: disable
|
3
|
+
SuggestExtensions: false
|
4
|
+
TargetRubyVersion: 2.6
|
5
|
+
|
6
|
+
Documentation:
|
7
|
+
Enabled: false
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Naming/VariableNumber:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Naming/FileName:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Security/MarshalLoad:
|
19
|
+
Enabled: false
|
20
|
+
|
21
|
+
Style/ClassVars:
|
22
|
+
Enabled: false
|
23
|
+
|
24
|
+
Style/OptionalBooleanParameter:
|
25
|
+
Enabled: false
|
26
|
+
|
27
|
+
Style/StringConcatenation:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/PerlBackrefs:
|
31
|
+
Enabled: false
|
32
|
+
|
33
|
+
Style/StringLiterals:
|
34
|
+
Enabled: true
|
35
|
+
EnforcedStyle: double_quotes
|
36
|
+
|
37
|
+
Style/StringLiteralsInInterpolation:
|
38
|
+
Enabled: true
|
39
|
+
EnforcedStyle: double_quotes
|
40
|
+
|
41
|
+
Style/WordArray:
|
42
|
+
Enabled: false
|
43
|
+
|
44
|
+
Style/EvalWithLocation:
|
45
|
+
Enabled: false
|
46
|
+
|
47
|
+
Layout/LineLength:
|
48
|
+
Max: 400
|
49
|
+
|
50
|
+
Metrics/MethodLength:
|
51
|
+
Max: 80
|
52
|
+
|
53
|
+
Metrics/BlockLength:
|
54
|
+
Max: 60
|
55
|
+
|
56
|
+
Metrics/AbcSize:
|
57
|
+
Max: 60
|
58
|
+
|
59
|
+
Metrics/PerceivedComplexity:
|
60
|
+
Max: 60
|
61
|
+
|
62
|
+
Metrics/ClassLength:
|
63
|
+
Max: 800
|
64
|
+
|
65
|
+
Metrics/CyclomaticComplexity:
|
66
|
+
Max: 60
|
67
|
+
|
68
|
+
Metrics/ParameterLists:
|
69
|
+
Max: 8
|
70
|
+
|
71
|
+
Metrics/ModuleLength:
|
72
|
+
Max: 200
|
data/.solargraph.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
include:
|
3
|
+
- "**/*.rb"
|
4
|
+
exclude:
|
5
|
+
- spec/**/*
|
6
|
+
- test/**/*
|
7
|
+
- vendor/**/*
|
8
|
+
- ".bundle/**/*"
|
9
|
+
require: []
|
10
|
+
domains: []
|
11
|
+
reporters:
|
12
|
+
- rubocop
|
13
|
+
# - require_not_found
|
14
|
+
formatter:
|
15
|
+
rubocop:
|
16
|
+
cops: safe
|
17
|
+
except: []
|
18
|
+
only: []
|
19
|
+
extra_args: []
|
20
|
+
require_paths: []
|
21
|
+
plugins: []
|
22
|
+
max_files: 5000
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|
4
4
|
|
5
|
-
|
5
|
+
## Description
|
6
6
|
|
7
7
|
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
8
|
tagger that assigns POS tags to English text based on a lookup dictionary and
|
@@ -13,64 +13,66 @@ word morphology or can be set to be treated as nouns or other parts of speech.
|
|
13
13
|
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
14
14
|
of regular expressions.
|
15
15
|
|
16
|
-
|
16
|
+
## Features
|
17
17
|
|
18
18
|
* Assigns POS tags to English text
|
19
19
|
* Extract noun phrases from tagged text
|
20
20
|
* etc.
|
21
21
|
|
22
|
-
|
22
|
+
## Synopsis
|
23
23
|
|
24
|
-
|
24
|
+
```ruby
|
25
|
+
require 'engtagger'
|
25
26
|
|
26
|
-
|
27
|
-
|
27
|
+
# Create a parser object
|
28
|
+
tgr = EngTagger.new
|
28
29
|
|
29
|
-
|
30
|
-
|
30
|
+
# Sample text
|
31
|
+
text = "Alice chased the big fat cat."
|
31
32
|
|
32
|
-
|
33
|
-
|
33
|
+
# Add part-of-speech tags to text
|
34
|
+
tagged = tgr.add_tags(text)
|
34
35
|
|
35
|
-
|
36
|
+
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
36
37
|
|
37
|
-
|
38
|
-
|
38
|
+
# Get a list of all nouns and noun phrases with occurrence counts
|
39
|
+
word_list = tgr.get_words(text)
|
39
40
|
|
40
|
-
|
41
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
41
42
|
|
42
|
-
|
43
|
-
|
43
|
+
# Get a readable version of the tagged text
|
44
|
+
readable = tgr.get_readable(text)
|
44
45
|
|
45
|
-
|
46
|
+
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
46
47
|
|
47
|
-
|
48
|
-
|
48
|
+
# Get all nouns from a tagged output
|
49
|
+
nouns = tgr.get_nouns(tagged)
|
49
50
|
|
50
|
-
|
51
|
+
#=> {"cat"=>1, "Alice"=>1}
|
51
52
|
|
52
|
-
|
53
|
-
|
53
|
+
# Get all proper nouns
|
54
|
+
proper = tgr.get_proper_nouns(tagged)
|
54
55
|
|
55
|
-
|
56
|
+
#=> {"Alice"=>1}
|
56
57
|
|
57
|
-
|
58
|
-
|
58
|
+
# Get all past tense verbs
|
59
|
+
pt_verbs = tgr.get_past_tense_verbs(tagged)
|
59
60
|
|
60
|
-
|
61
|
+
#=> {"chased"=>1}
|
61
62
|
|
62
|
-
|
63
|
-
|
63
|
+
# Get all the adjectives
|
64
|
+
adj = tgr.get_adjectives(tagged)
|
64
65
|
|
65
|
-
|
66
|
+
#=> {"big"=>1, "fat"=>1}
|
66
67
|
|
67
|
-
|
68
|
-
|
69
|
-
|
68
|
+
# Get all noun phrases of any syntactic level
|
69
|
+
# (same as word_list but take a tagged input)
|
70
|
+
nps = tgr.get_noun_phrases(tagged)
|
70
71
|
|
71
|
-
|
72
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
73
|
+
```
|
72
74
|
|
73
|
-
|
75
|
+
## Tag Set
|
74
76
|
|
75
77
|
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
76
78
|
|
@@ -120,26 +122,56 @@ The set of POS tags used here is a modified version of the Penn Treebank tagset.
|
|
120
122
|
LRB Punctuation, left bracket (, {, [
|
121
123
|
RRB Punctuation, right bracket ), }, ]
|
122
124
|
|
123
|
-
|
125
|
+
## Installation
|
124
126
|
|
125
|
-
|
127
|
+
**Recommended Approach (without sudo):**
|
126
128
|
|
127
|
-
|
129
|
+
It is recommended to install the `engtagger` gem within your user environment without root privileges. This ensures proper file permissions and avoids potential issues. You can achieve this by using Ruby version managers like `rbenv` or `rvm` to manage your Ruby versions and gemsets.
|
128
130
|
|
129
|
-
|
131
|
+
To install without `sudo`, simply run:
|
130
132
|
|
131
|
-
|
133
|
+
```bash
|
134
|
+
gem install engtagger
|
135
|
+
```
|
132
136
|
|
133
|
-
|
137
|
+
**Alternative Approach (with sudo):**
|
138
|
+
|
139
|
+
If you must use `sudo` for installation, you'll need to adjust file permissions afterward to ensure accessibility.
|
140
|
+
|
141
|
+
1. Install the gem with `sudo`:
|
142
|
+
|
143
|
+
```bash
|
144
|
+
sudo gem install engtagger
|
145
|
+
```
|
146
|
+
|
147
|
+
2. Grant necessary permissions to your user:
|
148
|
+
|
149
|
+
```bash
|
150
|
+
sudo chown -R $(whoami) /Library/Ruby/Gems/2.6.0/gems/engtagger-0.4.1
|
151
|
+
```
|
152
|
+
|
153
|
+
**Note:** The path above assumes you are using Ruby version 2.6.0. If you are using a different version, you will need to modify the path accordingly. You can find your Ruby version by running `ruby -v`.
|
154
|
+
|
155
|
+
## Troubleshooting
|
156
|
+
|
157
|
+
**Permission Issues:**
|
158
|
+
|
159
|
+
If you encounter "cannot load such file" errors after installation, it might be due to incorrect file permissions. Ensure you've followed the instructions for adjusting permissions if you used `sudo` during installation.
|
160
|
+
|
161
|
+
## Author
|
162
|
+
|
163
|
+
Yoichiro Hasebe (yohasebe [at] gmail.com)
|
164
|
+
|
165
|
+
## Contributors
|
134
166
|
|
135
167
|
Many thanks to the collaborators listed in the right column of this GitHub page.
|
136
168
|
|
137
|
-
|
169
|
+
## Acknowledgement
|
138
170
|
|
139
171
|
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
140
172
|
The credit for the crucial part of its algorithm/design therefore goes to
|
141
173
|
Aaron Coburn, the author of the original Perl version.
|
142
174
|
|
143
|
-
|
175
|
+
## License
|
144
176
|
|
145
177
|
This library is distributed under the GPL. Please see the LICENSE file.
|
data/Rakefile
CHANGED
data/engtagger.gemspec
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/engtagger/version"
|
3
4
|
|
4
5
|
Gem::Specification.new do |gem|
|
5
6
|
gem.authors = ["Yoichiro Hasebe"]
|
6
7
|
gem.email = ["yohasebe@gmail.com"]
|
7
|
-
gem.summary
|
8
|
-
gem.description
|
9
|
-
gem.homepage
|
10
|
-
|
11
|
-
gem.
|
12
|
-
gem.
|
8
|
+
gem.summary = "A probability based, corpus-trained English POS tagger"
|
9
|
+
gem.description = "A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values."
|
10
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
11
|
+
gem.license = "GPL"
|
12
|
+
gem.required_ruby_version = Gem::Requirement.new(">= 2.6")
|
13
|
+
gem.files = Dir.chdir(File.expand_path(__dir__)) do
|
14
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
15
|
+
end
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
13
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
18
|
gem.name = "engtagger"
|
15
19
|
gem.require_paths = ["lib"]
|
16
20
|
gem.version = EngTagger::VERSION
|
17
|
-
|
18
|
-
gem.add_runtime_dependency 'lru_redux'
|
21
|
+
gem.add_dependency "lru_redux"
|
19
22
|
end
|