lemmatizer 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +19 -11
- data/lib/lemmatizer/lemmatizer.rb +6 -2
- data/lib/lemmatizer/version.rb +1 -1
- data/spec/lemmatizer_spec.rb +20 -10
- data/spec/user.dict1.txt +3 -0
- data/spec/user.dict2.txt +2 -0
- metadata +6 -4
- data/spec/user.dict.txt +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ffb812c39d3601bb86da6d6e44c8375cc8078bfb88089a8357c14c92473209a
|
4
|
+
data.tar.gz: fe6a030e331e8138231dfaf48b136bf5dc7e4e8be40a8457664bdd8bf2b8f6b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2505986c5b0919c7691fbec86afc814df9567e23024f68e9c7f1daa0afd1485534fcd700aa131146d12f8d7dbfb95fbcbce798fc37e4408afd7976d077d750f
|
7
|
+
data.tar.gz: 3e9e7f6289d906b2e4f984ccf285e24024de933b7aa9065f99559f057392299a69696ab82706624781f780d925eb6f1565f629462ec9a1f98fadfc3bea8810ba
|
data/README.md
CHANGED
@@ -41,25 +41,33 @@ p lem.lemma("MacBooks", :noun) # => "MacBooks"
|
|
41
41
|
p lem.lemma("higher", :adj) # => "higher" not "high"!
|
42
42
|
|
43
43
|
# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
|
44
|
-
#
|
44
|
+
# To fix this, modify the original dict directly (lib/dict/index.{noun|verb|adj|adv})
|
45
|
+
# or supply with custom dict files (recommended).
|
45
46
|
```
|
46
47
|
|
47
48
|
Supplying with user dict
|
48
49
|
-----------
|
49
50
|
```ruby
|
50
|
-
# You can supply
|
51
|
-
# The data in user supplied files overrides the preset data
|
51
|
+
# You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
|
52
|
+
# The data in user supplied files overrides the preset data. Here's the sample.
|
52
53
|
|
53
|
-
|
54
|
-
adj higher high
|
55
|
-
adj highest high
|
56
|
-
noun MacBooks MacBook
|
57
|
-
|
54
|
+
# --- sample.dict.txt (don't include hash symbol on the left) ---
|
55
|
+
# adj higher high
|
56
|
+
# adj highest high
|
57
|
+
# noun MacBooks MacBook
|
58
|
+
# ---------------------------------------------------------------
|
58
59
|
|
59
60
|
lem = Lemmatizer.new("sample.dict.txt")
|
60
|
-
|
61
|
-
|
62
|
-
p lem.lemma("
|
61
|
+
# => 3 lexical items added from dict file provided
|
62
|
+
|
63
|
+
p lem.lemma("higher", :adj) # => "high"
|
64
|
+
p lem.lemma("highest", :adj) # => "high"
|
65
|
+
p lem.lemma("MacBooks", :noun) # => "MacBook"
|
66
|
+
|
67
|
+
# The argument to Lemmatizer.new can be either of the following:
|
68
|
+
# 1) a path string to a dict file (e.g. "/path/to/dict.txt")
|
69
|
+
# 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
|
70
|
+
|
63
71
|
```
|
64
72
|
|
65
73
|
Author
|
@@ -68,7 +68,11 @@ module Lemmatizer
|
|
68
68
|
load_wordnet_files(pos, pair[0], pair[1])
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if dict
|
72
|
+
[dict].flatten.each do |d|
|
73
|
+
load_provided_dict(d)
|
74
|
+
end
|
75
|
+
end
|
72
76
|
end
|
73
77
|
|
74
78
|
def lemma(form, pos = nil)
|
@@ -182,7 +186,7 @@ module Lemmatizer
|
|
182
186
|
end
|
183
187
|
end
|
184
188
|
end
|
185
|
-
puts "#{num_lex_added}
|
189
|
+
puts "#{num_lex_added} items added from #{File.basename dict}"
|
186
190
|
end
|
187
191
|
end
|
188
192
|
end
|
data/lib/lemmatizer/version.rb
CHANGED
data/spec/lemmatizer_spec.rb
CHANGED
@@ -5,8 +5,10 @@ describe 'Lemmatizer' do
|
|
5
5
|
|
6
6
|
before(:all) do
|
7
7
|
@lemmatizer = Lemmatizer.new
|
8
|
-
|
9
|
-
|
8
|
+
user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
|
9
|
+
user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
|
10
|
+
@lemmatizer_single_userdict = Lemmatizer.new(user_data1)
|
11
|
+
@lemmatizer_multiple_userdicts = Lemmatizer.new([user_data1, user_data2])
|
10
12
|
end
|
11
13
|
|
12
14
|
describe '#lemma' do
|
@@ -75,19 +77,27 @@ describe 'Lemmatizer' do
|
|
75
77
|
expect(result_n2).not_to eq('MacBook')
|
76
78
|
end
|
77
79
|
|
78
|
-
it 'can load user dict that overrides presets' do
|
80
|
+
it 'can load a user dict that overrides presets' do
|
79
81
|
# 'MacBooks' -> 'MacBook'
|
80
|
-
result_u1 = @
|
82
|
+
result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
|
81
83
|
expect(result_u1).to eq('MacBook')
|
84
|
+
result_u2 = @lemmatizer_single_userdict.lemma('crying', :verb)
|
85
|
+
expect(result_u2).to eq('cry')
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'can load uder dicts that override presets' do
|
89
|
+
# 'MacBooks' -> 'MacBook'
|
90
|
+
result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
|
91
|
+
expect(result_ud1).to eq('MacBook')
|
82
92
|
# 'higher' -> 'high'
|
83
|
-
|
84
|
-
expect(
|
93
|
+
result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
|
94
|
+
expect(result_ud2).to eq('high')
|
85
95
|
# 'highest' -> 'high'
|
86
|
-
|
87
|
-
expect(
|
96
|
+
result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
|
97
|
+
expect(result_ud3).to eq('high')
|
88
98
|
# check if (unoverridden) preset data is kept intact
|
89
|
-
|
90
|
-
expect(
|
99
|
+
result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
|
100
|
+
expect(result_ud4).to eq('cry')
|
91
101
|
end
|
92
102
|
end
|
93
103
|
end
|
data/spec/user.dict1.txt
ADDED
data/spec/user.dict2.txt
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lemmatizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -53,7 +53,8 @@ files:
|
|
53
53
|
- lib/lemmatizer/version.rb
|
54
54
|
- spec/lemmatizer_spec.rb
|
55
55
|
- spec/spec_helper.rb
|
56
|
-
- spec/user.
|
56
|
+
- spec/user.dict1.txt
|
57
|
+
- spec/user.dict2.txt
|
57
58
|
homepage: http://github.com/yohasebe/lemmatizer
|
58
59
|
licenses:
|
59
60
|
- MIT
|
@@ -80,4 +81,5 @@ summary: Englsh lemmatizer in Ruby
|
|
80
81
|
test_files:
|
81
82
|
- spec/lemmatizer_spec.rb
|
82
83
|
- spec/spec_helper.rb
|
83
|
-
- spec/user.
|
84
|
+
- spec/user.dict1.txt
|
85
|
+
- spec/user.dict2.txt
|