lemmatizer 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +19 -11
- data/lib/lemmatizer/lemmatizer.rb +6 -2
- data/lib/lemmatizer/version.rb +1 -1
- data/spec/lemmatizer_spec.rb +20 -10
- data/spec/user.dict1.txt +3 -0
- data/spec/user.dict2.txt +2 -0
- metadata +6 -4
- data/spec/user.dict.txt +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ffb812c39d3601bb86da6d6e44c8375cc8078bfb88089a8357c14c92473209a
|
4
|
+
data.tar.gz: fe6a030e331e8138231dfaf48b136bf5dc7e4e8be40a8457664bdd8bf2b8f6b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2505986c5b0919c7691fbec86afc814df9567e23024f68e9c7f1daa0afd1485534fcd700aa131146d12f8d7dbfb95fbcbce798fc37e4408afd7976d077d750f
|
7
|
+
data.tar.gz: 3e9e7f6289d906b2e4f984ccf285e24024de933b7aa9065f99559f057392299a69696ab82706624781f780d925eb6f1565f629462ec9a1f98fadfc3bea8810ba
|
data/README.md
CHANGED
@@ -41,25 +41,33 @@ p lem.lemma("MacBooks", :noun) # => "MacBooks"
|
|
41
41
|
p lem.lemma("higher", :adj) # => "higher" not "high"!
|
42
42
|
|
43
43
|
# The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
|
44
|
-
#
|
44
|
+
# To fix this, modify the original dict directly (lib/dict/index.{noun|verb|adj|adv})
|
45
|
+
# or supply with custom dict files (recommended).
|
45
46
|
```
|
46
47
|
|
47
48
|
Supplying with user dict
|
48
49
|
-----------
|
49
50
|
```ruby
|
50
|
-
# You can supply
|
51
|
-
# The data in user supplied files overrides the preset data
|
51
|
+
# You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
|
52
|
+
# The data in user supplied files overrides the preset data. Here's the sample.
|
52
53
|
|
53
|
-
|
54
|
-
adj higher high
|
55
|
-
adj highest high
|
56
|
-
noun MacBooks MacBook
|
57
|
-
|
54
|
+
# --- sample.dict.txt (don't include hash symbol on the left) ---
|
55
|
+
# adj higher high
|
56
|
+
# adj highest high
|
57
|
+
# noun MacBooks MacBook
|
58
|
+
# ---------------------------------------------------------------
|
58
59
|
|
59
60
|
lem = Lemmatizer.new("sample.dict.txt")
|
60
|
-
|
61
|
-
|
62
|
-
p lem.lemma("
|
61
|
+
# => 3 lexical items added from dict file provided
|
62
|
+
|
63
|
+
p lem.lemma("higher", :adj) # => "high"
|
64
|
+
p lem.lemma("highest", :adj) # => "high"
|
65
|
+
p lem.lemma("MacBooks", :noun) # => "MacBook"
|
66
|
+
|
67
|
+
# The argument to Lemmatizer.new can be either of the following:
|
68
|
+
# 1) a path string to a dict file (e.g. "/path/to/dict.txt")
|
69
|
+
# 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
|
70
|
+
|
63
71
|
```
|
64
72
|
|
65
73
|
Author
|
@@ -68,7 +68,11 @@ module Lemmatizer
|
|
68
68
|
load_wordnet_files(pos, pair[0], pair[1])
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if dict
|
72
|
+
[dict].flatten.each do |d|
|
73
|
+
load_provided_dict(d)
|
74
|
+
end
|
75
|
+
end
|
72
76
|
end
|
73
77
|
|
74
78
|
def lemma(form, pos = nil)
|
@@ -182,7 +186,7 @@ module Lemmatizer
|
|
182
186
|
end
|
183
187
|
end
|
184
188
|
end
|
185
|
-
puts "#{num_lex_added}
|
189
|
+
puts "#{num_lex_added} items added from #{File.basename dict}"
|
186
190
|
end
|
187
191
|
end
|
188
192
|
end
|
data/lib/lemmatizer/version.rb
CHANGED
data/spec/lemmatizer_spec.rb
CHANGED
@@ -5,8 +5,10 @@ describe 'Lemmatizer' do
|
|
5
5
|
|
6
6
|
before(:all) do
|
7
7
|
@lemmatizer = Lemmatizer.new
|
8
|
-
|
9
|
-
|
8
|
+
user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
|
9
|
+
user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
|
10
|
+
@lemmatizer_single_userdict = Lemmatizer.new(user_data1)
|
11
|
+
@lemmatizer_multiple_userdicts = Lemmatizer.new([user_data1, user_data2])
|
10
12
|
end
|
11
13
|
|
12
14
|
describe '#lemma' do
|
@@ -75,19 +77,27 @@ describe 'Lemmatizer' do
|
|
75
77
|
expect(result_n2).not_to eq('MacBook')
|
76
78
|
end
|
77
79
|
|
78
|
-
it 'can load user dict that overrides presets' do
|
80
|
+
it 'can load a user dict that overrides presets' do
|
79
81
|
# 'MacBooks' -> 'MacBook'
|
80
|
-
result_u1 = @
|
82
|
+
result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
|
81
83
|
expect(result_u1).to eq('MacBook')
|
84
|
+
result_u2 = @lemmatizer_single_userdict.lemma('crying', :verb)
|
85
|
+
expect(result_u2).to eq('cry')
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'can load uder dicts that override presets' do
|
89
|
+
# 'MacBooks' -> 'MacBook'
|
90
|
+
result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
|
91
|
+
expect(result_ud1).to eq('MacBook')
|
82
92
|
# 'higher' -> 'high'
|
83
|
-
|
84
|
-
expect(
|
93
|
+
result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
|
94
|
+
expect(result_ud2).to eq('high')
|
85
95
|
# 'highest' -> 'high'
|
86
|
-
|
87
|
-
expect(
|
96
|
+
result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
|
97
|
+
expect(result_ud3).to eq('high')
|
88
98
|
# check if (unoverridden) preset data is kept intact
|
89
|
-
|
90
|
-
expect(
|
99
|
+
result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
|
100
|
+
expect(result_ud4).to eq('cry')
|
91
101
|
end
|
92
102
|
end
|
93
103
|
end
|
data/spec/user.dict1.txt
ADDED
data/spec/user.dict2.txt
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lemmatizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -53,7 +53,8 @@ files:
|
|
53
53
|
- lib/lemmatizer/version.rb
|
54
54
|
- spec/lemmatizer_spec.rb
|
55
55
|
- spec/spec_helper.rb
|
56
|
-
- spec/user.
|
56
|
+
- spec/user.dict1.txt
|
57
|
+
- spec/user.dict2.txt
|
57
58
|
homepage: http://github.com/yohasebe/lemmatizer
|
58
59
|
licenses:
|
59
60
|
- MIT
|
@@ -80,4 +81,5 @@ summary: Englsh lemmatizer in Ruby
|
|
80
81
|
test_files:
|
81
82
|
- spec/lemmatizer_spec.rb
|
82
83
|
- spec/spec_helper.rb
|
83
|
-
- spec/user.
|
84
|
+
- spec/user.dict1.txt
|
85
|
+
- spec/user.dict2.txt
|