lemmatizer 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +27 -3
- data/lib/lemmatizer/lemmatizer.rb +18 -4
- data/lib/lemmatizer/version.rb +1 -1
- data/spec/lemmatizer_spec.rb +24 -10
- data/spec/user.dict3.txt +6 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 622ecac5f6d2ffa86a231a0bf2a1689bc5f6746d53a388c9efc4041c30f2e151
|
4
|
+
data.tar.gz: 7886e4ed919288df6216495f2f0b047e6555c6008c36dc1233c19925fb244501
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de93d602e0a85badd29aa47f9fa4e2970fc3f2b4feb24e596be1921820c57cc3a3d162127bfff817e5b06094e25612be01bbe4f5751bad8278cbd7d6b7bad38c
|
7
|
+
data.tar.gz: d80dd24c7cb03de971f5c1e8bb1cf3a3a19bb41c0a5dfdf0a079c4b205ba524a360296d28370d3d72328cc0ba91fc67da5862552d11baf80efdd1d263d8b3353
|
data/README.md
CHANGED
@@ -51,14 +51,14 @@ Supplying with user dict
|
|
51
51
|
# You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
|
52
52
|
# The data in user supplied files overrides the preset data. Here's the sample.
|
53
53
|
|
54
|
-
# --- sample.
|
54
|
+
# --- sample.dict1.txt (don't include hash symbol on the left) ---
|
55
55
|
# adj higher high
|
56
56
|
# adj highest high
|
57
57
|
# noun MacBooks MacBook
|
58
58
|
# ---------------------------------------------------------------
|
59
59
|
|
60
|
-
lem = Lemmatizer.new("sample.
|
61
|
-
# => 3
|
60
|
+
lem = Lemmatizer.new("sample.dict1.txt")
|
61
|
+
# => 3 items added from dict file provided
|
62
62
|
|
63
63
|
p lem.lemma("higher", :adj) # => "high"
|
64
64
|
p lem.lemma("highest", :adj) # => "high"
|
@@ -67,14 +67,38 @@ p lem.lemma("MacBooks", :noun) # => "MacBook"
|
|
67
67
|
# The argument to Lemmatizer.new can be either of the following:
|
68
68
|
# 1) a path string to a dict file (e.g. "/path/to/dict.txt")
|
69
69
|
# 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
|
70
|
+
```
|
71
|
+
|
72
|
+
Resolving abbreviations
|
73
|
+
-----------
|
74
|
+
```ruby
|
75
|
+
# You can use 'abbr' tag in user dicts to resolve abbreviations in text.
|
76
|
+
|
77
|
+
# --- sample.dict2.txt (don't include hash symbol on the left) ---
|
78
|
+
# abbr utexas University of Texas
|
79
|
+
# abbr mit Massachusetts Institute of Technology
|
80
|
+
# ---------------------------------------------------------------
|
81
|
+
|
82
|
+
# <NOTE>
|
83
|
+
# 1. Expressions on the right (substitutes) can contain white spaces,
|
84
|
+
# while expressions in the middle (words to be replaced) cannot.
|
85
|
+
# 2. Double/Single quotations could be used with substitute expressions,
|
86
|
+
# but not with original expressions.
|
87
|
+
|
88
|
+
lem = Lemmatizer.new("sample.dict2.txt")
|
89
|
+
# => 2 items added from dict file provided
|
70
90
|
|
91
|
+
p lem.lemma("utexas", :abbr) # => "University of Texas"
|
92
|
+
p lem.lemma("mit", :abbr) # => "Massachusetts Institute of Technology"
|
71
93
|
```
|
72
94
|
|
73
95
|
Author
|
74
96
|
------
|
97
|
+
|
75
98
|
* Yoichiro Hasebe <yohasebe@gmail.com>
|
76
99
|
|
77
100
|
Thanks for assistance and contributions:
|
101
|
+
|
78
102
|
* Vladimir Ivic <http://vladimirivic.com>
|
79
103
|
|
80
104
|
License
|
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
module Lemmatizer
|
2
3
|
class Lemmatizer
|
3
4
|
DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
|
@@ -51,6 +52,8 @@ module Lemmatizer
|
|
51
52
|
],
|
52
53
|
:adv => [
|
53
54
|
],
|
55
|
+
:abbr => [
|
56
|
+
],
|
54
57
|
:unknown => [
|
55
58
|
]
|
56
59
|
}
|
@@ -77,7 +80,7 @@ module Lemmatizer
|
|
77
80
|
|
78
81
|
def lemma(form, pos = nil)
|
79
82
|
unless pos
|
80
|
-
[:verb, :noun, :adj, :adv].each do |p|
|
83
|
+
[:verb, :noun, :adj, :adv, :abbr].each do |p|
|
81
84
|
result = lemma(form, p)
|
82
85
|
return result unless result == form
|
83
86
|
end
|
@@ -168,6 +171,8 @@ module Lemmatizer
|
|
168
171
|
return :adj
|
169
172
|
when "r", "adverb", "adv"
|
170
173
|
return :adv
|
174
|
+
when "b", "abbrev", "abbr", "abr"
|
175
|
+
return :abbr
|
171
176
|
else
|
172
177
|
return :unknown
|
173
178
|
end
|
@@ -178,15 +183,24 @@ module Lemmatizer
|
|
178
183
|
open_file(dict) do |io|
|
179
184
|
io.each_line do |line|
|
180
185
|
# pos must be either n|v|r|a or noun|verb|adverb|adjective
|
181
|
-
p, w, s = line.split(/\s
|
186
|
+
p, w, s = line.split(/\s+/, 3)
|
182
187
|
pos = str_to_pos(p)
|
188
|
+
word = w
|
189
|
+
substitute = s.strip
|
190
|
+
if /\A\"(.*)\"\z/ =~ substitute
|
191
|
+
substitute = $1
|
192
|
+
end
|
193
|
+
if /\A\'(.*)\'\z/ =~ substitute
|
194
|
+
substitute = $1
|
195
|
+
end
|
196
|
+
next unless (pos && word && substitute)
|
183
197
|
if @wordlists[pos]
|
184
|
-
@wordlists[pos][
|
198
|
+
@wordlists[pos][word] = substitute
|
185
199
|
num_lex_added += 1
|
186
200
|
end
|
187
201
|
end
|
188
202
|
end
|
189
|
-
puts "#{num_lex_added} items added from #{File.basename dict}"
|
203
|
+
# puts "#{num_lex_added} items added from #{File.basename dict}"
|
190
204
|
end
|
191
205
|
end
|
192
206
|
end
|
data/lib/lemmatizer/version.rb
CHANGED
data/spec/lemmatizer_spec.rb
CHANGED
@@ -7,8 +7,9 @@ describe 'Lemmatizer' do
|
|
7
7
|
@lemmatizer = Lemmatizer.new
|
8
8
|
user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
|
9
9
|
user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
|
10
|
+
user_data3 = File.join(File.dirname(__FILE__), "user.dict3.txt")
|
10
11
|
@lemmatizer_single_userdict = Lemmatizer.new(user_data1)
|
11
|
-
@lemmatizer_multiple_userdicts = Lemmatizer.new([
|
12
|
+
@lemmatizer_multiple_userdicts = Lemmatizer.new([user_data2, user_data3])
|
12
13
|
end
|
13
14
|
|
14
15
|
describe '#lemma' do
|
@@ -48,7 +49,7 @@ describe 'Lemmatizer' do
|
|
48
49
|
end
|
49
50
|
|
50
51
|
it 'gives a result when no pos is given' do
|
51
|
-
# Order: :verb, :noun, :adv, or :
|
52
|
+
# Order: :verb, :noun, :adv, :adj, or :abbr
|
52
53
|
result_1 = @lemmatizer.lemma('plays')
|
53
54
|
expect(result_1).to eq('play')
|
54
55
|
|
@@ -81,23 +82,36 @@ describe 'Lemmatizer' do
|
|
81
82
|
# 'MacBooks' -> 'MacBook'
|
82
83
|
result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
|
83
84
|
expect(result_u1).to eq('MacBook')
|
84
|
-
|
85
|
-
|
85
|
+
# 'iPhones' -> 'iPhone'
|
86
|
+
result_u2 = @lemmatizer_single_userdict.lemma('iPhones', :noun)
|
87
|
+
expect(result_u2).to eq('iPhone')
|
86
88
|
end
|
87
89
|
|
88
90
|
it 'can load uder dicts that override presets' do
|
89
|
-
# 'MacBooks' -> 'MacBook'
|
90
|
-
result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
|
91
|
-
expect(result_ud1).to eq('MacBook')
|
92
91
|
# 'higher' -> 'high'
|
93
|
-
result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
|
94
|
-
expect(result_ud2).to eq('high')
|
95
|
-
# 'highest' -> 'high'
|
96
92
|
result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
|
97
93
|
expect(result_ud3).to eq('high')
|
98
94
|
# check if (unoverridden) preset data is kept intact
|
99
95
|
result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
|
100
96
|
expect(result_ud4).to eq('cry')
|
97
|
+
# 'I'm' -> 'I am'
|
98
|
+
result_ud5 = @lemmatizer_multiple_userdicts.lemma("I'm", :abbr)
|
99
|
+
expect(result_ud5).to eq('I am')
|
100
|
+
# 'You're' -> 'you are'
|
101
|
+
result_ud6 = @lemmatizer_multiple_userdicts.lemma("You're", :abbr)
|
102
|
+
expect(result_ud6).to eq("you are")
|
103
|
+
# 'you're' -> 'you are'
|
104
|
+
result_ud7 = @lemmatizer_multiple_userdicts.lemma("you're", :abbr)
|
105
|
+
expect(result_ud7).to eq("you are")
|
106
|
+
# 'h2s' -> 'Hydrogen Sulphide'
|
107
|
+
result_ud8 = @lemmatizer_multiple_userdicts.lemma("h2s", :abbr)
|
108
|
+
expect(result_ud8).to eq("Hydrogen Sulphide")
|
109
|
+
# 'utexas' -> 'University of Texas'
|
110
|
+
result_ud9 = @lemmatizer_multiple_userdicts.lemma("utexas", :abbr)
|
111
|
+
expect(result_ud9).to eq("University of Texas")
|
112
|
+
# 'mit' -> 'Massachusetts Institute of Technology'
|
113
|
+
result_ud10 = @lemmatizer_multiple_userdicts.lemma("mit", :abbr)
|
114
|
+
expect(result_ud10).to eq("Massachusetts Institute of Technology")
|
101
115
|
end
|
102
116
|
end
|
103
117
|
end
|
data/spec/user.dict3.txt
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lemmatizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- spec/spec_helper.rb
|
56
56
|
- spec/user.dict1.txt
|
57
57
|
- spec/user.dict2.txt
|
58
|
+
- spec/user.dict3.txt
|
58
59
|
homepage: http://github.com/yohasebe/lemmatizer
|
59
60
|
licenses:
|
60
61
|
- MIT
|
@@ -83,3 +84,4 @@ test_files:
|
|
83
84
|
- spec/spec_helper.rb
|
84
85
|
- spec/user.dict1.txt
|
85
86
|
- spec/user.dict2.txt
|
87
|
+
- spec/user.dict3.txt
|