lemmatizer 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -3
- data/lib/lemmatizer/lemmatizer.rb +18 -4
- data/lib/lemmatizer/version.rb +1 -1
- data/spec/lemmatizer_spec.rb +24 -10
- data/spec/user.dict3.txt +6 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 622ecac5f6d2ffa86a231a0bf2a1689bc5f6746d53a388c9efc4041c30f2e151
|
4
|
+
data.tar.gz: 7886e4ed919288df6216495f2f0b047e6555c6008c36dc1233c19925fb244501
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de93d602e0a85badd29aa47f9fa4e2970fc3f2b4feb24e596be1921820c57cc3a3d162127bfff817e5b06094e25612be01bbe4f5751bad8278cbd7d6b7bad38c
|
7
|
+
data.tar.gz: d80dd24c7cb03de971f5c1e8bb1cf3a3a19bb41c0a5dfdf0a079c4b205ba524a360296d28370d3d72328cc0ba91fc67da5862552d11baf80efdd1d263d8b3353
|
data/README.md
CHANGED
@@ -51,14 +51,14 @@ Supplying with user dict
|
|
51
51
|
# You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
|
52
52
|
# The data in user supplied files overrides the preset data. Here's the sample.
|
53
53
|
|
54
|
-
# --- sample.
|
54
|
+
# --- sample.dict1.txt (don't include hash symbol on the left) ---
|
55
55
|
# adj higher high
|
56
56
|
# adj highest high
|
57
57
|
# noun MacBooks MacBook
|
58
58
|
# ---------------------------------------------------------------
|
59
59
|
|
60
|
-
lem = Lemmatizer.new("sample.
|
61
|
-
# => 3
|
60
|
+
lem = Lemmatizer.new("sample.dict1.txt")
|
61
|
+
# => 3 items added from dict file provided
|
62
62
|
|
63
63
|
p lem.lemma("higher", :adj) # => "high"
|
64
64
|
p lem.lemma("highest", :adj) # => "high"
|
@@ -67,14 +67,38 @@ p lem.lemma("MacBooks", :noun) # => "MacBook"
|
|
67
67
|
# The argument to Lemmatizer.new can be either of the following:
|
68
68
|
# 1) a path string to a dict file (e.g. "/path/to/dict.txt")
|
69
69
|
# 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
|
70
|
+
```
|
71
|
+
|
72
|
+
Resolving abbreviations
|
73
|
+
-----------
|
74
|
+
```ruby
|
75
|
+
# You can use 'abbr' tag in user dicts to resolve abbreviations in text.
|
76
|
+
|
77
|
+
# --- sample.dict2.txt (don't include hash symbol on the left) ---
|
78
|
+
# abbr utexas University of Texas
|
79
|
+
# abbr mit Massachusetts Institute of Technology
|
80
|
+
# ---------------------------------------------------------------
|
81
|
+
|
82
|
+
# <NOTE>
|
83
|
+
# 1. Expressions on the right (substitutes) can contain white spaces,
|
84
|
+
# while expressions in the middle (words to be replaced) cannot.
|
85
|
+
# 2. Double/Single quotations could be used with substitute expressions,
|
86
|
+
# but not with original expressions.
|
87
|
+
|
88
|
+
lem = Lemmatizer.new("sample.dict2.txt")
|
89
|
+
# => 2 items added from dict file provided
|
70
90
|
|
91
|
+
p lem.lemma("utexas", :abbr) # => "University of Texas"
|
92
|
+
p lem.lemma("mit", :abbr) # => "Massachusetts Institute of Technology"
|
71
93
|
```
|
72
94
|
|
73
95
|
Author
|
74
96
|
------
|
97
|
+
|
75
98
|
* Yoichiro Hasebe <yohasebe@gmail.com>
|
76
99
|
|
77
100
|
Thanks for assistance and contributions:
|
101
|
+
|
78
102
|
* Vladimir Ivic <http://vladimirivic.com>
|
79
103
|
|
80
104
|
License
|
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
module Lemmatizer
|
2
3
|
class Lemmatizer
|
3
4
|
DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
|
@@ -51,6 +52,8 @@ module Lemmatizer
|
|
51
52
|
],
|
52
53
|
:adv => [
|
53
54
|
],
|
55
|
+
:abbr => [
|
56
|
+
],
|
54
57
|
:unknown => [
|
55
58
|
]
|
56
59
|
}
|
@@ -77,7 +80,7 @@ module Lemmatizer
|
|
77
80
|
|
78
81
|
def lemma(form, pos = nil)
|
79
82
|
unless pos
|
80
|
-
[:verb, :noun, :adj, :adv].each do |p|
|
83
|
+
[:verb, :noun, :adj, :adv, :abbr].each do |p|
|
81
84
|
result = lemma(form, p)
|
82
85
|
return result unless result == form
|
83
86
|
end
|
@@ -168,6 +171,8 @@ module Lemmatizer
|
|
168
171
|
return :adj
|
169
172
|
when "r", "adverb", "adv"
|
170
173
|
return :adv
|
174
|
+
when "b", "abbrev", "abbr", "abr"
|
175
|
+
return :abbr
|
171
176
|
else
|
172
177
|
return :unknown
|
173
178
|
end
|
@@ -178,15 +183,24 @@ module Lemmatizer
|
|
178
183
|
open_file(dict) do |io|
|
179
184
|
io.each_line do |line|
|
180
185
|
# pos must be either n|v|r|a or noun|verb|adverb|adjective
|
181
|
-
p, w, s = line.split(/\s
|
186
|
+
p, w, s = line.split(/\s+/, 3)
|
182
187
|
pos = str_to_pos(p)
|
188
|
+
word = w
|
189
|
+
substitute = s.strip
|
190
|
+
if /\A\"(.*)\"\z/ =~ substitute
|
191
|
+
substitute = $1
|
192
|
+
end
|
193
|
+
if /\A\'(.*)\'\z/ =~ substitute
|
194
|
+
substitute = $1
|
195
|
+
end
|
196
|
+
next unless (pos && word && substitute)
|
183
197
|
if @wordlists[pos]
|
184
|
-
@wordlists[pos][
|
198
|
+
@wordlists[pos][word] = substitute
|
185
199
|
num_lex_added += 1
|
186
200
|
end
|
187
201
|
end
|
188
202
|
end
|
189
|
-
puts "#{num_lex_added} items added from #{File.basename dict}"
|
203
|
+
# puts "#{num_lex_added} items added from #{File.basename dict}"
|
190
204
|
end
|
191
205
|
end
|
192
206
|
end
|
data/lib/lemmatizer/version.rb
CHANGED
data/spec/lemmatizer_spec.rb
CHANGED
@@ -7,8 +7,9 @@ describe 'Lemmatizer' do
|
|
7
7
|
@lemmatizer = Lemmatizer.new
|
8
8
|
user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
|
9
9
|
user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
|
10
|
+
user_data3 = File.join(File.dirname(__FILE__), "user.dict3.txt")
|
10
11
|
@lemmatizer_single_userdict = Lemmatizer.new(user_data1)
|
11
|
-
@lemmatizer_multiple_userdicts = Lemmatizer.new([
|
12
|
+
@lemmatizer_multiple_userdicts = Lemmatizer.new([user_data2, user_data3])
|
12
13
|
end
|
13
14
|
|
14
15
|
describe '#lemma' do
|
@@ -48,7 +49,7 @@ describe 'Lemmatizer' do
|
|
48
49
|
end
|
49
50
|
|
50
51
|
it 'gives a result when no pos is given' do
|
51
|
-
# Order: :verb, :noun, :adv, or :
|
52
|
+
# Order: :verb, :noun, :adv, :adj, or :abbr
|
52
53
|
result_1 = @lemmatizer.lemma('plays')
|
53
54
|
expect(result_1).to eq('play')
|
54
55
|
|
@@ -81,23 +82,36 @@ describe 'Lemmatizer' do
|
|
81
82
|
# 'MacBooks' -> 'MacBook'
|
82
83
|
result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
|
83
84
|
expect(result_u1).to eq('MacBook')
|
84
|
-
|
85
|
-
|
85
|
+
# 'iPhones' -> 'iPhone'
|
86
|
+
result_u2 = @lemmatizer_single_userdict.lemma('iPhones', :noun)
|
87
|
+
expect(result_u2).to eq('iPhone')
|
86
88
|
end
|
87
89
|
|
88
90
|
it 'can load uder dicts that override presets' do
|
89
|
-
# 'MacBooks' -> 'MacBook'
|
90
|
-
result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
|
91
|
-
expect(result_ud1).to eq('MacBook')
|
92
91
|
# 'higher' -> 'high'
|
93
|
-
result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
|
94
|
-
expect(result_ud2).to eq('high')
|
95
|
-
# 'highest' -> 'high'
|
96
92
|
result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
|
97
93
|
expect(result_ud3).to eq('high')
|
98
94
|
# check if (unoverridden) preset data is kept intact
|
99
95
|
result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
|
100
96
|
expect(result_ud4).to eq('cry')
|
97
|
+
# 'I'm' -> 'I am'
|
98
|
+
result_ud5 = @lemmatizer_multiple_userdicts.lemma("I'm", :abbr)
|
99
|
+
expect(result_ud5).to eq('I am')
|
100
|
+
# 'You're' -> 'you are'
|
101
|
+
result_ud6 = @lemmatizer_multiple_userdicts.lemma("You're", :abbr)
|
102
|
+
expect(result_ud6).to eq("you are")
|
103
|
+
# 'you're' -> 'you are'
|
104
|
+
result_ud7 = @lemmatizer_multiple_userdicts.lemma("you're", :abbr)
|
105
|
+
expect(result_ud7).to eq("you are")
|
106
|
+
# 'h2s' -> 'Hydrogen Sulphide'
|
107
|
+
result_ud8 = @lemmatizer_multiple_userdicts.lemma("h2s", :abbr)
|
108
|
+
expect(result_ud8).to eq("Hydrogen Sulphide")
|
109
|
+
# 'utexas' -> 'University of Texas'
|
110
|
+
result_ud9 = @lemmatizer_multiple_userdicts.lemma("utexas", :abbr)
|
111
|
+
expect(result_ud9).to eq("University of Texas")
|
112
|
+
# 'mit' -> 'Massachusetts Institute of Technology'
|
113
|
+
result_ud10 = @lemmatizer_multiple_userdicts.lemma("mit", :abbr)
|
114
|
+
expect(result_ud10).to eq("Massachusetts Institute of Technology")
|
101
115
|
end
|
102
116
|
end
|
103
117
|
end
|
data/spec/user.dict3.txt
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lemmatizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- spec/spec_helper.rb
|
56
56
|
- spec/user.dict1.txt
|
57
57
|
- spec/user.dict2.txt
|
58
|
+
- spec/user.dict3.txt
|
58
59
|
homepage: http://github.com/yohasebe/lemmatizer
|
59
60
|
licenses:
|
60
61
|
- MIT
|
@@ -83,3 +84,4 @@ test_files:
|
|
83
84
|
- spec/spec_helper.rb
|
84
85
|
- spec/user.dict1.txt
|
85
86
|
- spec/user.dict2.txt
|
87
|
+
- spec/user.dict3.txt
|