lemmatizer 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3962f3411b45a381c605ddd975f34bfad4055d8bc63bfdf385ed9341f395f5c4
4
- data.tar.gz: 7dd90d196fda8c109f5847a7fafb0917f99c6f508dd0488cc33d60ced2c435e6
3
+ metadata.gz: 9ffb812c39d3601bb86da6d6e44c8375cc8078bfb88089a8357c14c92473209a
4
+ data.tar.gz: fe6a030e331e8138231dfaf48b136bf5dc7e4e8be40a8457664bdd8bf2b8f6b5
5
5
  SHA512:
6
- metadata.gz: eef23c892d9d9544637196fa61dde985fe440308157fd05f0202511723f3c84585b294cea6d827b4652a1473ff89f2b1f7a4b242aed17ae97c1700ffdee302e6
7
- data.tar.gz: 1e95bb9907884803e9413251df611bd999d698baf2a6c709e5eb8f4ccbe9b272894ce179931c3609fa50e37fcac08f444cc10e05602312362b72f99fb871c77f
6
+ metadata.gz: f2505986c5b0919c7691fbec86afc814df9567e23024f68e9c7f1daa0afd1485534fcd700aa131146d12f8d7dbfb95fbcbce798fc37e4408afd7976d077d750f
7
+ data.tar.gz: 3e9e7f6289d906b2e4f984ccf285e24024de933b7aa9065f99559f057392299a69696ab82706624781f780d925eb6f1565f629462ec9a1f98fadfc3bea8810ba
data/README.md CHANGED
@@ -41,25 +41,33 @@ p lem.lemma("MacBooks", :noun) # => "MacBooks"
41
41
  p lem.lemma("higher", :adj) # => "higher" not "high"!
42
42
 
43
43
  # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
44
- # Modify dict/index.{noun|verb|adj|adv} if necessary.
44
+ # To fix this, modify the original dict directly (lib/dict/index.{noun|verb|adj|adv})
45
+ # or supply with custom dict files (recommended).
45
46
  ```
46
47
 
47
48
  Supplying with user dict
48
49
  -----------
49
50
  ```ruby
50
- # You can supply files with additional dict data consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
51
- # The data in user supplied files overrides the preset data
51
+ # You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
52
+ # The data in user supplied files overrides the preset data. Here's the sample.
52
53
 
53
- ------ sample.dict.txt -----
54
- adj higher high
55
- adj highest high
56
- noun MacBooks MacBook
57
- ----------------------------
54
+ # --- sample.dict.txt (don't include hash symbol on the left) ---
55
+ # adj higher high
56
+ # adj highest high
57
+ # noun MacBooks MacBook
58
+ # ---------------------------------------------------------------
58
59
 
59
60
  lem = Lemmatizer.new("sample.dict.txt")
60
- p lem.lemma("higher", :adj) # => "high"
61
- p lem.lemma("highest", :adj) # => "high"
62
- p lem.lemma("MacBooks", :noun # => "MacBook"
61
+ # => 3 lexical items added from dict file provided
62
+
63
+ p lem.lemma("higher", :adj) # => "high"
64
+ p lem.lemma("highest", :adj) # => "high"
65
+ p lem.lemma("MacBooks", :noun) # => "MacBook"
66
+
67
+ # The argument to Lemmatizer.new can be either of the following:
68
+ # 1) a path string to a dict file (e.g. "/path/to/dict.txt")
69
+ # 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
70
+
63
71
  ```
64
72
 
65
73
  Author
@@ -68,7 +68,11 @@ module Lemmatizer
68
68
  load_wordnet_files(pos, pair[0], pair[1])
69
69
  end
70
70
 
71
- load_provided_dict(dict) if dict
71
+ if dict
72
+ [dict].flatten.each do |d|
73
+ load_provided_dict(d)
74
+ end
75
+ end
72
76
  end
73
77
 
74
78
  def lemma(form, pos = nil)
@@ -182,7 +186,7 @@ module Lemmatizer
182
186
  end
183
187
  end
184
188
  end
185
- puts "#{num_lex_added} lexical items added from dict file provided"
189
+ puts "#{num_lex_added} items added from #{File.basename dict}"
186
190
  end
187
191
  end
188
192
  end
@@ -1,3 +1,3 @@
1
1
  module Lemmatizer
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end
@@ -5,8 +5,10 @@ describe 'Lemmatizer' do
5
5
 
6
6
  before(:all) do
7
7
  @lemmatizer = Lemmatizer.new
8
- user_data = File.join(File.dirname(__FILE__), "user.dict.txt")
9
- @lemmatizer_with_userdata = Lemmatizer.new(user_data)
8
+ user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
9
+ user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
10
+ @lemmatizer_single_userdict = Lemmatizer.new(user_data1)
11
+ @lemmatizer_multiple_userdicts = Lemmatizer.new([user_data1, user_data2])
10
12
  end
11
13
 
12
14
  describe '#lemma' do
@@ -75,19 +77,27 @@ describe 'Lemmatizer' do
75
77
  expect(result_n2).not_to eq('MacBook')
76
78
  end
77
79
 
78
- it 'can load user dict that overrides presets' do
80
+ it 'can load a user dict that overrides presets' do
79
81
  # 'MacBooks' -> 'MacBook'
80
- result_u1 = @lemmatizer_with_userdata.lemma('MacBooks', :noun)
82
+ result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
81
83
  expect(result_u1).to eq('MacBook')
84
+ result_u2 = @lemmatizer_single_userdict.lemma('crying', :verb)
85
+ expect(result_u2).to eq('cry')
86
+ end
87
+
88
+ it 'can load uder dicts that override presets' do
89
+ # 'MacBooks' -> 'MacBook'
90
+ result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
91
+ expect(result_ud1).to eq('MacBook')
82
92
  # 'higher' -> 'high'
83
- result_u2 = @lemmatizer_with_userdata.lemma('higher', :adj)
84
- expect(result_u2).to eq('high')
93
+ result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
94
+ expect(result_ud2).to eq('high')
85
95
  # 'highest' -> 'high'
86
- result_u3 = @lemmatizer_with_userdata.lemma('higher')
87
- expect(result_u3).to eq('high')
96
+ result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
97
+ expect(result_ud3).to eq('high')
88
98
  # check if (unoverridden) preset data is kept intact
89
- result_u4 = @lemmatizer_with_userdata.lemma('crying', :verb)
90
- expect(result_u4).to eq('cry')
99
+ result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
100
+ expect(result_ud4).to eq('cry')
91
101
  end
92
102
  end
93
103
  end
@@ -0,0 +1,3 @@
1
+ noun MacBooks MacBook
2
+ noun iPhones iPhone
3
+ noun iPads iPad
@@ -0,0 +1,2 @@
1
+ adj higher high
2
+ adj highest high
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-15 00:00:00.000000000 Z
11
+ date: 2019-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -53,7 +53,8 @@ files:
53
53
  - lib/lemmatizer/version.rb
54
54
  - spec/lemmatizer_spec.rb
55
55
  - spec/spec_helper.rb
56
- - spec/user.dict.txt
56
+ - spec/user.dict1.txt
57
+ - spec/user.dict2.txt
57
58
  homepage: http://github.com/yohasebe/lemmatizer
58
59
  licenses:
59
60
  - MIT
@@ -80,4 +81,5 @@ summary: Englsh lemmatizer in Ruby
80
81
  test_files:
81
82
  - spec/lemmatizer_spec.rb
82
83
  - spec/spec_helper.rb
83
- - spec/user.dict.txt
84
+ - spec/user.dict1.txt
85
+ - spec/user.dict2.txt
@@ -1,5 +0,0 @@
1
- n MacBooks MacBook
2
- n iPhones iPhone
3
- n iPads iPad
4
- adj higher high
5
- adj highest high