lemmatizer 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3962f3411b45a381c605ddd975f34bfad4055d8bc63bfdf385ed9341f395f5c4
4
- data.tar.gz: 7dd90d196fda8c109f5847a7fafb0917f99c6f508dd0488cc33d60ced2c435e6
3
+ metadata.gz: 9ffb812c39d3601bb86da6d6e44c8375cc8078bfb88089a8357c14c92473209a
4
+ data.tar.gz: fe6a030e331e8138231dfaf48b136bf5dc7e4e8be40a8457664bdd8bf2b8f6b5
5
5
  SHA512:
6
- metadata.gz: eef23c892d9d9544637196fa61dde985fe440308157fd05f0202511723f3c84585b294cea6d827b4652a1473ff89f2b1f7a4b242aed17ae97c1700ffdee302e6
7
- data.tar.gz: 1e95bb9907884803e9413251df611bd999d698baf2a6c709e5eb8f4ccbe9b272894ce179931c3609fa50e37fcac08f444cc10e05602312362b72f99fb871c77f
6
+ metadata.gz: f2505986c5b0919c7691fbec86afc814df9567e23024f68e9c7f1daa0afd1485534fcd700aa131146d12f8d7dbfb95fbcbce798fc37e4408afd7976d077d750f
7
+ data.tar.gz: 3e9e7f6289d906b2e4f984ccf285e24024de933b7aa9065f99559f057392299a69696ab82706624781f780d925eb6f1565f629462ec9a1f98fadfc3bea8810ba
data/README.md CHANGED
@@ -41,25 +41,33 @@ p lem.lemma("MacBooks", :noun) # => "MacBooks"
41
41
  p lem.lemma("higher", :adj) # => "higher" not "high"!
42
42
 
43
43
  # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
44
- # Modify dict/index.{noun|verb|adj|adv} if necessary.
44
+ # To fix this, modify the original dict directly (lib/dict/index.{noun|verb|adj|adv})
45
+ # or supply with custom dict files (recommended).
45
46
  ```
46
47
 
47
48
  Supplying with user dict
48
49
  -----------
49
50
  ```ruby
50
- # You can supply files with additional dict data consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
51
- # The data in user supplied files overrides the preset data
51
+ # You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
52
+ # The data in user supplied files overrides the preset data. Here's the sample.
52
53
 
53
- ------ sample.dict.txt -----
54
- adj higher high
55
- adj highest high
56
- noun MacBooks MacBook
57
- ----------------------------
54
+ # --- sample.dict.txt (don't include hash symbol on the left) ---
55
+ # adj higher high
56
+ # adj highest high
57
+ # noun MacBooks MacBook
58
+ # ---------------------------------------------------------------
58
59
 
59
60
  lem = Lemmatizer.new("sample.dict.txt")
60
- p lem.lemma("higher", :adj) # => "high"
61
- p lem.lemma("highest", :adj) # => "high"
62
- p lem.lemma("MacBooks", :noun # => "MacBook"
61
+ # => 3 lexical items added from dict file provided
62
+
63
+ p lem.lemma("higher", :adj) # => "high"
64
+ p lem.lemma("highest", :adj) # => "high"
65
+ p lem.lemma("MacBooks", :noun) # => "MacBook"
66
+
67
+ # The argument to Lemmatizer.new can be either of the following:
68
+ # 1) a path string to a dict file (e.g. "/path/to/dict.txt")
69
+ # 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
70
+
63
71
  ```
64
72
 
65
73
  Author
@@ -68,7 +68,11 @@ module Lemmatizer
68
68
  load_wordnet_files(pos, pair[0], pair[1])
69
69
  end
70
70
 
71
- load_provided_dict(dict) if dict
71
+ if dict
72
+ [dict].flatten.each do |d|
73
+ load_provided_dict(d)
74
+ end
75
+ end
72
76
  end
73
77
 
74
78
  def lemma(form, pos = nil)
@@ -182,7 +186,7 @@ module Lemmatizer
182
186
  end
183
187
  end
184
188
  end
185
- puts "#{num_lex_added} lexical items added from dict file provided"
189
+ puts "#{num_lex_added} items added from #{File.basename dict}"
186
190
  end
187
191
  end
188
192
  end
@@ -1,3 +1,3 @@
1
1
  module Lemmatizer
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end
@@ -5,8 +5,10 @@ describe 'Lemmatizer' do
5
5
 
6
6
  before(:all) do
7
7
  @lemmatizer = Lemmatizer.new
8
- user_data = File.join(File.dirname(__FILE__), "user.dict.txt")
9
- @lemmatizer_with_userdata = Lemmatizer.new(user_data)
8
+ user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
9
+ user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
10
+ @lemmatizer_single_userdict = Lemmatizer.new(user_data1)
11
+ @lemmatizer_multiple_userdicts = Lemmatizer.new([user_data1, user_data2])
10
12
  end
11
13
 
12
14
  describe '#lemma' do
@@ -75,19 +77,27 @@ describe 'Lemmatizer' do
75
77
  expect(result_n2).not_to eq('MacBook')
76
78
  end
77
79
 
78
- it 'can load user dict that overrides presets' do
80
+ it 'can load a user dict that overrides presets' do
79
81
  # 'MacBooks' -> 'MacBook'
80
- result_u1 = @lemmatizer_with_userdata.lemma('MacBooks', :noun)
82
+ result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
81
83
  expect(result_u1).to eq('MacBook')
84
+ result_u2 = @lemmatizer_single_userdict.lemma('crying', :verb)
85
+ expect(result_u2).to eq('cry')
86
+ end
87
+
88
+ it 'can load uder dicts that override presets' do
89
+ # 'MacBooks' -> 'MacBook'
90
+ result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
91
+ expect(result_ud1).to eq('MacBook')
82
92
  # 'higher' -> 'high'
83
- result_u2 = @lemmatizer_with_userdata.lemma('higher', :adj)
84
- expect(result_u2).to eq('high')
93
+ result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
94
+ expect(result_ud2).to eq('high')
85
95
  # 'highest' -> 'high'
86
- result_u3 = @lemmatizer_with_userdata.lemma('higher')
87
- expect(result_u3).to eq('high')
96
+ result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
97
+ expect(result_ud3).to eq('high')
88
98
  # check if (unoverridden) preset data is kept intact
89
- result_u4 = @lemmatizer_with_userdata.lemma('crying', :verb)
90
- expect(result_u4).to eq('cry')
99
+ result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
100
+ expect(result_ud4).to eq('cry')
91
101
  end
92
102
  end
93
103
  end
@@ -0,0 +1,3 @@
1
+ noun MacBooks MacBook
2
+ noun iPhones iPhone
3
+ noun iPads iPad
@@ -0,0 +1,2 @@
1
+ adj higher high
2
+ adj highest high
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-15 00:00:00.000000000 Z
11
+ date: 2019-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -53,7 +53,8 @@ files:
53
53
  - lib/lemmatizer/version.rb
54
54
  - spec/lemmatizer_spec.rb
55
55
  - spec/spec_helper.rb
56
- - spec/user.dict.txt
56
+ - spec/user.dict1.txt
57
+ - spec/user.dict2.txt
57
58
  homepage: http://github.com/yohasebe/lemmatizer
58
59
  licenses:
59
60
  - MIT
@@ -80,4 +81,5 @@ summary: Englsh lemmatizer in Ruby
80
81
  test_files:
81
82
  - spec/lemmatizer_spec.rb
82
83
  - spec/spec_helper.rb
83
- - spec/user.dict.txt
84
+ - spec/user.dict1.txt
85
+ - spec/user.dict2.txt
@@ -1,5 +0,0 @@
1
- n MacBooks MacBook
2
- n iPhones iPhone
3
- n iPads iPad
4
- adj higher high
5
- adj highest high