RubyGems - lemmatizer - Versions diffs - 0.2.0 → 0.2.1 - Mend

lemmatizer 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3962f3411b45a381c605ddd975f34bfad4055d8bc63bfdf385ed9341f395f5c4
-  data.tar.gz: 7dd90d196fda8c109f5847a7fafb0917f99c6f508dd0488cc33d60ced2c435e6
+  metadata.gz: 9ffb812c39d3601bb86da6d6e44c8375cc8078bfb88089a8357c14c92473209a
+  data.tar.gz: fe6a030e331e8138231dfaf48b136bf5dc7e4e8be40a8457664bdd8bf2b8f6b5
 SHA512:
-  metadata.gz: eef23c892d9d9544637196fa61dde985fe440308157fd05f0202511723f3c84585b294cea6d827b4652a1473ff89f2b1f7a4b242aed17ae97c1700ffdee302e6
-  data.tar.gz: 1e95bb9907884803e9413251df611bd999d698baf2a6c709e5eb8f4ccbe9b272894ce179931c3609fa50e37fcac08f444cc10e05602312362b72f99fb871c77f
+  metadata.gz: f2505986c5b0919c7691fbec86afc814df9567e23024f68e9c7f1daa0afd1485534fcd700aa131146d12f8d7dbfb95fbcbce798fc37e4408afd7976d077d750f
+  data.tar.gz: 3e9e7f6289d906b2e4f984ccf285e24024de933b7aa9065f99559f057392299a69696ab82706624781f780d925eb6f1565f629462ec9a1f98fadfc3bea8810ba

data/README.md CHANGED

@@ -41,25 +41,33 @@ p lem.lemma("MacBooks", :noun) # => "MacBooks"
 p lem.lemma("higher", :adj) # => "higher" not "high"!
 # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
-# Modify dict/index.{noun|verb|adj|adv} if necessary.
+# To fix this, modify the original dict directly (lib/dict/index.{noun|verb|adj|adv})
+# or supply with custom dict files (recommended).
 ```
 Supplying with user dict
 -----------
 ```ruby
-# You can supply files with additional dict data consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
-# The data in user supplied files overrides the preset data
+# You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
+# The data in user supplied files overrides the preset data. Here's the sample.
------- sample.dict.txt -----
-adj   higher   high
-adj   highest  high
-noun  MacBooks MacBook
-----------------------------
+# --- sample.dict.txt (don't include hash symbol on the left) ---
+# adj   higher   high
+# adj   highest  high
+# noun  MacBooks MacBook
+# ---------------------------------------------------------------
 lem = Lemmatizer.new("sample.dict.txt")
-p lem.lemma("higher", :adj)    # => "high"
-p lem.lemma("highest", :adj)   # => "high"
-p lem.lemma("MacBooks", :noun  # => "MacBook"
+# => 3 lexical items added from dict file provided
+p lem.lemma("higher", :adj)     # => "high"
+p lem.lemma("highest", :adj)    # => "high"
+p lem.lemma("MacBooks", :noun)  # => "MacBook"
+# The argument to Lemmatizer.new can be either of the following:
+# 1) a path string to a dict file (e.g. "/path/to/dict.txt")
+# 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
 ```
 Author

data/lib/lemmatizer/lemmatizer.rb CHANGED

@@ -68,7 +68,11 @@ module Lemmatizer
         load_wordnet_files(pos, pair[0], pair[1])
       end
-      load_provided_dict(dict) if dict
+      if dict
+        [dict].flatten.each do |d|
+          load_provided_dict(d)
+        end
+      end
     end
     def lemma(form, pos = nil)
@@ -182,7 +186,7 @@ module Lemmatizer
           end
         end
       end
-      puts "#{num_lex_added} lexical items added from dict file provided"
+      puts "#{num_lex_added} items added from #{File.basename dict}"
     end
   end
 end

data/lib/lemmatizer/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Lemmatizer
-  VERSION = '0.2.0'
+  VERSION = '0.2.1'
 end

data/spec/lemmatizer_spec.rb CHANGED

@@ -5,8 +5,10 @@ describe 'Lemmatizer' do
   before(:all) do
     @lemmatizer = Lemmatizer.new
-    user_data = File.join(File.dirname(__FILE__), "user.dict.txt")
-    @lemmatizer_with_userdata = Lemmatizer.new(user_data)
+    user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
+    user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
+    @lemmatizer_single_userdict = Lemmatizer.new(user_data1)
+    @lemmatizer_multiple_userdicts = Lemmatizer.new([user_data1, user_data2])
   end
   describe '#lemma' do
@@ -75,19 +77,27 @@ describe 'Lemmatizer' do
       expect(result_n2).not_to eq('MacBook')
     end
-    it 'can load user dict that overrides presets' do
+    it 'can load a user dict that overrides presets' do
       # 'MacBooks' -> 'MacBook'
-      result_u1 = @lemmatizer_with_userdata.lemma('MacBooks', :noun)
+      result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
       expect(result_u1).to eq('MacBook')
+      result_u2 = @lemmatizer_single_userdict.lemma('crying', :verb)
+      expect(result_u2).to eq('cry')
+    end
+    it 'can load uder dicts that override presets' do
+      # 'MacBooks' -> 'MacBook'
+      result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
+      expect(result_ud1).to eq('MacBook')
       # 'higher' -> 'high'
-      result_u2 = @lemmatizer_with_userdata.lemma('higher', :adj)
-      expect(result_u2).to eq('high')
+      result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
+      expect(result_ud2).to eq('high')
       # 'highest' -> 'high'
-      result_u3 = @lemmatizer_with_userdata.lemma('higher')
-      expect(result_u3).to eq('high')
+      result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
+      expect(result_ud3).to eq('high')
       # check if (unoverridden) preset data is kept intact
-      result_u4 = @lemmatizer_with_userdata.lemma('crying', :verb)
-      expect(result_u4).to eq('cry')
+      result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
+      expect(result_ud4).to eq('cry')
     end
   end
 end

data/spec/user.dict1.txt ADDED

@@ -0,0 +1,3 @@
+noun MacBooks MacBook
+noun iPhones  iPhone
+noun iPads    iPad

data/spec/user.dict2.txt ADDED

	@@ -0,0 +1,2 @@
1	+ adj higher high
2	+ adj highest high

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: lemmatizer
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-02-15 00:00:00.000000000 Z
+date: 2019-02-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -53,7 +53,8 @@ files:
 - lib/lemmatizer/version.rb
 - spec/lemmatizer_spec.rb
 - spec/spec_helper.rb
-- spec/user.dict.txt
+- spec/user.dict1.txt
+- spec/user.dict2.txt
 homepage: http://github.com/yohasebe/lemmatizer
 licenses:
 - MIT
@@ -80,4 +81,5 @@ summary: Englsh lemmatizer in Ruby
 test_files:
 - spec/lemmatizer_spec.rb
 - spec/spec_helper.rb
-- spec/user.dict.txt
+- spec/user.dict1.txt
+- spec/user.dict2.txt

data/spec/user.dict.txt DELETED

@@ -1,5 +0,0 @@
-n MacBooks MacBook
-n iPhones iPhone
-n iPads iPad
-adj higher high
-adj highest high