lemmatizer 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- YmRjOGYyZWE4ZWJjNWJjYWU3MzFkY2M5MjU1OTM4MzMxOGM1OGYwYw==
5
- data.tar.gz: !binary |-
6
- ZjRiZGQ1NjI1MzU2NTEyM2JmMzg0NGZiNDI2ZGRiMzExNmNlNDllNw==
2
+ SHA256:
3
+ metadata.gz: 3962f3411b45a381c605ddd975f34bfad4055d8bc63bfdf385ed9341f395f5c4
4
+ data.tar.gz: 7dd90d196fda8c109f5847a7fafb0917f99c6f508dd0488cc33d60ced2c435e6
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- NTM0YThiNDVhYWVlYjZkNDZlZGNmMTg2OTYxMDE0ZDYwNWM0NWE5MGE2YjA5
10
- ODZmYTM2YmE1MmM5MGJhODUzOWUxYmQzYTcwMzBhMmIxNmRiOTEwOGRmOWFk
11
- YmNmNzc1YWI5ZDMzMDk2NDBhNmExNTUyZDgwYTJhZjFlOTZkN2Y=
12
- data.tar.gz: !binary |-
13
- ZmFlNWQ2OWYzYTBmMjM1MmVlOThlMWNlMTIwNjAwYjgwMzYxNWM0YmUzMThj
14
- YjJiODA3NGNmOTk0MzQ4ZmY2YTc2ODM1YmJhMzgxOTQ1ZmEzNTY4ZDNkMDky
15
- YTdmYTk4NDY5MzAzZjk2M2ZhY2RmOTJjZDQwMmY3ODE5N2ViOTY=
6
+ metadata.gz: eef23c892d9d9544637196fa61dde985fe440308157fd05f0202511723f3c84585b294cea6d827b4652a1473ff89f2b1f7a4b242aed17ae97c1700ffdee302e6
7
+ data.tar.gz: 1e95bb9907884803e9413251df611bd999d698baf2a6c709e5eb8f4ccbe9b272894ce179931c3609fa50e37fcac08f444cc10e05602312362b72f99fb871c77f
data/README.md CHANGED
@@ -4,6 +4,8 @@ Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordne
4
4
 
5
5
  Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
6
6
 
7
+ Version 0.2 has added functionality to add user supplied data at runtime
8
+
7
9
  Installation
8
10
  ------------
9
11
  sudo gem install lemmatizer
@@ -42,6 +44,24 @@ p lem.lemma("higher", :adj) # => "higher" not "high"!
42
44
  # Modify dict/index.{noun|verb|adj|adv} if necessary.
43
45
  ```
44
46
 
47
+ Supplying with user dict
48
+ -----------
49
+ ```ruby
50
+ # You can supply files with additional dict data consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
51
+ # The data in user supplied files overrides the preset data
52
+
53
+ ------ sample.dict.txt -----
54
+ adj higher high
55
+ adj highest high
56
+ noun MacBooks MacBook
57
+ ----------------------------
58
+
59
+ lem = Lemmatizer.new("sample.dict.txt")
60
+ p lem.lemma("higher", :adj) # => "high"
61
+ p lem.lemma("highest", :adj) # => "high"
62
+ p lem.lemma("MacBooks", :noun # => "MacBook"
63
+ ```
64
+
45
65
  Author
46
66
  ------
47
67
  * Yoichiro Hasebe <yohasebe@gmail.com>
@@ -51,4 +71,4 @@ Thanks for assistance and contributions:
51
71
 
52
72
  License
53
73
  -------
54
- Licensed under the MIT license.
74
+ Licensed under the MIT license.
data/Rakefile CHANGED
@@ -3,7 +3,7 @@ require 'rspec/core'
3
3
  require 'rspec/core/rake_task'
4
4
 
5
5
  RSpec::Core::RakeTask.new(:spec) do |spec|
6
- spec.pattern = FileList['spec/**/*_spec.rb']
6
+ spec.pattern = FileList['spec/**/*_spec.rb']
7
7
  end
8
8
 
9
- task :default => :spec
9
+ task :default => :spec
@@ -1,5 +1,3 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
1
  lib = File.expand_path('../lib', __FILE__)
4
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
3
 
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ target = ARGV[0]
5
+ base = File.basename(target)
6
+
7
+ case base
8
+ when /\.noun/
9
+ mode = :noun
10
+ when /\.verb/
11
+ mode = :verb
12
+ when /\.adj/
13
+ mode = :adj
14
+ when /\.adv/
15
+ mode = :adv
16
+ end
17
+
18
+ newtarget = base + "-mod"
19
+
20
+ infile = File.open(target)
21
+ lines = infile.readlines
22
+ infile.close
23
+
24
+ results = {}
25
+ lines.each do |line|
26
+ /^([^\s]+)/ =~ line
27
+ case mode
28
+ when :noun
29
+ lemma = $1.sub(/s\z/, "").sub(/e\z/, "")
30
+ when :verb
31
+ lemma = $1.sub(/s\z/, "").sub(/d\z/, "").sub(/ing\z/, "").sub(/e\z/, "")
32
+ when :adj
33
+ lemma = $1.sub(/r\z/, "").sub(/st\z/, "").sub(/e\z/, "").sub(/i\z/, "")
34
+ when :adv
35
+ lemma = $1.sub(/r\z/, "").sub(/st\z/, "").sub(/e\z/, "").sub(/i\z/, "")
36
+ end
37
+ if results[lemma]
38
+ next
39
+ else
40
+ results[lemma] = line
41
+ end
42
+ end
43
+
44
+ outfile = File.open(newtarget, "w")
45
+ outfile.write(results.values.join(""))
46
+ outfile.close
@@ -1,12 +1,10 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  require 'stringio'
4
2
  require 'lemmatizer/version'
5
3
  require 'lemmatizer/core_ext'
6
4
  require 'lemmatizer/lemmatizer'
7
5
 
8
6
  module Lemmatizer
9
- def self.new
10
- Lemmatizer.new
7
+ def self.new(dict = nil)
8
+ Lemmatizer.new(dict)
11
9
  end
12
10
  end
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  module Lematizer
4
2
  class ::String
5
3
  def endwith(s)
@@ -1,74 +1,74 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  module Lemmatizer
4
2
  class Lemmatizer
5
3
  DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
6
-
4
+
7
5
  WN_FILES = {
8
6
  :noun => [
9
- DATA_DIR + '/dict/index.noun',
7
+ DATA_DIR + '/dict/index.noun',
10
8
  DATA_DIR + '/dict/noun.exc'
11
9
  ],
12
10
  :verb => [
13
- DATA_DIR + '/dict/index.verb',
11
+ DATA_DIR + '/dict/index.verb',
14
12
  DATA_DIR + '/dict/verb.exc'
15
13
  ],
16
14
  :adj => [
17
- DATA_DIR + '/dict/index.adj',
15
+ DATA_DIR + '/dict/index.adj',
18
16
  DATA_DIR + '/dict/adj.exc'
19
17
  ],
20
18
  :adv => [
21
- DATA_DIR + '/dict/index.adv',
19
+ DATA_DIR + '/dict/index.adv',
22
20
  DATA_DIR + '/dict/adv.exc'
23
21
  ]
24
22
  }
25
23
 
26
24
  MORPHOLOGICAL_SUBSTITUTIONS = {
27
25
  :noun => [
28
- ['s', '' ],
29
- ['ses', 's' ],
30
- ['ves', 'f' ],
26
+ ['s', '' ],
27
+ ['ses', 's' ],
28
+ ['ves', 'f' ],
31
29
  ['xes', 'x' ],
32
- ['zes', 'z' ],
33
- ['ches', 'ch' ],
30
+ ['zes', 'z' ],
31
+ ['ches', 'ch' ],
34
32
  ['shes', 'sh' ],
35
- ['men', 'man'],
33
+ ['men', 'man'],
36
34
  ['ies', 'y' ]
37
35
  ],
38
36
  :verb => [
39
- ['s', '' ],
40
- ['ies', 'y'],
41
- ['es', 'e'],
37
+ ['s', '' ],
38
+ ['ies', 'y'],
39
+ ['es', 'e'],
42
40
  ['es', '' ],
43
- ['ed', 'e'],
44
- ['ed', '' ],
45
- ['ing', 'e'],
41
+ ['ed', 'e'],
42
+ ['ed', '' ],
43
+ ['ing', 'e'],
46
44
  ['ing', '' ]
47
45
  ],
48
46
  :adj => [
49
- ['er', '' ],
50
- ['est', '' ],
51
- ['er', 'e'],
47
+ ['er', '' ],
48
+ ['est', '' ],
49
+ ['er', 'e'],
52
50
  ['est', 'e']
53
51
  ],
54
52
  :adv => [
53
+ ],
54
+ :unknown => [
55
55
  ]
56
56
  }
57
57
 
58
- def initialize(files = WN_FILES)
58
+ def initialize(dict = nil)
59
59
  @wordlists = {}
60
60
  @exceptions = {}
61
-
61
+
62
62
  MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
63
63
  @wordlists[x] = {}
64
64
  @exceptions[x] = {}
65
65
  end
66
-
67
- if files
68
- files.each_pair do |pos, pair|
69
- load_wordnet_files(pos, pair[0], pair[1])
70
- end
66
+
67
+ WN_FILES.each_pair do |pos, pair|
68
+ load_wordnet_files(pos, pair[0], pair[1])
71
69
  end
70
+
71
+ load_provided_dict(dict) if dict
72
72
  end
73
73
 
74
74
  def lemma(form, pos = nil)
@@ -79,15 +79,20 @@ module Lemmatizer
79
79
  end
80
80
 
81
81
  return form
82
- end
82
+ end
83
83
 
84
84
  each_lemma(form, pos) do |x|
85
85
  return x
86
86
  end
87
-
87
+
88
88
  form
89
89
  end
90
-
90
+
91
+ # Print object only on init
92
+ def inspect
93
+ "#{self}"
94
+ end
95
+
91
96
  private
92
97
 
93
98
  def open_file(*args)
@@ -142,11 +147,42 @@ module Lemmatizer
142
147
  yield x + 'ful'
143
148
  end
144
149
  else
145
-
150
+
146
151
  each_substitutions(form, pos) do|x|
147
152
  yield x
148
153
  end
149
154
  end
150
155
  end
156
+
157
+ def str_to_pos(str)
158
+ case str
159
+ when "n", "noun"
160
+ return :noun
161
+ when "v", "verb"
162
+ return :noun
163
+ when "a", "j", "adjective", "adj"
164
+ return :adj
165
+ when "r", "adverb", "adv"
166
+ return :adv
167
+ else
168
+ return :unknown
169
+ end
170
+ end
171
+
172
+ def load_provided_dict(dict)
173
+ num_lex_added = 0
174
+ open_file(dict) do |io|
175
+ io.each_line do |line|
176
+ # pos must be either n|v|r|a or noun|verb|adverb|adjective
177
+ p, w, s = line.split(/\s+/)
178
+ pos = str_to_pos(p)
179
+ if @wordlists[pos]
180
+ @wordlists[pos][w] = s
181
+ num_lex_added += 1
182
+ end
183
+ end
184
+ end
185
+ puts "#{num_lex_added} lexical items added from dict file provided"
186
+ end
151
187
  end
152
188
  end
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  module Lemmatizer
4
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
5
3
  end
@@ -1,65 +1,93 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  require 'spec_helper'
4
2
  require 'lemmatizer'
5
3
 
6
- describe "Lemmatizer" do
7
- before do
8
- @lemmatizer = Lemmatizer.new
9
- end
10
-
11
- describe "#lemma" do
12
- it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
13
- result_n1 = @lemmatizer.lemma("analyses", :noun)
14
- result_n1.should == "analysis"
15
-
16
- # Lemmatizer leaves alone words that its dictionary does not contain to keep proper names such as "James" intact.
17
- result_n2 = @lemmatizer.lemma("MacBooks", :noun)
18
- result_n2.should_not == "MacBook"
19
-
20
- result_n3 = @lemmatizer.lemma("desks", :noun)
21
- result_n3.should == "desk"
22
-
23
- result_v1 = @lemmatizer.lemma("hired", :verb)
24
- result_v1.should == "hire"
25
-
26
- result_v2 = @lemmatizer.lemma("worried", :verb)
27
- result_v2.should == "worry"
28
-
29
- result_v3 = @lemmatizer.lemma("partying", :verb)
30
- result_v3.should == "party"
31
-
32
- result_a1 = @lemmatizer.lemma("better", :adj)
33
- result_a1.should == "good"
34
-
35
- result_a2 = @lemmatizer.lemma("hotter", :adj)
36
- result_a2.should == "hot"
37
-
38
- result_r1 = @lemmatizer.lemma("best", :adv)
39
- result_r1.should == "well"
40
-
41
- result_r2 = @lemmatizer.lemma("best", :adv)
42
- result_r2.should_not == "good"
43
-
44
- # Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
45
- result_1 = @lemmatizer.lemma("plays")
46
- result_1.should == "play"
47
-
48
- result_2 = @lemmatizer.lemma("oxen")
49
- result_2.should == "ox"
50
-
51
- result_3 = @lemmatizer.lemma("higher")
52
- result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
53
-
54
- result_2 = @lemmatizer.lemma("asdfassda") # non-existing word
55
- result_2.should == "asdfassda"
56
-
57
- # test cases for words used in README
58
- result_t1 = @lemmatizer.lemma("fired")
59
- result_t1.should == "fire"
60
-
61
- result_t2 = @lemmatizer.lemma("slower")
62
- result_t2.should == "slow"
63
- end
64
- end
4
+ describe 'Lemmatizer' do
5
+
6
+ before(:all) do
7
+ @lemmatizer = Lemmatizer.new
8
+ user_data = File.join(File.dirname(__FILE__), "user.dict.txt")
9
+ @lemmatizer_with_userdata = Lemmatizer.new(user_data)
10
+ end
11
+
12
+ describe '#lemma' do
13
+ it 'takes a noun and returns its lemma' do
14
+ result_n1 = @lemmatizer.lemma('analyses', :noun)
15
+ expect(result_n1).to eq('analysis')
16
+
17
+ result_n3 = @lemmatizer.lemma('desks', :noun)
18
+ expect(result_n3).to eq('desk')
19
+ end
20
+
21
+ it 'takes a verb and returns its lemma' do
22
+ result_v1 = @lemmatizer.lemma('hired', :verb)
23
+ expect(result_v1).to eq('hire')
24
+
25
+ result_v2 = @lemmatizer.lemma('worried', :verb)
26
+ expect(result_v2).to eq('worry')
27
+
28
+ result_v3 = @lemmatizer.lemma('partying', :verb)
29
+ expect(result_v3).to eq('party')
30
+ end
31
+
32
+ it 'takes an adjective and returns its lemma' do
33
+ result_a1 = @lemmatizer.lemma('better', :adj)
34
+ expect(result_a1).to eq('good')
35
+
36
+ result_a2 = @lemmatizer.lemma('hotter', :adj)
37
+ expect(result_a2).to eq('hot')
38
+ end
39
+
40
+ it 'takes an adverb and returns its lemma' do
41
+ result_r1 = @lemmatizer.lemma('best', :adv)
42
+ expect(result_r1).to eq('well')
43
+
44
+ result_r2 = @lemmatizer.lemma('best', :adv)
45
+ expect(result_r2).not_to eq('good')
46
+ end
47
+
48
+ it 'gives a result when no pos is given' do
49
+ # Order: :verb, :noun, :adv, or :adj
50
+ result_1 = @lemmatizer.lemma('plays')
51
+ expect(result_1).to eq('play')
52
+
53
+ result_2 = @lemmatizer.lemma('oxen')
54
+ expect(result_2).to eq('ox')
55
+
56
+ # 'higher' is itself contained in the adj list.
57
+ result_3 = @lemmatizer.lemma('higher')
58
+ expect(result_3).not_to eq('high')
59
+
60
+ # Non-existing word
61
+ result_2 = @lemmatizer.lemma('asdfassda')
62
+ expect(result_2).to eq('asdfassda')
63
+
64
+ # Test cases for words used in README
65
+ result_t1 = @lemmatizer.lemma('fired')
66
+ expect(result_t1).to eq('fire')
67
+
68
+ result_t2 = @lemmatizer.lemma('slower')
69
+ expect(result_t2).to eq('slow')
70
+ end
71
+
72
+ it 'leaves alone words that dictionary does not contain' do
73
+ # Such as 'James' or 'MacBooks'
74
+ result_n2 = @lemmatizer.lemma('MacBooks', :noun)
75
+ expect(result_n2).not_to eq('MacBook')
76
+ end
77
+
78
+ it 'can load user dict that overrides presets' do
79
+ # 'MacBooks' -> 'MacBook'
80
+ result_u1 = @lemmatizer_with_userdata.lemma('MacBooks', :noun)
81
+ expect(result_u1).to eq('MacBook')
82
+ # 'higher' -> 'high'
83
+ result_u2 = @lemmatizer_with_userdata.lemma('higher', :adj)
84
+ expect(result_u2).to eq('high')
85
+ # 'highest' -> 'high'
86
+ result_u3 = @lemmatizer_with_userdata.lemma('higher')
87
+ expect(result_u3).to eq('high')
88
+ # check if (unoverridden) preset data is kept intact
89
+ result_u4 = @lemmatizer_with_userdata.lemma('crying', :verb)
90
+ expect(result_u4).to eq('cry')
91
+ end
92
+ end
65
93
  end
@@ -0,0 +1,5 @@
1
+ n MacBooks MacBook
2
+ n iPhones iPhone
3
+ n iPads iPad
4
+ adj higher high
5
+ adj highest high
metadata CHANGED
@@ -1,30 +1,30 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-03 00:00:00.000000000 Z
11
+ date: 2019-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- description: ! "\n Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
27
+ description: "\n Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
28
28
  package.\n "
29
29
  email:
30
30
  - yohasebe@gmail.com
@@ -32,7 +32,7 @@ executables: []
32
32
  extensions: []
33
33
  extra_rdoc_files: []
34
34
  files:
35
- - .gitignore
35
+ - ".gitignore"
36
36
  - Gemfile
37
37
  - LICENSE.txt
38
38
  - README.md
@@ -40,6 +40,7 @@ files:
40
40
  - lemmatizer.gemspec
41
41
  - lib/dict/adj.exc
42
42
  - lib/dict/adv.exc
43
+ - lib/dict/cleanup.rb
43
44
  - lib/dict/index.adj
44
45
  - lib/dict/index.adv
45
46
  - lib/dict/index.noun
@@ -52,6 +53,7 @@ files:
52
53
  - lib/lemmatizer/version.rb
53
54
  - spec/lemmatizer_spec.rb
54
55
  - spec/spec_helper.rb
56
+ - spec/user.dict.txt
55
57
  homepage: http://github.com/yohasebe/lemmatizer
56
58
  licenses:
57
59
  - MIT
@@ -62,20 +64,20 @@ require_paths:
62
64
  - lib
63
65
  required_ruby_version: !ruby/object:Gem::Requirement
64
66
  requirements:
65
- - - ! '>='
67
+ - - ">="
66
68
  - !ruby/object:Gem::Version
67
69
  version: '0'
68
70
  required_rubygems_version: !ruby/object:Gem::Requirement
69
71
  requirements:
70
- - - ! '>='
72
+ - - ">="
71
73
  - !ruby/object:Gem::Version
72
74
  version: '0'
73
75
  requirements: []
74
- rubyforge_project:
75
- rubygems_version: 2.1.9
76
+ rubygems_version: 3.0.1
76
77
  signing_key:
77
78
  specification_version: 4
78
79
  summary: Englsh lemmatizer in Ruby
79
80
  test_files:
80
81
  - spec/lemmatizer_spec.rb
81
82
  - spec/spec_helper.rb
83
+ - spec/user.dict.txt