lemmatizer 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- YmRjOGYyZWE4ZWJjNWJjYWU3MzFkY2M5MjU1OTM4MzMxOGM1OGYwYw==
5
- data.tar.gz: !binary |-
6
- ZjRiZGQ1NjI1MzU2NTEyM2JmMzg0NGZiNDI2ZGRiMzExNmNlNDllNw==
2
+ SHA256:
3
+ metadata.gz: 3962f3411b45a381c605ddd975f34bfad4055d8bc63bfdf385ed9341f395f5c4
4
+ data.tar.gz: 7dd90d196fda8c109f5847a7fafb0917f99c6f508dd0488cc33d60ced2c435e6
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- NTM0YThiNDVhYWVlYjZkNDZlZGNmMTg2OTYxMDE0ZDYwNWM0NWE5MGE2YjA5
10
- ODZmYTM2YmE1MmM5MGJhODUzOWUxYmQzYTcwMzBhMmIxNmRiOTEwOGRmOWFk
11
- YmNmNzc1YWI5ZDMzMDk2NDBhNmExNTUyZDgwYTJhZjFlOTZkN2Y=
12
- data.tar.gz: !binary |-
13
- ZmFlNWQ2OWYzYTBmMjM1MmVlOThlMWNlMTIwNjAwYjgwMzYxNWM0YmUzMThj
14
- YjJiODA3NGNmOTk0MzQ4ZmY2YTc2ODM1YmJhMzgxOTQ1ZmEzNTY4ZDNkMDky
15
- YTdmYTk4NDY5MzAzZjk2M2ZhY2RmOTJjZDQwMmY3ODE5N2ViOTY=
6
+ metadata.gz: eef23c892d9d9544637196fa61dde985fe440308157fd05f0202511723f3c84585b294cea6d827b4652a1473ff89f2b1f7a4b242aed17ae97c1700ffdee302e6
7
+ data.tar.gz: 1e95bb9907884803e9413251df611bd999d698baf2a6c709e5eb8f4ccbe9b272894ce179931c3609fa50e37fcac08f444cc10e05602312362b72f99fb871c77f
data/README.md CHANGED
@@ -4,6 +4,8 @@ Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordne
4
4
 
5
5
  Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
6
6
 
7
+ Version 0.2 has added functionality to add user supplied data at runtime
8
+
7
9
  Installation
8
10
  ------------
9
11
  sudo gem install lemmatizer
@@ -42,6 +44,24 @@ p lem.lemma("higher", :adj) # => "higher" not "high"!
42
44
  # Modify dict/index.{noun|verb|adj|adv} if necessary.
43
45
  ```
44
46
 
47
+ Supplying with user dict
48
+ -----------
49
+ ```ruby
50
+ # You can supply files with additional dict data consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
51
+ # The data in user supplied files overrides the preset data
52
+
53
+ ------ sample.dict.txt -----
54
+ adj higher high
55
+ adj highest high
56
+ noun MacBooks MacBook
57
+ ----------------------------
58
+
59
+ lem = Lemmatizer.new("sample.dict.txt")
60
+ p lem.lemma("higher", :adj) # => "high"
61
+ p lem.lemma("highest", :adj) # => "high"
62
+ p lem.lemma("MacBooks", :noun # => "MacBook"
63
+ ```
64
+
45
65
  Author
46
66
  ------
47
67
  * Yoichiro Hasebe <yohasebe@gmail.com>
@@ -51,4 +71,4 @@ Thanks for assistance and contributions:
51
71
 
52
72
  License
53
73
  -------
54
- Licensed under the MIT license.
74
+ Licensed under the MIT license.
data/Rakefile CHANGED
@@ -3,7 +3,7 @@ require 'rspec/core'
3
3
  require 'rspec/core/rake_task'
4
4
 
5
5
  RSpec::Core::RakeTask.new(:spec) do |spec|
6
- spec.pattern = FileList['spec/**/*_spec.rb']
6
+ spec.pattern = FileList['spec/**/*_spec.rb']
7
7
  end
8
8
 
9
- task :default => :spec
9
+ task :default => :spec
@@ -1,5 +1,3 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
1
  lib = File.expand_path('../lib', __FILE__)
4
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
3
 
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ target = ARGV[0]
5
+ base = File.basename(target)
6
+
7
+ case base
8
+ when /\.noun/
9
+ mode = :noun
10
+ when /\.verb/
11
+ mode = :verb
12
+ when /\.adj/
13
+ mode = :adj
14
+ when /\.adv/
15
+ mode = :adv
16
+ end
17
+
18
+ newtarget = base + "-mod"
19
+
20
+ infile = File.open(target)
21
+ lines = infile.readlines
22
+ infile.close
23
+
24
+ results = {}
25
+ lines.each do |line|
26
+ /^([^\s]+)/ =~ line
27
+ case mode
28
+ when :noun
29
+ lemma = $1.sub(/s\z/, "").sub(/e\z/, "")
30
+ when :verb
31
+ lemma = $1.sub(/s\z/, "").sub(/d\z/, "").sub(/ing\z/, "").sub(/e\z/, "")
32
+ when :adj
33
+ lemma = $1.sub(/r\z/, "").sub(/st\z/, "").sub(/e\z/, "").sub(/i\z/, "")
34
+ when :adv
35
+ lemma = $1.sub(/r\z/, "").sub(/st\z/, "").sub(/e\z/, "").sub(/i\z/, "")
36
+ end
37
+ if results[lemma]
38
+ next
39
+ else
40
+ results[lemma] = line
41
+ end
42
+ end
43
+
44
+ outfile = File.open(newtarget, "w")
45
+ outfile.write(results.values.join(""))
46
+ outfile.close
@@ -1,12 +1,10 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  require 'stringio'
4
2
  require 'lemmatizer/version'
5
3
  require 'lemmatizer/core_ext'
6
4
  require 'lemmatizer/lemmatizer'
7
5
 
8
6
  module Lemmatizer
9
- def self.new
10
- Lemmatizer.new
7
+ def self.new(dict = nil)
8
+ Lemmatizer.new(dict)
11
9
  end
12
10
  end
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  module Lematizer
4
2
  class ::String
5
3
  def endwith(s)
@@ -1,74 +1,74 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  module Lemmatizer
4
2
  class Lemmatizer
5
3
  DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
6
-
4
+
7
5
  WN_FILES = {
8
6
  :noun => [
9
- DATA_DIR + '/dict/index.noun',
7
+ DATA_DIR + '/dict/index.noun',
10
8
  DATA_DIR + '/dict/noun.exc'
11
9
  ],
12
10
  :verb => [
13
- DATA_DIR + '/dict/index.verb',
11
+ DATA_DIR + '/dict/index.verb',
14
12
  DATA_DIR + '/dict/verb.exc'
15
13
  ],
16
14
  :adj => [
17
- DATA_DIR + '/dict/index.adj',
15
+ DATA_DIR + '/dict/index.adj',
18
16
  DATA_DIR + '/dict/adj.exc'
19
17
  ],
20
18
  :adv => [
21
- DATA_DIR + '/dict/index.adv',
19
+ DATA_DIR + '/dict/index.adv',
22
20
  DATA_DIR + '/dict/adv.exc'
23
21
  ]
24
22
  }
25
23
 
26
24
  MORPHOLOGICAL_SUBSTITUTIONS = {
27
25
  :noun => [
28
- ['s', '' ],
29
- ['ses', 's' ],
30
- ['ves', 'f' ],
26
+ ['s', '' ],
27
+ ['ses', 's' ],
28
+ ['ves', 'f' ],
31
29
  ['xes', 'x' ],
32
- ['zes', 'z' ],
33
- ['ches', 'ch' ],
30
+ ['zes', 'z' ],
31
+ ['ches', 'ch' ],
34
32
  ['shes', 'sh' ],
35
- ['men', 'man'],
33
+ ['men', 'man'],
36
34
  ['ies', 'y' ]
37
35
  ],
38
36
  :verb => [
39
- ['s', '' ],
40
- ['ies', 'y'],
41
- ['es', 'e'],
37
+ ['s', '' ],
38
+ ['ies', 'y'],
39
+ ['es', 'e'],
42
40
  ['es', '' ],
43
- ['ed', 'e'],
44
- ['ed', '' ],
45
- ['ing', 'e'],
41
+ ['ed', 'e'],
42
+ ['ed', '' ],
43
+ ['ing', 'e'],
46
44
  ['ing', '' ]
47
45
  ],
48
46
  :adj => [
49
- ['er', '' ],
50
- ['est', '' ],
51
- ['er', 'e'],
47
+ ['er', '' ],
48
+ ['est', '' ],
49
+ ['er', 'e'],
52
50
  ['est', 'e']
53
51
  ],
54
52
  :adv => [
53
+ ],
54
+ :unknown => [
55
55
  ]
56
56
  }
57
57
 
58
- def initialize(files = WN_FILES)
58
+ def initialize(dict = nil)
59
59
  @wordlists = {}
60
60
  @exceptions = {}
61
-
61
+
62
62
  MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
63
63
  @wordlists[x] = {}
64
64
  @exceptions[x] = {}
65
65
  end
66
-
67
- if files
68
- files.each_pair do |pos, pair|
69
- load_wordnet_files(pos, pair[0], pair[1])
70
- end
66
+
67
+ WN_FILES.each_pair do |pos, pair|
68
+ load_wordnet_files(pos, pair[0], pair[1])
71
69
  end
70
+
71
+ load_provided_dict(dict) if dict
72
72
  end
73
73
 
74
74
  def lemma(form, pos = nil)
@@ -79,15 +79,20 @@ module Lemmatizer
79
79
  end
80
80
 
81
81
  return form
82
- end
82
+ end
83
83
 
84
84
  each_lemma(form, pos) do |x|
85
85
  return x
86
86
  end
87
-
87
+
88
88
  form
89
89
  end
90
-
90
+
91
+ # Print object only on init
92
+ def inspect
93
+ "#{self}"
94
+ end
95
+
91
96
  private
92
97
 
93
98
  def open_file(*args)
@@ -142,11 +147,42 @@ module Lemmatizer
142
147
  yield x + 'ful'
143
148
  end
144
149
  else
145
-
150
+
146
151
  each_substitutions(form, pos) do|x|
147
152
  yield x
148
153
  end
149
154
  end
150
155
  end
156
+
157
+ def str_to_pos(str)
158
+ case str
159
+ when "n", "noun"
160
+ return :noun
161
+ when "v", "verb"
162
+ return :noun
163
+ when "a", "j", "adjective", "adj"
164
+ return :adj
165
+ when "r", "adverb", "adv"
166
+ return :adv
167
+ else
168
+ return :unknown
169
+ end
170
+ end
171
+
172
+ def load_provided_dict(dict)
173
+ num_lex_added = 0
174
+ open_file(dict) do |io|
175
+ io.each_line do |line|
176
+ # pos must be either n|v|r|a or noun|verb|adverb|adjective
177
+ p, w, s = line.split(/\s+/)
178
+ pos = str_to_pos(p)
179
+ if @wordlists[pos]
180
+ @wordlists[pos][w] = s
181
+ num_lex_added += 1
182
+ end
183
+ end
184
+ end
185
+ puts "#{num_lex_added} lexical items added from dict file provided"
186
+ end
151
187
  end
152
188
  end
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8; mode: ruby -*-
2
-
3
1
  module Lemmatizer
4
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
5
3
  end
@@ -1,65 +1,93 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  require 'spec_helper'
4
2
  require 'lemmatizer'
5
3
 
6
- describe "Lemmatizer" do
7
- before do
8
- @lemmatizer = Lemmatizer.new
9
- end
10
-
11
- describe "#lemma" do
12
- it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
13
- result_n1 = @lemmatizer.lemma("analyses", :noun)
14
- result_n1.should == "analysis"
15
-
16
- # Lemmatizer leaves alone words that its dictionary does not contain to keep proper names such as "James" intact.
17
- result_n2 = @lemmatizer.lemma("MacBooks", :noun)
18
- result_n2.should_not == "MacBook"
19
-
20
- result_n3 = @lemmatizer.lemma("desks", :noun)
21
- result_n3.should == "desk"
22
-
23
- result_v1 = @lemmatizer.lemma("hired", :verb)
24
- result_v1.should == "hire"
25
-
26
- result_v2 = @lemmatizer.lemma("worried", :verb)
27
- result_v2.should == "worry"
28
-
29
- result_v3 = @lemmatizer.lemma("partying", :verb)
30
- result_v3.should == "party"
31
-
32
- result_a1 = @lemmatizer.lemma("better", :adj)
33
- result_a1.should == "good"
34
-
35
- result_a2 = @lemmatizer.lemma("hotter", :adj)
36
- result_a2.should == "hot"
37
-
38
- result_r1 = @lemmatizer.lemma("best", :adv)
39
- result_r1.should == "well"
40
-
41
- result_r2 = @lemmatizer.lemma("best", :adv)
42
- result_r2.should_not == "good"
43
-
44
- # Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
45
- result_1 = @lemmatizer.lemma("plays")
46
- result_1.should == "play"
47
-
48
- result_2 = @lemmatizer.lemma("oxen")
49
- result_2.should == "ox"
50
-
51
- result_3 = @lemmatizer.lemma("higher")
52
- result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
53
-
54
- result_2 = @lemmatizer.lemma("asdfassda") # non-existing word
55
- result_2.should == "asdfassda"
56
-
57
- # test cases for words used in README
58
- result_t1 = @lemmatizer.lemma("fired")
59
- result_t1.should == "fire"
60
-
61
- result_t2 = @lemmatizer.lemma("slower")
62
- result_t2.should == "slow"
63
- end
64
- end
4
+ describe 'Lemmatizer' do
5
+
6
+ before(:all) do
7
+ @lemmatizer = Lemmatizer.new
8
+ user_data = File.join(File.dirname(__FILE__), "user.dict.txt")
9
+ @lemmatizer_with_userdata = Lemmatizer.new(user_data)
10
+ end
11
+
12
+ describe '#lemma' do
13
+ it 'takes a noun and returns its lemma' do
14
+ result_n1 = @lemmatizer.lemma('analyses', :noun)
15
+ expect(result_n1).to eq('analysis')
16
+
17
+ result_n3 = @lemmatizer.lemma('desks', :noun)
18
+ expect(result_n3).to eq('desk')
19
+ end
20
+
21
+ it 'takes a verb and returns its lemma' do
22
+ result_v1 = @lemmatizer.lemma('hired', :verb)
23
+ expect(result_v1).to eq('hire')
24
+
25
+ result_v2 = @lemmatizer.lemma('worried', :verb)
26
+ expect(result_v2).to eq('worry')
27
+
28
+ result_v3 = @lemmatizer.lemma('partying', :verb)
29
+ expect(result_v3).to eq('party')
30
+ end
31
+
32
+ it 'takes an adjective and returns its lemma' do
33
+ result_a1 = @lemmatizer.lemma('better', :adj)
34
+ expect(result_a1).to eq('good')
35
+
36
+ result_a2 = @lemmatizer.lemma('hotter', :adj)
37
+ expect(result_a2).to eq('hot')
38
+ end
39
+
40
+ it 'takes an adverb and returns its lemma' do
41
+ result_r1 = @lemmatizer.lemma('best', :adv)
42
+ expect(result_r1).to eq('well')
43
+
44
+ result_r2 = @lemmatizer.lemma('best', :adv)
45
+ expect(result_r2).not_to eq('good')
46
+ end
47
+
48
+ it 'gives a result when no pos is given' do
49
+ # Order: :verb, :noun, :adv, or :adj
50
+ result_1 = @lemmatizer.lemma('plays')
51
+ expect(result_1).to eq('play')
52
+
53
+ result_2 = @lemmatizer.lemma('oxen')
54
+ expect(result_2).to eq('ox')
55
+
56
+ # 'higher' is itself contained in the adj list.
57
+ result_3 = @lemmatizer.lemma('higher')
58
+ expect(result_3).not_to eq('high')
59
+
60
+ # Non-existing word
61
+ result_2 = @lemmatizer.lemma('asdfassda')
62
+ expect(result_2).to eq('asdfassda')
63
+
64
+ # Test cases for words used in README
65
+ result_t1 = @lemmatizer.lemma('fired')
66
+ expect(result_t1).to eq('fire')
67
+
68
+ result_t2 = @lemmatizer.lemma('slower')
69
+ expect(result_t2).to eq('slow')
70
+ end
71
+
72
+ it 'leaves alone words that dictionary does not contain' do
73
+ # Such as 'James' or 'MacBooks'
74
+ result_n2 = @lemmatizer.lemma('MacBooks', :noun)
75
+ expect(result_n2).not_to eq('MacBook')
76
+ end
77
+
78
+ it 'can load user dict that overrides presets' do
79
+ # 'MacBooks' -> 'MacBook'
80
+ result_u1 = @lemmatizer_with_userdata.lemma('MacBooks', :noun)
81
+ expect(result_u1).to eq('MacBook')
82
+ # 'higher' -> 'high'
83
+ result_u2 = @lemmatizer_with_userdata.lemma('higher', :adj)
84
+ expect(result_u2).to eq('high')
85
+ # 'highest' -> 'high'
86
+ result_u3 = @lemmatizer_with_userdata.lemma('higher')
87
+ expect(result_u3).to eq('high')
88
+ # check if (unoverridden) preset data is kept intact
89
+ result_u4 = @lemmatizer_with_userdata.lemma('crying', :verb)
90
+ expect(result_u4).to eq('cry')
91
+ end
92
+ end
65
93
  end
@@ -0,0 +1,5 @@
1
+ n MacBooks MacBook
2
+ n iPhones iPhone
3
+ n iPads iPad
4
+ adj higher high
5
+ adj highest high
metadata CHANGED
@@ -1,30 +1,30 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-03 00:00:00.000000000 Z
11
+ date: 2019-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- description: ! "\n Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
27
+ description: "\n Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
28
28
  package.\n "
29
29
  email:
30
30
  - yohasebe@gmail.com
@@ -32,7 +32,7 @@ executables: []
32
32
  extensions: []
33
33
  extra_rdoc_files: []
34
34
  files:
35
- - .gitignore
35
+ - ".gitignore"
36
36
  - Gemfile
37
37
  - LICENSE.txt
38
38
  - README.md
@@ -40,6 +40,7 @@ files:
40
40
  - lemmatizer.gemspec
41
41
  - lib/dict/adj.exc
42
42
  - lib/dict/adv.exc
43
+ - lib/dict/cleanup.rb
43
44
  - lib/dict/index.adj
44
45
  - lib/dict/index.adv
45
46
  - lib/dict/index.noun
@@ -52,6 +53,7 @@ files:
52
53
  - lib/lemmatizer/version.rb
53
54
  - spec/lemmatizer_spec.rb
54
55
  - spec/spec_helper.rb
56
+ - spec/user.dict.txt
55
57
  homepage: http://github.com/yohasebe/lemmatizer
56
58
  licenses:
57
59
  - MIT
@@ -62,20 +64,20 @@ require_paths:
62
64
  - lib
63
65
  required_ruby_version: !ruby/object:Gem::Requirement
64
66
  requirements:
65
- - - ! '>='
67
+ - - ">="
66
68
  - !ruby/object:Gem::Version
67
69
  version: '0'
68
70
  required_rubygems_version: !ruby/object:Gem::Requirement
69
71
  requirements:
70
- - - ! '>='
72
+ - - ">="
71
73
  - !ruby/object:Gem::Version
72
74
  version: '0'
73
75
  requirements: []
74
- rubyforge_project:
75
- rubygems_version: 2.1.9
76
+ rubygems_version: 3.0.1
76
77
  signing_key:
77
78
  specification_version: 4
78
79
  summary: Englsh lemmatizer in Ruby
79
80
  test_files:
80
81
  - spec/lemmatizer_spec.rb
81
82
  - spec/spec_helper.rb
83
+ - spec/user.dict.txt