lemmatizer 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ YmRjOGYyZWE4ZWJjNWJjYWU3MzFkY2M5MjU1OTM4MzMxOGM1OGYwYw==
5
+ data.tar.gz: !binary |-
6
+ ZjRiZGQ1NjI1MzU2NTEyM2JmMzg0NGZiNDI2ZGRiMzExNmNlNDllNw==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NTM0YThiNDVhYWVlYjZkNDZlZGNmMTg2OTYxMDE0ZDYwNWM0NWE5MGE2YjA5
10
+ ODZmYTM2YmE1MmM5MGJhODUzOWUxYmQzYTcwMzBhMmIxNmRiOTEwOGRmOWFk
11
+ YmNmNzc1YWI5ZDMzMDk2NDBhNmExNTUyZDgwYTJhZjFlOTZkN2Y=
12
+ data.tar.gz: !binary |-
13
+ ZmFlNWQ2OWYzYTBmMjM1MmVlOThlMWNlMTIwNjAwYjgwMzYxNWM0YmUzMThj
14
+ YjJiODA3NGNmOTk0MzQ4ZmY2YTc2ODM1YmJhMzgxOTQ1ZmEzNTY4ZDNkMDky
15
+ YTdmYTk4NDY5MzAzZjk2M2ZhY2RmOTJjZDQwMmY3ODE5N2ViOTY=
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Yoichiro Hasebe
1
+ Copyright (c) 2012-2013 Yoichiro Hasebe
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -1,40 +1,54 @@
1
1
  lemmatizer
2
2
  ==========
3
-
4
3
  Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
5
4
 
6
5
  Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
7
6
 
8
7
  Installation
9
8
  ------------
10
-
11
9
  sudo gem install lemmatizer
12
10
 
13
11
 
14
12
  Usage
15
13
  -----
16
-
17
- require "lemmatizer"
18
-
19
- lem = Lemmatizer.new
20
-
21
- p lem.lemma("dogs", :noun ) # => "dog"
22
- p lem.lemma("hired", :verb ) # => "hire"
23
- p lem.lemma("hotter", :adj ) # => "hot"
24
- p lem.lemma("better", :adv ) # => "well"
25
-
26
- # when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
27
- p lem.lemma("fired") # => "fire"
28
- p lem.lemma("slow") # => "slow"
14
+ ```ruby
15
+ require "lemmatizer"
16
+
17
+ lem = Lemmatizer.new
18
+
19
+ p lem.lemma("dogs", :noun ) # => "dog"
20
+ p lem.lemma("hired", :verb ) # => "hire"
21
+ p lem.lemma("hotter", :adj ) # => "hot"
22
+ p lem.lemma("better", :adv ) # => "well"
23
+
24
+ # when part-of-speech symbol is not specified as the second argument,
25
+ # lemmatizer tries :verb, :noun, :adj, and :adv one by one in this order.
26
+ p lem.lemma("fired") # => "fire"
27
+ p lem.lemma("slow") # => "slow"
28
+ ```
29
29
 
30
30
  Limitations
31
31
  -----------
32
-
33
- # Lemmatizer leaves alone words that its dictionary does not contain. This keeps proper names such as "James" intact.
34
- p lem.lemma("MacBooks", :noun) # => "MacBooks"
35
-
36
- # If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
37
- p lem.lemma("higher", :adj) # => "higher" not "high"!
38
-
39
- # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
40
- # Modify dict/index.{noun|verb|adj|adv} if necessary.
32
+ ```ruby
33
+ # Lemmatizer leaves alone words that its dictionary does not contain.
34
+ # This keeps proper names such as "James" intact.
35
+ p lem.lemma("MacBooks", :noun) # => "MacBooks"
36
+
37
+ # If an inflected form is included as a lemma in the word index,
38
+ # lemmatizer may not give an expected result.
39
+ p lem.lemma("higher", :adj) # => "higher" not "high"!
40
+
41
+ # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
42
+ # Modify dict/index.{noun|verb|adj|adv} if necessary.
43
+ ```
44
+
45
+ Author
46
+ ------
47
+ * Yoichiro Hasebe <yohasebe@gmail.com>
48
+
49
+ Thanks for assistance and contributions:
50
+ * Vladimir Ivic <http://vladimirivic.com>
51
+
52
+ License
53
+ -------
54
+ Licensed under the MIT license.
@@ -1,21 +1,25 @@
1
1
  # -*- encoding: utf-8 -*-
2
+
2
3
  lib = File.expand_path('../lib', __FILE__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
4
6
  require 'lemmatizer/version'
5
7
 
6
8
  Gem::Specification.new do |gem|
7
- gem.name = "lemmatizer"
9
+ gem.name = 'lemmatizer'
8
10
  gem.version = Lemmatizer::VERSION
9
- gem.authors = ["Yoichiro Hasebe"]
10
- gem.email = ["yohasebe@gmail.com"]
11
- gem.description = %q{Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package.}
12
- gem.summary = %q{Englsh lemmatizer in Ruby}
13
- gem.homepage = "http://github.com/yohasebe/lemmatizer"
14
-
11
+ gem.authors = ['Yoichiro Hasebe']
12
+ gem.email = ['yohasebe@gmail.com']
13
+ gem.description = %q(
14
+ Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package.
15
+ )
16
+ gem.summary = 'Englsh lemmatizer in Ruby'
17
+ gem.homepage = 'http://github.com/yohasebe/lemmatizer'
18
+ gem.licenses = ['MIT']
15
19
  gem.files = `git ls-files`.split($/)
16
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
- gem.require_paths = ["lib"]
20
+ gem.executables = gem.files.grep(%r(^bin/)).map{ |f| File.basename(f) }
21
+ gem.test_files = gem.files.grep(%r(^(test|spec|features)/))
22
+ gem.require_paths = ['lib']
19
23
 
20
- gem.add_development_dependency "rspec"
24
+ gem.add_development_dependency 'rspec'
21
25
  end
@@ -1,117 +1,12 @@
1
- #! /usr/bin/env ruby
2
1
  # -*- coding: utf-8; mode: ruby -*-
3
2
 
4
- # Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
5
- # Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
3
+ require 'stringio'
4
+ require 'lemmatizer/version'
5
+ require 'lemmatizer/core_ext'
6
+ require 'lemmatizer/lemmatizer'
6
7
 
7
-
8
- require "lemmatizer/version"
9
- require "stringio"
10
-
11
- class String
12
- def endwith(s)
13
- self =~ /#{s}$/
14
- end
8
+ module Lemmatizer
9
+ def self.new
10
+ Lemmatizer.new
11
+ end
15
12
  end
16
-
17
- class Lemmatizer
18
- current_dir = File.expand_path(File.dirname(__FILE__))
19
- WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
20
- :verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
21
- :adj => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
22
- :adv => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
23
-
24
- MORPHOLOGICAL_SUBSTITUTIONS = {
25
- :noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
26
- ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
27
- ['men', 'man'], ['ies', 'y']],
28
- :verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
29
- ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
30
- :adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
31
- :adv => []}
32
-
33
- def initialize(files = WN_FILES)
34
- @wordlists = {}
35
- @exceptions = {}
36
- MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
37
- @wordlists[x] = {}
38
- @exceptions[x] = {}
39
- end
40
- if files then
41
- files.each_pair do |pos, pair|
42
- load_wordnet_files(pos, pair[0], pair[1])
43
- end
44
- end
45
- end
46
-
47
- def open_file(*args)
48
- if args[0].is_a? IO or args[0].is_a? StringIO then
49
- yield args[0]
50
- else
51
- File.open(*args) do |io|
52
- yield io
53
- end
54
- end
55
- end
56
-
57
- def load_wordnet_files(pos, list, exc)
58
- open_file(list) do |io|
59
- io.each_line do |line|
60
- w = line.split(/\s+/)[0]
61
- @wordlists[pos][w] = w
62
- end
63
- end
64
-
65
- open_file(exc) do |io|
66
- io.each_line do |line|
67
- w, s = line.split(/\s+/)
68
- @exceptions[pos][w] ||= []
69
- @exceptions[pos][w] << s
70
- end
71
- end
72
- end
73
-
74
- def _each_substitutions(form, pos)
75
- if lemma = @wordlists[pos][form] then
76
- yield lemma
77
- end
78
- MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
79
- old, new = *entry
80
- if form.endwith(old)
81
- _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
82
- yield x
83
- end
84
- end
85
- end
86
- end
87
-
88
- def each_lemma(form, pos)
89
- if lemma = @exceptions[pos][form] then
90
- lemma.each{|x |yield x}
91
- end
92
- if pos == :noun and form.endwith('ful')
93
- each_lemma(form[0, form.length-3], pos) do |x|
94
- yield x+'ful'
95
- end
96
- else
97
- _each_substitutions(form, pos) do|x|
98
- yield x
99
- end
100
- end
101
- end
102
-
103
- def lemma(form, pos = nil)
104
- if !pos
105
- [:verb, :noun, :adj, :adv].each do |p|
106
- result = lemma(form, p)
107
- return result unless result == form
108
- end
109
- return form
110
- end
111
- each_lemma(form, pos) do |x|
112
- return x
113
- end
114
- return form
115
- end
116
-
117
- end
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8; mode: ruby -*-
2
+
3
+ module Lematizer
4
+ class ::String
5
+ def endwith(s)
6
+ self =~ /#{s}$/
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,152 @@
1
+ # -*- coding: utf-8; mode: ruby -*-
2
+
3
+ module Lemmatizer
4
+ class Lemmatizer
5
+ DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
6
+
7
+ WN_FILES = {
8
+ :noun => [
9
+ DATA_DIR + '/dict/index.noun',
10
+ DATA_DIR + '/dict/noun.exc'
11
+ ],
12
+ :verb => [
13
+ DATA_DIR + '/dict/index.verb',
14
+ DATA_DIR + '/dict/verb.exc'
15
+ ],
16
+ :adj => [
17
+ DATA_DIR + '/dict/index.adj',
18
+ DATA_DIR + '/dict/adj.exc'
19
+ ],
20
+ :adv => [
21
+ DATA_DIR + '/dict/index.adv',
22
+ DATA_DIR + '/dict/adv.exc'
23
+ ]
24
+ }
25
+
26
+ MORPHOLOGICAL_SUBSTITUTIONS = {
27
+ :noun => [
28
+ ['s', '' ],
29
+ ['ses', 's' ],
30
+ ['ves', 'f' ],
31
+ ['xes', 'x' ],
32
+ ['zes', 'z' ],
33
+ ['ches', 'ch' ],
34
+ ['shes', 'sh' ],
35
+ ['men', 'man'],
36
+ ['ies', 'y' ]
37
+ ],
38
+ :verb => [
39
+ ['s', '' ],
40
+ ['ies', 'y'],
41
+ ['es', 'e'],
42
+ ['es', '' ],
43
+ ['ed', 'e'],
44
+ ['ed', '' ],
45
+ ['ing', 'e'],
46
+ ['ing', '' ]
47
+ ],
48
+ :adj => [
49
+ ['er', '' ],
50
+ ['est', '' ],
51
+ ['er', 'e'],
52
+ ['est', 'e']
53
+ ],
54
+ :adv => [
55
+ ]
56
+ }
57
+
58
+ def initialize(files = WN_FILES)
59
+ @wordlists = {}
60
+ @exceptions = {}
61
+
62
+ MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
63
+ @wordlists[x] = {}
64
+ @exceptions[x] = {}
65
+ end
66
+
67
+ if files
68
+ files.each_pair do |pos, pair|
69
+ load_wordnet_files(pos, pair[0], pair[1])
70
+ end
71
+ end
72
+ end
73
+
74
+ def lemma(form, pos = nil)
75
+ unless pos
76
+ [:verb, :noun, :adj, :adv].each do |p|
77
+ result = lemma(form, p)
78
+ return result unless result == form
79
+ end
80
+
81
+ return form
82
+ end
83
+
84
+ each_lemma(form, pos) do |x|
85
+ return x
86
+ end
87
+
88
+ form
89
+ end
90
+
91
+ private
92
+
93
+ def open_file(*args)
94
+ if args[0].is_a? IO or args[0].is_a? StringIO
95
+ yield args[0]
96
+ else
97
+ File.open(*args) do |io|
98
+ yield io
99
+ end
100
+ end
101
+ end
102
+
103
+ def load_wordnet_files(pos, list, exc)
104
+ open_file(list) do |io|
105
+ io.each_line do |line|
106
+ w = line.split(/\s+/)[0]
107
+ @wordlists[pos][w] = w
108
+ end
109
+ end
110
+
111
+ open_file(exc) do |io|
112
+ io.each_line do |line|
113
+ w, s = line.split(/\s+/)
114
+ @exceptions[pos][w] ||= []
115
+ @exceptions[pos][w] << s
116
+ end
117
+ end
118
+ end
119
+
120
+ def each_substitutions(form, pos)
121
+ if lemma = @wordlists[pos][form]
122
+ yield lemma
123
+ end
124
+
125
+ MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
126
+ old, new = *entry
127
+ if form.endwith(old)
128
+ each_substitutions(form[0, form.length - old.length] + new, pos) do |x|
129
+ yield x
130
+ end
131
+ end
132
+ end
133
+ end
134
+
135
+ def each_lemma(form, pos)
136
+ if lemma = @exceptions[pos][form]
137
+ lemma.each { |x| yield x }
138
+ end
139
+
140
+ if pos == :noun && form.endwith('ful')
141
+ each_lemma(form[0, form.length-3], pos) do |x|
142
+ yield x + 'ful'
143
+ end
144
+ else
145
+
146
+ each_substitutions(form, pos) do|x|
147
+ yield x
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
@@ -1,3 +1,5 @@
1
- class Lemmatizer
2
- VERSION = "0.1.0"
1
+ # -*- coding: utf-8; mode: ruby -*-
2
+
3
+ module Lemmatizer
4
+ VERSION = '0.1.1'
3
5
  end
@@ -1,18 +1,14 @@
1
- #!/usr/bin/env ruby
2
1
  # -*- coding: utf-8 -*-
3
2
 
4
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
3
+ require 'spec_helper'
5
4
  require 'lemmatizer'
6
5
 
7
6
  describe "Lemmatizer" do
8
- it "contains lemmatizing functions:" do
9
- end
10
-
11
7
  before do
12
8
  @lemmatizer = Lemmatizer.new
13
9
  end
14
10
 
15
- describe "lemma" do
11
+ describe "#lemma" do
16
12
  it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
17
13
  result_n1 = @lemmatizer.lemma("analyses", :noun)
18
14
  result_n1.should == "analysis"
@@ -55,6 +51,9 @@ describe "Lemmatizer" do
55
51
  result_3 = @lemmatizer.lemma("higher")
56
52
  result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
57
53
 
54
+ result_2 = @lemmatizer.lemma("asdfassda") # non-existing word
55
+ result_2.should == "asdfassda"
56
+
58
57
  # test cases for words used in README
59
58
  result_t1 = @lemmatizer.lemma("fired")
60
59
  result_t1.should == "fire"
@@ -63,5 +62,4 @@ describe "Lemmatizer" do
63
62
  result_t2.should == "slow"
64
63
  end
65
64
  end
66
-
67
- end
65
+ end
@@ -1,6 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
1
  require 'rspec'
3
2
 
4
- RSpec.configure do |config|
5
- # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
- end
3
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
metadata CHANGED
@@ -1,29 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
5
- prerelease:
4
+ version: 0.1.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Yoichiro Hasebe
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-10-29 00:00:00.000000000 Z
11
+ date: 2013-11-03 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rspec
16
- requirement: &70314483330880 !ruby/object:Gem::Requirement
17
- none: false
15
+ requirement: !ruby/object:Gem::Requirement
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :development
23
21
  prerelease: false
24
- version_requirements: *70314483330880
25
- description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
26
- package.
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: ! "\n Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
28
+ package.\n "
27
29
  email:
28
30
  - yohasebe@gmail.com
29
31
  executables: []
@@ -45,32 +47,34 @@ files:
45
47
  - lib/dict/noun.exc
46
48
  - lib/dict/verb.exc
47
49
  - lib/lemmatizer.rb
50
+ - lib/lemmatizer/core_ext.rb
51
+ - lib/lemmatizer/lemmatizer.rb
48
52
  - lib/lemmatizer/version.rb
49
53
  - spec/lemmatizer_spec.rb
50
54
  - spec/spec_helper.rb
51
55
  homepage: http://github.com/yohasebe/lemmatizer
52
- licenses: []
56
+ licenses:
57
+ - MIT
58
+ metadata: {}
53
59
  post_install_message:
54
60
  rdoc_options: []
55
61
  require_paths:
56
62
  - lib
57
63
  required_ruby_version: !ruby/object:Gem::Requirement
58
- none: false
59
64
  requirements:
60
65
  - - ! '>='
61
66
  - !ruby/object:Gem::Version
62
67
  version: '0'
63
68
  required_rubygems_version: !ruby/object:Gem::Requirement
64
- none: false
65
69
  requirements:
66
70
  - - ! '>='
67
71
  - !ruby/object:Gem::Version
68
72
  version: '0'
69
73
  requirements: []
70
74
  rubyforge_project:
71
- rubygems_version: 1.8.17
75
+ rubygems_version: 2.1.9
72
76
  signing_key:
73
- specification_version: 3
77
+ specification_version: 4
74
78
  summary: Englsh lemmatizer in Ruby
75
79
  test_files:
76
80
  - spec/lemmatizer_spec.rb