lemmatizer 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ YmRjOGYyZWE4ZWJjNWJjYWU3MzFkY2M5MjU1OTM4MzMxOGM1OGYwYw==
5
+ data.tar.gz: !binary |-
6
+ ZjRiZGQ1NjI1MzU2NTEyM2JmMzg0NGZiNDI2ZGRiMzExNmNlNDllNw==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NTM0YThiNDVhYWVlYjZkNDZlZGNmMTg2OTYxMDE0ZDYwNWM0NWE5MGE2YjA5
10
+ ODZmYTM2YmE1MmM5MGJhODUzOWUxYmQzYTcwMzBhMmIxNmRiOTEwOGRmOWFk
11
+ YmNmNzc1YWI5ZDMzMDk2NDBhNmExNTUyZDgwYTJhZjFlOTZkN2Y=
12
+ data.tar.gz: !binary |-
13
+ ZmFlNWQ2OWYzYTBmMjM1MmVlOThlMWNlMTIwNjAwYjgwMzYxNWM0YmUzMThj
14
+ YjJiODA3NGNmOTk0MzQ4ZmY2YTc2ODM1YmJhMzgxOTQ1ZmEzNTY4ZDNkMDky
15
+ YTdmYTk4NDY5MzAzZjk2M2ZhY2RmOTJjZDQwMmY3ODE5N2ViOTY=
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Yoichiro Hasebe
1
+ Copyright (c) 2012-2013 Yoichiro Hasebe
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -1,40 +1,54 @@
1
1
  lemmatizer
2
2
  ==========
3
-
4
3
  Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
5
4
 
6
5
  Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
7
6
 
8
7
  Installation
9
8
  ------------
10
-
11
9
  sudo gem install lemmatizer
12
10
 
13
11
 
14
12
  Usage
15
13
  -----
16
-
17
- require "lemmatizer"
18
-
19
- lem = Lemmatizer.new
20
-
21
- p lem.lemma("dogs", :noun ) # => "dog"
22
- p lem.lemma("hired", :verb ) # => "hire"
23
- p lem.lemma("hotter", :adj ) # => "hot"
24
- p lem.lemma("better", :adv ) # => "well"
25
-
26
- # when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
27
- p lem.lemma("fired") # => "fire"
28
- p lem.lemma("slow") # => "slow"
14
+ ```ruby
15
+ require "lemmatizer"
16
+
17
+ lem = Lemmatizer.new
18
+
19
+ p lem.lemma("dogs", :noun ) # => "dog"
20
+ p lem.lemma("hired", :verb ) # => "hire"
21
+ p lem.lemma("hotter", :adj ) # => "hot"
22
+ p lem.lemma("better", :adv ) # => "well"
23
+
24
+ # when part-of-speech symbol is not specified as the second argument,
25
+ # lemmatizer tries :verb, :noun, :adj, and :adv one by one in this order.
26
+ p lem.lemma("fired") # => "fire"
27
+ p lem.lemma("slow") # => "slow"
28
+ ```
29
29
 
30
30
  Limitations
31
31
  -----------
32
-
33
- # Lemmatizer leaves alone words that its dictionary does not contain. This keeps proper names such as "James" intact.
34
- p lem.lemma("MacBooks", :noun) # => "MacBooks"
35
-
36
- # If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
37
- p lem.lemma("higher", :adj) # => "higher" not "high"!
38
-
39
- # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
40
- # Modify dict/index.{noun|verb|adj|adv} if necessary.
32
+ ```ruby
33
+ # Lemmatizer leaves alone words that its dictionary does not contain.
34
+ # This keeps proper names such as "James" intact.
35
+ p lem.lemma("MacBooks", :noun) # => "MacBooks"
36
+
37
+ # If an inflected form is included as a lemma in the word index,
38
+ # lemmatizer may not give an expected result.
39
+ p lem.lemma("higher", :adj) # => "higher" not "high"!
40
+
41
+ # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
42
+ # Modify dict/index.{noun|verb|adj|adv} if necessary.
43
+ ```
44
+
45
+ Author
46
+ ------
47
+ * Yoichiro Hasebe <yohasebe@gmail.com>
48
+
49
+ Thanks for assistance and contributions:
50
+ * Vladimir Ivic <http://vladimirivic.com>
51
+
52
+ License
53
+ -------
54
+ Licensed under the MIT license.
@@ -1,21 +1,25 @@
1
1
  # -*- encoding: utf-8 -*-
2
+
2
3
  lib = File.expand_path('../lib', __FILE__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
4
6
  require 'lemmatizer/version'
5
7
 
6
8
  Gem::Specification.new do |gem|
7
- gem.name = "lemmatizer"
9
+ gem.name = 'lemmatizer'
8
10
  gem.version = Lemmatizer::VERSION
9
- gem.authors = ["Yoichiro Hasebe"]
10
- gem.email = ["yohasebe@gmail.com"]
11
- gem.description = %q{Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package.}
12
- gem.summary = %q{Englsh lemmatizer in Ruby}
13
- gem.homepage = "http://github.com/yohasebe/lemmatizer"
14
-
11
+ gem.authors = ['Yoichiro Hasebe']
12
+ gem.email = ['yohasebe@gmail.com']
13
+ gem.description = %q(
14
+ Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package.
15
+ )
16
+ gem.summary = 'Englsh lemmatizer in Ruby'
17
+ gem.homepage = 'http://github.com/yohasebe/lemmatizer'
18
+ gem.licenses = ['MIT']
15
19
  gem.files = `git ls-files`.split($/)
16
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
- gem.require_paths = ["lib"]
20
+ gem.executables = gem.files.grep(%r(^bin/)).map{ |f| File.basename(f) }
21
+ gem.test_files = gem.files.grep(%r(^(test|spec|features)/))
22
+ gem.require_paths = ['lib']
19
23
 
20
- gem.add_development_dependency "rspec"
24
+ gem.add_development_dependency 'rspec'
21
25
  end
@@ -1,117 +1,12 @@
1
- #! /usr/bin/env ruby
2
1
  # -*- coding: utf-8; mode: ruby -*-
3
2
 
4
- # Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
5
- # Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
3
+ require 'stringio'
4
+ require 'lemmatizer/version'
5
+ require 'lemmatizer/core_ext'
6
+ require 'lemmatizer/lemmatizer'
6
7
 
7
-
8
- require "lemmatizer/version"
9
- require "stringio"
10
-
11
- class String
12
- def endwith(s)
13
- self =~ /#{s}$/
14
- end
8
+ module Lemmatizer
9
+ def self.new
10
+ Lemmatizer.new
11
+ end
15
12
  end
16
-
17
- class Lemmatizer
18
- current_dir = File.expand_path(File.dirname(__FILE__))
19
- WN_FILES = {:noun => [current_dir + "/dict/index.noun", current_dir + "/dict/noun.exc"],
20
- :verb => [current_dir + "/dict/index.verb", current_dir + "/dict/verb.exc"],
21
- :adj => [current_dir + "/dict/index.adj", current_dir + "/dict/adj.exc"],
22
- :adv => [current_dir + "/dict/index.adv", current_dir + "/dict/adv.exc"]}
23
-
24
- MORPHOLOGICAL_SUBSTITUTIONS = {
25
- :noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
26
- ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
27
- ['men', 'man'], ['ies', 'y']],
28
- :verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
29
- ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
30
- :adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
31
- :adv => []}
32
-
33
- def initialize(files = WN_FILES)
34
- @wordlists = {}
35
- @exceptions = {}
36
- MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
37
- @wordlists[x] = {}
38
- @exceptions[x] = {}
39
- end
40
- if files then
41
- files.each_pair do |pos, pair|
42
- load_wordnet_files(pos, pair[0], pair[1])
43
- end
44
- end
45
- end
46
-
47
- def open_file(*args)
48
- if args[0].is_a? IO or args[0].is_a? StringIO then
49
- yield args[0]
50
- else
51
- File.open(*args) do |io|
52
- yield io
53
- end
54
- end
55
- end
56
-
57
- def load_wordnet_files(pos, list, exc)
58
- open_file(list) do |io|
59
- io.each_line do |line|
60
- w = line.split(/\s+/)[0]
61
- @wordlists[pos][w] = w
62
- end
63
- end
64
-
65
- open_file(exc) do |io|
66
- io.each_line do |line|
67
- w, s = line.split(/\s+/)
68
- @exceptions[pos][w] ||= []
69
- @exceptions[pos][w] << s
70
- end
71
- end
72
- end
73
-
74
- def _each_substitutions(form, pos)
75
- if lemma = @wordlists[pos][form] then
76
- yield lemma
77
- end
78
- MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
79
- old, new = *entry
80
- if form.endwith(old)
81
- _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
82
- yield x
83
- end
84
- end
85
- end
86
- end
87
-
88
- def each_lemma(form, pos)
89
- if lemma = @exceptions[pos][form] then
90
- lemma.each{|x |yield x}
91
- end
92
- if pos == :noun and form.endwith('ful')
93
- each_lemma(form[0, form.length-3], pos) do |x|
94
- yield x+'ful'
95
- end
96
- else
97
- _each_substitutions(form, pos) do|x|
98
- yield x
99
- end
100
- end
101
- end
102
-
103
- def lemma(form, pos = nil)
104
- if !pos
105
- [:verb, :noun, :adj, :adv].each do |p|
106
- result = lemma(form, p)
107
- return result unless result == form
108
- end
109
- return form
110
- end
111
- each_lemma(form, pos) do |x|
112
- return x
113
- end
114
- return form
115
- end
116
-
117
- end
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8; mode: ruby -*-
2
+
3
+ module Lematizer
4
+ class ::String
5
+ def endwith(s)
6
+ self =~ /#{s}$/
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,152 @@
1
+ # -*- coding: utf-8; mode: ruby -*-
2
+
3
+ module Lemmatizer
4
+ class Lemmatizer
5
+ DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
6
+
7
+ WN_FILES = {
8
+ :noun => [
9
+ DATA_DIR + '/dict/index.noun',
10
+ DATA_DIR + '/dict/noun.exc'
11
+ ],
12
+ :verb => [
13
+ DATA_DIR + '/dict/index.verb',
14
+ DATA_DIR + '/dict/verb.exc'
15
+ ],
16
+ :adj => [
17
+ DATA_DIR + '/dict/index.adj',
18
+ DATA_DIR + '/dict/adj.exc'
19
+ ],
20
+ :adv => [
21
+ DATA_DIR + '/dict/index.adv',
22
+ DATA_DIR + '/dict/adv.exc'
23
+ ]
24
+ }
25
+
26
+ MORPHOLOGICAL_SUBSTITUTIONS = {
27
+ :noun => [
28
+ ['s', '' ],
29
+ ['ses', 's' ],
30
+ ['ves', 'f' ],
31
+ ['xes', 'x' ],
32
+ ['zes', 'z' ],
33
+ ['ches', 'ch' ],
34
+ ['shes', 'sh' ],
35
+ ['men', 'man'],
36
+ ['ies', 'y' ]
37
+ ],
38
+ :verb => [
39
+ ['s', '' ],
40
+ ['ies', 'y'],
41
+ ['es', 'e'],
42
+ ['es', '' ],
43
+ ['ed', 'e'],
44
+ ['ed', '' ],
45
+ ['ing', 'e'],
46
+ ['ing', '' ]
47
+ ],
48
+ :adj => [
49
+ ['er', '' ],
50
+ ['est', '' ],
51
+ ['er', 'e'],
52
+ ['est', 'e']
53
+ ],
54
+ :adv => [
55
+ ]
56
+ }
57
+
58
+ def initialize(files = WN_FILES)
59
+ @wordlists = {}
60
+ @exceptions = {}
61
+
62
+ MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
63
+ @wordlists[x] = {}
64
+ @exceptions[x] = {}
65
+ end
66
+
67
+ if files
68
+ files.each_pair do |pos, pair|
69
+ load_wordnet_files(pos, pair[0], pair[1])
70
+ end
71
+ end
72
+ end
73
+
74
+ def lemma(form, pos = nil)
75
+ unless pos
76
+ [:verb, :noun, :adj, :adv].each do |p|
77
+ result = lemma(form, p)
78
+ return result unless result == form
79
+ end
80
+
81
+ return form
82
+ end
83
+
84
+ each_lemma(form, pos) do |x|
85
+ return x
86
+ end
87
+
88
+ form
89
+ end
90
+
91
+ private
92
+
93
+ def open_file(*args)
94
+ if args[0].is_a? IO or args[0].is_a? StringIO
95
+ yield args[0]
96
+ else
97
+ File.open(*args) do |io|
98
+ yield io
99
+ end
100
+ end
101
+ end
102
+
103
+ def load_wordnet_files(pos, list, exc)
104
+ open_file(list) do |io|
105
+ io.each_line do |line|
106
+ w = line.split(/\s+/)[0]
107
+ @wordlists[pos][w] = w
108
+ end
109
+ end
110
+
111
+ open_file(exc) do |io|
112
+ io.each_line do |line|
113
+ w, s = line.split(/\s+/)
114
+ @exceptions[pos][w] ||= []
115
+ @exceptions[pos][w] << s
116
+ end
117
+ end
118
+ end
119
+
120
+ def each_substitutions(form, pos)
121
+ if lemma = @wordlists[pos][form]
122
+ yield lemma
123
+ end
124
+
125
+ MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
126
+ old, new = *entry
127
+ if form.endwith(old)
128
+ each_substitutions(form[0, form.length - old.length] + new, pos) do |x|
129
+ yield x
130
+ end
131
+ end
132
+ end
133
+ end
134
+
135
+ def each_lemma(form, pos)
136
+ if lemma = @exceptions[pos][form]
137
+ lemma.each { |x| yield x }
138
+ end
139
+
140
+ if pos == :noun && form.endwith('ful')
141
+ each_lemma(form[0, form.length-3], pos) do |x|
142
+ yield x + 'ful'
143
+ end
144
+ else
145
+
146
+ each_substitutions(form, pos) do|x|
147
+ yield x
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
@@ -1,3 +1,5 @@
1
- class Lemmatizer
2
- VERSION = "0.1.0"
1
+ # -*- coding: utf-8; mode: ruby -*-
2
+
3
+ module Lemmatizer
4
+ VERSION = '0.1.1'
3
5
  end
@@ -1,18 +1,14 @@
1
- #!/usr/bin/env ruby
2
1
  # -*- coding: utf-8 -*-
3
2
 
4
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
3
+ require 'spec_helper'
5
4
  require 'lemmatizer'
6
5
 
7
6
  describe "Lemmatizer" do
8
- it "contains lemmatizing functions:" do
9
- end
10
-
11
7
  before do
12
8
  @lemmatizer = Lemmatizer.new
13
9
  end
14
10
 
15
- describe "lemma" do
11
+ describe "#lemma" do
16
12
  it "takes a word form and its part-of-speech symbol (:noun, :verb, :adj, :adv) and then returns its lemma form" do
17
13
  result_n1 = @lemmatizer.lemma("analyses", :noun)
18
14
  result_n1.should == "analysis"
@@ -55,6 +51,9 @@ describe "Lemmatizer" do
55
51
  result_3 = @lemmatizer.lemma("higher")
56
52
  result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
57
53
 
54
+ result_2 = @lemmatizer.lemma("asdfassda") # non-existing word
55
+ result_2.should == "asdfassda"
56
+
58
57
  # test cases for words used in README
59
58
  result_t1 = @lemmatizer.lemma("fired")
60
59
  result_t1.should == "fire"
@@ -63,5 +62,4 @@ describe "Lemmatizer" do
63
62
  result_t2.should == "slow"
64
63
  end
65
64
  end
66
-
67
- end
65
+ end
@@ -1,6 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
1
  require 'rspec'
3
2
 
4
- RSpec.configure do |config|
5
- # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
- end
3
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
metadata CHANGED
@@ -1,29 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
5
- prerelease:
4
+ version: 0.1.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Yoichiro Hasebe
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-10-29 00:00:00.000000000 Z
11
+ date: 2013-11-03 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rspec
16
- requirement: &70314483330880 !ruby/object:Gem::Requirement
17
- none: false
15
+ requirement: !ruby/object:Gem::Requirement
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :development
23
21
  prerelease: false
24
- version_requirements: *70314483330880
25
- description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
26
- package.
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: ! "\n Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
28
+ package.\n "
27
29
  email:
28
30
  - yohasebe@gmail.com
29
31
  executables: []
@@ -45,32 +47,34 @@ files:
45
47
  - lib/dict/noun.exc
46
48
  - lib/dict/verb.exc
47
49
  - lib/lemmatizer.rb
50
+ - lib/lemmatizer/core_ext.rb
51
+ - lib/lemmatizer/lemmatizer.rb
48
52
  - lib/lemmatizer/version.rb
49
53
  - spec/lemmatizer_spec.rb
50
54
  - spec/spec_helper.rb
51
55
  homepage: http://github.com/yohasebe/lemmatizer
52
- licenses: []
56
+ licenses:
57
+ - MIT
58
+ metadata: {}
53
59
  post_install_message:
54
60
  rdoc_options: []
55
61
  require_paths:
56
62
  - lib
57
63
  required_ruby_version: !ruby/object:Gem::Requirement
58
- none: false
59
64
  requirements:
60
65
  - - ! '>='
61
66
  - !ruby/object:Gem::Version
62
67
  version: '0'
63
68
  required_rubygems_version: !ruby/object:Gem::Requirement
64
- none: false
65
69
  requirements:
66
70
  - - ! '>='
67
71
  - !ruby/object:Gem::Version
68
72
  version: '0'
69
73
  requirements: []
70
74
  rubyforge_project:
71
- rubygems_version: 1.8.17
75
+ rubygems_version: 2.1.9
72
76
  signing_key:
73
- specification_version: 3
77
+ specification_version: 4
74
78
  summary: Englsh lemmatizer in Ruby
75
79
  test_files:
76
80
  - spec/lemmatizer_spec.rb