lemmatizer 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,7 +1,15 @@
1
1
  lemmatizer
2
2
  ==========
3
3
 
4
- Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package
4
+ Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
5
+
6
+ Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
7
+
8
+ Installation
9
+ ------------
10
+
11
+ sudo gem install lemmatizer
12
+
5
13
 
6
14
  Usage
7
15
  -----
@@ -14,7 +22,19 @@ Usage
14
22
  p lem.lemma("hired", :verb ) # => "hire"
15
23
  p lem.lemma("hotter", :adj ) # => "hot"
16
24
  p lem.lemma("better", :adv ) # => "well"
25
+
26
+ # when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
27
+ p lem.lemma("fired") # => "fire"
28
+ p lem.lemma("slow") # => "slow"
29
+
30
+ Limitations
31
+ -----------
17
32
 
18
33
  # Lemmatizer leaves alone words that its dictionary does not contain. This keeps proper names such as "James" intact.
19
34
  p lem.lemma("MacBooks", :noun) # => "MacBooks"
20
-
35
+
36
+ # If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
37
+ p lem.lemma("higher", :adj) # => "higher" not "high"!
38
+
39
+ # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
40
+ # Modify dict/index.{noun|verb|adj|adv} if necessary.
@@ -1,32 +1,3 @@
1
- 1 This software and database is being provided to you, the LICENSEE, by
2
- 2 Princeton University under the following license. By obtaining, using
3
- 3 and/or copying this software and database, you agree that you have
4
- 4 read, understood, and will comply with these terms and conditions.:
5
- 5
6
- 6 Permission to use, copy, modify and distribute this software and
7
- 7 database and its documentation for any purpose and without fee or
8
- 8 royalty is hereby granted, provided that you agree to comply with
9
- 9 the following copyright notice and statements, including the disclaimer,
10
- 10 and that the same appear on ALL copies of the software, database and
11
- 11 documentation, including modifications that you make for internal
12
- 12 use or for distribution.
13
- 13
14
- 14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
15
- 15
16
- 16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
17
- 17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
18
- 18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
19
- 19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
20
- 20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
21
- 21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
22
- 22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
23
- 23 OTHER RIGHTS.
24
- 24
25
- 25 The name of Princeton University or Princeton may not be used in
26
- 26 advertising or publicity pertaining to distribution of the software
27
- 27 and/or database. Title to copyright in this software, database and
28
- 28 any associated documentation shall at all times remain with
29
- 29 Princeton University and LICENSEE agrees to preserve same.
30
1
  .22-caliber a 1 1 \ 1 0 03146310
31
2
  .22-calibre a 1 1 \ 1 0 03146310
32
3
  .22_caliber a 1 1 \ 1 0 03146310
@@ -1,32 +1,3 @@
1
- 1 This software and database is being provided to you, the LICENSEE, by
2
- 2 Princeton University under the following license. By obtaining, using
3
- 3 and/or copying this software and database, you agree that you have
4
- 4 read, understood, and will comply with these terms and conditions.:
5
- 5
6
- 6 Permission to use, copy, modify and distribute this software and
7
- 7 database and its documentation for any purpose and without fee or
8
- 8 royalty is hereby granted, provided that you agree to comply with
9
- 9 the following copyright notice and statements, including the disclaimer,
10
- 10 and that the same appear on ALL copies of the software, database and
11
- 11 documentation, including modifications that you make for internal
12
- 12 use or for distribution.
13
- 13
14
- 14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
15
- 15
16
- 16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
17
- 17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
18
- 18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
19
- 19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
20
- 20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
21
- 21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
22
- 22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
23
- 23 OTHER RIGHTS.
24
- 24
25
- 25 The name of Princeton University or Princeton may not be used in
26
- 26 advertising or publicity pertaining to distribution of the software
27
- 27 and/or database. Title to copyright in this software, database and
28
- 28 any associated documentation shall at all times remain with
29
- 29 Princeton University and LICENSEE agrees to preserve same.
30
1
  'tween r 1 0 1 0 00250898
31
2
  'tween_decks r 1 0 1 0 00498293
32
3
  a.d. r 1 0 1 0 00001837
@@ -1,4 +1,3 @@
1
- # lemma pos synset_cnt p_cnt [ptr_symbol...] sense_cnt tagsense_cnt synset_offset [synset_offset...]
2
1
  aah v 1 1 @ 1 0 00865776
3
2
  abacinate v 1 1 @ 1 0 02168378
4
3
  abandon v 5 4 @ ~ $ + 5 5 02228031 02227741 02076676 00613393 00614057
@@ -1,8 +1,9 @@
1
1
  #! /usr/bin/env ruby
2
2
  # -*- coding: utf-8; mode: ruby -*-
3
3
 
4
- # inspired by nltk.corpus.reader.wordnet.morphy
5
- # http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
4
+ # Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
5
+ # Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
6
+
6
7
 
7
8
  require "lemmatizer/version"
8
9
  require "stringio"
@@ -22,11 +23,10 @@ class Lemmatizer
22
23
 
23
24
  MORPHOLOGICAL_SUBSTITUTIONS = {
24
25
  :noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
25
- ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
26
- ['men', 'man'], ['ies', 'y']],
26
+ ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
27
+ ['men', 'man'], ['ies', 'y']],
27
28
  :verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
28
- ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
29
-
29
+ ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
30
30
  :adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
31
31
  :adv => []}
32
32
 
@@ -38,7 +38,7 @@ class Lemmatizer
38
38
  @exceptions[x] = {}
39
39
  end
40
40
  if files then
41
- files.each_pair do |pos,pair|
41
+ files.each_pair do |pos, pair|
42
42
  load_wordnet_files(pos, pair[0], pair[1])
43
43
  end
44
44
  end
@@ -64,19 +64,33 @@ class Lemmatizer
64
64
 
65
65
  open_file(exc) do |io|
66
66
  io.each_line do |line|
67
- w,s = line.split(/\s+/)
67
+ w, s = line.split(/\s+/)
68
68
  @exceptions[pos][w] ||= []
69
69
  @exceptions[pos][w] << s
70
70
  end
71
71
  end
72
72
  end
73
73
 
74
+ def _each_substitutions(form, pos)
75
+ if lemma = @wordlists[pos][form] then
76
+ yield lemma
77
+ end
78
+ MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
79
+ old, new = *entry
80
+ if form.endwith(old)
81
+ _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
82
+ yield x
83
+ end
84
+ end
85
+ end
86
+ end
87
+
74
88
  def each_lemma(form, pos)
75
89
  if lemma = @exceptions[pos][form] then
76
90
  lemma.each{|x |yield x}
77
91
  end
78
92
  if pos == :noun and form.endwith('ful')
79
- each_lemma(form[0,form.length-3], pos) do |x|
93
+ each_lemma(form[0, form.length-3], pos) do |x|
80
94
  yield x+'ful'
81
95
  end
82
96
  else
@@ -86,23 +100,18 @@ class Lemmatizer
86
100
  end
87
101
  end
88
102
 
89
- def lemma(form,pos)
103
+ def lemma(form, pos = nil)
104
+ if !pos
105
+ [:verb, :noun, :adj, :adv].each do |p|
106
+ result = lemma(form, p)
107
+ return result unless result == form
108
+ end
109
+ return form
110
+ end
90
111
  each_lemma(form, pos) do |x|
91
112
  return x
92
113
  end
93
114
  return form
94
115
  end
95
- def _each_substitutions(form, pos)
96
- if lemma = @wordlists[pos][form] then
97
- yield lemma
98
- end
99
- MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
100
- old, new = *entry
101
- if form.endwith(old)
102
- _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
103
- yield x
104
- end
105
- end
106
- end
107
- end
116
+
108
117
  end
@@ -1,3 +1,3 @@
1
1
  class Lemmatizer
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -44,6 +44,23 @@ describe "Lemmatizer" do
44
44
 
45
45
  result_r2 = @lemmatizer.lemma("best", :adv)
46
46
  result_r2.should_not == "good"
47
+
48
+ # Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
49
+ result_1 = @lemmatizer.lemma("plays")
50
+ result_1.should == "play"
51
+
52
+ result_2 = @lemmatizer.lemma("oxen")
53
+ result_2.should == "ox"
54
+
55
+ result_3 = @lemmatizer.lemma("higher")
56
+ result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
57
+
58
+ # test cases for words used in README
59
+ result_t1 = @lemmatizer.lemma("fired")
60
+ result_t1.should == "fire"
61
+
62
+ result_t2 = @lemmatizer.lemma("slower")
63
+ result_t2.should == "slow"
47
64
  end
48
65
  end
49
66
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-28 00:00:00.000000000 Z
12
+ date: 2012-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70234015642620 !ruby/object:Gem::Requirement
16
+ requirement: &70314483330880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70234015642620
24
+ version_requirements: *70314483330880
25
25
  description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
26
26
  package.
27
27
  email: