lemmatizer 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,7 +1,15 @@
1
1
  lemmatizer
2
2
  ==========
3
3
 
4
- Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy package
4
+ Lemmatizer for text in English. Inspired by Python's [nltk.corpus.reader.wordnet.morphy](orpusReader.morphy) package.
5
+
6
+ Based on code posted by mtbr at his blog entry [WordNet-based lemmatizer](http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer)
7
+
8
+ Installation
9
+ ------------
10
+
11
+ sudo gem install lemmatizer
12
+
5
13
 
6
14
  Usage
7
15
  -----
@@ -14,7 +22,19 @@ Usage
14
22
  p lem.lemma("hired", :verb ) # => "hire"
15
23
  p lem.lemma("hotter", :adj ) # => "hot"
16
24
  p lem.lemma("better", :adv ) # => "well"
25
+
26
+ # when part-of-speech symbol is not specified as the second argument, lemmatizer tries :verb, :noun, :adj, or :adv one by one in this order.
27
+ p lem.lemma("fired") # => "fire"
28
+ p lem.lemma("slow") # => "slow"
29
+
30
+ Limitations
31
+ -----------
17
32
 
18
33
  # Lemmatizer leaves alone words that its dictionary does not contain. This keeps proper names such as "James" intact.
19
34
  p lem.lemma("MacBooks", :noun) # => "MacBooks"
20
-
35
+
36
+ # If an inflected form of word is included as a lemma in the word list, lemmatizer may not give the expected result.
37
+ p lem.lemma("higher", :adj) # => "higher" not "high"!
38
+
39
+ # The above has to happen because "higher" is itself an entry word listed in dict/index.adj .
40
+ # Modify dict/index.{noun|verb|adj|adv} if necessary.
@@ -1,32 +1,3 @@
1
- 1 This software and database is being provided to you, the LICENSEE, by
2
- 2 Princeton University under the following license. By obtaining, using
3
- 3 and/or copying this software and database, you agree that you have
4
- 4 read, understood, and will comply with these terms and conditions.:
5
- 5
6
- 6 Permission to use, copy, modify and distribute this software and
7
- 7 database and its documentation for any purpose and without fee or
8
- 8 royalty is hereby granted, provided that you agree to comply with
9
- 9 the following copyright notice and statements, including the disclaimer,
10
- 10 and that the same appear on ALL copies of the software, database and
11
- 11 documentation, including modifications that you make for internal
12
- 12 use or for distribution.
13
- 13
14
- 14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
15
- 15
16
- 16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
17
- 17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
18
- 18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
19
- 19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
20
- 20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
21
- 21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
22
- 22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
23
- 23 OTHER RIGHTS.
24
- 24
25
- 25 The name of Princeton University or Princeton may not be used in
26
- 26 advertising or publicity pertaining to distribution of the software
27
- 27 and/or database. Title to copyright in this software, database and
28
- 28 any associated documentation shall at all times remain with
29
- 29 Princeton University and LICENSEE agrees to preserve same.
30
1
  .22-caliber a 1 1 \ 1 0 03146310
31
2
  .22-calibre a 1 1 \ 1 0 03146310
32
3
  .22_caliber a 1 1 \ 1 0 03146310
@@ -1,32 +1,3 @@
1
- 1 This software and database is being provided to you, the LICENSEE, by
2
- 2 Princeton University under the following license. By obtaining, using
3
- 3 and/or copying this software and database, you agree that you have
4
- 4 read, understood, and will comply with these terms and conditions.:
5
- 5
6
- 6 Permission to use, copy, modify and distribute this software and
7
- 7 database and its documentation for any purpose and without fee or
8
- 8 royalty is hereby granted, provided that you agree to comply with
9
- 9 the following copyright notice and statements, including the disclaimer,
10
- 10 and that the same appear on ALL copies of the software, database and
11
- 11 documentation, including modifications that you make for internal
12
- 12 use or for distribution.
13
- 13
14
- 14 WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
15
- 15
16
- 16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
17
- 17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
18
- 18 IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
19
- 19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
20
- 20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
21
- 21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
22
- 22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
23
- 23 OTHER RIGHTS.
24
- 24
25
- 25 The name of Princeton University or Princeton may not be used in
26
- 26 advertising or publicity pertaining to distribution of the software
27
- 27 and/or database. Title to copyright in this software, database and
28
- 28 any associated documentation shall at all times remain with
29
- 29 Princeton University and LICENSEE agrees to preserve same.
30
1
  'tween r 1 0 1 0 00250898
31
2
  'tween_decks r 1 0 1 0 00498293
32
3
  a.d. r 1 0 1 0 00001837
@@ -1,4 +1,3 @@
1
- # lemma pos synset_cnt p_cnt [ptr_symbol...] sense_cnt tagsense_cnt synset_offset [synset_offset...]
2
1
  aah v 1 1 @ 1 0 00865776
3
2
  abacinate v 1 1 @ 1 0 02168378
4
3
  abandon v 5 4 @ ~ $ + 5 5 02228031 02227741 02076676 00613393 00614057
@@ -1,8 +1,9 @@
1
1
  #! /usr/bin/env ruby
2
2
  # -*- coding: utf-8; mode: ruby -*-
3
3
 
4
- # inspired by nltk.corpus.reader.wordnet.morphy
5
- # http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
4
+ # Inspired by nltk.corpus.reader.wordnet.morphy http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.wordnet-pysrc.html#WordNetCorpusReader.morphy
5
+ # Original code posted by mtbr at http://d.hatena.ne.jp/mtbr/20090303/prfrnlprubyWordNetbasedlemmatizer
6
+
6
7
 
7
8
  require "lemmatizer/version"
8
9
  require "stringio"
@@ -22,11 +23,10 @@ class Lemmatizer
22
23
 
23
24
  MORPHOLOGICAL_SUBSTITUTIONS = {
24
25
  :noun => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
25
- ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
26
- ['men', 'man'], ['ies', 'y']],
26
+ ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
27
+ ['men', 'man'], ['ies', 'y']],
27
28
  :verb => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
28
- ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
29
-
29
+ ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
30
30
  :adj => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
31
31
  :adv => []}
32
32
 
@@ -38,7 +38,7 @@ class Lemmatizer
38
38
  @exceptions[x] = {}
39
39
  end
40
40
  if files then
41
- files.each_pair do |pos,pair|
41
+ files.each_pair do |pos, pair|
42
42
  load_wordnet_files(pos, pair[0], pair[1])
43
43
  end
44
44
  end
@@ -64,19 +64,33 @@ class Lemmatizer
64
64
 
65
65
  open_file(exc) do |io|
66
66
  io.each_line do |line|
67
- w,s = line.split(/\s+/)
67
+ w, s = line.split(/\s+/)
68
68
  @exceptions[pos][w] ||= []
69
69
  @exceptions[pos][w] << s
70
70
  end
71
71
  end
72
72
  end
73
73
 
74
+ def _each_substitutions(form, pos)
75
+ if lemma = @wordlists[pos][form] then
76
+ yield lemma
77
+ end
78
+ MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
79
+ old, new = *entry
80
+ if form.endwith(old)
81
+ _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
82
+ yield x
83
+ end
84
+ end
85
+ end
86
+ end
87
+
74
88
  def each_lemma(form, pos)
75
89
  if lemma = @exceptions[pos][form] then
76
90
  lemma.each{|x |yield x}
77
91
  end
78
92
  if pos == :noun and form.endwith('ful')
79
- each_lemma(form[0,form.length-3], pos) do |x|
93
+ each_lemma(form[0, form.length-3], pos) do |x|
80
94
  yield x+'ful'
81
95
  end
82
96
  else
@@ -86,23 +100,18 @@ class Lemmatizer
86
100
  end
87
101
  end
88
102
 
89
- def lemma(form,pos)
103
+ def lemma(form, pos = nil)
104
+ if !pos
105
+ [:verb, :noun, :adj, :adv].each do |p|
106
+ result = lemma(form, p)
107
+ return result unless result == form
108
+ end
109
+ return form
110
+ end
90
111
  each_lemma(form, pos) do |x|
91
112
  return x
92
113
  end
93
114
  return form
94
115
  end
95
- def _each_substitutions(form, pos)
96
- if lemma = @wordlists[pos][form] then
97
- yield lemma
98
- end
99
- MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
100
- old, new = *entry
101
- if form.endwith(old)
102
- _each_substitutions(form[0, form.length - old.length] + new, pos) do|x|
103
- yield x
104
- end
105
- end
106
- end
107
- end
116
+
108
117
  end
@@ -1,3 +1,3 @@
1
1
  class Lemmatizer
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -44,6 +44,23 @@ describe "Lemmatizer" do
44
44
 
45
45
  result_r2 = @lemmatizer.lemma("best", :adv)
46
46
  result_r2.should_not == "good"
47
+
48
+ # Lemmatizer give a result even when no pos is given, by assuming it to be :verb, :noun, :adv, or :adj.
49
+ result_1 = @lemmatizer.lemma("plays")
50
+ result_1.should == "play"
51
+
52
+ result_2 = @lemmatizer.lemma("oxen")
53
+ result_2.should == "ox"
54
+
55
+ result_3 = @lemmatizer.lemma("higher")
56
+ result_3.should_not == "high" # since 'higher' is itself contained in the adj list.
57
+
58
+ # test cases for words used in README
59
+ result_t1 = @lemmatizer.lemma("fired")
60
+ result_t1.should == "fire"
61
+
62
+ result_t2 = @lemmatizer.lemma("slower")
63
+ result_t2.should == "slow"
47
64
  end
48
65
  end
49
66
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-28 00:00:00.000000000 Z
12
+ date: 2012-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70234015642620 !ruby/object:Gem::Requirement
16
+ requirement: &70314483330880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70234015642620
24
+ version_requirements: *70314483330880
25
25
  description: Lemmatizer for text in English. Inspired by Python's nltk.corpus.reader.wordnet.morphy
26
26
  package.
27
27
  email: