lemmatizer 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9ffb812c39d3601bb86da6d6e44c8375cc8078bfb88089a8357c14c92473209a
4
- data.tar.gz: fe6a030e331e8138231dfaf48b136bf5dc7e4e8be40a8457664bdd8bf2b8f6b5
3
+ metadata.gz: 622ecac5f6d2ffa86a231a0bf2a1689bc5f6746d53a388c9efc4041c30f2e151
4
+ data.tar.gz: 7886e4ed919288df6216495f2f0b047e6555c6008c36dc1233c19925fb244501
5
5
  SHA512:
6
- metadata.gz: f2505986c5b0919c7691fbec86afc814df9567e23024f68e9c7f1daa0afd1485534fcd700aa131146d12f8d7dbfb95fbcbce798fc37e4408afd7976d077d750f
7
- data.tar.gz: 3e9e7f6289d906b2e4f984ccf285e24024de933b7aa9065f99559f057392299a69696ab82706624781f780d925eb6f1565f629462ec9a1f98fadfc3bea8810ba
6
+ metadata.gz: de93d602e0a85badd29aa47f9fa4e2970fc3f2b4feb24e596be1921820c57cc3a3d162127bfff817e5b06094e25612be01bbe4f5751bad8278cbd7d6b7bad38c
7
+ data.tar.gz: d80dd24c7cb03de971f5c1e8bb1cf3a3a19bb41c0a5dfdf0a079c4b205ba524a360296d28370d3d72328cc0ba91fc67da5862552d11baf80efdd1d263d8b3353
data/README.md CHANGED
@@ -51,14 +51,14 @@ Supplying with user dict
51
51
  # You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
52
52
  # The data in user supplied files overrides the preset data. Here's the sample.
53
53
 
54
- # --- sample.dict.txt (don't include hash symbol on the left) ---
54
+ # --- sample.dict1.txt (don't include hash symbol on the left) ---
55
55
  # adj higher high
56
56
  # adj highest high
57
57
  # noun MacBooks MacBook
58
58
  # ---------------------------------------------------------------
59
59
 
60
- lem = Lemmatizer.new("sample.dict.txt")
61
- # => 3 lexical items added from dict file provided
60
+ lem = Lemmatizer.new("sample.dict1.txt")
61
+ # => 3 items added from dict file provided
62
62
 
63
63
  p lem.lemma("higher", :adj) # => "high"
64
64
  p lem.lemma("highest", :adj) # => "high"
@@ -67,14 +67,38 @@ p lem.lemma("MacBooks", :noun) # => "MacBook"
67
67
  # The argument to Lemmatizer.new can be either of the following:
68
68
  # 1) a path string to a dict file (e.g. "/path/to/dict.txt")
69
69
  # 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
70
+ ```
71
+
72
+ Resolving abbreviations
73
+ -----------
74
+ ```ruby
75
+ # You can use 'abbr' tag in user dicts to resolve abbreviations in text.
76
+
77
+ # --- sample.dict2.txt (don't include hash symbol on the left) ---
78
+ # abbr utexas University of Texas
79
+ # abbr mit Massachusetts Institute of Technology
80
+ # ---------------------------------------------------------------
81
+
82
+ # <NOTE>
83
+ # 1. Expressions on the right (substitutes) can contain white spaces,
84
+ # while expressions in the middle (words to be replaced) cannot.
85
+ # 2. Double/Single quotations could be used with substitute expressions,
86
+ # but not with original expressions.
87
+
88
+ lem = Lemmatizer.new("sample.dict2.txt")
89
+ # => 2 items added from dict file provided
70
90
 
91
+ p lem.lemma("utexas", :abbr) # => "University of Texas"
92
+ p lem.lemma("mit", :abbr) # => "Massachusetts Institute of Technology"
71
93
  ```
72
94
 
73
95
  Author
74
96
  ------
97
+
75
98
  * Yoichiro Hasebe <yohasebe@gmail.com>
76
99
 
77
100
  Thanks for assistance and contributions:
101
+
78
102
  * Vladimir Ivic <http://vladimirivic.com>
79
103
 
80
104
  License
@@ -1,3 +1,4 @@
1
+
1
2
  module Lemmatizer
2
3
  class Lemmatizer
3
4
  DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
@@ -51,6 +52,8 @@ module Lemmatizer
51
52
  ],
52
53
  :adv => [
53
54
  ],
55
+ :abbr => [
56
+ ],
54
57
  :unknown => [
55
58
  ]
56
59
  }
@@ -77,7 +80,7 @@ module Lemmatizer
77
80
 
78
81
  def lemma(form, pos = nil)
79
82
  unless pos
80
- [:verb, :noun, :adj, :adv].each do |p|
83
+ [:verb, :noun, :adj, :adv, :abbr].each do |p|
81
84
  result = lemma(form, p)
82
85
  return result unless result == form
83
86
  end
@@ -168,6 +171,8 @@ module Lemmatizer
168
171
  return :adj
169
172
  when "r", "adverb", "adv"
170
173
  return :adv
174
+ when "b", "abbrev", "abbr", "abr"
175
+ return :abbr
171
176
  else
172
177
  return :unknown
173
178
  end
@@ -178,15 +183,24 @@ module Lemmatizer
178
183
  open_file(dict) do |io|
179
184
  io.each_line do |line|
180
185
  # pos must be either n|v|r|a or noun|verb|adverb|adjective
181
- p, w, s = line.split(/\s+/)
186
+ p, w, s = line.split(/\s+/, 3)
182
187
  pos = str_to_pos(p)
188
+ word = w
189
+ substitute = s.strip
190
+ if /\A\"(.*)\"\z/ =~ substitute
191
+ substitute = $1
192
+ end
193
+ if /\A\'(.*)\'\z/ =~ substitute
194
+ substitute = $1
195
+ end
196
+ next unless (pos && word && substitute)
183
197
  if @wordlists[pos]
184
- @wordlists[pos][w] = s
198
+ @wordlists[pos][word] = substitute
185
199
  num_lex_added += 1
186
200
  end
187
201
  end
188
202
  end
189
- puts "#{num_lex_added} items added from #{File.basename dict}"
203
+ # puts "#{num_lex_added} items added from #{File.basename dict}"
190
204
  end
191
205
  end
192
206
  end
@@ -1,3 +1,3 @@
1
1
  module Lemmatizer
2
- VERSION = '0.2.1'
2
+ VERSION = '0.2.2'
3
3
  end
@@ -7,8 +7,9 @@ describe 'Lemmatizer' do
7
7
  @lemmatizer = Lemmatizer.new
8
8
  user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
9
9
  user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
10
+ user_data3 = File.join(File.dirname(__FILE__), "user.dict3.txt")
10
11
  @lemmatizer_single_userdict = Lemmatizer.new(user_data1)
11
- @lemmatizer_multiple_userdicts = Lemmatizer.new([user_data1, user_data2])
12
+ @lemmatizer_multiple_userdicts = Lemmatizer.new([user_data2, user_data3])
12
13
  end
13
14
 
14
15
  describe '#lemma' do
@@ -48,7 +49,7 @@ describe 'Lemmatizer' do
48
49
  end
49
50
 
50
51
  it 'gives a result when no pos is given' do
51
- # Order: :verb, :noun, :adv, or :adj
52
+ # Order: :verb, :noun, :adv, :adj, or :abbr
52
53
  result_1 = @lemmatizer.lemma('plays')
53
54
  expect(result_1).to eq('play')
54
55
 
@@ -81,23 +82,36 @@ describe 'Lemmatizer' do
81
82
  # 'MacBooks' -> 'MacBook'
82
83
  result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
83
84
  expect(result_u1).to eq('MacBook')
84
- result_u2 = @lemmatizer_single_userdict.lemma('crying', :verb)
85
- expect(result_u2).to eq('cry')
85
+ # 'iPhones' -> 'iPhone'
86
+ result_u2 = @lemmatizer_single_userdict.lemma('iPhones', :noun)
87
+ expect(result_u2).to eq('iPhone')
86
88
  end
87
89
 
88
90
  it 'can load uder dicts that override presets' do
89
- # 'MacBooks' -> 'MacBook'
90
- result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
91
- expect(result_ud1).to eq('MacBook')
92
91
  # 'higher' -> 'high'
93
- result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
94
- expect(result_ud2).to eq('high')
95
- # 'highest' -> 'high'
96
92
  result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
97
93
  expect(result_ud3).to eq('high')
98
94
  # check if (unoverridden) preset data is kept intact
99
95
  result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
100
96
  expect(result_ud4).to eq('cry')
97
+ # 'I'm' -> 'I am'
98
+ result_ud5 = @lemmatizer_multiple_userdicts.lemma("I'm", :abbr)
99
+ expect(result_ud5).to eq('I am')
100
+ # 'You're' -> 'you are'
101
+ result_ud6 = @lemmatizer_multiple_userdicts.lemma("You're", :abbr)
102
+ expect(result_ud6).to eq("you are")
103
+ # 'you're' -> 'you are'
104
+ result_ud7 = @lemmatizer_multiple_userdicts.lemma("you're", :abbr)
105
+ expect(result_ud7).to eq("you are")
106
+ # 'h2s' -> 'Hydrogen Sulphide'
107
+ result_ud8 = @lemmatizer_multiple_userdicts.lemma("h2s", :abbr)
108
+ expect(result_ud8).to eq("Hydrogen Sulphide")
109
+ # 'utexas' -> 'University of Texas'
110
+ result_ud9 = @lemmatizer_multiple_userdicts.lemma("utexas", :abbr)
111
+ expect(result_ud9).to eq("University of Texas")
112
+ # 'mit' -> 'Massachusetts Institute of Technology'
113
+ result_ud10 = @lemmatizer_multiple_userdicts.lemma("mit", :abbr)
114
+ expect(result_ud10).to eq("Massachusetts Institute of Technology")
101
115
  end
102
116
  end
103
117
  end
@@ -0,0 +1,6 @@
1
+ abbr I'm "I am"
2
+ abbr you're "you are"
3
+ abbr You're "you are"
4
+ abbr h2s "Hydrogen Sulphide"
5
+ abbr utexas University of Texas
6
+ abbr mit Massachusetts Institute of Technology
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-16 00:00:00.000000000 Z
11
+ date: 2019-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -55,6 +55,7 @@ files:
55
55
  - spec/spec_helper.rb
56
56
  - spec/user.dict1.txt
57
57
  - spec/user.dict2.txt
58
+ - spec/user.dict3.txt
58
59
  homepage: http://github.com/yohasebe/lemmatizer
59
60
  licenses:
60
61
  - MIT
@@ -83,3 +84,4 @@ test_files:
83
84
  - spec/spec_helper.rb
84
85
  - spec/user.dict1.txt
85
86
  - spec/user.dict2.txt
87
+ - spec/user.dict3.txt