lemmatizer 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9ffb812c39d3601bb86da6d6e44c8375cc8078bfb88089a8357c14c92473209a
4
- data.tar.gz: fe6a030e331e8138231dfaf48b136bf5dc7e4e8be40a8457664bdd8bf2b8f6b5
3
+ metadata.gz: 622ecac5f6d2ffa86a231a0bf2a1689bc5f6746d53a388c9efc4041c30f2e151
4
+ data.tar.gz: 7886e4ed919288df6216495f2f0b047e6555c6008c36dc1233c19925fb244501
5
5
  SHA512:
6
- metadata.gz: f2505986c5b0919c7691fbec86afc814df9567e23024f68e9c7f1daa0afd1485534fcd700aa131146d12f8d7dbfb95fbcbce798fc37e4408afd7976d077d750f
7
- data.tar.gz: 3e9e7f6289d906b2e4f984ccf285e24024de933b7aa9065f99559f057392299a69696ab82706624781f780d925eb6f1565f629462ec9a1f98fadfc3bea8810ba
6
+ metadata.gz: de93d602e0a85badd29aa47f9fa4e2970fc3f2b4feb24e596be1921820c57cc3a3d162127bfff817e5b06094e25612be01bbe4f5751bad8278cbd7d6b7bad38c
7
+ data.tar.gz: d80dd24c7cb03de971f5c1e8bb1cf3a3a19bb41c0a5dfdf0a079c4b205ba524a360296d28370d3d72328cc0ba91fc67da5862552d11baf80efdd1d263d8b3353
data/README.md CHANGED
@@ -51,14 +51,14 @@ Supplying with user dict
51
51
  # You can supply custom dict files consisting of lines in the format of <pos>\s+<form>\s+<lemma>.
52
52
  # The data in user supplied files overrides the preset data. Here's the sample.
53
53
 
54
- # --- sample.dict.txt (don't include hash symbol on the left) ---
54
+ # --- sample.dict1.txt (don't include hash symbol on the left) ---
55
55
  # adj higher high
56
56
  # adj highest high
57
57
  # noun MacBooks MacBook
58
58
  # ---------------------------------------------------------------
59
59
 
60
- lem = Lemmatizer.new("sample.dict.txt")
61
- # => 3 lexical items added from dict file provided
60
+ lem = Lemmatizer.new("sample.dict1.txt")
61
+ # => 3 items added from dict file provided
62
62
 
63
63
  p lem.lemma("higher", :adj) # => "high"
64
64
  p lem.lemma("highest", :adj) # => "high"
@@ -67,14 +67,38 @@ p lem.lemma("MacBooks", :noun) # => "MacBook"
67
67
  # The argument to Lemmatizer.new can be either of the following:
68
68
  # 1) a path string to a dict file (e.g. "/path/to/dict.txt")
69
69
  # 2) an array of paths to dict files (e.g. ["./dict/noun.txt", "./dict/verb.txt"])
70
+ ```
71
+
72
+ Resolving abbreviations
73
+ -----------
74
+ ```ruby
75
+ # You can use 'abbr' tag in user dicts to resolve abbreviations in text.
76
+
77
+ # --- sample.dict2.txt (don't include hash symbol on the left) ---
78
+ # abbr utexas University of Texas
79
+ # abbr mit Massachusetts Institute of Technology
80
+ # ---------------------------------------------------------------
81
+
82
+ # <NOTE>
83
+ # 1. Expressions on the right (substitutes) can contain white spaces,
84
+ # while expressions in the middle (words to be replaced) cannot.
85
+ # 2. Double/Single quotations could be used with substitute expressions,
86
+ # but not with original expressions.
87
+
88
+ lem = Lemmatizer.new("sample.dict2.txt")
89
+ # => 2 items added from dict file provided
70
90
 
91
+ p lem.lemma("utexas", :abbr) # => "University of Texas"
92
+ p lem.lemma("mit", :abbr) # => "Massachusetts Institute of Technology"
71
93
  ```
72
94
 
73
95
  Author
74
96
  ------
97
+
75
98
  * Yoichiro Hasebe <yohasebe@gmail.com>
76
99
 
77
100
  Thanks for assistance and contributions:
101
+
78
102
  * Vladimir Ivic <http://vladimirivic.com>
79
103
 
80
104
  License
@@ -1,3 +1,4 @@
1
+
1
2
  module Lemmatizer
2
3
  class Lemmatizer
3
4
  DATA_DIR = File.expand_path('..', File.dirname(__FILE__))
@@ -51,6 +52,8 @@ module Lemmatizer
51
52
  ],
52
53
  :adv => [
53
54
  ],
55
+ :abbr => [
56
+ ],
54
57
  :unknown => [
55
58
  ]
56
59
  }
@@ -77,7 +80,7 @@ module Lemmatizer
77
80
 
78
81
  def lemma(form, pos = nil)
79
82
  unless pos
80
- [:verb, :noun, :adj, :adv].each do |p|
83
+ [:verb, :noun, :adj, :adv, :abbr].each do |p|
81
84
  result = lemma(form, p)
82
85
  return result unless result == form
83
86
  end
@@ -168,6 +171,8 @@ module Lemmatizer
168
171
  return :adj
169
172
  when "r", "adverb", "adv"
170
173
  return :adv
174
+ when "b", "abbrev", "abbr", "abr"
175
+ return :abbr
171
176
  else
172
177
  return :unknown
173
178
  end
@@ -178,15 +183,24 @@ module Lemmatizer
178
183
  open_file(dict) do |io|
179
184
  io.each_line do |line|
180
185
  # pos must be either n|v|r|a or noun|verb|adverb|adjective
181
- p, w, s = line.split(/\s+/)
186
+ p, w, s = line.split(/\s+/, 3)
182
187
  pos = str_to_pos(p)
188
+ word = w
189
+ substitute = s.strip
190
+ if /\A\"(.*)\"\z/ =~ substitute
191
+ substitute = $1
192
+ end
193
+ if /\A\'(.*)\'\z/ =~ substitute
194
+ substitute = $1
195
+ end
196
+ next unless (pos && word && substitute)
183
197
  if @wordlists[pos]
184
- @wordlists[pos][w] = s
198
+ @wordlists[pos][word] = substitute
185
199
  num_lex_added += 1
186
200
  end
187
201
  end
188
202
  end
189
- puts "#{num_lex_added} items added from #{File.basename dict}"
203
+ # puts "#{num_lex_added} items added from #{File.basename dict}"
190
204
  end
191
205
  end
192
206
  end
@@ -1,3 +1,3 @@
1
1
  module Lemmatizer
2
- VERSION = '0.2.1'
2
+ VERSION = '0.2.2'
3
3
  end
@@ -7,8 +7,9 @@ describe 'Lemmatizer' do
7
7
  @lemmatizer = Lemmatizer.new
8
8
  user_data1 = File.join(File.dirname(__FILE__), "user.dict1.txt")
9
9
  user_data2 = File.join(File.dirname(__FILE__), "user.dict2.txt")
10
+ user_data3 = File.join(File.dirname(__FILE__), "user.dict3.txt")
10
11
  @lemmatizer_single_userdict = Lemmatizer.new(user_data1)
11
- @lemmatizer_multiple_userdicts = Lemmatizer.new([user_data1, user_data2])
12
+ @lemmatizer_multiple_userdicts = Lemmatizer.new([user_data2, user_data3])
12
13
  end
13
14
 
14
15
  describe '#lemma' do
@@ -48,7 +49,7 @@ describe 'Lemmatizer' do
48
49
  end
49
50
 
50
51
  it 'gives a result when no pos is given' do
51
- # Order: :verb, :noun, :adv, or :adj
52
+ # Order: :verb, :noun, :adv, :adj, or :abbr
52
53
  result_1 = @lemmatizer.lemma('plays')
53
54
  expect(result_1).to eq('play')
54
55
 
@@ -81,23 +82,36 @@ describe 'Lemmatizer' do
81
82
  # 'MacBooks' -> 'MacBook'
82
83
  result_u1 = @lemmatizer_single_userdict.lemma('MacBooks', :noun)
83
84
  expect(result_u1).to eq('MacBook')
84
- result_u2 = @lemmatizer_single_userdict.lemma('crying', :verb)
85
- expect(result_u2).to eq('cry')
85
+ # 'iPhones' -> 'iPhone'
86
+ result_u2 = @lemmatizer_single_userdict.lemma('iPhones', :noun)
87
+ expect(result_u2).to eq('iPhone')
86
88
  end
87
89
 
88
90
  it 'can load uder dicts that override presets' do
89
- # 'MacBooks' -> 'MacBook'
90
- result_ud1 = @lemmatizer_multiple_userdicts.lemma('MacBooks', :noun)
91
- expect(result_ud1).to eq('MacBook')
92
91
  # 'higher' -> 'high'
93
- result_ud2 = @lemmatizer_multiple_userdicts.lemma('higher', :adj)
94
- expect(result_ud2).to eq('high')
95
- # 'highest' -> 'high'
96
92
  result_ud3 = @lemmatizer_multiple_userdicts.lemma('higher')
97
93
  expect(result_ud3).to eq('high')
98
94
  # check if (unoverridden) preset data is kept intact
99
95
  result_ud4 = @lemmatizer_multiple_userdicts.lemma('crying', :verb)
100
96
  expect(result_ud4).to eq('cry')
97
+ # 'I'm' -> 'I am'
98
+ result_ud5 = @lemmatizer_multiple_userdicts.lemma("I'm", :abbr)
99
+ expect(result_ud5).to eq('I am')
100
+ # 'You're' -> 'you are'
101
+ result_ud6 = @lemmatizer_multiple_userdicts.lemma("You're", :abbr)
102
+ expect(result_ud6).to eq("you are")
103
+ # 'you're' -> 'you are'
104
+ result_ud7 = @lemmatizer_multiple_userdicts.lemma("you're", :abbr)
105
+ expect(result_ud7).to eq("you are")
106
+ # 'h2s' -> 'Hydrogen Sulphide'
107
+ result_ud8 = @lemmatizer_multiple_userdicts.lemma("h2s", :abbr)
108
+ expect(result_ud8).to eq("Hydrogen Sulphide")
109
+ # 'utexas' -> 'University of Texas'
110
+ result_ud9 = @lemmatizer_multiple_userdicts.lemma("utexas", :abbr)
111
+ expect(result_ud9).to eq("University of Texas")
112
+ # 'mit' -> 'Massachusetts Institute of Technology'
113
+ result_ud10 = @lemmatizer_multiple_userdicts.lemma("mit", :abbr)
114
+ expect(result_ud10).to eq("Massachusetts Institute of Technology")
101
115
  end
102
116
  end
103
117
  end
@@ -0,0 +1,6 @@
1
+ abbr I'm "I am"
2
+ abbr you're "you are"
3
+ abbr You're "you are"
4
+ abbr h2s "Hydrogen Sulphide"
5
+ abbr utexas University of Texas
6
+ abbr mit Massachusetts Institute of Technology
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lemmatizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-16 00:00:00.000000000 Z
11
+ date: 2019-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -55,6 +55,7 @@ files:
55
55
  - spec/spec_helper.rb
56
56
  - spec/user.dict1.txt
57
57
  - spec/user.dict2.txt
58
+ - spec/user.dict3.txt
58
59
  homepage: http://github.com/yohasebe/lemmatizer
59
60
  licenses:
60
61
  - MIT
@@ -83,3 +84,4 @@ test_files:
83
84
  - spec/spec_helper.rb
84
85
  - spec/user.dict1.txt
85
86
  - spec/user.dict2.txt
87
+ - spec/user.dict3.txt