nekoneko_gen 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -153,7 +153,6 @@ CODE
153
153
  <<CODE
154
154
  def self.classify(vec)
155
155
  input_y = []
156
- output_y = []
157
156
  HIDDEN_UNITS.times do |i|
158
157
  input_y[i] = sigmoid(INPUT_BIAS[i] +
159
158
  INPUT_W[i].values_at(*vec).compact.reduce(0.0, :+))
@@ -18,7 +18,6 @@ module NekonekoGen
18
18
  @name = safe_name(@filename).split("_").map(&:capitalize).join
19
19
  @labels = files.map {|file| "#{safe_name(file).upcase}"}
20
20
  end
21
-
22
21
  def train(iteration = nil)
23
22
  iteration ||= @classifier.default_iteration
24
23
  data = []
@@ -31,7 +30,7 @@ module NekonekoGen
31
30
  File.open(@files[i]) do |f|
32
31
  content = f.read
33
32
  end
34
- content = NKF.nkf('-w', content)
33
+ content = NKF.nkf('-wZX', content).downcase
35
34
  content.lines do |line|
36
35
  vec = fv(line.chomp)
37
36
  if (vec.size > 0)
@@ -40,6 +39,7 @@ module NekonekoGen
40
39
  end
41
40
  puts sprintf("%.4fs", Time.now - t)
42
41
  end
42
+
43
43
  samples = data.map{|v| v.size}.min
44
44
  iteration.times do |step|
45
45
  loss = 0.0
@@ -81,6 +81,7 @@ module NekonekoGen
81
81
  # -*- coding: utf-8 -*-
82
82
  require 'rubygems'
83
83
  require 'json'
84
+ require 'nkf'
84
85
  require 'bimyou_segmenter'
85
86
 
86
87
  class #{@name}
@@ -88,17 +89,17 @@ class #{@name}
88
89
  K
89
90
  end
90
91
  def self.predict(text)
91
- classify(fv(text))
92
+ classify(fv(NKF::nkf('-wZX', text).downcase))
92
93
  end
93
-
94
94
  #{labels}
95
95
  LABELS = #{@labels.inspect}
96
96
  K = #{@classifier.k}
97
-
98
97
  private
99
98
  def self.fv(text)
100
99
  prev = nil
101
- BimyouSegmenter.segment(text).map do |word|
100
+ BimyouSegmenter.segment(text,
101
+ :white_space => true,
102
+ :symbol => true).map do |word|
102
103
  if (prev)
103
104
  if (NGRAM_TARGET =~ word)
104
105
  nword = [prev + word, word]
@@ -114,10 +115,9 @@ class #{@name}
114
115
  end
115
116
  word
116
117
  end
117
- end.flatten(1)
118
+ end.flatten
118
119
  end
119
120
  #{@classifier.classify_method_code(:ruby)}
120
-
121
121
  NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\\-_a-zA-Z‐_0-90-9]+$)|' +
122
122
  '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
123
123
  [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
@@ -142,10 +142,24 @@ MODEL
142
142
  word_id
143
143
  end
144
144
  end
145
+ def lemmatize(word)
146
+ if (word =~ /^[a-z\-_]+$/)
147
+ LEMMATIZATION_RULE.each do |reg|
148
+ if (word.match(reg[0]))
149
+ return reg[1].map{|b| word.gsub(reg[0], b)}
150
+ end
151
+ end
152
+ word
153
+ else
154
+ word
155
+ end
156
+ end
145
157
  def fv(text)
146
158
  vec = Hash.new(0)
147
159
  prev = nil
148
- words = BimyouSegmenter.segment(text, :white_space => true).map do |word|
160
+ BimyouSegmenter.segment(text,
161
+ :white_space => true,
162
+ :symbol => true).map do |word|
149
163
  if (prev)
150
164
  if (NGRAM_TARGET =~ word)
151
165
  nword = [prev + word, word]
@@ -161,8 +175,8 @@ MODEL
161
175
  end
162
176
  word
163
177
  end
164
- end.flatten(1).reject do |word|
165
- STOP_WORDS[word]
178
+ end.flatten.reject do |word|
179
+ word.empty? || STOP_WORDS[word] || word.match(WHITE_SPACE) || word.match(SYMBOL)
166
180
  end.each do |word|
167
181
  vec[word2id(word)] += 1
168
182
  end
@@ -208,7 +222,7 @@ MODEL
208
222
  end
209
223
  end
210
224
  def safe_name(filename)
211
- File.basename(filename, ".*").gsub('-','_').gsub(/[^a-zA-Z_0-9]/, '')
225
+ File.basename(filename, ".*").gsub(/[\-\.]/,'_').gsub(/[^a-zA-Z_0-9]/, '')
212
226
  end
213
227
  def puts(s)
214
228
  unless (@quiet)
@@ -220,10 +234,19 @@ MODEL
220
234
  Kernel.print s
221
235
  end
222
236
  end
237
+ SYMBOL = Regexp.new('^[^々〇' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
238
+ [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
239
+ [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') +
240
+ '\s ぁ-ゞァ-ヾa-zA-Za-zA-Z0-90-9]+$')
241
+ WHITE_SPACE = /[\s ]/
223
242
  NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\-_a-zA-Z‐_0-90-9]+$)|' +
224
243
  '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
225
244
  [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
226
245
  [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
227
- STOP_WORDS = {"の"=>1, ""=>1, "て"=>1, ""=>1, "た"=>1, ""=>1, "で"=>1, ""=>1, "と"=>1, ""=>1, "も"=>1, "ない"=>1, "だ"=>1, ""=>1, "です"=>1, "から"=>1, "ます"=>1, ""=>1, "けど"=>1, "って"=>1, ""=>1, "よ"=>1, "まし"=>1, "たら"=>1, ""=>1, ""=>1, "なら"=>1, "でしょ"=>1, "とか"=>1, "じゃ"=>1, "まで"=>1, "ので"=>1, "ませ"=>1, "だけ"=>1, "へ"=>1, "なく"=>1, "という"=>1, "や"=>1, "でも"=>1, "ござい"=>1, "し"=>1, "たい"=>1, "だろ"=>1, "なかっ"=>1, "ある"=>1, "ず"=>1, "たり"=>1, "だっ"=>1, "しか"=>1, "くらい"=>1, "かも"=>1, "ながら"=>1, "でし"=>1, "また"=>1, "より"=>1, "のに"=>1, "わ"=>1, "など"=>1, "として"=>1, "ぬ"=>1, "あっ"=>1, "らしい"=>1, "ばかり"=>1, "ほど"=>1, "ぞ"=>1, "しかし"=>1, "なけれ"=>1, "ただ"=>1, "つ"=>1, "けれども"=>1, "んで"=>1, "ぐらい"=>1, "なんて"=>1, "について"=>1, "そうして"=>1, "ましょ"=>1, "さえ"=>1, "のみ"=>1, "たく"=>1, "あり"=>1, "る"=>1, "なんか"=>1, "べき"=>1, "だって"=>1, "それとも"=>1, "ちゃ"=>1, "なぁ"=>1, "それから"=>1, "さ"=>1, "ぜ"=>1, "によって"=>1, "ねえ"=>1, "っけ"=>1, "やら"=>1, "だから"=>1, "とも"=>1, "いや"=>1, "なり"=>1, "それでも"=>1, "なあ"=>1, "まい"=>1, "つつ"=>1, "そして"=>1, "それで"=>1, "かい"=>1, "すると"=>1, "しかも"=>1, "あろ"=>1, "らしく"=>1, "ずつ"=>1, "り"=>1, "たる"=>1, "又"=>1, "ねぇ"=>1, "に対して"=>1, "け"=>1, "こそ"=>1, "もしくは"=>1, "なきゃ"=>1, "だら"=>1, "そこで"=>1, "すら"=>1, "実は"=>1, "ところが"=>1, "なる"=>1, "による"=>1, "御座い"=>1, "じゃん"=>1, "つまり"=>1, "けれど"=>1, "ただし"=>1, "だの"=>1, "たかっ"=>1, "ざる"=>1, "ごとく"=>1, "に対する"=>1, "とかいう"=>1, "かしら"=>1, "なくっ"=>1, "そりゃ"=>1, "または"=>1, "べ"=>1, "にて"=>1, "において"=>1, "たろ"=>1, "無い"=>1, "あれ"=>1, "なぞ"=>1, "っと"=>1, "き"=>1, "にとって"=>1, "たって"=>1, "じ"=>1, "あるいは"=>1, "ど"=>1, "っす"=>1, "だり"=>1, "又は"=>1, "ばっかり"=>1, "てか"=>1, "けども"=>1, "と共に"=>1, "れ"=>1, "なかろ"=>1, "なお"=>1, "ものの"=>1, "に関する"=>1, "ばっか"=>1, "こうして"=>1, "程"=>1, "べし"=>1, "たとえば"=>1, "ども"=>1, "一方"=>1, "それでは"=>1, "かつ"=>1, "やし"=>1, "だけど"=>1, "なんぞ"=>1, "べく"=>1, "迄"=>1, "如く"=>1, "ってか"=>1, "すなわち"=>1, "さて"=>1, "どころか"=>1, "では"=>1, "を以て"=>1, "かぁ"=>1, "のう"=>1, "らしかっ"=>1, "そしたら"=>1, "にゃ"=>1, "まじ"=>1, "るる"=>1, "らし"=>1, "やん"=>1, "たけれ"=>1, "らしき"=>1, "しも"=>1, "べから"=>1, "或いは"=>1, "及び"=>1, "だが"=>1, "ごとき"=>1, "なし"=>1, "如き"=>1, "ねん"=>1, "但し"=>1, "ござる"=>1, "いえ"=>1, "故に"=>1, "即ち"=>1, "やっ"=>1, "なき"=>1, "無かっ"=>1, "なけりゃ"=>1, "即"=>1, "よって"=>1, "或は"=>1, "および"=>1, "尚"=>1, "否"=>1, "じゃろ"=>1, "っしょ"=>1, "尤も"=>1, "だに"=>1, "やす"=>1, "ござん"=>1, "ついで"=>1, "へん"=>1, "じゃっ"=>1, "わい"=>1, "次に"=>1, "之"=>1, "ける"=>1, "然し"=>1, "もっとも"=>1, "そうしたら"=>1, "無く"=>1, "やろ"=>1, "亦"=>1, "っし"=>1, "に対し"=>1, "乃至"=>1, "なれ"=>1, "御座る"=>1, "御座ん"=>1, "とう"=>1, "てえ"=>1, "但"=>1, "どし"=>1, "ざり"=>1, "といふ"=>1, "たれ"=>1, "したら"=>1, "もん"=>1, "やせ"=>1, "たくっ"=>1, "若しくは"=>1, "ずん"=>1, "あら"=>1, "ざれ"=>1, "無かろ"=>1, "無けれ"=>1, "ごとし"=>1, "たきゃ"=>1, "どす"=>1, "けり"=>1, "まじき"=>1, "ますれ"=>1, "たき"=>1, "てん"=>1, "たけりゃ"=>1, "無き"=>1, "無"=>1, "如し"=>1, "あん"=>1, "御座っ"=>1, "ありゃ"=>1, "かな"=>1, "ばかし"=>1}
246
+ LEMMATIZATION_RULE = [[/shes$/, ["sh"]], [/ches$/, ["ch"]], [/zed$/, ["zed"]], [/zes$/, ["z"]], [/ses$/, ["s"]], [/ing$/, [""]], [/ves$/, ["f"]], [/xes$/, ["x"]], [/ies$/, ["y"]], [/est$/, ["e", ""]], [/men$/, ["man"]], [/es$/, ["e", ""]], [/ed$/, ["e", ""]], [/er$/, ["e", ""]], [/s$/, [""]]]
247
+ STOP_WORDS = {"の"=>1, "に"=>1, "て"=>1, "が"=>1, "た"=>1, "は"=>1, "で"=>1, "を"=>1, "と"=>1, "か"=>1, "も"=>1, "ない"=>1, "だ"=>1, "な"=>1, "です"=>1, "から"=>1, "ます"=>1, "う"=>1, "けど"=>1, "って"=>1, "ば"=>1, "よ"=>1, "まし"=>1, "たら"=>1, "ね"=>1, "ん"=>1, "なら"=>1, "でしょ"=>1, "とか"=>1, "じゃ"=>1, "まで"=>1, "ので"=>1, "ませ"=>1, "だけ"=>1, "へ"=>1, "なく"=>1, "という"=>1, "や"=>1, "でも"=>1, "ござい"=>1, "し"=>1, "たい"=>1, "だろ"=>1, "なかっ"=>1, "ある"=>1, "ず"=>1, "たり"=>1, "だっ"=>1, "しか"=>1, "くらい"=>1, "かも"=>1, "ながら"=>1, "でし"=>1, "また"=>1, "より"=>1, "のに"=>1, "わ"=>1, "など"=>1, "として"=>1, "ぬ"=>1, "あっ"=>1, "らしい"=>1, "ばかり"=>1, "ほど"=>1, "ぞ"=>1, "しかし"=>1, "なけれ"=>1, "ただ"=>1, "つ"=>1, "けれども"=>1, "んで"=>1, "ぐらい"=>1, "なんて"=>1, "について"=>1, "そうして"=>1, "ましょ"=>1, "さえ"=>1, "のみ"=>1, "たく"=>1, "あり"=>1, "る"=>1, "なんか"=>1, "べき"=>1, "だって"=>1, "それとも"=>1, "ちゃ"=>1, "なぁ"=>1, "それから"=>1, "さ"=>1, "ぜ"=>1, "によって"=>1, "ねえ"=>1, "っけ"=>1, "やら"=>1, "だから"=>1, "とも"=>1, "いや"=>1, "なり"=>1, "それでも"=>1, "なあ"=>1, "まい"=>1, "つつ"=>1, "そして"=>1, "それで"=>1, "かい"=>1, "すると"=>1, "しかも"=>1, "あろ"=>1, "らしく"=>1, "ずつ"=>1, "り"=>1, "たる"=>1, "又"=>1, "ねぇ"=>1, "に対して"=>1, "け"=>1, "こそ"=>1, "もしくは"=>1, "なきゃ"=>1, "だら"=>1, "そこで"=>1, "すら"=>1, "実は"=>1, "ところが"=>1, "なる"=>1, "による"=>1, "御座い"=>1, "じゃん"=>1, "つまり"=>1, "けれど"=>1, "ただし"=>1, "だの"=>1, "たかっ"=>1, "ざる"=>1, "ごとく"=>1, "に対する"=>1, "とかいう"=>1, "かしら"=>1, "なくっ"=>1, "そりゃ"=>1, "または"=>1, "べ"=>1, "にて"=>1, "において"=>1, "たろ"=>1, "無い"=>1, "あれ"=>1, "なぞ"=>1, "っと"=>1, "き"=>1, "にとって"=>1, "たって"=>1, "じ"=>1, "あるいは"=>1, "ど"=>1, "っす"=>1, "だり"=>1, "又は"=>1, "ばっかり"=>1, "てか"=>1, "けども"=>1, "と共に"=>1, "れ"=>1, "なかろ"=>1, "なお"=>1, "ものの"=>1, "に関する"=>1, "ばっか"=>1, "こうして"=>1, "程"=>1, "べし"=>1, "たとえば"=>1, "ども"=>1, "一方"=>1, "それでは"=>1, "かつ"=>1, "やし"=>1, "だけど"=>1, "なんぞ"=>1, "べく"=>1, "迄"=>1, "如く"=>1, "ってか"=>1, "すなわち"=>1, "さて"=>1, "どころか"=>1, "では"=>1, "を以て"=>1, "かぁ"=>1, "のう"=>1, "らしかっ"=>1, "そしたら"=>1, "にゃ"=>1, "まじ"=>1, "るる"=>1, "らし"=>1, "やん"=>1, "たけれ"=>1, "らしき"=>1, "しも"=>1, "べから"=>1, "或いは"=>1, "及び"=>1, "だが"=>1, "ごとき"=>1, "なし"=>1, "如き"=>1, "ねん"=>1, "但し"=>1, "ござる"=>1, "いえ"=>1, "故に"=>1, "即ち"=>1, "やっ"=>1, "なき"=>1, "無かっ"=>1, "なけりゃ"=>1, "即"=>1, "よって"=>1, "或は"=>1, "および"=>1, "尚"=>1, "否"=>1, "じゃろ"=>1, "っしょ"=>1, "尤も"=>1, "だに"=>1, "やす"=>1, "ござん"=>1, "ついで"=>1, "へん"=>1, "じゃっ"=>1, "わい"=>1, "次に"=>1, "之"=>1, "ける"=>1, "然し"=>1, "もっとも"=>1, "そうしたら"=>1, "無く"=>1, "やろ"=>1, "亦"=>1, "っし"=>1, "に対し"=>1, "乃至"=>1, "なれ"=>1, "御座る"=>1, "御座ん"=>1, "とう"=>1, "てえ"=>1, "但"=>1, "どし"=>1, "ざり"=>1, "といふ"=>1, "たれ"=>1, "したら"=>1, "もん"=>1, "やせ"=>1, "たくっ"=>1, "若しくは"=>1, "ずん"=>1, "あら"=>1, "ざれ"=>1, "無かろ"=>1, "無けれ"=>1, "ごとし"=>1, "たきゃ"=>1, "どす"=>1, "けり"=>1, "まじき"=>1, "ますれ"=>1, "たき"=>1, "てん"=>1, "たけりゃ"=>1, "無き"=>1, "無"=>1, "如し"=>1, "あん"=>1, "御座っ"=>1, "ありゃ"=>1, "かな"=>1, "ばかし"=>1,
248
+ "a"=>1, "about"=>1, "after"=>1, "against"=>1, "all"=>1, "also"=>1, "although"=>1, "am"=>1, "among"=>1, "an"=>1, "and"=>1, "any"=>1, "anyone"=>1, "are"=>1, "as"=>1, "at"=>1, "ax"=>1, "be"=>1, "became"=>1, "because"=>1, "been"=>1, "being"=>1, "between"=>1, "but"=>1, "by"=>1, "c"=>1, "ca"=>1, "can"=>1, "come"=>1, "could"=>1, "cs"=>1, "did"=>1, "do"=>1, "does"=>1, "don"=>1, "during"=>1, "each"=>1, "early"=>1, "even"=>1, "for"=>1, "form"=>1, "found"=>1, "from"=>1, "get"=>1, "good"=>1, "had"=>1, "has"=>1, "have"=>1, "he"=>1, "her"=>1, "here"=>1, "him"=>1, "his"=>1, "how"=>1, "however"=>1, "i"=>1, "if"=>1, "in"=>1, "include"=>1, "including"=>1, "into"=>1, "is"=>1, "it"=>1, "its"=>1, "just"=>1, "know"=>1, "late"=>1, "later"=>1, "like"=>1, "made"=>1, "many"=>1, "may"=>1, "me"=>1, "more"=>1, "most"=>1, "much"=>1, "my"=>1, "near"=>1, "need"=>1, "new"=>1, "no"=>1, "non"=>1, "not"=>1, "now"=>1, "of"=>1, "off"=>1, "on"=>1, "one"=>1, "only"=>1, "or"=>1, "other"=>1, "our"=>1, "out"=>1, "over"=>1, "people"=>1, "please"=>1, "r"=>1, "right"=>1, "s"=>1, "same"=>1, "see"=>1, "several"=>1, "she"=>1, "should"=>1, "so"=>1, "some"=>1, "something"=>1, "such"=>1, "t"=>1, "than"=>1, "that"=>1, "the"=>1, "their"=>1, "them"=>1, "then"=>1, "there"=>1, "these"=>1, "they"=>1, "think"=>1, "this"=>1, "those"=>1, "through"=>1, "time"=>1, "to"=>1, "too"=>1, "u"=>1, "under"=>1, "until"=>1, "up"=>1, "us"=>1, "use"=>1, "used"=>1, "ve"=>1, "very"=>1, "want"=>1, "was"=>1, "way"=>1, "we"=>1, "well"=>1, "were"=>1, "what"=>1, "when"=>1, "where"=>1, "which"=>1, "who"=>1, "why"=>1, "will"=>1, "with"=>1, "would"=>1, "you"=>1, "your"=>1,
249
+ "q" => 1, "p" => 1, "b" => 1, "d" => 1, "o" => 1, "8" => 1, "1" => 1, "0" => 1, "z"=>1, "w"=>1, "v" => 1, "7" => 1, "x" => 1, "e" => 1, "f" => 1, "c" => 1 ,"3" => 1
250
+ }
228
251
  end
229
252
  end
@@ -1,4 +1,4 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  module NekonekoGen
3
- VERSION = "0.2.1"
3
+ VERSION = "0.3.0"
4
4
  end
data/nekoneko_gen.gemspec CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
15
15
  gem.require_paths = ["lib"]
16
16
  gem.version = NekonekoGen::VERSION
17
17
 
18
- gem.add_dependency 'bimyou_segmenter'
18
+ gem.add_dependency 'bimyou_segmenter', '>= 1.2.0'
19
19
  gem.add_dependency 'json'
20
20
  gem.add_development_dependency 'test-unit'
21
21
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nekoneko_gen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-01 00:00:00.000000000Z
12
+ date: 2012-06-02 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bimyou_segmenter
16
- requirement: &14306440 !ruby/object:Gem::Requirement
16
+ requirement: &11266280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: 1.2.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *14306440
24
+ version_requirements: *11266280
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: json
27
- requirement: &14304220 !ruby/object:Gem::Requirement
27
+ requirement: &11527320 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *14304220
35
+ version_requirements: *11527320
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: test-unit
38
- requirement: &14303060 !ruby/object:Gem::Requirement
38
+ requirement: &11890980 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *14303060
46
+ version_requirements: *11890980
47
47
  description: Japanese Text Classifier Generator
48
48
  email:
49
49
  - nagadomi@nurs.or.jp