nekoneko_gen 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -153,7 +153,6 @@ CODE
153
153
  <<CODE
154
154
  def self.classify(vec)
155
155
  input_y = []
156
- output_y = []
157
156
  HIDDEN_UNITS.times do |i|
158
157
  input_y[i] = sigmoid(INPUT_BIAS[i] +
159
158
  INPUT_W[i].values_at(*vec).compact.reduce(0.0, :+))
@@ -18,7 +18,6 @@ module NekonekoGen
18
18
  @name = safe_name(@filename).split("_").map(&:capitalize).join
19
19
  @labels = files.map {|file| "#{safe_name(file).upcase}"}
20
20
  end
21
-
22
21
  def train(iteration = nil)
23
22
  iteration ||= @classifier.default_iteration
24
23
  data = []
@@ -31,7 +30,7 @@ module NekonekoGen
31
30
  File.open(@files[i]) do |f|
32
31
  content = f.read
33
32
  end
34
- content = NKF.nkf('-w', content)
33
+ content = NKF.nkf('-wZX', content).downcase
35
34
  content.lines do |line|
36
35
  vec = fv(line.chomp)
37
36
  if (vec.size > 0)
@@ -40,6 +39,7 @@ module NekonekoGen
40
39
  end
41
40
  puts sprintf("%.4fs", Time.now - t)
42
41
  end
42
+
43
43
  samples = data.map{|v| v.size}.min
44
44
  iteration.times do |step|
45
45
  loss = 0.0
@@ -81,6 +81,7 @@ module NekonekoGen
81
81
  # -*- coding: utf-8 -*-
82
82
  require 'rubygems'
83
83
  require 'json'
84
+ require 'nkf'
84
85
  require 'bimyou_segmenter'
85
86
 
86
87
  class #{@name}
@@ -88,17 +89,17 @@ class #{@name}
88
89
  K
89
90
  end
90
91
  def self.predict(text)
91
- classify(fv(text))
92
+ classify(fv(NKF::nkf('-wZX', text).downcase))
92
93
  end
93
-
94
94
  #{labels}
95
95
  LABELS = #{@labels.inspect}
96
96
  K = #{@classifier.k}
97
-
98
97
  private
99
98
  def self.fv(text)
100
99
  prev = nil
101
- BimyouSegmenter.segment(text).map do |word|
100
+ BimyouSegmenter.segment(text,
101
+ :white_space => true,
102
+ :symbol => true).map do |word|
102
103
  if (prev)
103
104
  if (NGRAM_TARGET =~ word)
104
105
  nword = [prev + word, word]
@@ -114,10 +115,9 @@ class #{@name}
114
115
  end
115
116
  word
116
117
  end
117
- end.flatten(1)
118
+ end.flatten
118
119
  end
119
120
  #{@classifier.classify_method_code(:ruby)}
120
-
121
121
  NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\\-_a-zA-Z‐_0-90-9]+$)|' +
122
122
  '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
123
123
  [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
@@ -142,10 +142,24 @@ MODEL
142
142
  word_id
143
143
  end
144
144
  end
145
+ def lemmatize(word)
146
+ if (word =~ /^[a-z\-_]+$/)
147
+ LEMMATIZATION_RULE.each do |reg|
148
+ if (word.match(reg[0]))
149
+ return reg[1].map{|b| word.gsub(reg[0], b)}
150
+ end
151
+ end
152
+ word
153
+ else
154
+ word
155
+ end
156
+ end
145
157
  def fv(text)
146
158
  vec = Hash.new(0)
147
159
  prev = nil
148
- words = BimyouSegmenter.segment(text, :white_space => true).map do |word|
160
+ BimyouSegmenter.segment(text,
161
+ :white_space => true,
162
+ :symbol => true).map do |word|
149
163
  if (prev)
150
164
  if (NGRAM_TARGET =~ word)
151
165
  nword = [prev + word, word]
@@ -161,8 +175,8 @@ MODEL
161
175
  end
162
176
  word
163
177
  end
164
- end.flatten(1).reject do |word|
165
- STOP_WORDS[word]
178
+ end.flatten.reject do |word|
179
+ word.empty? || STOP_WORDS[word] || word.match(WHITE_SPACE) || word.match(SYMBOL)
166
180
  end.each do |word|
167
181
  vec[word2id(word)] += 1
168
182
  end
@@ -208,7 +222,7 @@ MODEL
208
222
  end
209
223
  end
210
224
  def safe_name(filename)
211
- File.basename(filename, ".*").gsub('-','_').gsub(/[^a-zA-Z_0-9]/, '')
225
+ File.basename(filename, ".*").gsub(/[\-\.]/,'_').gsub(/[^a-zA-Z_0-9]/, '')
212
226
  end
213
227
  def puts(s)
214
228
  unless (@quiet)
@@ -220,10 +234,19 @@ MODEL
220
234
  Kernel.print s
221
235
  end
222
236
  end
237
+ SYMBOL = Regexp.new('^[^々〇' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
238
+ [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
239
+ [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') +
240
+ '\s ぁ-ゞァ-ヾa-zA-Za-zA-Z0-90-9]+$')
241
+ WHITE_SPACE = /[\s ]/
223
242
  NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\-_a-zA-Z‐_0-90-9]+$)|' +
224
243
  '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
225
244
  [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
226
245
  [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
227
- STOP_WORDS = {"の"=>1, ""=>1, "て"=>1, ""=>1, "た"=>1, ""=>1, "で"=>1, ""=>1, "と"=>1, ""=>1, "も"=>1, "ない"=>1, "だ"=>1, ""=>1, "です"=>1, "から"=>1, "ます"=>1, ""=>1, "けど"=>1, "って"=>1, ""=>1, "よ"=>1, "まし"=>1, "たら"=>1, ""=>1, ""=>1, "なら"=>1, "でしょ"=>1, "とか"=>1, "じゃ"=>1, "まで"=>1, "ので"=>1, "ませ"=>1, "だけ"=>1, "へ"=>1, "なく"=>1, "という"=>1, "や"=>1, "でも"=>1, "ござい"=>1, "し"=>1, "たい"=>1, "だろ"=>1, "なかっ"=>1, "ある"=>1, "ず"=>1, "たり"=>1, "だっ"=>1, "しか"=>1, "くらい"=>1, "かも"=>1, "ながら"=>1, "でし"=>1, "また"=>1, "より"=>1, "のに"=>1, "わ"=>1, "など"=>1, "として"=>1, "ぬ"=>1, "あっ"=>1, "らしい"=>1, "ばかり"=>1, "ほど"=>1, "ぞ"=>1, "しかし"=>1, "なけれ"=>1, "ただ"=>1, "つ"=>1, "けれども"=>1, "んで"=>1, "ぐらい"=>1, "なんて"=>1, "について"=>1, "そうして"=>1, "ましょ"=>1, "さえ"=>1, "のみ"=>1, "たく"=>1, "あり"=>1, "る"=>1, "なんか"=>1, "べき"=>1, "だって"=>1, "それとも"=>1, "ちゃ"=>1, "なぁ"=>1, "それから"=>1, "さ"=>1, "ぜ"=>1, "によって"=>1, "ねえ"=>1, "っけ"=>1, "やら"=>1, "だから"=>1, "とも"=>1, "いや"=>1, "なり"=>1, "それでも"=>1, "なあ"=>1, "まい"=>1, "つつ"=>1, "そして"=>1, "それで"=>1, "かい"=>1, "すると"=>1, "しかも"=>1, "あろ"=>1, "らしく"=>1, "ずつ"=>1, "り"=>1, "たる"=>1, "又"=>1, "ねぇ"=>1, "に対して"=>1, "け"=>1, "こそ"=>1, "もしくは"=>1, "なきゃ"=>1, "だら"=>1, "そこで"=>1, "すら"=>1, "実は"=>1, "ところが"=>1, "なる"=>1, "による"=>1, "御座い"=>1, "じゃん"=>1, "つまり"=>1, "けれど"=>1, "ただし"=>1, "だの"=>1, "たかっ"=>1, "ざる"=>1, "ごとく"=>1, "に対する"=>1, "とかいう"=>1, "かしら"=>1, "なくっ"=>1, "そりゃ"=>1, "または"=>1, "べ"=>1, "にて"=>1, "において"=>1, "たろ"=>1, "無い"=>1, "あれ"=>1, "なぞ"=>1, "っと"=>1, "き"=>1, "にとって"=>1, "たって"=>1, "じ"=>1, "あるいは"=>1, "ど"=>1, "っす"=>1, "だり"=>1, "又は"=>1, "ばっかり"=>1, "てか"=>1, "けども"=>1, "と共に"=>1, "れ"=>1, "なかろ"=>1, "なお"=>1, "ものの"=>1, "に関する"=>1, "ばっか"=>1, "こうして"=>1, "程"=>1, "べし"=>1, "たとえば"=>1, "ども"=>1, "一方"=>1, "それでは"=>1, "かつ"=>1, "やし"=>1, "だけど"=>1, "なんぞ"=>1, "べく"=>1, "迄"=>1, "如く"=>1, "ってか"=>1, "すなわち"=>1, "さて"=>1, "どころか"=>1, "では"=>1, "を以て"=>1, "かぁ"=>1, "のう"=>1, "らしかっ"=>1, "そしたら"=>1, "にゃ"=>1, "まじ"=>1, "るる"=>1, "らし"=>1, "やん"=>1, "たけれ"=>1, "らしき"=>1, "しも"=>1, "べから"=>1, "或いは"=>1, "及び"=>1, "だが"=>1, "ごとき"=>1, "なし"=>1, "如き"=>1, "ねん"=>1, "但し"=>1, "ござる"=>1, "いえ"=>1, "故に"=>1, "即ち"=>1, "やっ"=>1, "なき"=>1, "無かっ"=>1, "なけりゃ"=>1, "即"=>1, "よって"=>1, "或は"=>1, "および"=>1, "尚"=>1, "否"=>1, "じゃろ"=>1, "っしょ"=>1, "尤も"=>1, "だに"=>1, "やす"=>1, "ござん"=>1, "ついで"=>1, "へん"=>1, "じゃっ"=>1, "わい"=>1, "次に"=>1, "之"=>1, "ける"=>1, "然し"=>1, "もっとも"=>1, "そうしたら"=>1, "無く"=>1, "やろ"=>1, "亦"=>1, "っし"=>1, "に対し"=>1, "乃至"=>1, "なれ"=>1, "御座る"=>1, "御座ん"=>1, "とう"=>1, "てえ"=>1, "但"=>1, "どし"=>1, "ざり"=>1, "といふ"=>1, "たれ"=>1, "したら"=>1, "もん"=>1, "やせ"=>1, "たくっ"=>1, "若しくは"=>1, "ずん"=>1, "あら"=>1, "ざれ"=>1, "無かろ"=>1, "無けれ"=>1, "ごとし"=>1, "たきゃ"=>1, "どす"=>1, "けり"=>1, "まじき"=>1, "ますれ"=>1, "たき"=>1, "てん"=>1, "たけりゃ"=>1, "無き"=>1, "無"=>1, "如し"=>1, "あん"=>1, "御座っ"=>1, "ありゃ"=>1, "かな"=>1, "ばかし"=>1}
246
+ LEMMATIZATION_RULE = [[/shes$/, ["sh"]], [/ches$/, ["ch"]], [/zed$/, ["zed"]], [/zes$/, ["z"]], [/ses$/, ["s"]], [/ing$/, [""]], [/ves$/, ["f"]], [/xes$/, ["x"]], [/ies$/, ["y"]], [/est$/, ["e", ""]], [/men$/, ["man"]], [/es$/, ["e", ""]], [/ed$/, ["e", ""]], [/er$/, ["e", ""]], [/s$/, [""]]]
247
+ STOP_WORDS = {"の"=>1, "に"=>1, "て"=>1, "が"=>1, "た"=>1, "は"=>1, "で"=>1, "を"=>1, "と"=>1, "か"=>1, "も"=>1, "ない"=>1, "だ"=>1, "な"=>1, "です"=>1, "から"=>1, "ます"=>1, "う"=>1, "けど"=>1, "って"=>1, "ば"=>1, "よ"=>1, "まし"=>1, "たら"=>1, "ね"=>1, "ん"=>1, "なら"=>1, "でしょ"=>1, "とか"=>1, "じゃ"=>1, "まで"=>1, "ので"=>1, "ませ"=>1, "だけ"=>1, "へ"=>1, "なく"=>1, "という"=>1, "や"=>1, "でも"=>1, "ござい"=>1, "し"=>1, "たい"=>1, "だろ"=>1, "なかっ"=>1, "ある"=>1, "ず"=>1, "たり"=>1, "だっ"=>1, "しか"=>1, "くらい"=>1, "かも"=>1, "ながら"=>1, "でし"=>1, "また"=>1, "より"=>1, "のに"=>1, "わ"=>1, "など"=>1, "として"=>1, "ぬ"=>1, "あっ"=>1, "らしい"=>1, "ばかり"=>1, "ほど"=>1, "ぞ"=>1, "しかし"=>1, "なけれ"=>1, "ただ"=>1, "つ"=>1, "けれども"=>1, "んで"=>1, "ぐらい"=>1, "なんて"=>1, "について"=>1, "そうして"=>1, "ましょ"=>1, "さえ"=>1, "のみ"=>1, "たく"=>1, "あり"=>1, "る"=>1, "なんか"=>1, "べき"=>1, "だって"=>1, "それとも"=>1, "ちゃ"=>1, "なぁ"=>1, "それから"=>1, "さ"=>1, "ぜ"=>1, "によって"=>1, "ねえ"=>1, "っけ"=>1, "やら"=>1, "だから"=>1, "とも"=>1, "いや"=>1, "なり"=>1, "それでも"=>1, "なあ"=>1, "まい"=>1, "つつ"=>1, "そして"=>1, "それで"=>1, "かい"=>1, "すると"=>1, "しかも"=>1, "あろ"=>1, "らしく"=>1, "ずつ"=>1, "り"=>1, "たる"=>1, "又"=>1, "ねぇ"=>1, "に対して"=>1, "け"=>1, "こそ"=>1, "もしくは"=>1, "なきゃ"=>1, "だら"=>1, "そこで"=>1, "すら"=>1, "実は"=>1, "ところが"=>1, "なる"=>1, "による"=>1, "御座い"=>1, "じゃん"=>1, "つまり"=>1, "けれど"=>1, "ただし"=>1, "だの"=>1, "たかっ"=>1, "ざる"=>1, "ごとく"=>1, "に対する"=>1, "とかいう"=>1, "かしら"=>1, "なくっ"=>1, "そりゃ"=>1, "または"=>1, "べ"=>1, "にて"=>1, "において"=>1, "たろ"=>1, "無い"=>1, "あれ"=>1, "なぞ"=>1, "っと"=>1, "き"=>1, "にとって"=>1, "たって"=>1, "じ"=>1, "あるいは"=>1, "ど"=>1, "っす"=>1, "だり"=>1, "又は"=>1, "ばっかり"=>1, "てか"=>1, "けども"=>1, "と共に"=>1, "れ"=>1, "なかろ"=>1, "なお"=>1, "ものの"=>1, "に関する"=>1, "ばっか"=>1, "こうして"=>1, "程"=>1, "べし"=>1, "たとえば"=>1, "ども"=>1, "一方"=>1, "それでは"=>1, "かつ"=>1, "やし"=>1, "だけど"=>1, "なんぞ"=>1, "べく"=>1, "迄"=>1, "如く"=>1, "ってか"=>1, "すなわち"=>1, "さて"=>1, "どころか"=>1, "では"=>1, "を以て"=>1, "かぁ"=>1, "のう"=>1, "らしかっ"=>1, "そしたら"=>1, "にゃ"=>1, "まじ"=>1, "るる"=>1, "らし"=>1, "やん"=>1, "たけれ"=>1, "らしき"=>1, "しも"=>1, "べから"=>1, "或いは"=>1, "及び"=>1, "だが"=>1, "ごとき"=>1, "なし"=>1, "如き"=>1, "ねん"=>1, "但し"=>1, "ござる"=>1, "いえ"=>1, "故に"=>1, "即ち"=>1, "やっ"=>1, "なき"=>1, "無かっ"=>1, "なけりゃ"=>1, "即"=>1, "よって"=>1, "或は"=>1, "および"=>1, "尚"=>1, "否"=>1, "じゃろ"=>1, "っしょ"=>1, "尤も"=>1, "だに"=>1, "やす"=>1, "ござん"=>1, "ついで"=>1, "へん"=>1, "じゃっ"=>1, "わい"=>1, "次に"=>1, "之"=>1, "ける"=>1, "然し"=>1, "もっとも"=>1, "そうしたら"=>1, "無く"=>1, "やろ"=>1, "亦"=>1, "っし"=>1, "に対し"=>1, "乃至"=>1, "なれ"=>1, "御座る"=>1, "御座ん"=>1, "とう"=>1, "てえ"=>1, "但"=>1, "どし"=>1, "ざり"=>1, "といふ"=>1, "たれ"=>1, "したら"=>1, "もん"=>1, "やせ"=>1, "たくっ"=>1, "若しくは"=>1, "ずん"=>1, "あら"=>1, "ざれ"=>1, "無かろ"=>1, "無けれ"=>1, "ごとし"=>1, "たきゃ"=>1, "どす"=>1, "けり"=>1, "まじき"=>1, "ますれ"=>1, "たき"=>1, "てん"=>1, "たけりゃ"=>1, "無き"=>1, "無"=>1, "如し"=>1, "あん"=>1, "御座っ"=>1, "ありゃ"=>1, "かな"=>1, "ばかし"=>1,
248
+ "a"=>1, "about"=>1, "after"=>1, "against"=>1, "all"=>1, "also"=>1, "although"=>1, "am"=>1, "among"=>1, "an"=>1, "and"=>1, "any"=>1, "anyone"=>1, "are"=>1, "as"=>1, "at"=>1, "ax"=>1, "be"=>1, "became"=>1, "because"=>1, "been"=>1, "being"=>1, "between"=>1, "but"=>1, "by"=>1, "c"=>1, "ca"=>1, "can"=>1, "come"=>1, "could"=>1, "cs"=>1, "did"=>1, "do"=>1, "does"=>1, "don"=>1, "during"=>1, "each"=>1, "early"=>1, "even"=>1, "for"=>1, "form"=>1, "found"=>1, "from"=>1, "get"=>1, "good"=>1, "had"=>1, "has"=>1, "have"=>1, "he"=>1, "her"=>1, "here"=>1, "him"=>1, "his"=>1, "how"=>1, "however"=>1, "i"=>1, "if"=>1, "in"=>1, "include"=>1, "including"=>1, "into"=>1, "is"=>1, "it"=>1, "its"=>1, "just"=>1, "know"=>1, "late"=>1, "later"=>1, "like"=>1, "made"=>1, "many"=>1, "may"=>1, "me"=>1, "more"=>1, "most"=>1, "much"=>1, "my"=>1, "near"=>1, "need"=>1, "new"=>1, "no"=>1, "non"=>1, "not"=>1, "now"=>1, "of"=>1, "off"=>1, "on"=>1, "one"=>1, "only"=>1, "or"=>1, "other"=>1, "our"=>1, "out"=>1, "over"=>1, "people"=>1, "please"=>1, "r"=>1, "right"=>1, "s"=>1, "same"=>1, "see"=>1, "several"=>1, "she"=>1, "should"=>1, "so"=>1, "some"=>1, "something"=>1, "such"=>1, "t"=>1, "than"=>1, "that"=>1, "the"=>1, "their"=>1, "them"=>1, "then"=>1, "there"=>1, "these"=>1, "they"=>1, "think"=>1, "this"=>1, "those"=>1, "through"=>1, "time"=>1, "to"=>1, "too"=>1, "u"=>1, "under"=>1, "until"=>1, "up"=>1, "us"=>1, "use"=>1, "used"=>1, "ve"=>1, "very"=>1, "want"=>1, "was"=>1, "way"=>1, "we"=>1, "well"=>1, "were"=>1, "what"=>1, "when"=>1, "where"=>1, "which"=>1, "who"=>1, "why"=>1, "will"=>1, "with"=>1, "would"=>1, "you"=>1, "your"=>1,
249
+ "q" => 1, "p" => 1, "b" => 1, "d" => 1, "o" => 1, "8" => 1, "1" => 1, "0" => 1, "z"=>1, "w"=>1, "v" => 1, "7" => 1, "x" => 1, "e" => 1, "f" => 1, "c" => 1 ,"3" => 1
250
+ }
228
251
  end
229
252
  end
@@ -1,4 +1,4 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  module NekonekoGen
3
- VERSION = "0.2.1"
3
+ VERSION = "0.3.0"
4
4
  end
data/nekoneko_gen.gemspec CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
15
15
  gem.require_paths = ["lib"]
16
16
  gem.version = NekonekoGen::VERSION
17
17
 
18
- gem.add_dependency 'bimyou_segmenter'
18
+ gem.add_dependency 'bimyou_segmenter', '>= 1.2.0'
19
19
  gem.add_dependency 'json'
20
20
  gem.add_development_dependency 'test-unit'
21
21
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nekoneko_gen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-01 00:00:00.000000000Z
12
+ date: 2012-06-02 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bimyou_segmenter
16
- requirement: &14306440 !ruby/object:Gem::Requirement
16
+ requirement: &11266280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: 1.2.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *14306440
24
+ version_requirements: *11266280
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: json
27
- requirement: &14304220 !ruby/object:Gem::Requirement
27
+ requirement: &11527320 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *14304220
35
+ version_requirements: *11527320
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: test-unit
38
- requirement: &14303060 !ruby/object:Gem::Requirement
38
+ requirement: &11890980 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *14303060
46
+ version_requirements: *11890980
47
47
  description: Japanese Text Classifier Generator
48
48
  email:
49
49
  - nagadomi@nurs.or.jp