nekoneko_gen 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/nekoneko_gen/mlp.rb +0 -1
- data/lib/nekoneko_gen/text_classifier_generator.rb +36 -13
- data/lib/nekoneko_gen/version.rb +1 -1
- data/nekoneko_gen.gemspec +1 -1
- metadata +9 -9
data/lib/nekoneko_gen/mlp.rb
CHANGED
|
@@ -18,7 +18,6 @@ module NekonekoGen
|
|
|
18
18
|
@name = safe_name(@filename).split("_").map(&:capitalize).join
|
|
19
19
|
@labels = files.map {|file| "#{safe_name(file).upcase}"}
|
|
20
20
|
end
|
|
21
|
-
|
|
22
21
|
def train(iteration = nil)
|
|
23
22
|
iteration ||= @classifier.default_iteration
|
|
24
23
|
data = []
|
|
@@ -31,7 +30,7 @@ module NekonekoGen
|
|
|
31
30
|
File.open(@files[i]) do |f|
|
|
32
31
|
content = f.read
|
|
33
32
|
end
|
|
34
|
-
content = NKF.nkf('-
|
|
33
|
+
content = NKF.nkf('-wZX', content).downcase
|
|
35
34
|
content.lines do |line|
|
|
36
35
|
vec = fv(line.chomp)
|
|
37
36
|
if (vec.size > 0)
|
|
@@ -40,6 +39,7 @@ module NekonekoGen
|
|
|
40
39
|
end
|
|
41
40
|
puts sprintf("%.4fs", Time.now - t)
|
|
42
41
|
end
|
|
42
|
+
|
|
43
43
|
samples = data.map{|v| v.size}.min
|
|
44
44
|
iteration.times do |step|
|
|
45
45
|
loss = 0.0
|
|
@@ -81,6 +81,7 @@ module NekonekoGen
|
|
|
81
81
|
# -*- coding: utf-8 -*-
|
|
82
82
|
require 'rubygems'
|
|
83
83
|
require 'json'
|
|
84
|
+
require 'nkf'
|
|
84
85
|
require 'bimyou_segmenter'
|
|
85
86
|
|
|
86
87
|
class #{@name}
|
|
@@ -88,17 +89,17 @@ class #{@name}
|
|
|
88
89
|
K
|
|
89
90
|
end
|
|
90
91
|
def self.predict(text)
|
|
91
|
-
classify(fv(text))
|
|
92
|
+
classify(fv(NKF::nkf('-wZX', text).downcase))
|
|
92
93
|
end
|
|
93
|
-
|
|
94
94
|
#{labels}
|
|
95
95
|
LABELS = #{@labels.inspect}
|
|
96
96
|
K = #{@classifier.k}
|
|
97
|
-
|
|
98
97
|
private
|
|
99
98
|
def self.fv(text)
|
|
100
99
|
prev = nil
|
|
101
|
-
BimyouSegmenter.segment(text
|
|
100
|
+
BimyouSegmenter.segment(text,
|
|
101
|
+
:white_space => true,
|
|
102
|
+
:symbol => true).map do |word|
|
|
102
103
|
if (prev)
|
|
103
104
|
if (NGRAM_TARGET =~ word)
|
|
104
105
|
nword = [prev + word, word]
|
|
@@ -114,10 +115,9 @@ class #{@name}
|
|
|
114
115
|
end
|
|
115
116
|
word
|
|
116
117
|
end
|
|
117
|
-
end.flatten
|
|
118
|
+
end.flatten
|
|
118
119
|
end
|
|
119
120
|
#{@classifier.classify_method_code(:ruby)}
|
|
120
|
-
|
|
121
121
|
NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\\-_a-zA-Z‐_0-90-9]+$)|' +
|
|
122
122
|
'(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
|
|
123
123
|
[0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
|
|
@@ -142,10 +142,24 @@ MODEL
|
|
|
142
142
|
word_id
|
|
143
143
|
end
|
|
144
144
|
end
|
|
145
|
+
def lemmatize(word)
|
|
146
|
+
if (word =~ /^[a-z\-_]+$/)
|
|
147
|
+
LEMMATIZATION_RULE.each do |reg|
|
|
148
|
+
if (word.match(reg[0]))
|
|
149
|
+
return reg[1].map{|b| word.gsub(reg[0], b)}
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
word
|
|
153
|
+
else
|
|
154
|
+
word
|
|
155
|
+
end
|
|
156
|
+
end
|
|
145
157
|
def fv(text)
|
|
146
158
|
vec = Hash.new(0)
|
|
147
159
|
prev = nil
|
|
148
|
-
|
|
160
|
+
BimyouSegmenter.segment(text,
|
|
161
|
+
:white_space => true,
|
|
162
|
+
:symbol => true).map do |word|
|
|
149
163
|
if (prev)
|
|
150
164
|
if (NGRAM_TARGET =~ word)
|
|
151
165
|
nword = [prev + word, word]
|
|
@@ -161,8 +175,8 @@ MODEL
|
|
|
161
175
|
end
|
|
162
176
|
word
|
|
163
177
|
end
|
|
164
|
-
end.flatten
|
|
165
|
-
STOP_WORDS[word]
|
|
178
|
+
end.flatten.reject do |word|
|
|
179
|
+
word.empty? || STOP_WORDS[word] || word.match(WHITE_SPACE) || word.match(SYMBOL)
|
|
166
180
|
end.each do |word|
|
|
167
181
|
vec[word2id(word)] += 1
|
|
168
182
|
end
|
|
@@ -208,7 +222,7 @@ MODEL
|
|
|
208
222
|
end
|
|
209
223
|
end
|
|
210
224
|
def safe_name(filename)
|
|
211
|
-
File.basename(filename, ".*").gsub('
|
|
225
|
+
File.basename(filename, ".*").gsub(/[\-\.]/,'_').gsub(/[^a-zA-Z_0-9]/, '')
|
|
212
226
|
end
|
|
213
227
|
def puts(s)
|
|
214
228
|
unless (@quiet)
|
|
@@ -220,10 +234,19 @@ MODEL
|
|
|
220
234
|
Kernel.print s
|
|
221
235
|
end
|
|
222
236
|
end
|
|
237
|
+
SYMBOL = Regexp.new('^[^々〇' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
|
|
238
|
+
[0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
|
|
239
|
+
[0x20000].pack('U') + '-' + [0x2FFFF].pack('U') +
|
|
240
|
+
'\s ぁ-ゞァ-ヾa-zA-Za-zA-Z0-90-9]+$')
|
|
241
|
+
WHITE_SPACE = /[\s ]/
|
|
223
242
|
NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\-_a-zA-Z‐_0-90-9]+$)|' +
|
|
224
243
|
'(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
|
|
225
244
|
[0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
|
|
226
245
|
[0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
|
|
227
|
-
|
|
246
|
+
LEMMATIZATION_RULE = [[/shes$/, ["sh"]], [/ches$/, ["ch"]], [/zed$/, ["zed"]], [/zes$/, ["z"]], [/ses$/, ["s"]], [/ing$/, [""]], [/ves$/, ["f"]], [/xes$/, ["x"]], [/ies$/, ["y"]], [/est$/, ["e", ""]], [/men$/, ["man"]], [/es$/, ["e", ""]], [/ed$/, ["e", ""]], [/er$/, ["e", ""]], [/s$/, [""]]]
|
|
247
|
+
STOP_WORDS = {"の"=>1, "に"=>1, "て"=>1, "が"=>1, "た"=>1, "は"=>1, "で"=>1, "を"=>1, "と"=>1, "か"=>1, "も"=>1, "ない"=>1, "だ"=>1, "な"=>1, "です"=>1, "から"=>1, "ます"=>1, "う"=>1, "けど"=>1, "って"=>1, "ば"=>1, "よ"=>1, "まし"=>1, "たら"=>1, "ね"=>1, "ん"=>1, "なら"=>1, "でしょ"=>1, "とか"=>1, "じゃ"=>1, "まで"=>1, "ので"=>1, "ませ"=>1, "だけ"=>1, "へ"=>1, "なく"=>1, "という"=>1, "や"=>1, "でも"=>1, "ござい"=>1, "し"=>1, "たい"=>1, "だろ"=>1, "なかっ"=>1, "ある"=>1, "ず"=>1, "たり"=>1, "だっ"=>1, "しか"=>1, "くらい"=>1, "かも"=>1, "ながら"=>1, "でし"=>1, "また"=>1, "より"=>1, "のに"=>1, "わ"=>1, "など"=>1, "として"=>1, "ぬ"=>1, "あっ"=>1, "らしい"=>1, "ばかり"=>1, "ほど"=>1, "ぞ"=>1, "しかし"=>1, "なけれ"=>1, "ただ"=>1, "つ"=>1, "けれども"=>1, "んで"=>1, "ぐらい"=>1, "なんて"=>1, "について"=>1, "そうして"=>1, "ましょ"=>1, "さえ"=>1, "のみ"=>1, "たく"=>1, "あり"=>1, "る"=>1, "なんか"=>1, "べき"=>1, "だって"=>1, "それとも"=>1, "ちゃ"=>1, "なぁ"=>1, "それから"=>1, "さ"=>1, "ぜ"=>1, "によって"=>1, "ねえ"=>1, "っけ"=>1, "やら"=>1, "だから"=>1, "とも"=>1, "いや"=>1, "なり"=>1, "それでも"=>1, "なあ"=>1, "まい"=>1, "つつ"=>1, "そして"=>1, "それで"=>1, "かい"=>1, "すると"=>1, "しかも"=>1, "あろ"=>1, "らしく"=>1, "ずつ"=>1, "り"=>1, "たる"=>1, "又"=>1, "ねぇ"=>1, "に対して"=>1, "け"=>1, "こそ"=>1, "もしくは"=>1, "なきゃ"=>1, "だら"=>1, "そこで"=>1, "すら"=>1, "実は"=>1, "ところが"=>1, "なる"=>1, "による"=>1, "御座い"=>1, "じゃん"=>1, "つまり"=>1, "けれど"=>1, "ただし"=>1, "だの"=>1, "たかっ"=>1, "ざる"=>1, "ごとく"=>1, "に対する"=>1, "とかいう"=>1, "かしら"=>1, "なくっ"=>1, "そりゃ"=>1, "または"=>1, "べ"=>1, "にて"=>1, "において"=>1, "たろ"=>1, "無い"=>1, "あれ"=>1, "なぞ"=>1, "っと"=>1, "き"=>1, "にとって"=>1, "たって"=>1, "じ"=>1, "あるいは"=>1, "ど"=>1, "っす"=>1, "だり"=>1, "又は"=>1, "ばっかり"=>1, "てか"=>1, "けども"=>1, "と共に"=>1, "れ"=>1, "なかろ"=>1, "なお"=>1, "ものの"=>1, "に関する"=>1, "ばっか"=>1, "こうして"=>1, "程"=>1, "べし"=>1, "たとえば"=>1, "ども"=>1, "一方"=>1, "それでは"=>1, "かつ"=>1, "やし"=>1, "だけど"=>1, "なんぞ"=>1, "べく"=>1, "迄"=>1, "如く"=>1, "ってか"=>1, "すなわち"=>1, "さて"=>1, "どころか"=>1, "では"=>1, "を以て"=>1, "かぁ"=>1, "のう"=>1, "らしかっ"=>1, "そしたら"=>1, "にゃ"=>1, "まじ"=>1, "るる"=>1, "らし"=>1, "やん"=>1, "たけれ"=>1, "らしき"=>1, "しも"=>1, "べから"=>1, "或いは"=>1, "及び"=>1, "だが"=>1, "ごとき"=>1, "なし"=>1, "如き"=>1, "ねん"=>1, "但し"=>1, "ござる"=>1, "いえ"=>1, "故に"=>1, "即ち"=>1, "やっ"=>1, "なき"=>1, "無かっ"=>1, "なけりゃ"=>1, "即"=>1, "よって"=>1, "或は"=>1, "および"=>1, "尚"=>1, "否"=>1, "じゃろ"=>1, "っしょ"=>1, "尤も"=>1, "だに"=>1, "やす"=>1, "ござん"=>1, "ついで"=>1, "へん"=>1, "じゃっ"=>1, "わい"=>1, "次に"=>1, "之"=>1, "ける"=>1, "然し"=>1, "もっとも"=>1, "そうしたら"=>1, "無く"=>1, "やろ"=>1, "亦"=>1, "っし"=>1, "に対し"=>1, "乃至"=>1, "なれ"=>1, "御座る"=>1, "御座ん"=>1, "とう"=>1, "てえ"=>1, "但"=>1, "どし"=>1, "ざり"=>1, "といふ"=>1, "たれ"=>1, "したら"=>1, "もん"=>1, "やせ"=>1, "たくっ"=>1, "若しくは"=>1, "ずん"=>1, "あら"=>1, "ざれ"=>1, "無かろ"=>1, "無けれ"=>1, "ごとし"=>1, "たきゃ"=>1, "どす"=>1, "けり"=>1, "まじき"=>1, "ますれ"=>1, "たき"=>1, "てん"=>1, "たけりゃ"=>1, "無き"=>1, "無"=>1, "如し"=>1, "あん"=>1, "御座っ"=>1, "ありゃ"=>1, "かな"=>1, "ばかし"=>1,
|
|
248
|
+
"a"=>1, "about"=>1, "after"=>1, "against"=>1, "all"=>1, "also"=>1, "although"=>1, "am"=>1, "among"=>1, "an"=>1, "and"=>1, "any"=>1, "anyone"=>1, "are"=>1, "as"=>1, "at"=>1, "ax"=>1, "be"=>1, "became"=>1, "because"=>1, "been"=>1, "being"=>1, "between"=>1, "but"=>1, "by"=>1, "c"=>1, "ca"=>1, "can"=>1, "come"=>1, "could"=>1, "cs"=>1, "did"=>1, "do"=>1, "does"=>1, "don"=>1, "during"=>1, "each"=>1, "early"=>1, "even"=>1, "for"=>1, "form"=>1, "found"=>1, "from"=>1, "get"=>1, "good"=>1, "had"=>1, "has"=>1, "have"=>1, "he"=>1, "her"=>1, "here"=>1, "him"=>1, "his"=>1, "how"=>1, "however"=>1, "i"=>1, "if"=>1, "in"=>1, "include"=>1, "including"=>1, "into"=>1, "is"=>1, "it"=>1, "its"=>1, "just"=>1, "know"=>1, "late"=>1, "later"=>1, "like"=>1, "made"=>1, "many"=>1, "may"=>1, "me"=>1, "more"=>1, "most"=>1, "much"=>1, "my"=>1, "near"=>1, "need"=>1, "new"=>1, "no"=>1, "non"=>1, "not"=>1, "now"=>1, "of"=>1, "off"=>1, "on"=>1, "one"=>1, "only"=>1, "or"=>1, "other"=>1, "our"=>1, "out"=>1, "over"=>1, "people"=>1, "please"=>1, "r"=>1, "right"=>1, "s"=>1, "same"=>1, "see"=>1, "several"=>1, "she"=>1, "should"=>1, "so"=>1, "some"=>1, "something"=>1, "such"=>1, "t"=>1, "than"=>1, "that"=>1, "the"=>1, "their"=>1, "them"=>1, "then"=>1, "there"=>1, "these"=>1, "they"=>1, "think"=>1, "this"=>1, "those"=>1, "through"=>1, "time"=>1, "to"=>1, "too"=>1, "u"=>1, "under"=>1, "until"=>1, "up"=>1, "us"=>1, "use"=>1, "used"=>1, "ve"=>1, "very"=>1, "want"=>1, "was"=>1, "way"=>1, "we"=>1, "well"=>1, "were"=>1, "what"=>1, "when"=>1, "where"=>1, "which"=>1, "who"=>1, "why"=>1, "will"=>1, "with"=>1, "would"=>1, "you"=>1, "your"=>1,
|
|
249
|
+
"q" => 1, "p" => 1, "b" => 1, "d" => 1, "o" => 1, "8" => 1, "1" => 1, "0" => 1, "z"=>1, "w"=>1, "v" => 1, "7" => 1, "x" => 1, "e" => 1, "f" => 1, "c" => 1 ,"3" => 1
|
|
250
|
+
}
|
|
228
251
|
end
|
|
229
252
|
end
|
data/lib/nekoneko_gen/version.rb
CHANGED
data/nekoneko_gen.gemspec
CHANGED
|
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
|
|
|
15
15
|
gem.require_paths = ["lib"]
|
|
16
16
|
gem.version = NekonekoGen::VERSION
|
|
17
17
|
|
|
18
|
-
gem.add_dependency 'bimyou_segmenter'
|
|
18
|
+
gem.add_dependency 'bimyou_segmenter', '>= 1.2.0'
|
|
19
19
|
gem.add_dependency 'json'
|
|
20
20
|
gem.add_development_dependency 'test-unit'
|
|
21
21
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: nekoneko_gen
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,22 +9,22 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2012-06-
|
|
12
|
+
date: 2012-06-02 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: bimyou_segmenter
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &11266280 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
20
20
|
- !ruby/object:Gem::Version
|
|
21
|
-
version:
|
|
21
|
+
version: 1.2.0
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *11266280
|
|
25
25
|
- !ruby/object:Gem::Dependency
|
|
26
26
|
name: json
|
|
27
|
-
requirement: &
|
|
27
|
+
requirement: &11527320 !ruby/object:Gem::Requirement
|
|
28
28
|
none: false
|
|
29
29
|
requirements:
|
|
30
30
|
- - ! '>='
|
|
@@ -32,10 +32,10 @@ dependencies:
|
|
|
32
32
|
version: '0'
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
|
-
version_requirements: *
|
|
35
|
+
version_requirements: *11527320
|
|
36
36
|
- !ruby/object:Gem::Dependency
|
|
37
37
|
name: test-unit
|
|
38
|
-
requirement: &
|
|
38
|
+
requirement: &11890980 !ruby/object:Gem::Requirement
|
|
39
39
|
none: false
|
|
40
40
|
requirements:
|
|
41
41
|
- - ! '>='
|
|
@@ -43,7 +43,7 @@ dependencies:
|
|
|
43
43
|
version: '0'
|
|
44
44
|
type: :development
|
|
45
45
|
prerelease: false
|
|
46
|
-
version_requirements: *
|
|
46
|
+
version_requirements: *11890980
|
|
47
47
|
description: Japanese Text Classifier Generator
|
|
48
48
|
email:
|
|
49
49
|
- nagadomi@nurs.or.jp
|