tiny-classifier 1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/README.md +58 -0
- data/Rakefile +28 -0
- data/bin/tc-classify +20 -0
- data/bin/tc-train +20 -0
- data/lib/tiny-classifier/base.rb +129 -0
- data/lib/tiny-classifier/classifier.rb +41 -0
- data/lib/tiny-classifier/trainer.rb +50 -0
- data/tiny-classifier.gemspec +42 -0
- metadata +83 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ef03bed349d3c266bab512446b6dff8946d5c14f
|
4
|
+
data.tar.gz: 768126f03759f1fe38ef2e155dad3484836f4422
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9c59f06bef1b3fcc30914a410c29ac52444ea77790f5f4f588515c268bb181a2f353083bfe1ab3ceeba94964052807a6cb556ded863712b09f6488c7d1feff07
|
7
|
+
data.tar.gz: aa188deccf010883b6d04f3ae63ba0124d9ff4f586690cfccf921be5e1466deed11e72b021083142d0e37134f80a1b462af4e7989af0d3c69c830a19391000c9
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# README
|
2
|
+
|
3
|
+
## Name
|
4
|
+
|
5
|
+
tiny-classifier
|
6
|
+
|
7
|
+
## Description
|
8
|
+
|
9
|
+
Command line tool to run text classifier based on naive bayes.
|
10
|
+
|
11
|
+
## Install
|
12
|
+
|
13
|
+
```
|
14
|
+
% gem install tiny-classifier
|
15
|
+
```
|
16
|
+
|
17
|
+
## Basic usage
|
18
|
+
|
19
|
+
Training:
|
20
|
+
|
21
|
+
```
|
22
|
+
% echo "Hello, world!" | tc-train --labels=positive,negative positive
|
23
|
+
% echo "I'm very very happy!" | tc-train --labels=positive,negative positive
|
24
|
+
% echo "I'm so bad..." | tc-train --labels=positive,negative negative
|
25
|
+
% echo "Oh my god!" | tc-train --labels=positive,negative negative
|
26
|
+
```
|
27
|
+
|
28
|
+
Classifying:
|
29
|
+
|
30
|
+
~~~
|
31
|
+
% echo "Happy day?" | tc-classify --labels=positive,negative
|
32
|
+
positive
|
33
|
+
~~~
|
34
|
+
|
35
|
+
## Command line parameters
|
36
|
+
|
37
|
+
### Common
|
38
|
+
|
39
|
+
`--labels=LABELS` (required)
|
40
|
+
: A comman-separated list of labels. You should use only alphabetic characters. (Non-alphabetical characters will cause problems.)
|
41
|
+
|
42
|
+
`--data-dir=PATH` (optional)
|
43
|
+
: The path to the directory that the training data to be saved. The current directory.
|
44
|
+
|
45
|
+
`--tokenizer=TOKENIZER` (optional)
|
46
|
+
: Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
|
47
|
+
|
48
|
+
### Trainer
|
49
|
+
|
50
|
+
The `tc-train` requires one command line argument: the label. You need to specify one of labels given via the `--labels` parameter.
|
51
|
+
|
52
|
+
## Copyright
|
53
|
+
|
54
|
+
Copyright (c) 2017 YUKI "Piro" Hiroshi
|
55
|
+
|
56
|
+
## License
|
57
|
+
|
58
|
+
GPLv3 or later. See LICENSE.txt for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
task :default => :build
|
17
|
+
|
18
|
+
require "bundler/gem_helper"
|
19
|
+
|
20
|
+
base_dir = File.join(File.dirname(__FILE__))
|
21
|
+
|
22
|
+
helper = Bundler::GemHelper.new(base_dir)
|
23
|
+
def helper.version_tag
|
24
|
+
version
|
25
|
+
end
|
26
|
+
|
27
|
+
helper.install
|
28
|
+
spec = helper.gemspec
|
data/bin/tc-classify
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
|
18
|
+
require "tiny-classifier/classifier"
|
19
|
+
|
20
|
+
Classifier.run
|
data/bin/tc-train
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
|
18
|
+
require "tiny-classifier/trainer"
|
19
|
+
|
20
|
+
Trainer.run
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "pathname"
|
17
|
+
require "optparse"
|
18
|
+
require "classifier-reborn"
|
19
|
+
|
20
|
+
class TinyClassifierBase
|
21
|
+
TOKENIZERS = [:none, :mecab]
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@tokenizer = :none
|
25
|
+
@data_dir = Dir.pwd
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_command_line_options(command_line_options)
|
29
|
+
option_parser = create_option_parser
|
30
|
+
option_parser.parse!(command_line_options)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def create_option_parser
|
35
|
+
parser = OptionParser.new
|
36
|
+
|
37
|
+
parser.on("--data-dir=PATH",
|
38
|
+
"Path to the directory to store training data file (default=current directory)") do |data_dir|
|
39
|
+
@data_dir = data_dir
|
40
|
+
end
|
41
|
+
|
42
|
+
parser.on("--labels=LABELS",
|
43
|
+
"List of labels (comma-separated)") do |labels|
|
44
|
+
@labels = normalize_labels(labels)
|
45
|
+
end
|
46
|
+
|
47
|
+
parser.on("--tokenizer=TOKENIZER",
|
48
|
+
"Tokenizer (default=#{@tokenizer})") do |tokenizer|
|
49
|
+
@tokenizer = tokenizer.to_sym
|
50
|
+
end
|
51
|
+
|
52
|
+
parser
|
53
|
+
end
|
54
|
+
|
55
|
+
def normalize_labels(labels)
|
56
|
+
labels
|
57
|
+
.strip
|
58
|
+
.downcase
|
59
|
+
.split(",")
|
60
|
+
.collect(&:strip)
|
61
|
+
.reject do |label|
|
62
|
+
label.empty?
|
63
|
+
end
|
64
|
+
.sort
|
65
|
+
.collect(&:capitalize)
|
66
|
+
end
|
67
|
+
|
68
|
+
def data_file_name
|
69
|
+
@data_file_basename ||= prepare_data_file_name
|
70
|
+
end
|
71
|
+
|
72
|
+
def prepare_data_file_name
|
73
|
+
labels = @labels.join("-").downcase
|
74
|
+
"tc.#{labels}.dat"
|
75
|
+
end
|
76
|
+
|
77
|
+
def data_file_path
|
78
|
+
@data_file_path ||= prepare_data_file_path
|
79
|
+
end
|
80
|
+
|
81
|
+
def prepare_data_file_path
|
82
|
+
path = Pathname(@data_dir)
|
83
|
+
path + data_file_name
|
84
|
+
end
|
85
|
+
|
86
|
+
def classifier
|
87
|
+
@classifier ||= prepare_classifier
|
88
|
+
end
|
89
|
+
|
90
|
+
def prepare_classifier
|
91
|
+
if data_file_path.exist?
|
92
|
+
data = File.read(data_file_path.to_s)
|
93
|
+
Marshal.load(data)
|
94
|
+
else
|
95
|
+
ClassifierReborn::Bayes.new(*@labels)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def prepare_input
|
100
|
+
tokenize
|
101
|
+
@input.strip!
|
102
|
+
end
|
103
|
+
|
104
|
+
def tokenize
|
105
|
+
case @tokenizer
|
106
|
+
when :mecab
|
107
|
+
tokenize_by_mecab
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def tokenize_by_mecab
|
112
|
+
require "natto"
|
113
|
+
natto = Natto::MeCab.new
|
114
|
+
terms = []
|
115
|
+
natto.parse(@input) do |term|
|
116
|
+
if term.feature =~ /名詞|形容詞|動詞/
|
117
|
+
terms << term.surface
|
118
|
+
end
|
119
|
+
end
|
120
|
+
@input = terms.join(" ").strip
|
121
|
+
end
|
122
|
+
|
123
|
+
def save
|
124
|
+
data = Marshal.dump(classifier)
|
125
|
+
File.open(data_file_path, "w") do |file|
|
126
|
+
file.write(data)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "tiny-classifier/base"
|
17
|
+
|
18
|
+
class Classifier < TinyClassifierBase
|
19
|
+
class << self
|
20
|
+
def run(argv=nil)
|
21
|
+
argv ||= ARGV.dup
|
22
|
+
classifier = new
|
23
|
+
classifier.parse_command_line_options(argv)
|
24
|
+
input = $stdin.readlines.join("\n")
|
25
|
+
classifier.run(input: input)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def run(params)
|
30
|
+
@label = params[:label]
|
31
|
+
@input = params[:input]
|
32
|
+
prepare_input
|
33
|
+
if @input.empty?
|
34
|
+
exit(1)
|
35
|
+
else
|
36
|
+
label = classifier.classify(@input)
|
37
|
+
puts label.downcase
|
38
|
+
exit(0)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "tiny-classifier/base"
|
17
|
+
|
18
|
+
class Trainer < TinyClassifierBase
|
19
|
+
class << self
|
20
|
+
def run(argv=nil)
|
21
|
+
argv ||= ARGV.dup
|
22
|
+
trainer = new
|
23
|
+
*labels = trainer.parse_command_line_options(argv)
|
24
|
+
input = $stdin.readlines.join("\n")
|
25
|
+
trainer.run(label: labels.first,
|
26
|
+
input: input)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def run(params)
|
31
|
+
@label = params[:label]
|
32
|
+
@input = params[:input]
|
33
|
+
prepare_input
|
34
|
+
if @input.empty?
|
35
|
+
exit(1)
|
36
|
+
else
|
37
|
+
classifier.send("train_#{@label.downcase}", @input)
|
38
|
+
save
|
39
|
+
exit(0)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
def save
|
45
|
+
data = Marshal.dump(classifier)
|
46
|
+
File.open(data_file_path, "w") do |file|
|
47
|
+
file.write(data)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
clean_white_space = lambda do |entry|
|
17
|
+
entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
|
18
|
+
end
|
19
|
+
|
20
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
|
21
|
+
|
22
|
+
Gem::Specification.new do |spec|
|
23
|
+
spec.name = "tiny-classifier"
|
24
|
+
spec.version = "1.0"
|
25
|
+
spec.homepage = "https://github.com/piroor/tiny-classifier"
|
26
|
+
spec.authors = ["YUKI \"Piro\" Hiroshi"]
|
27
|
+
spec.email = ["piro.outsider.reflex@gmail.com"]
|
28
|
+
readme = File.read("README.md")
|
29
|
+
readme.force_encoding("UTF-8")
|
30
|
+
entries = readme.split(/^\#\#\s(.*)$/)
|
31
|
+
description = clean_white_space.call(entries[entries.index("Description") + 1])
|
32
|
+
spec.summary, spec.description, = description.split(/\n\n+/, 3)
|
33
|
+
spec.license = "GPLv3 or later"
|
34
|
+
spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
|
35
|
+
spec.files += Dir.glob("lib/**/*.rb")
|
36
|
+
Dir.chdir("bin") do
|
37
|
+
spec.executables = Dir.glob("*")
|
38
|
+
end
|
39
|
+
|
40
|
+
spec.add_runtime_dependency("classifier-reborn")
|
41
|
+
spec.add_runtime_dependency("natto")
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tiny-classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '1.0'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- YUKI "Piro" Hiroshi
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-06-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: classifier-reborn
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: natto
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: ''
|
42
|
+
email:
|
43
|
+
- piro.outsider.reflex@gmail.com
|
44
|
+
executables:
|
45
|
+
- tc-train
|
46
|
+
- tc-classify
|
47
|
+
extensions: []
|
48
|
+
extra_rdoc_files: []
|
49
|
+
files:
|
50
|
+
- Gemfile
|
51
|
+
- README.md
|
52
|
+
- Rakefile
|
53
|
+
- bin/tc-classify
|
54
|
+
- bin/tc-train
|
55
|
+
- lib/tiny-classifier/base.rb
|
56
|
+
- lib/tiny-classifier/classifier.rb
|
57
|
+
- lib/tiny-classifier/trainer.rb
|
58
|
+
- tiny-classifier.gemspec
|
59
|
+
homepage: https://github.com/piroor/tiny-classifier
|
60
|
+
licenses:
|
61
|
+
- GPLv3 or later
|
62
|
+
metadata: {}
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options: []
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
requirements: []
|
78
|
+
rubyforge_project:
|
79
|
+
rubygems_version: 2.5.1
|
80
|
+
signing_key:
|
81
|
+
specification_version: 4
|
82
|
+
summary: Command line tool to run text classifier based on naive bayes.
|
83
|
+
test_files: []
|