tiny-classifier 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/README.md +58 -0
- data/Rakefile +28 -0
- data/bin/tc-classify +20 -0
- data/bin/tc-train +20 -0
- data/lib/tiny-classifier/base.rb +129 -0
- data/lib/tiny-classifier/classifier.rb +41 -0
- data/lib/tiny-classifier/trainer.rb +50 -0
- data/tiny-classifier.gemspec +42 -0
- metadata +83 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ef03bed349d3c266bab512446b6dff8946d5c14f
|
4
|
+
data.tar.gz: 768126f03759f1fe38ef2e155dad3484836f4422
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9c59f06bef1b3fcc30914a410c29ac52444ea77790f5f4f588515c268bb181a2f353083bfe1ab3ceeba94964052807a6cb556ded863712b09f6488c7d1feff07
|
7
|
+
data.tar.gz: aa188deccf010883b6d04f3ae63ba0124d9ff4f586690cfccf921be5e1466deed11e72b021083142d0e37134f80a1b462af4e7989af0d3c69c830a19391000c9
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# README
|
2
|
+
|
3
|
+
## Name
|
4
|
+
|
5
|
+
tiny-classifier
|
6
|
+
|
7
|
+
## Description
|
8
|
+
|
9
|
+
Command line tool to run text classifier based on naive bayes.
|
10
|
+
|
11
|
+
## Install
|
12
|
+
|
13
|
+
```
|
14
|
+
% gem install tiny-classifier
|
15
|
+
```
|
16
|
+
|
17
|
+
## Basic usage
|
18
|
+
|
19
|
+
Training:
|
20
|
+
|
21
|
+
```
|
22
|
+
% echo "Hello, world!" | tc-train --labels=positive,negative positive
|
23
|
+
% echo "I'm very very happy!" | tc-train --labels=positive,negative positive
|
24
|
+
% echo "I'm so bad..." | tc-train --labels=positive,negative negative
|
25
|
+
% echo "Oh my god!" | tc-train --labels=positive,negative negative
|
26
|
+
```
|
27
|
+
|
28
|
+
Classifying:
|
29
|
+
|
30
|
+
~~~
|
31
|
+
% echo "Happy day?" | tc-classify --labels=positive,negative
|
32
|
+
positive
|
33
|
+
~~~
|
34
|
+
|
35
|
+
## Command line parameters
|
36
|
+
|
37
|
+
### Common
|
38
|
+
|
39
|
+
`--labels=LABELS` (required)
|
40
|
+
: A comman-separated list of labels. You should use only alphabetic characters. (Non-alphabetical characters will cause problems.)
|
41
|
+
|
42
|
+
`--data-dir=PATH` (optional)
|
43
|
+
: The path to the directory that the training data to be saved. The current directory.
|
44
|
+
|
45
|
+
`--tokenizer=TOKENIZER` (optional)
|
46
|
+
: Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
|
47
|
+
|
48
|
+
### Trainer
|
49
|
+
|
50
|
+
The `tc-train` requires one command line argument: the label. You need to specify one of labels given via the `--labels` parameter.
|
51
|
+
|
52
|
+
## Copyright
|
53
|
+
|
54
|
+
Copyright (c) 2017 YUKI "Piro" Hiroshi
|
55
|
+
|
56
|
+
## License
|
57
|
+
|
58
|
+
GPLv3 or later. See LICENSE.txt for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
task :default => :build
|
17
|
+
|
18
|
+
require "bundler/gem_helper"
|
19
|
+
|
20
|
+
base_dir = File.join(File.dirname(__FILE__))
|
21
|
+
|
22
|
+
helper = Bundler::GemHelper.new(base_dir)
|
23
|
+
def helper.version_tag
|
24
|
+
version
|
25
|
+
end
|
26
|
+
|
27
|
+
helper.install
|
28
|
+
spec = helper.gemspec
|
data/bin/tc-classify
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
|
18
|
+
require "tiny-classifier/classifier"
|
19
|
+
|
20
|
+
Classifier.run
|
data/bin/tc-train
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
|
18
|
+
require "tiny-classifier/trainer"
|
19
|
+
|
20
|
+
Trainer.run
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "pathname"
|
17
|
+
require "optparse"
|
18
|
+
require "classifier-reborn"
|
19
|
+
|
20
|
+
class TinyClassifierBase
|
21
|
+
TOKENIZERS = [:none, :mecab]
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@tokenizer = :none
|
25
|
+
@data_dir = Dir.pwd
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_command_line_options(command_line_options)
|
29
|
+
option_parser = create_option_parser
|
30
|
+
option_parser.parse!(command_line_options)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def create_option_parser
|
35
|
+
parser = OptionParser.new
|
36
|
+
|
37
|
+
parser.on("--data-dir=PATH",
|
38
|
+
"Path to the directory to store training data file (default=current directory)") do |data_dir|
|
39
|
+
@data_dir = data_dir
|
40
|
+
end
|
41
|
+
|
42
|
+
parser.on("--labels=LABELS",
|
43
|
+
"List of labels (comma-separated)") do |labels|
|
44
|
+
@labels = normalize_labels(labels)
|
45
|
+
end
|
46
|
+
|
47
|
+
parser.on("--tokenizer=TOKENIZER",
|
48
|
+
"Tokenizer (default=#{@tokenizer})") do |tokenizer|
|
49
|
+
@tokenizer = tokenizer.to_sym
|
50
|
+
end
|
51
|
+
|
52
|
+
parser
|
53
|
+
end
|
54
|
+
|
55
|
+
def normalize_labels(labels)
|
56
|
+
labels
|
57
|
+
.strip
|
58
|
+
.downcase
|
59
|
+
.split(",")
|
60
|
+
.collect(&:strip)
|
61
|
+
.reject do |label|
|
62
|
+
label.empty?
|
63
|
+
end
|
64
|
+
.sort
|
65
|
+
.collect(&:capitalize)
|
66
|
+
end
|
67
|
+
|
68
|
+
def data_file_name
|
69
|
+
@data_file_basename ||= prepare_data_file_name
|
70
|
+
end
|
71
|
+
|
72
|
+
def prepare_data_file_name
|
73
|
+
labels = @labels.join("-").downcase
|
74
|
+
"tc.#{labels}.dat"
|
75
|
+
end
|
76
|
+
|
77
|
+
def data_file_path
|
78
|
+
@data_file_path ||= prepare_data_file_path
|
79
|
+
end
|
80
|
+
|
81
|
+
def prepare_data_file_path
|
82
|
+
path = Pathname(@data_dir)
|
83
|
+
path + data_file_name
|
84
|
+
end
|
85
|
+
|
86
|
+
def classifier
|
87
|
+
@classifier ||= prepare_classifier
|
88
|
+
end
|
89
|
+
|
90
|
+
def prepare_classifier
|
91
|
+
if data_file_path.exist?
|
92
|
+
data = File.read(data_file_path.to_s)
|
93
|
+
Marshal.load(data)
|
94
|
+
else
|
95
|
+
ClassifierReborn::Bayes.new(*@labels)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def prepare_input
|
100
|
+
tokenize
|
101
|
+
@input.strip!
|
102
|
+
end
|
103
|
+
|
104
|
+
def tokenize
|
105
|
+
case @tokenizer
|
106
|
+
when :mecab
|
107
|
+
tokenize_by_mecab
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def tokenize_by_mecab
|
112
|
+
require "natto"
|
113
|
+
natto = Natto::MeCab.new
|
114
|
+
terms = []
|
115
|
+
natto.parse(@input) do |term|
|
116
|
+
if term.feature =~ /名詞|形容詞|動詞/
|
117
|
+
terms << term.surface
|
118
|
+
end
|
119
|
+
end
|
120
|
+
@input = terms.join(" ").strip
|
121
|
+
end
|
122
|
+
|
123
|
+
def save
|
124
|
+
data = Marshal.dump(classifier)
|
125
|
+
File.open(data_file_path, "w") do |file|
|
126
|
+
file.write(data)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "tiny-classifier/base"
|
17
|
+
|
18
|
+
class Classifier < TinyClassifierBase
|
19
|
+
class << self
|
20
|
+
def run(argv=nil)
|
21
|
+
argv ||= ARGV.dup
|
22
|
+
classifier = new
|
23
|
+
classifier.parse_command_line_options(argv)
|
24
|
+
input = $stdin.readlines.join("\n")
|
25
|
+
classifier.run(input: input)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def run(params)
|
30
|
+
@label = params[:label]
|
31
|
+
@input = params[:input]
|
32
|
+
prepare_input
|
33
|
+
if @input.empty?
|
34
|
+
exit(1)
|
35
|
+
else
|
36
|
+
label = classifier.classify(@input)
|
37
|
+
puts label.downcase
|
38
|
+
exit(0)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "tiny-classifier/base"
|
17
|
+
|
18
|
+
class Trainer < TinyClassifierBase
|
19
|
+
class << self
|
20
|
+
def run(argv=nil)
|
21
|
+
argv ||= ARGV.dup
|
22
|
+
trainer = new
|
23
|
+
*labels = trainer.parse_command_line_options(argv)
|
24
|
+
input = $stdin.readlines.join("\n")
|
25
|
+
trainer.run(label: labels.first,
|
26
|
+
input: input)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def run(params)
|
31
|
+
@label = params[:label]
|
32
|
+
@input = params[:input]
|
33
|
+
prepare_input
|
34
|
+
if @input.empty?
|
35
|
+
exit(1)
|
36
|
+
else
|
37
|
+
classifier.send("train_#{@label.downcase}", @input)
|
38
|
+
save
|
39
|
+
exit(0)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
def save
|
45
|
+
data = Marshal.dump(classifier)
|
46
|
+
File.open(data_file_path, "w") do |file|
|
47
|
+
file.write(data)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Copyright (C) 2017 YUKI "Piro" Hiroshi
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
clean_white_space = lambda do |entry|
|
17
|
+
entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
|
18
|
+
end
|
19
|
+
|
20
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
|
21
|
+
|
22
|
+
Gem::Specification.new do |spec|
|
23
|
+
spec.name = "tiny-classifier"
|
24
|
+
spec.version = "1.0"
|
25
|
+
spec.homepage = "https://github.com/piroor/tiny-classifier"
|
26
|
+
spec.authors = ["YUKI \"Piro\" Hiroshi"]
|
27
|
+
spec.email = ["piro.outsider.reflex@gmail.com"]
|
28
|
+
readme = File.read("README.md")
|
29
|
+
readme.force_encoding("UTF-8")
|
30
|
+
entries = readme.split(/^\#\#\s(.*)$/)
|
31
|
+
description = clean_white_space.call(entries[entries.index("Description") + 1])
|
32
|
+
spec.summary, spec.description, = description.split(/\n\n+/, 3)
|
33
|
+
spec.license = "GPLv3 or later"
|
34
|
+
spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
|
35
|
+
spec.files += Dir.glob("lib/**/*.rb")
|
36
|
+
Dir.chdir("bin") do
|
37
|
+
spec.executables = Dir.glob("*")
|
38
|
+
end
|
39
|
+
|
40
|
+
spec.add_runtime_dependency("classifier-reborn")
|
41
|
+
spec.add_runtime_dependency("natto")
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tiny-classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '1.0'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- YUKI "Piro" Hiroshi
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-06-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: classifier-reborn
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: natto
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: ''
|
42
|
+
email:
|
43
|
+
- piro.outsider.reflex@gmail.com
|
44
|
+
executables:
|
45
|
+
- tc-train
|
46
|
+
- tc-classify
|
47
|
+
extensions: []
|
48
|
+
extra_rdoc_files: []
|
49
|
+
files:
|
50
|
+
- Gemfile
|
51
|
+
- README.md
|
52
|
+
- Rakefile
|
53
|
+
- bin/tc-classify
|
54
|
+
- bin/tc-train
|
55
|
+
- lib/tiny-classifier/base.rb
|
56
|
+
- lib/tiny-classifier/classifier.rb
|
57
|
+
- lib/tiny-classifier/trainer.rb
|
58
|
+
- tiny-classifier.gemspec
|
59
|
+
homepage: https://github.com/piroor/tiny-classifier
|
60
|
+
licenses:
|
61
|
+
- GPLv3 or later
|
62
|
+
metadata: {}
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options: []
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
requirements: []
|
78
|
+
rubyforge_project:
|
79
|
+
rubygems_version: 2.5.1
|
80
|
+
signing_key:
|
81
|
+
specification_version: 4
|
82
|
+
summary: Command line tool to run text classifier based on naive bayes.
|
83
|
+
test_files: []
|