tiny-classifier 1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ef03bed349d3c266bab512446b6dff8946d5c14f
4
+ data.tar.gz: 768126f03759f1fe38ef2e155dad3484836f4422
5
+ SHA512:
6
+ metadata.gz: 9c59f06bef1b3fcc30914a410c29ac52444ea77790f5f4f588515c268bb181a2f353083bfe1ab3ceeba94964052807a6cb556ded863712b09f6488c7d1feff07
7
+ data.tar.gz: aa188deccf010883b6d04f3ae63ba0124d9ff4f586690cfccf921be5e1466deed11e72b021083142d0e37134f80a1b462af4e7989af0d3c69c830a19391000c9
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,58 @@
1
+ # README
2
+
3
+ ## Name
4
+
5
+ tiny-classifier
6
+
7
+ ## Description
8
+
9
+ Command line tool to run text classifier based on naive bayes.
10
+
11
+ ## Install
12
+
13
+ ```
14
+ % gem install tiny-classifier
15
+ ```
16
+
17
+ ## Basic usage
18
+
19
+ Training:
20
+
21
+ ```
22
+ % echo "Hello, world!" | tc-train --labels=positive,negative positive
23
+ % echo "I'm very very happy!" | tc-train --labels=positive,negative positive
24
+ % echo "I'm so bad..." | tc-train --labels=positive,negative negative
25
+ % echo "Oh my god!" | tc-train --labels=positive,negative negative
26
+ ```
27
+
28
+ Classifying:
29
+
30
+ ~~~
31
+ % echo "Happy day?" | tc-classify --labels=positive,negative
32
+ positive
33
+ ~~~
34
+
35
+ ## Command line parameters
36
+
37
+ ### Common
38
+
39
+ `--labels=LABELS` (required)
40
+ : A comman-separated list of labels. You should use only alphabetic characters. (Non-alphabetical characters will cause problems.)
41
+
42
+ `--data-dir=PATH` (optional)
43
+ : The path to the directory that the training data to be saved. The current directory.
44
+
45
+ `--tokenizer=TOKENIZER` (optional)
46
+ : Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
47
+
48
+ ### Trainer
49
+
50
+ The `tc-train` requires one command line argument: the label. You need to specify one of labels given via the `--labels` parameter.
51
+
52
+ ## Copyright
53
+
54
+ Copyright (c) 2017 YUKI "Piro" Hiroshi
55
+
56
+ ## License
57
+
58
+ GPLv3 or later. See LICENSE.txt for details.
data/Rakefile ADDED
@@ -0,0 +1,28 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ task :default => :build
17
+
18
+ require "bundler/gem_helper"
19
+
20
+ base_dir = File.join(File.dirname(__FILE__))
21
+
22
+ helper = Bundler::GemHelper.new(base_dir)
23
+ def helper.version_tag
24
+ version
25
+ end
26
+
27
+ helper.install
28
+ spec = helper.gemspec
data/bin/tc-classify ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+
18
+ require "tiny-classifier/classifier"
19
+
20
+ Classifier.run
data/bin/tc-train ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+
18
+ require "tiny-classifier/trainer"
19
+
20
+ Trainer.run
@@ -0,0 +1,129 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "pathname"
17
+ require "optparse"
18
+ require "classifier-reborn"
19
+
20
+ class TinyClassifierBase
21
+ TOKENIZERS = [:none, :mecab]
22
+
23
+ def initialize
24
+ @tokenizer = :none
25
+ @data_dir = Dir.pwd
26
+ end
27
+
28
+ def parse_command_line_options(command_line_options)
29
+ option_parser = create_option_parser
30
+ option_parser.parse!(command_line_options)
31
+ end
32
+
33
+ private
34
+ def create_option_parser
35
+ parser = OptionParser.new
36
+
37
+ parser.on("--data-dir=PATH",
38
+ "Path to the directory to store training data file (default=current directory)") do |data_dir|
39
+ @data_dir = data_dir
40
+ end
41
+
42
+ parser.on("--labels=LABELS",
43
+ "List of labels (comma-separated)") do |labels|
44
+ @labels = normalize_labels(labels)
45
+ end
46
+
47
+ parser.on("--tokenizer=TOKENIZER",
48
+ "Tokenizer (default=#{@tokenizer})") do |tokenizer|
49
+ @tokenizer = tokenizer.to_sym
50
+ end
51
+
52
+ parser
53
+ end
54
+
55
+ def normalize_labels(labels)
56
+ labels
57
+ .strip
58
+ .downcase
59
+ .split(",")
60
+ .collect(&:strip)
61
+ .reject do |label|
62
+ label.empty?
63
+ end
64
+ .sort
65
+ .collect(&:capitalize)
66
+ end
67
+
68
+ def data_file_name
69
+ @data_file_basename ||= prepare_data_file_name
70
+ end
71
+
72
+ def prepare_data_file_name
73
+ labels = @labels.join("-").downcase
74
+ "tc.#{labels}.dat"
75
+ end
76
+
77
+ def data_file_path
78
+ @data_file_path ||= prepare_data_file_path
79
+ end
80
+
81
+ def prepare_data_file_path
82
+ path = Pathname(@data_dir)
83
+ path + data_file_name
84
+ end
85
+
86
+ def classifier
87
+ @classifier ||= prepare_classifier
88
+ end
89
+
90
+ def prepare_classifier
91
+ if data_file_path.exist?
92
+ data = File.read(data_file_path.to_s)
93
+ Marshal.load(data)
94
+ else
95
+ ClassifierReborn::Bayes.new(*@labels)
96
+ end
97
+ end
98
+
99
+ def prepare_input
100
+ tokenize
101
+ @input.strip!
102
+ end
103
+
104
+ def tokenize
105
+ case @tokenizer
106
+ when :mecab
107
+ tokenize_by_mecab
108
+ end
109
+ end
110
+
111
+ def tokenize_by_mecab
112
+ require "natto"
113
+ natto = Natto::MeCab.new
114
+ terms = []
115
+ natto.parse(@input) do |term|
116
+ if term.feature =~ /名詞|形容詞|動詞/
117
+ terms << term.surface
118
+ end
119
+ end
120
+ @input = terms.join(" ").strip
121
+ end
122
+
123
+ def save
124
+ data = Marshal.dump(classifier)
125
+ File.open(data_file_path, "w") do |file|
126
+ file.write(data)
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,41 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/base"
17
+
18
+ class Classifier < TinyClassifierBase
19
+ class << self
20
+ def run(argv=nil)
21
+ argv ||= ARGV.dup
22
+ classifier = new
23
+ classifier.parse_command_line_options(argv)
24
+ input = $stdin.readlines.join("\n")
25
+ classifier.run(input: input)
26
+ end
27
+ end
28
+
29
+ def run(params)
30
+ @label = params[:label]
31
+ @input = params[:input]
32
+ prepare_input
33
+ if @input.empty?
34
+ exit(1)
35
+ else
36
+ label = classifier.classify(@input)
37
+ puts label.downcase
38
+ exit(0)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,50 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/base"
17
+
18
+ class Trainer < TinyClassifierBase
19
+ class << self
20
+ def run(argv=nil)
21
+ argv ||= ARGV.dup
22
+ trainer = new
23
+ *labels = trainer.parse_command_line_options(argv)
24
+ input = $stdin.readlines.join("\n")
25
+ trainer.run(label: labels.first,
26
+ input: input)
27
+ end
28
+ end
29
+
30
+ def run(params)
31
+ @label = params[:label]
32
+ @input = params[:input]
33
+ prepare_input
34
+ if @input.empty?
35
+ exit(1)
36
+ else
37
+ classifier.send("train_#{@label.downcase}", @input)
38
+ save
39
+ exit(0)
40
+ end
41
+ end
42
+
43
+ private
44
+ def save
45
+ data = Marshal.dump(classifier)
46
+ File.open(data_file_path, "w") do |file|
47
+ file.write(data)
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,42 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ clean_white_space = lambda do |entry|
17
+ entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
18
+ end
19
+
20
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
21
+
22
+ Gem::Specification.new do |spec|
23
+ spec.name = "tiny-classifier"
24
+ spec.version = "1.0"
25
+ spec.homepage = "https://github.com/piroor/tiny-classifier"
26
+ spec.authors = ["YUKI \"Piro\" Hiroshi"]
27
+ spec.email = ["piro.outsider.reflex@gmail.com"]
28
+ readme = File.read("README.md")
29
+ readme.force_encoding("UTF-8")
30
+ entries = readme.split(/^\#\#\s(.*)$/)
31
+ description = clean_white_space.call(entries[entries.index("Description") + 1])
32
+ spec.summary, spec.description, = description.split(/\n\n+/, 3)
33
+ spec.license = "GPLv3 or later"
34
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
35
+ spec.files += Dir.glob("lib/**/*.rb")
36
+ Dir.chdir("bin") do
37
+ spec.executables = Dir.glob("*")
38
+ end
39
+
40
+ spec.add_runtime_dependency("classifier-reborn")
41
+ spec.add_runtime_dependency("natto")
42
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tiny-classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: '1.0'
5
+ platform: ruby
6
+ authors:
7
+ - YUKI "Piro" Hiroshi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-06-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: classifier-reborn
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: natto
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: ''
42
+ email:
43
+ - piro.outsider.reflex@gmail.com
44
+ executables:
45
+ - tc-train
46
+ - tc-classify
47
+ extensions: []
48
+ extra_rdoc_files: []
49
+ files:
50
+ - Gemfile
51
+ - README.md
52
+ - Rakefile
53
+ - bin/tc-classify
54
+ - bin/tc-train
55
+ - lib/tiny-classifier/base.rb
56
+ - lib/tiny-classifier/classifier.rb
57
+ - lib/tiny-classifier/trainer.rb
58
+ - tiny-classifier.gemspec
59
+ homepage: https://github.com/piroor/tiny-classifier
60
+ licenses:
61
+ - GPLv3 or later
62
+ metadata: {}
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ requirements: []
78
+ rubyforge_project:
79
+ rubygems_version: 2.5.1
80
+ signing_key:
81
+ specification_version: 4
82
+ summary: Command line tool to run text classifier based on naive bayes.
83
+ test_files: []