tiny-classifier 2.1 → 2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7eb179e8a3a17711921f6bd0af6132909ace2f90
4
- data.tar.gz: 2f029f934ddb3f5277980228800fa24093491bab
3
+ metadata.gz: 9e7e246aa9446d56c68ea631b0c2dda4bd0fb506
4
+ data.tar.gz: 42f9c4ca23ca91a6d2e9e2121d7eb860ded738c5
5
5
  SHA512:
6
- metadata.gz: 9ecc0831b783aed6891e09fe73260f0eef63f021e450bf4db89cea267b4797cffade8f74b05530a058646d77ca1265a24ea7287e6754e132ec1e724d0ae21611
7
- data.tar.gz: aa5e0822ebcbc19b5497f1596be44b3b01e0fe184903d8b111fa9eae635f5bfd8c94ed34bda5a98658514cf10d185806abbd0eb12a124beba92943750bf107a0
6
+ metadata.gz: 3964e9cc15c6c7a4e6f49a6a7e945f3596c863169c812303293ca1c92a66407a0d96bd6abc3e4a521fa40eca524eece08bb98315521cedfe1fc816e599af80fe
7
+ data.tar.gz: 0fb9acbac0b1fc12fa72381af63085f7bf2f1aef6674ecfced6ebf6d0645a7ca274d27ae708ce6ea26f85f54fefb66ccd6b1fe2b5e0ff6c3308d30257b489c75
data/Gemfile CHANGED
File without changes
data/README.md CHANGED
File without changes
data/Rakefile CHANGED
@@ -26,3 +26,8 @@ end
26
26
 
27
27
  helper.install
28
28
  spec = helper.gemspec
29
+
30
+ desc "Run tests"
31
+ task :test do
32
+ ruby("test/run-test.rb")
33
+ end
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/classifier"
18
+ require "tiny-classifier/command/classify"
19
19
 
20
- TinyClassifier::Classifier.run
20
+ TinyClassifier::Command::Classify.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/classifier-generator"
18
+ require "tiny-classifier/command/generate-classifier"
19
19
 
20
- TinyClassifier::ClassifierGenerator.run
20
+ TinyClassifier::Command::GenerateClassifier.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/retrainer"
18
+ require "tiny-classifier/command/retrain"
19
19
 
20
- TinyClassifier::Retrainer.run
20
+ TinyClassifier::Command::Retrain.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/trainer"
18
+ require "tiny-classifier/command/train"
19
19
 
20
- TinyClassifier::Trainer.run
20
+ TinyClassifier::Command::Train.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/untrainer"
18
+ require "tiny-classifier/command/untrain"
19
19
 
20
- TinyClassifier::Untrainer.run
20
+ TinyClassifier::Command::Untrain.run
@@ -0,0 +1,61 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module TinyClassifier
17
+ class CategoryManager
18
+ attr_reader :chosen
19
+
20
+ def initialize(categories)
21
+ @categories = categories.strip.split(",")
22
+ normalize_all
23
+ clanup
24
+ end
25
+
26
+ def all
27
+ @categories
28
+ end
29
+
30
+ def valid?(category)
31
+ category = normalize(category)
32
+ @categories.include?(category)
33
+ end
34
+
35
+ def basename
36
+ @categories.join("-").downcase
37
+ end
38
+
39
+ def normalize(category)
40
+ category
41
+ .downcase
42
+ .strip
43
+ .capitalize
44
+ end
45
+
46
+ private
47
+ def normalize_all
48
+ @categories.collect! do |category|
49
+ normalize(category)
50
+ end
51
+ end
52
+
53
+ def clanup
54
+ @categories.reject! do |category|
55
+ category.empty?
56
+ end
57
+ @categories.uniq!
58
+ @categories.sort!
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,162 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "pathname"
17
+ require "optparse"
18
+ require "classifier-reborn"
19
+ require "tiny-classifier/tokenizer"
20
+ require "tiny-classifier/category-manager"
21
+ require "tiny-classifier/input"
22
+ require "tiny-classifier/errors"
23
+
24
+ module TinyClassifier
25
+ module Command
26
+ class Base
27
+ class << self
28
+ def run(argv=nil)
29
+ argv ||= ARGV.dup
30
+ command = new(argv)
31
+ command.run
32
+ end
33
+ end
34
+
35
+ attr_reader :tokenizer
36
+ attr_writer :classifier
37
+
38
+ def initialize(argv=[])
39
+ @categories = nil
40
+ @tokenizer = Tokenizer.new
41
+ @data_dir = Dir.pwd
42
+ @verbose = false
43
+ end
44
+
45
+ def run
46
+ raise NoCategories.new unless @categories
47
+ end
48
+
49
+ def parse_command_line_options(command_line_options)
50
+ option_parser.parse!(command_line_options)
51
+ end
52
+
53
+ def classifier
54
+ @classifier ||= prepare_classifier
55
+ end
56
+
57
+ def data_file_name
58
+ "tc.#{@categories.basename}.dat"
59
+ end
60
+
61
+ def data_file_path
62
+ @data_file_path ||= prepare_data_file_path
63
+ end
64
+
65
+ private
66
+ def option_parser
67
+ @option_parser ||= create_option_parser
68
+ end
69
+
70
+ def create_option_parser
71
+ parser = OptionParser.new
72
+
73
+ parser.on("-d PATH", "--data-dir=PATH",
74
+ "Path to the directory to store training data file (default=current directory)") do |data_dir|
75
+ @data_dir = data_dir
76
+ end
77
+
78
+ parser.on("-c CATEGORIES", "--categories=CATEGORIES",
79
+ "List of categories (comma-separated)") do |categories|
80
+ @categories = CategoryManager.new(categories)
81
+ end
82
+
83
+ parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
84
+ "Tokenizer (default=#{@tokenizer})") do |tokenizer|
85
+ @tokenizer.type = tokenizer
86
+ end
87
+
88
+ parser.on("-v", "--verbose",
89
+ "Output internal information (for debugging)") do |verbose|
90
+ @verbose = verbose
91
+ end
92
+
93
+ parser
94
+ end
95
+
96
+ def prepare_data_file_path
97
+ path = Pathname(@data_dir)
98
+ path += data_file_name
99
+ log("file: #{path}")
100
+ path
101
+ end
102
+
103
+ def prepare_classifier
104
+ if data_file_path.exist?
105
+ data = File.read(data_file_path.to_s)
106
+ Marshal.load(data)
107
+ else
108
+ ClassifierReborn::Bayes.new(*@categories.all)
109
+ end
110
+ end
111
+
112
+ def save
113
+ data = Marshal.dump(classifier)
114
+ File.open(data_file_path, "w") do |file|
115
+ file.write(data)
116
+ end
117
+ end
118
+
119
+ def input
120
+ @input ||= prepare_input
121
+ end
122
+
123
+ def prepare_input
124
+ input = Input.new
125
+ raise NoInput.new unless input.given?
126
+ tokenized = @tokenizer.tokenize(input.read)
127
+ log("tokenizer: #{@tokenizer.type}")
128
+ log("tokenized: #{tokenized}")
129
+ tokenized
130
+ end
131
+
132
+ def prepare_category(category)
133
+ raise NoCategory.new unless category
134
+
135
+ category = @categories.normalize(category)
136
+
137
+ unless @categories.valid?(category)
138
+ raise InvalidCategory.new(category, @categories.all)
139
+ end
140
+ category
141
+ end
142
+
143
+ def handle_error(error)
144
+ case error
145
+ when TinyClassifierError
146
+ error(error.message)
147
+ else
148
+ error(error.inspect)
149
+ end
150
+ false
151
+ end
152
+
153
+ def error(message)
154
+ $stderr.puts(message)
155
+ end
156
+
157
+ def log(message)
158
+ $stderr.puts(message) if @verbose
159
+ end
160
+ end
161
+ end
162
+ end
@@ -13,29 +13,26 @@
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- require "tiny-classifier/base"
16
+ require "tiny-classifier/command/base"
17
17
 
18
18
  module TinyClassifier
19
- class Classifier < Base
20
- class << self
21
- def run(argv=nil)
22
- argv ||= ARGV.dup
23
- classifier = new
24
- classifier.parse_command_line_options(argv)
25
- classifier.run
19
+ module Command
20
+ class Classify < Base
21
+ def initialize(argv=[])
22
+ super
23
+ parse_command_line_options(argv)
26
24
  end
27
- end
28
25
 
29
- attr_writer :classifier
26
+ def run
27
+ super
28
+ raise NoEffectiveInput.new if input.empty?
29
+ raise NoTrainingData.new(data_file_path) unless data_file_path.exist?
30
30
 
31
- def run
32
- if input.empty?
33
- error("Error: No effective input.")
34
- false
35
- else
36
31
  category = classifier.classify(input)
37
- puts category.downcase
32
+ $stdout.puts(category.downcase)
38
33
  true
34
+ rescue StandardError => error
35
+ handle_error(error)
39
36
  end
40
37
  end
41
38
  end
@@ -0,0 +1,88 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/command/base"
17
+ require "fileutils"
18
+ require "base64"
19
+
20
+ module TinyClassifier
21
+ module Command
22
+ class GenerateClassifier < Base
23
+ def initialize(argv=[])
24
+ super
25
+
26
+ @output_dir = Dir.pwd
27
+ option_parser.on("-o PATH", "--output-dir=PATH",
28
+ "Path to the classifier command to be saved (default=current directory)") do |output_dir|
29
+ @output_dir = output_dir
30
+ end
31
+
32
+ parse_command_line_options(argv)
33
+ end
34
+
35
+ def run
36
+ super
37
+ unless data_file_path.exist?
38
+ raise NoTrainingData.new(data_file_path)
39
+ end
40
+ unless prepare_output_file_path.parent.exist?
41
+ raise InvalidOutputDir.new(prepare_output_file_path.parent)
42
+ end
43
+
44
+ FileUtils.mkdir_p(output_file_path.parent)
45
+ File.open(output_file_path, "w") do |file|
46
+ file.puts("#!/usr/bin/env ruby")
47
+ file.puts("require \"base64\"")
48
+ file.puts("require \"classifier-reborn\"")
49
+ file.puts("require \"tiny-classifier/command/classify\"")
50
+ file.puts("classifier_code = Base64.strict_decode64(\"#{encoded_classifier}\")")
51
+ file.puts("command = TinyClassifier::Command::Classify.new([")
52
+ file.puts(" \"--categories=#{@categories.all.join(",")}\",")
53
+ file.puts(" \"--tokenizer=#{@tokenizer.type}\",")
54
+ file.puts("])")
55
+ file.puts("command.classifier = Marshal.load(classifier_code)")
56
+ file.puts("command.run")
57
+ end
58
+ FileUtils.chmod("a+x", output_file_path)
59
+ true
60
+ rescue StandardError => error
61
+ handle_error(error)
62
+ end
63
+
64
+ def classifier_name
65
+ @classifier_name ||= "tc-classify-#{@categories.basename}"
66
+ end
67
+
68
+ def output_file_path
69
+ @output_file_path ||= prepare_output_file_path
70
+ end
71
+
72
+ private
73
+ def encoded_classifier
74
+ @encoded_classifier ||= prepare_encoded_classifier
75
+ end
76
+
77
+ def prepare_encoded_classifier
78
+ classifier_code = Marshal.dump(classifier)
79
+ Base64.strict_encode64(classifier_code)
80
+ end
81
+
82
+ def prepare_output_file_path
83
+ path = Pathname(@output_dir)
84
+ path + classifier_name
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,75 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/command/train"
17
+
18
+ module TinyClassifier
19
+ module Command
20
+ class Retrain < Base
21
+ def initialize(argv=[])
22
+ super
23
+ option_parser.banner += " WRONG CORRECT"
24
+ *categories = parse_command_line_options(argv)
25
+ @wrong_category = categories.shift
26
+ @correct_category = categories.shift
27
+ end
28
+
29
+ def run
30
+ super
31
+ prepare_categories
32
+ raise NoEffectiveInput.new if input.empty?
33
+ raise NoTrainingData.new(data_file_path) unless data_file_path.exist?
34
+
35
+ classifier.untrain(@wrong_category, input)
36
+ classifier.train(@correct_category, input)
37
+ save
38
+ true
39
+ rescue StandardError => error
40
+ handle_error(error)
41
+ end
42
+
43
+ private
44
+ def prepare_categories
45
+ begin
46
+ @wrong_category = prepare_category(@wrong_category)
47
+ rescue StandardError => error
48
+ case error
49
+ when NoCategory
50
+ raise NoWrongCategory.new
51
+ when InvalidCategory
52
+ raise InvalidWrongCategory.new(@wrong_category, @categories.all)
53
+ else
54
+ raise error
55
+ end
56
+ end
57
+
58
+ begin
59
+ @correct_category = prepare_category(@correct_category)
60
+ rescue StandardError => error
61
+ case error
62
+ when NoCategory
63
+ raise NoCorrectCategory.new
64
+ when InvalidCategory
65
+ raise InvalidCorrectCategory.new(@correct_category, @categories.all)
66
+ else
67
+ raise error
68
+ end
69
+ end
70
+
71
+ log("training as: #{@wrong_category} => #{@correct_category}")
72
+ end
73
+ end
74
+ end
75
+ end
@@ -13,33 +13,29 @@
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- require "tiny-classifier/trainer"
16
+ require "tiny-classifier/command/base"
17
17
 
18
18
  module TinyClassifier
19
- class Retrainer < Trainer
20
- class << self
21
- def run(argv=nil)
22
- argv ||= ARGV.dup
23
- retrainer = new
24
- *categories = retrainer.parse_command_line_options(argv)
25
- retrainer.run(wrong: categories[0],
26
- correct: categories[1])
19
+ module Command
20
+ class Train < Base
21
+ def initialize(argv=[])
22
+ super
23
+ option_parser.banner += " CATEGORY"
24
+ *categories = parse_command_line_options(argv)
25
+ @category = categories.first
27
26
  end
28
- end
29
27
 
30
- def run(params)
31
- if input.empty?
32
- error("Error: No effective input.")
33
- false
34
- else
35
- @category = params[:wrong]
36
- prepare_category
37
- classifier.send("untrain_#{@category}", input)
38
- @category = params[:correct]
39
- prepare_category
40
- classifier.send("train_#{@category}", input)
28
+ def run
29
+ super
30
+ @category = prepare_category(@category)
31
+ log("training as: #{@category}")
32
+ raise NoEffectiveInput.new if input.empty?
33
+
34
+ classifier.train(@category, input)
41
35
  save
42
36
  true
37
+ rescue StandardError => error
38
+ handle_error(error)
43
39
  end
44
40
  end
45
41
  end
@@ -0,0 +1,43 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/command/base"
17
+
18
+ module TinyClassifier
19
+ module Command
20
+ class Untrain < Base
21
+ def initialize(argv=[])
22
+ super
23
+ option_parser.banner += " CATEGORY"
24
+ *categories = parse_command_line_options(argv)
25
+ @category = categories.first
26
+ end
27
+
28
+ def run
29
+ super
30
+ @category = prepare_category(@category)
31
+ log("untraining as: #{@category}")
32
+ raise NoEffectiveInput.new if input.empty?
33
+ raise NoTrainingData.new(data_file_path) unless data_file_path.exist?
34
+
35
+ classifier.untrain(@category, input)
36
+ save
37
+ true
38
+ rescue StandardError => error
39
+ handle_error(error)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,104 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module TinyClassifier
17
+ class TinyClassifierError < StandardError
18
+ end
19
+
20
+ class NoInput < TinyClassifierError
21
+ def message
22
+ "No input. You need to give any input via the STDIN."
23
+ end
24
+ end
25
+
26
+ class NoEffectiveInput < TinyClassifierError
27
+ def message
28
+ "No effective input."
29
+ end
30
+ end
31
+
32
+ class NoCategories < TinyClassifierError
33
+ def message
34
+ "You need to specify categories."
35
+ end
36
+ end
37
+
38
+ class NoCategory < TinyClassifierError
39
+ def message
40
+ "You need to specify a category for the input."
41
+ end
42
+ end
43
+
44
+ class NoWrongCategory < NoCategory
45
+ def message
46
+ "You need to specify a category to untrain the input."
47
+ end
48
+ end
49
+
50
+ class NoCorrectCategory < NoCategory
51
+ def message
52
+ "You need to specify a category to retrain the input."
53
+ end
54
+ end
55
+
56
+ class InvalidCategory < TinyClassifierError
57
+ attr_reader :category, :categories
58
+
59
+ def initialize(category, categories)
60
+ @category = category
61
+ @categories = categories
62
+ end
63
+
64
+ def message
65
+ "You need to specify one of valid categories: #{@categories.join(", ")}"
66
+ end
67
+ end
68
+
69
+ class InvalidWrongCategory < InvalidCategory
70
+ def message
71
+ "You need to specify one of valid categories to untrain: #{@categories.join(", ")}"
72
+ end
73
+ end
74
+
75
+ class InvalidCorrectCategory < InvalidCategory
76
+ def message
77
+ "You need to specify one of valid categories to retrain: #{@categories.join(", ")}"
78
+ end
79
+ end
80
+
81
+ class NoTrainingData < TinyClassifierError
82
+ attr_reader :data_dir
83
+
84
+ def initialize(data_dir)
85
+ @data_dir = data_dir
86
+ end
87
+
88
+ def message
89
+ "There is no training data at #{@data_dir}."
90
+ end
91
+ end
92
+
93
+ class InvalidOutputDir < TinyClassifierError
94
+ attr_reader :output_dir
95
+
96
+ def initialize(output_dir)
97
+ @output_dir = output_dir
98
+ end
99
+
100
+ def message
101
+ "#{@output_dir} is not available as the output directory."
102
+ end
103
+ end
104
+ end
@@ -13,21 +13,19 @@
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- require "tiny-classifier/trainer"
17
-
18
16
  module TinyClassifier
19
- class Untrainer < Trainer
20
- def run(params)
21
- @category = params[:category]
22
- prepare_category
23
- if input.empty?
24
- error("Error: No effective input.")
25
- false
26
- else
27
- classifier.send("untrain_#{@category}", input)
28
- save
29
- true
30
- end
17
+ class Input
18
+ def initialize(data = nil)
19
+ @data = data
20
+ end
21
+
22
+ def given?
23
+ return true if @data or $stdin.is_a?(StringIO)
24
+ File.pipe?(STDIN)
25
+ end
26
+
27
+ def read
28
+ @data ||= $stdin.readlines.join(" ").strip
31
29
  end
32
30
  end
33
31
  end
File without changes
@@ -21,7 +21,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
21
21
 
22
22
  Gem::Specification.new do |spec|
23
23
  spec.name = "tiny-classifier"
24
- spec.version = "2.1"
24
+ spec.version = "2.2"
25
25
  spec.homepage = "https://github.com/piroor/tiny-classifier"
26
26
  spec.authors = ["YUKI \"Piro\" Hiroshi"]
27
27
  spec.email = ["piro.outsider.reflex@gmail.com"]
@@ -39,4 +39,8 @@ Gem::Specification.new do |spec|
39
39
 
40
40
  spec.add_runtime_dependency("classifier-reborn")
41
41
  spec.add_runtime_dependency("natto")
42
+
43
+ spec.add_development_dependency("bundler")
44
+ spec.add_development_dependency("rake")
45
+ spec.add_development_dependency("test-unit")
42
46
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: '2.1'
4
+ version: '2.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - YUKI "Piro" Hiroshi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-22 00:00:00.000000000 Z
11
+ date: 2017-06-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: classifier-reborn
@@ -38,15 +38,57 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: test-unit
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
41
83
  description: ''
42
84
  email:
43
85
  - piro.outsider.reflex@gmail.com
44
86
  executables:
45
- - tc-classify
46
87
  - tc-generate-classifier
88
+ - tc-untrain
47
89
  - tc-retrain
48
90
  - tc-train
49
- - tc-untrain
91
+ - tc-classify
50
92
  extensions: []
51
93
  extra_rdoc_files: []
52
94
  files:
@@ -58,13 +100,16 @@ files:
58
100
  - bin/tc-retrain
59
101
  - bin/tc-train
60
102
  - bin/tc-untrain
61
- - lib/tiny-classifier/base.rb
62
- - lib/tiny-classifier/classifier-generator.rb
63
- - lib/tiny-classifier/classifier.rb
64
- - lib/tiny-classifier/retrainer.rb
103
+ - lib/tiny-classifier/category-manager.rb
104
+ - lib/tiny-classifier/command/base.rb
105
+ - lib/tiny-classifier/command/classify.rb
106
+ - lib/tiny-classifier/command/generate-classifier.rb
107
+ - lib/tiny-classifier/command/retrain.rb
108
+ - lib/tiny-classifier/command/train.rb
109
+ - lib/tiny-classifier/command/untrain.rb
110
+ - lib/tiny-classifier/errors.rb
111
+ - lib/tiny-classifier/input.rb
65
112
  - lib/tiny-classifier/tokenizer.rb
66
- - lib/tiny-classifier/trainer.rb
67
- - lib/tiny-classifier/untrainer.rb
68
113
  - tiny-classifier.gemspec
69
114
  homepage: https://github.com/piroor/tiny-classifier
70
115
  licenses:
@@ -1,136 +0,0 @@
1
- # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require "pathname"
17
- require "optparse"
18
- require "classifier-reborn"
19
- require "tiny-classifier/tokenizer"
20
-
21
- module TinyClassifier
22
- class Base
23
- attr_reader :tokenizer
24
-
25
- def initialize
26
- @tokenizer = Tokenizer.new
27
- @data_dir = Dir.pwd
28
- @verbose = false
29
- end
30
-
31
- def parse_command_line_options(command_line_options)
32
- option_parser.parse!(command_line_options)
33
- end
34
-
35
- def classifier
36
- @classifier ||= prepare_classifier
37
- end
38
-
39
- private
40
- def option_parser
41
- @option_parser ||= create_option_parser
42
- end
43
-
44
- def create_option_parser
45
- parser = OptionParser.new
46
-
47
- parser.on("-d PATH", "--data-dir=PATH",
48
- "Path to the directory to store training data file (default=current directory)") do |data_dir|
49
- @data_dir = data_dir
50
- end
51
-
52
- parser.on("-c CATEGORIES", "--categories=CATEGORIES",
53
- "List of categories (comma-separated)") do |categories|
54
- @categories = normalize_categories(categories)
55
- log("categories: #{@categories}")
56
- end
57
-
58
- parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
59
- "Tokenizer (default=#{@tokenizer})") do |tokenizer|
60
- @tokenizer.type = tokenizer
61
- end
62
-
63
- parser.on("-v", "--verbose",
64
- "Output internal information (for debugging)") do |verbose|
65
- @verbose = verbose
66
- end
67
-
68
- parser
69
- end
70
-
71
- def normalize_categories(categories)
72
- categories
73
- .strip
74
- .downcase
75
- .split(",")
76
- .collect(&:strip)
77
- .reject do |category|
78
- category.empty?
79
- end
80
- .sort
81
- .collect(&:capitalize)
82
- end
83
-
84
- def data_file_name
85
- @data_file_basename ||= prepare_data_file_name
86
- end
87
-
88
- def prepare_data_file_name
89
- categories = @categories.join("-").downcase
90
- "tc.#{categories}.dat"
91
- end
92
-
93
- def data_file_path
94
- @data_file_path ||= prepare_data_file_path
95
- end
96
-
97
- def prepare_data_file_path
98
- path = Pathname(@data_dir)
99
- path + data_file_name
100
- end
101
-
102
- def prepare_classifier
103
- if data_file_path.exist?
104
- data = File.read(data_file_path.to_s)
105
- Marshal.load(data)
106
- else
107
- ClassifierReborn::Bayes.new(*@categories)
108
- end
109
- end
110
-
111
- def input
112
- @input ||= prepare_input
113
- end
114
-
115
- def prepare_input
116
- unless File.pipe?(STDIN)
117
- error("Error: No effective input. You need to give any input via the STDIN.")
118
- exit(false)
119
- end
120
- @input = $stdin.readlines.join(" ")
121
- @input = @tokenizer.tokenize(@input)
122
- log("tokenizer: #{@tokenizer.type}")
123
- @input.strip!
124
- log("input: #{@input}")
125
- @input
126
- end
127
-
128
- def error(message)
129
- STDERR.puts(message)
130
- end
131
-
132
- def log(message)
133
- STDERR.puts(message) if @verbose
134
- end
135
- end
136
- end
@@ -1,88 +0,0 @@
1
- # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require "tiny-classifier/base"
17
- require "tiny-classifier/classifier"
18
- require "fileutils"
19
- require "base64"
20
-
21
- module TinyClassifier
22
- class ClassifierGenerator < Base
23
- class << self
24
- def run(argv=nil)
25
- argv ||= ARGV.dup
26
- generator = new
27
- generator.parse_command_line_options(argv)
28
- generator.run
29
- end
30
- end
31
-
32
- def initialize
33
- super
34
- @output_dir = Dir.pwd
35
- option_parser.on("-o PATH", "--output-dir=PATH",
36
- "Path to the classifier command to be saved (default=current directory)") do |output_dir|
37
- @output_dir = output_dir
38
- end
39
- end
40
-
41
- def run
42
- File.open(output_file_path, "w") do |file|
43
- file.puts("#!/usr/bin/env ruby")
44
- file.puts("require \"base64\"")
45
- file.puts("require \"classifier-reborn\"")
46
- file.puts("require \"tiny-classifier/classifier\"")
47
- file.puts("classifier_code = Base64.strict_decode64(\"#{encoded_classifier}\")")
48
- file.puts("classifier = TinyClassifier::Classifier.new")
49
- file.puts("classifier.classifier = Marshal.load(classifier_code)")
50
- file.puts("classifier.tokenizer.type = \"#{@tokenizer.type}\"")
51
- file.puts("classifier.run")
52
- end
53
- FileUtils.chmod("a+x", output_file_path)
54
- end
55
-
56
- private
57
- def encoded_classifier
58
- @encoded_classifier ||= prepare_encoded_classifier
59
- end
60
-
61
- def prepare_encoded_classifier
62
- classifier = Classifier.new
63
- classifier.parse_command_line_options(ARGV.dup)
64
- FileUtils.mkdir_p(output_file_path.parent)
65
-
66
- classifier_code = Marshal.dump(classifier.classifier)
67
- Base64.strict_encode64(classifier_code)
68
- end
69
-
70
- def classifier_name
71
- @classifier_name ||= prepare_classifier_name
72
- end
73
-
74
- def prepare_classifier_name
75
- categories = @categories.join("-").downcase
76
- "tc-classify-#{categories}"
77
- end
78
-
79
- def output_file_path
80
- @output_file_path ||= prepare_output_file_path
81
- end
82
-
83
- def prepare_output_file_path
84
- path = Pathname(@output_dir)
85
- path + classifier_name
86
- end
87
- end
88
- end
@@ -1,74 +0,0 @@
1
- # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require "tiny-classifier/base"
17
-
18
- module TinyClassifier
19
- class Trainer < Base
20
- class << self
21
- def run(argv=nil)
22
- argv ||= ARGV.dup
23
- trainer = new
24
- *categories = trainer.parse_command_line_options(argv)
25
- trainer.run(category: categories.first)
26
- end
27
- end
28
-
29
- def initialize
30
- super
31
- option_parser.banner += " CATEGORY"
32
- end
33
-
34
- def run(params)
35
- @category = params[:category]
36
- prepare_category
37
- if input.empty?
38
- error("Error: No effective input.")
39
- false
40
- else
41
- classifier.send("train_#{@category}", input)
42
- save
43
- true
44
- end
45
- end
46
-
47
- private
48
- def prepare_category
49
- unless @category
50
- error("Error: You need to specify the category for the input.")
51
- exit(false)
52
- end
53
-
54
- @category = @category.downcase.strip
55
-
56
- if @category.empty?
57
- error("Error: You need to specify the category for the input.")
58
- exit(false)
59
- end
60
-
61
- unless @categories.include?(@category.capitalize)
62
- error("Error: You need to specify one of valid categories: #{@categories.join(', ')}")
63
- exit(false)
64
- end
65
- end
66
-
67
- def save
68
- data = Marshal.dump(classifier)
69
- File.open(data_file_path, "w") do |file|
70
- file.write(data)
71
- end
72
- end
73
- end
74
- end