tiny-classifier 2.1 → 2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7eb179e8a3a17711921f6bd0af6132909ace2f90
4
- data.tar.gz: 2f029f934ddb3f5277980228800fa24093491bab
3
+ metadata.gz: 9e7e246aa9446d56c68ea631b0c2dda4bd0fb506
4
+ data.tar.gz: 42f9c4ca23ca91a6d2e9e2121d7eb860ded738c5
5
5
  SHA512:
6
- metadata.gz: 9ecc0831b783aed6891e09fe73260f0eef63f021e450bf4db89cea267b4797cffade8f74b05530a058646d77ca1265a24ea7287e6754e132ec1e724d0ae21611
7
- data.tar.gz: aa5e0822ebcbc19b5497f1596be44b3b01e0fe184903d8b111fa9eae635f5bfd8c94ed34bda5a98658514cf10d185806abbd0eb12a124beba92943750bf107a0
6
+ metadata.gz: 3964e9cc15c6c7a4e6f49a6a7e945f3596c863169c812303293ca1c92a66407a0d96bd6abc3e4a521fa40eca524eece08bb98315521cedfe1fc816e599af80fe
7
+ data.tar.gz: 0fb9acbac0b1fc12fa72381af63085f7bf2f1aef6674ecfced6ebf6d0645a7ca274d27ae708ce6ea26f85f54fefb66ccd6b1fe2b5e0ff6c3308d30257b489c75
data/Gemfile CHANGED
File without changes
data/README.md CHANGED
File without changes
data/Rakefile CHANGED
@@ -26,3 +26,8 @@ end
26
26
 
27
27
  helper.install
28
28
  spec = helper.gemspec
29
+
30
+ desc "Run tests"
31
+ task :test do
32
+ ruby("test/run-test.rb")
33
+ end
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/classifier"
18
+ require "tiny-classifier/command/classify"
19
19
 
20
- TinyClassifier::Classifier.run
20
+ TinyClassifier::Command::Classify.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/classifier-generator"
18
+ require "tiny-classifier/command/generate-classifier"
19
19
 
20
- TinyClassifier::ClassifierGenerator.run
20
+ TinyClassifier::Command::GenerateClassifier.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/retrainer"
18
+ require "tiny-classifier/command/retrain"
19
19
 
20
- TinyClassifier::Retrainer.run
20
+ TinyClassifier::Command::Retrain.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/trainer"
18
+ require "tiny-classifier/command/train"
19
19
 
20
- TinyClassifier::Trainer.run
20
+ TinyClassifier::Command::Train.run
@@ -15,6 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License
16
16
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "tiny-classifier/untrainer"
18
+ require "tiny-classifier/command/untrain"
19
19
 
20
- TinyClassifier::Untrainer.run
20
+ TinyClassifier::Command::Untrain.run
@@ -0,0 +1,61 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module TinyClassifier
17
+ class CategoryManager
18
+ attr_reader :chosen
19
+
20
+ def initialize(categories)
21
+ @categories = categories.strip.split(",")
22
+ normalize_all
23
+ clanup
24
+ end
25
+
26
+ def all
27
+ @categories
28
+ end
29
+
30
+ def valid?(category)
31
+ category = normalize(category)
32
+ @categories.include?(category)
33
+ end
34
+
35
+ def basename
36
+ @categories.join("-").downcase
37
+ end
38
+
39
+ def normalize(category)
40
+ category
41
+ .downcase
42
+ .strip
43
+ .capitalize
44
+ end
45
+
46
+ private
47
+ def normalize_all
48
+ @categories.collect! do |category|
49
+ normalize(category)
50
+ end
51
+ end
52
+
53
+ def clanup
54
+ @categories.reject! do |category|
55
+ category.empty?
56
+ end
57
+ @categories.uniq!
58
+ @categories.sort!
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,162 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "pathname"
17
+ require "optparse"
18
+ require "classifier-reborn"
19
+ require "tiny-classifier/tokenizer"
20
+ require "tiny-classifier/category-manager"
21
+ require "tiny-classifier/input"
22
+ require "tiny-classifier/errors"
23
+
24
+ module TinyClassifier
25
+ module Command
26
+ class Base
27
+ class << self
28
+ def run(argv=nil)
29
+ argv ||= ARGV.dup
30
+ command = new(argv)
31
+ command.run
32
+ end
33
+ end
34
+
35
+ attr_reader :tokenizer
36
+ attr_writer :classifier
37
+
38
+ def initialize(argv=[])
39
+ @categories = nil
40
+ @tokenizer = Tokenizer.new
41
+ @data_dir = Dir.pwd
42
+ @verbose = false
43
+ end
44
+
45
+ def run
46
+ raise NoCategories.new unless @categories
47
+ end
48
+
49
+ def parse_command_line_options(command_line_options)
50
+ option_parser.parse!(command_line_options)
51
+ end
52
+
53
+ def classifier
54
+ @classifier ||= prepare_classifier
55
+ end
56
+
57
+ def data_file_name
58
+ "tc.#{@categories.basename}.dat"
59
+ end
60
+
61
+ def data_file_path
62
+ @data_file_path ||= prepare_data_file_path
63
+ end
64
+
65
+ private
66
+ def option_parser
67
+ @option_parser ||= create_option_parser
68
+ end
69
+
70
+ def create_option_parser
71
+ parser = OptionParser.new
72
+
73
+ parser.on("-d PATH", "--data-dir=PATH",
74
+ "Path to the directory to store training data file (default=current directory)") do |data_dir|
75
+ @data_dir = data_dir
76
+ end
77
+
78
+ parser.on("-c CATEGORIES", "--categories=CATEGORIES",
79
+ "List of categories (comma-separated)") do |categories|
80
+ @categories = CategoryManager.new(categories)
81
+ end
82
+
83
+ parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
84
+ "Tokenizer (default=#{@tokenizer})") do |tokenizer|
85
+ @tokenizer.type = tokenizer
86
+ end
87
+
88
+ parser.on("-v", "--verbose",
89
+ "Output internal information (for debugging)") do |verbose|
90
+ @verbose = verbose
91
+ end
92
+
93
+ parser
94
+ end
95
+
96
+ def prepare_data_file_path
97
+ path = Pathname(@data_dir)
98
+ path += data_file_name
99
+ log("file: #{path}")
100
+ path
101
+ end
102
+
103
+ def prepare_classifier
104
+ if data_file_path.exist?
105
+ data = File.read(data_file_path.to_s)
106
+ Marshal.load(data)
107
+ else
108
+ ClassifierReborn::Bayes.new(*@categories.all)
109
+ end
110
+ end
111
+
112
+ def save
113
+ data = Marshal.dump(classifier)
114
+ File.open(data_file_path, "w") do |file|
115
+ file.write(data)
116
+ end
117
+ end
118
+
119
+ def input
120
+ @input ||= prepare_input
121
+ end
122
+
123
+ def prepare_input
124
+ input = Input.new
125
+ raise NoInput.new unless input.given?
126
+ tokenized = @tokenizer.tokenize(input.read)
127
+ log("tokenizer: #{@tokenizer.type}")
128
+ log("tokenized: #{tokenized}")
129
+ tokenized
130
+ end
131
+
132
+ def prepare_category(category)
133
+ raise NoCategory.new unless category
134
+
135
+ category = @categories.normalize(category)
136
+
137
+ unless @categories.valid?(category)
138
+ raise InvalidCategory.new(category, @categories.all)
139
+ end
140
+ category
141
+ end
142
+
143
+ def handle_error(error)
144
+ case error
145
+ when TinyClassifierError
146
+ error(error.message)
147
+ else
148
+ error(error.inspect)
149
+ end
150
+ false
151
+ end
152
+
153
+ def error(message)
154
+ $stderr.puts(message)
155
+ end
156
+
157
+ def log(message)
158
+ $stderr.puts(message) if @verbose
159
+ end
160
+ end
161
+ end
162
+ end
@@ -13,29 +13,26 @@
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- require "tiny-classifier/base"
16
+ require "tiny-classifier/command/base"
17
17
 
18
18
  module TinyClassifier
19
- class Classifier < Base
20
- class << self
21
- def run(argv=nil)
22
- argv ||= ARGV.dup
23
- classifier = new
24
- classifier.parse_command_line_options(argv)
25
- classifier.run
19
+ module Command
20
+ class Classify < Base
21
+ def initialize(argv=[])
22
+ super
23
+ parse_command_line_options(argv)
26
24
  end
27
- end
28
25
 
29
- attr_writer :classifier
26
+ def run
27
+ super
28
+ raise NoEffectiveInput.new if input.empty?
29
+ raise NoTrainingData.new(data_file_path) unless data_file_path.exist?
30
30
 
31
- def run
32
- if input.empty?
33
- error("Error: No effective input.")
34
- false
35
- else
36
31
  category = classifier.classify(input)
37
- puts category.downcase
32
+ $stdout.puts(category.downcase)
38
33
  true
34
+ rescue StandardError => error
35
+ handle_error(error)
39
36
  end
40
37
  end
41
38
  end
@@ -0,0 +1,88 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/command/base"
17
+ require "fileutils"
18
+ require "base64"
19
+
20
+ module TinyClassifier
21
+ module Command
22
+ class GenerateClassifier < Base
23
+ def initialize(argv=[])
24
+ super
25
+
26
+ @output_dir = Dir.pwd
27
+ option_parser.on("-o PATH", "--output-dir=PATH",
28
+ "Path to the classifier command to be saved (default=current directory)") do |output_dir|
29
+ @output_dir = output_dir
30
+ end
31
+
32
+ parse_command_line_options(argv)
33
+ end
34
+
35
+ def run
36
+ super
37
+ unless data_file_path.exist?
38
+ raise NoTrainingData.new(data_file_path)
39
+ end
40
+ unless prepare_output_file_path.parent.exist?
41
+ raise InvalidOutputDir.new(prepare_output_file_path.parent)
42
+ end
43
+
44
+ FileUtils.mkdir_p(output_file_path.parent)
45
+ File.open(output_file_path, "w") do |file|
46
+ file.puts("#!/usr/bin/env ruby")
47
+ file.puts("require \"base64\"")
48
+ file.puts("require \"classifier-reborn\"")
49
+ file.puts("require \"tiny-classifier/command/classify\"")
50
+ file.puts("classifier_code = Base64.strict_decode64(\"#{encoded_classifier}\")")
51
+ file.puts("command = TinyClassifier::Command::Classify.new([")
52
+ file.puts(" \"--categories=#{@categories.all.join(",")}\",")
53
+ file.puts(" \"--tokenizer=#{@tokenizer.type}\",")
54
+ file.puts("])")
55
+ file.puts("command.classifier = Marshal.load(classifier_code)")
56
+ file.puts("command.run")
57
+ end
58
+ FileUtils.chmod("a+x", output_file_path)
59
+ true
60
+ rescue StandardError => error
61
+ handle_error(error)
62
+ end
63
+
64
+ def classifier_name
65
+ @classifier_name ||= "tc-classify-#{@categories.basename}"
66
+ end
67
+
68
+ def output_file_path
69
+ @output_file_path ||= prepare_output_file_path
70
+ end
71
+
72
+ private
73
+ def encoded_classifier
74
+ @encoded_classifier ||= prepare_encoded_classifier
75
+ end
76
+
77
+ def prepare_encoded_classifier
78
+ classifier_code = Marshal.dump(classifier)
79
+ Base64.strict_encode64(classifier_code)
80
+ end
81
+
82
+ def prepare_output_file_path
83
+ path = Pathname(@output_dir)
84
+ path + classifier_name
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,75 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/command/train"
17
+
18
+ module TinyClassifier
19
+ module Command
20
+ class Retrain < Base
21
+ def initialize(argv=[])
22
+ super
23
+ option_parser.banner += " WRONG CORRECT"
24
+ *categories = parse_command_line_options(argv)
25
+ @wrong_category = categories.shift
26
+ @correct_category = categories.shift
27
+ end
28
+
29
+ def run
30
+ super
31
+ prepare_categories
32
+ raise NoEffectiveInput.new if input.empty?
33
+ raise NoTrainingData.new(data_file_path) unless data_file_path.exist?
34
+
35
+ classifier.untrain(@wrong_category, input)
36
+ classifier.train(@correct_category, input)
37
+ save
38
+ true
39
+ rescue StandardError => error
40
+ handle_error(error)
41
+ end
42
+
43
+ private
44
+ def prepare_categories
45
+ begin
46
+ @wrong_category = prepare_category(@wrong_category)
47
+ rescue StandardError => error
48
+ case error
49
+ when NoCategory
50
+ raise NoWrongCategory.new
51
+ when InvalidCategory
52
+ raise InvalidWrongCategory.new(@wrong_category, @categories.all)
53
+ else
54
+ raise error
55
+ end
56
+ end
57
+
58
+ begin
59
+ @correct_category = prepare_category(@correct_category)
60
+ rescue StandardError => error
61
+ case error
62
+ when NoCategory
63
+ raise NoCorrectCategory.new
64
+ when InvalidCategory
65
+ raise InvalidCorrectCategory.new(@correct_category, @categories.all)
66
+ else
67
+ raise error
68
+ end
69
+ end
70
+
71
+ log("training as: #{@wrong_category} => #{@correct_category}")
72
+ end
73
+ end
74
+ end
75
+ end
@@ -13,33 +13,29 @@
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- require "tiny-classifier/trainer"
16
+ require "tiny-classifier/command/base"
17
17
 
18
18
  module TinyClassifier
19
- class Retrainer < Trainer
20
- class << self
21
- def run(argv=nil)
22
- argv ||= ARGV.dup
23
- retrainer = new
24
- *categories = retrainer.parse_command_line_options(argv)
25
- retrainer.run(wrong: categories[0],
26
- correct: categories[1])
19
+ module Command
20
+ class Train < Base
21
+ def initialize(argv=[])
22
+ super
23
+ option_parser.banner += " CATEGORY"
24
+ *categories = parse_command_line_options(argv)
25
+ @category = categories.first
27
26
  end
28
- end
29
27
 
30
- def run(params)
31
- if input.empty?
32
- error("Error: No effective input.")
33
- false
34
- else
35
- @category = params[:wrong]
36
- prepare_category
37
- classifier.send("untrain_#{@category}", input)
38
- @category = params[:correct]
39
- prepare_category
40
- classifier.send("train_#{@category}", input)
28
+ def run
29
+ super
30
+ @category = prepare_category(@category)
31
+ log("training as: #{@category}")
32
+ raise NoEffectiveInput.new if input.empty?
33
+
34
+ classifier.train(@category, input)
41
35
  save
42
36
  true
37
+ rescue StandardError => error
38
+ handle_error(error)
43
39
  end
44
40
  end
45
41
  end
@@ -0,0 +1,43 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/command/base"
17
+
18
+ module TinyClassifier
19
+ module Command
20
+ class Untrain < Base
21
+ def initialize(argv=[])
22
+ super
23
+ option_parser.banner += " CATEGORY"
24
+ *categories = parse_command_line_options(argv)
25
+ @category = categories.first
26
+ end
27
+
28
+ def run
29
+ super
30
+ @category = prepare_category(@category)
31
+ log("untraining as: #{@category}")
32
+ raise NoEffectiveInput.new if input.empty?
33
+ raise NoTrainingData.new(data_file_path) unless data_file_path.exist?
34
+
35
+ classifier.untrain(@category, input)
36
+ save
37
+ true
38
+ rescue StandardError => error
39
+ handle_error(error)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,104 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module TinyClassifier
17
+ class TinyClassifierError < StandardError
18
+ end
19
+
20
+ class NoInput < TinyClassifierError
21
+ def message
22
+ "No input. You need to give any input via the STDIN."
23
+ end
24
+ end
25
+
26
+ class NoEffectiveInput < TinyClassifierError
27
+ def message
28
+ "No effective input."
29
+ end
30
+ end
31
+
32
+ class NoCategories < TinyClassifierError
33
+ def message
34
+ "You need to specify categories."
35
+ end
36
+ end
37
+
38
+ class NoCategory < TinyClassifierError
39
+ def message
40
+ "You need to specify a category for the input."
41
+ end
42
+ end
43
+
44
+ class NoWrongCategory < NoCategory
45
+ def message
46
+ "You need to specify a category to untrain the input."
47
+ end
48
+ end
49
+
50
+ class NoCorrectCategory < NoCategory
51
+ def message
52
+ "You need to specify a category to retrain the input."
53
+ end
54
+ end
55
+
56
+ class InvalidCategory < TinyClassifierError
57
+ attr_reader :category, :categories
58
+
59
+ def initialize(category, categories)
60
+ @category = category
61
+ @categories = categories
62
+ end
63
+
64
+ def message
65
+ "You need to specify one of valid categories: #{@categories.join(", ")}"
66
+ end
67
+ end
68
+
69
+ class InvalidWrongCategory < InvalidCategory
70
+ def message
71
+ "You need to specify one of valid categories to untrain: #{@categories.join(", ")}"
72
+ end
73
+ end
74
+
75
+ class InvalidCorrectCategory < InvalidCategory
76
+ def message
77
+ "You need to specify one of valid categories to retrain: #{@categories.join(", ")}"
78
+ end
79
+ end
80
+
81
+ class NoTrainingData < TinyClassifierError
82
+ attr_reader :data_dir
83
+
84
+ def initialize(data_dir)
85
+ @data_dir = data_dir
86
+ end
87
+
88
+ def message
89
+ "There is no training data at #{@data_dir}."
90
+ end
91
+ end
92
+
93
+ class InvalidOutputDir < TinyClassifierError
94
+ attr_reader :output_dir
95
+
96
+ def initialize(output_dir)
97
+ @output_dir = output_dir
98
+ end
99
+
100
+ def message
101
+ "#{@output_dir} is not available as the output directory."
102
+ end
103
+ end
104
+ end
@@ -13,21 +13,19 @@
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- require "tiny-classifier/trainer"
17
-
18
16
  module TinyClassifier
19
- class Untrainer < Trainer
20
- def run(params)
21
- @category = params[:category]
22
- prepare_category
23
- if input.empty?
24
- error("Error: No effective input.")
25
- false
26
- else
27
- classifier.send("untrain_#{@category}", input)
28
- save
29
- true
30
- end
17
+ class Input
18
+ def initialize(data = nil)
19
+ @data = data
20
+ end
21
+
22
+ def given?
23
+ return true if @data or $stdin.is_a?(StringIO)
24
+ File.pipe?(STDIN)
25
+ end
26
+
27
+ def read
28
+ @data ||= $stdin.readlines.join(" ").strip
31
29
  end
32
30
  end
33
31
  end
File without changes
@@ -21,7 +21,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
21
21
 
22
22
  Gem::Specification.new do |spec|
23
23
  spec.name = "tiny-classifier"
24
- spec.version = "2.1"
24
+ spec.version = "2.2"
25
25
  spec.homepage = "https://github.com/piroor/tiny-classifier"
26
26
  spec.authors = ["YUKI \"Piro\" Hiroshi"]
27
27
  spec.email = ["piro.outsider.reflex@gmail.com"]
@@ -39,4 +39,8 @@ Gem::Specification.new do |spec|
39
39
 
40
40
  spec.add_runtime_dependency("classifier-reborn")
41
41
  spec.add_runtime_dependency("natto")
42
+
43
+ spec.add_development_dependency("bundler")
44
+ spec.add_development_dependency("rake")
45
+ spec.add_development_dependency("test-unit")
42
46
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: '2.1'
4
+ version: '2.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - YUKI "Piro" Hiroshi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-22 00:00:00.000000000 Z
11
+ date: 2017-06-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: classifier-reborn
@@ -38,15 +38,57 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: test-unit
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
41
83
  description: ''
42
84
  email:
43
85
  - piro.outsider.reflex@gmail.com
44
86
  executables:
45
- - tc-classify
46
87
  - tc-generate-classifier
88
+ - tc-untrain
47
89
  - tc-retrain
48
90
  - tc-train
49
- - tc-untrain
91
+ - tc-classify
50
92
  extensions: []
51
93
  extra_rdoc_files: []
52
94
  files:
@@ -58,13 +100,16 @@ files:
58
100
  - bin/tc-retrain
59
101
  - bin/tc-train
60
102
  - bin/tc-untrain
61
- - lib/tiny-classifier/base.rb
62
- - lib/tiny-classifier/classifier-generator.rb
63
- - lib/tiny-classifier/classifier.rb
64
- - lib/tiny-classifier/retrainer.rb
103
+ - lib/tiny-classifier/category-manager.rb
104
+ - lib/tiny-classifier/command/base.rb
105
+ - lib/tiny-classifier/command/classify.rb
106
+ - lib/tiny-classifier/command/generate-classifier.rb
107
+ - lib/tiny-classifier/command/retrain.rb
108
+ - lib/tiny-classifier/command/train.rb
109
+ - lib/tiny-classifier/command/untrain.rb
110
+ - lib/tiny-classifier/errors.rb
111
+ - lib/tiny-classifier/input.rb
65
112
  - lib/tiny-classifier/tokenizer.rb
66
- - lib/tiny-classifier/trainer.rb
67
- - lib/tiny-classifier/untrainer.rb
68
113
  - tiny-classifier.gemspec
69
114
  homepage: https://github.com/piroor/tiny-classifier
70
115
  licenses:
@@ -1,136 +0,0 @@
1
- # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require "pathname"
17
- require "optparse"
18
- require "classifier-reborn"
19
- require "tiny-classifier/tokenizer"
20
-
21
- module TinyClassifier
22
- class Base
23
- attr_reader :tokenizer
24
-
25
- def initialize
26
- @tokenizer = Tokenizer.new
27
- @data_dir = Dir.pwd
28
- @verbose = false
29
- end
30
-
31
- def parse_command_line_options(command_line_options)
32
- option_parser.parse!(command_line_options)
33
- end
34
-
35
- def classifier
36
- @classifier ||= prepare_classifier
37
- end
38
-
39
- private
40
- def option_parser
41
- @option_parser ||= create_option_parser
42
- end
43
-
44
- def create_option_parser
45
- parser = OptionParser.new
46
-
47
- parser.on("-d PATH", "--data-dir=PATH",
48
- "Path to the directory to store training data file (default=current directory)") do |data_dir|
49
- @data_dir = data_dir
50
- end
51
-
52
- parser.on("-c CATEGORIES", "--categories=CATEGORIES",
53
- "List of categories (comma-separated)") do |categories|
54
- @categories = normalize_categories(categories)
55
- log("categories: #{@categories}")
56
- end
57
-
58
- parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
59
- "Tokenizer (default=#{@tokenizer})") do |tokenizer|
60
- @tokenizer.type = tokenizer
61
- end
62
-
63
- parser.on("-v", "--verbose",
64
- "Output internal information (for debugging)") do |verbose|
65
- @verbose = verbose
66
- end
67
-
68
- parser
69
- end
70
-
71
- def normalize_categories(categories)
72
- categories
73
- .strip
74
- .downcase
75
- .split(",")
76
- .collect(&:strip)
77
- .reject do |category|
78
- category.empty?
79
- end
80
- .sort
81
- .collect(&:capitalize)
82
- end
83
-
84
- def data_file_name
85
- @data_file_basename ||= prepare_data_file_name
86
- end
87
-
88
- def prepare_data_file_name
89
- categories = @categories.join("-").downcase
90
- "tc.#{categories}.dat"
91
- end
92
-
93
- def data_file_path
94
- @data_file_path ||= prepare_data_file_path
95
- end
96
-
97
- def prepare_data_file_path
98
- path = Pathname(@data_dir)
99
- path + data_file_name
100
- end
101
-
102
- def prepare_classifier
103
- if data_file_path.exist?
104
- data = File.read(data_file_path.to_s)
105
- Marshal.load(data)
106
- else
107
- ClassifierReborn::Bayes.new(*@categories)
108
- end
109
- end
110
-
111
- def input
112
- @input ||= prepare_input
113
- end
114
-
115
- def prepare_input
116
- unless File.pipe?(STDIN)
117
- error("Error: No effective input. You need to give any input via the STDIN.")
118
- exit(false)
119
- end
120
- @input = $stdin.readlines.join(" ")
121
- @input = @tokenizer.tokenize(@input)
122
- log("tokenizer: #{@tokenizer.type}")
123
- @input.strip!
124
- log("input: #{@input}")
125
- @input
126
- end
127
-
128
- def error(message)
129
- STDERR.puts(message)
130
- end
131
-
132
- def log(message)
133
- STDERR.puts(message) if @verbose
134
- end
135
- end
136
- end
@@ -1,88 +0,0 @@
1
- # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require "tiny-classifier/base"
17
- require "tiny-classifier/classifier"
18
- require "fileutils"
19
- require "base64"
20
-
21
- module TinyClassifier
22
- class ClassifierGenerator < Base
23
- class << self
24
- def run(argv=nil)
25
- argv ||= ARGV.dup
26
- generator = new
27
- generator.parse_command_line_options(argv)
28
- generator.run
29
- end
30
- end
31
-
32
- def initialize
33
- super
34
- @output_dir = Dir.pwd
35
- option_parser.on("-o PATH", "--output-dir=PATH",
36
- "Path to the classifier command to be saved (default=current directory)") do |output_dir|
37
- @output_dir = output_dir
38
- end
39
- end
40
-
41
- def run
42
- File.open(output_file_path, "w") do |file|
43
- file.puts("#!/usr/bin/env ruby")
44
- file.puts("require \"base64\"")
45
- file.puts("require \"classifier-reborn\"")
46
- file.puts("require \"tiny-classifier/classifier\"")
47
- file.puts("classifier_code = Base64.strict_decode64(\"#{encoded_classifier}\")")
48
- file.puts("classifier = TinyClassifier::Classifier.new")
49
- file.puts("classifier.classifier = Marshal.load(classifier_code)")
50
- file.puts("classifier.tokenizer.type = \"#{@tokenizer.type}\"")
51
- file.puts("classifier.run")
52
- end
53
- FileUtils.chmod("a+x", output_file_path)
54
- end
55
-
56
- private
57
- def encoded_classifier
58
- @encoded_classifier ||= prepare_encoded_classifier
59
- end
60
-
61
- def prepare_encoded_classifier
62
- classifier = Classifier.new
63
- classifier.parse_command_line_options(ARGV.dup)
64
- FileUtils.mkdir_p(output_file_path.parent)
65
-
66
- classifier_code = Marshal.dump(classifier.classifier)
67
- Base64.strict_encode64(classifier_code)
68
- end
69
-
70
- def classifier_name
71
- @classifier_name ||= prepare_classifier_name
72
- end
73
-
74
- def prepare_classifier_name
75
- categories = @categories.join("-").downcase
76
- "tc-classify-#{categories}"
77
- end
78
-
79
- def output_file_path
80
- @output_file_path ||= prepare_output_file_path
81
- end
82
-
83
- def prepare_output_file_path
84
- path = Pathname(@output_dir)
85
- path + classifier_name
86
- end
87
- end
88
- end
@@ -1,74 +0,0 @@
1
- # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
- #
3
- # This program is free software: you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation, either version 3 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
-
16
- require "tiny-classifier/base"
17
-
18
- module TinyClassifier
19
- class Trainer < Base
20
- class << self
21
- def run(argv=nil)
22
- argv ||= ARGV.dup
23
- trainer = new
24
- *categories = trainer.parse_command_line_options(argv)
25
- trainer.run(category: categories.first)
26
- end
27
- end
28
-
29
- def initialize
30
- super
31
- option_parser.banner += " CATEGORY"
32
- end
33
-
34
- def run(params)
35
- @category = params[:category]
36
- prepare_category
37
- if input.empty?
38
- error("Error: No effective input.")
39
- false
40
- else
41
- classifier.send("train_#{@category}", input)
42
- save
43
- true
44
- end
45
- end
46
-
47
- private
48
- def prepare_category
49
- unless @category
50
- error("Error: You need to specify the category for the input.")
51
- exit(false)
52
- end
53
-
54
- @category = @category.downcase.strip
55
-
56
- if @category.empty?
57
- error("Error: You need to specify the category for the input.")
58
- exit(false)
59
- end
60
-
61
- unless @categories.include?(@category.capitalize)
62
- error("Error: You need to specify one of valid categories: #{@categories.join(', ')}")
63
- exit(false)
64
- end
65
- end
66
-
67
- def save
68
- data = Marshal.dump(classifier)
69
- File.open(data_file_path, "w") do |file|
70
- file.write(data)
71
- end
72
- end
73
- end
74
- end