tiny-classifier 1.2 → 1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e4640ed17e08cec0d4c7e06a7acedc0527669506
4
- data.tar.gz: 982f06aad8df916300a18b04ca00ac9e0ae2dceb
3
+ metadata.gz: 9f1938c2074b9e055cba5fcfe02fd5dc89d39d89
4
+ data.tar.gz: da79e06a3acbe42ebad1b4576194cf774314298c
5
5
  SHA512:
6
- metadata.gz: dd7a63f5ff86e02b32a63a852f40d17ffce45ccd9ee5d68ee1fc131f68d770f595fae5c12a039d207ef557e2ee23229c2bea0755d7c36b84444bd0721e27e29e
7
- data.tar.gz: 67ad761ba648d8718b1dd6339e83c221856223e180787d14e7761106b7589e588468f6d47d6bc3f3471d2c591868a0d72756532d5e530f944bc7f8f5e10409f4
6
+ metadata.gz: 4b5f7099909ca9da6f754829881ee7de94a703f6c4abc32d958fe94f3a10be7f056acc17f5e1e3aadd66b8e3caa2a72f8c6ed2bcdfe6a8b6e20540114324a685
7
+ data.tar.gz: fb1b1aa131bee28c831c81a59ede9fa7f909b50e03bafb3652397111acfccc09c714480b097a0b6dd38de5ea9baa47d02df7eb3354317bd954efa257d8c0747e
data/README.md CHANGED
@@ -33,15 +33,30 @@ Training:
33
33
  % echo "Oh my god!" | tc-train --labels=positive,negative negative
34
34
  ```
35
35
 
36
- The training data will be saved as `tc.positive-negative.dat` (`tc.` is the fixed prefix, `.dat` is the fixed suffix. The middle part is filled by given labels automatically.) in the current directory. If you hope the file to be saved in any different place, please specify `--base-dir=/path/to/data/directory`.
36
+ The training data will be saved as `tc.negative-positive.dat` (`tc.` is the fixed prefix, `.dat` is the fixed suffix. The middle part is filled by given labels automatically.) in the current directory. If you hope the file to be saved in any different place, please specify `--base-dir=/path/to/data/directory`.
37
37
 
38
- Classifying:
38
+ Testing to classify:
39
39
 
40
40
  ~~~
41
41
  % echo "Happy day?" | tc-classify --labels=positive,negative
42
42
  positive
43
43
  ~~~
44
44
 
45
+ If you think that the classifier has been enoughly trained, then you can generate a fixed classifier:
46
+
47
+ ~~~
48
+ % tc-generate-classifier --labels=positive,negative --output-dir=/path/to/dir
49
+ ~~~
50
+
51
+ Then a fixed classifier (executable Ruby script) will be generated as `tc-classify-negative-positive` (`tc-classify-` is the fixed prefix, rest is filled by given labels automatically.)
52
+
53
+ ~~~
54
+ % ls /path/to/dir/
55
+ tc-classify-negative-positive
56
+ % echo "Happy day?" | /path/to/dir/tc-classify-negative-positive
57
+ positive
58
+ ~~~
59
+
45
60
  ## Command line parameters
46
61
 
47
62
  ### Common
@@ -55,10 +70,15 @@ positive
55
70
  `-t`, `--tokenizer=TOKENIZER` (optional)
56
71
  : Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
57
72
 
58
- ### Trainer
73
+ ### `tc-train` specific parameters
59
74
 
60
75
  The `tc-train` requires one command line argument: the label. You need to specify one of labels given via the `--labels` parameter.
61
76
 
77
+ ### `tc-generate-classifier` specific parameters
78
+
79
+ `-o`, `--output-dir=PATH` (optional)
80
+ : The path to the directory that the classifier to be saved. The current directory is the default value.
81
+
62
82
  ## Copyright
63
83
 
64
84
  Copyright (c) 2017 YUKI "Piro" Hiroshi
data/bin/tc-classify CHANGED
@@ -17,4 +17,4 @@
17
17
 
18
18
  require "tiny-classifier/classifier"
19
19
 
20
- Classifier.run
20
+ TinyClassifier::Classifier.run
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+
18
+ require "tiny-classifier/classifier-generator"
19
+
20
+ TinyClassifier::ClassifierGenerator.run
data/bin/tc-train CHANGED
@@ -17,4 +17,4 @@
17
17
 
18
18
  require "tiny-classifier/trainer"
19
19
 
20
- Trainer.run
20
+ TinyClassifier::Trainer.run
@@ -16,119 +16,103 @@
16
16
  require "pathname"
17
17
  require "optparse"
18
18
  require "classifier-reborn"
19
+ require "tiny-classifier/tokenizer"
19
20
 
20
- class TinyClassifierBase
21
- TOKENIZERS = [:none, :mecab]
21
+ module TinyClassifier
22
+ class Base
23
+ attr_reader :tokenizer
22
24
 
23
- def initialize
24
- @tokenizer = :none
25
- @data_dir = Dir.pwd
26
- end
27
-
28
- def parse_command_line_options(command_line_options)
29
- option_parser.parse!(command_line_options)
30
- end
31
-
32
- private
33
- def option_parser
34
- @option_parser ||= create_option_parser
35
- end
36
-
37
- def create_option_parser
38
- parser = OptionParser.new
25
+ def initialize
26
+ @tokenizer = Tokenizer.new
27
+ @data_dir = Dir.pwd
28
+ end
39
29
 
40
- parser.on("-d PATH", "--data-dir=PATH",
41
- "Path to the directory to store training data file (default=current directory)") do |data_dir|
42
- @data_dir = data_dir
30
+ def parse_command_line_options(command_line_options)
31
+ option_parser.parse!(command_line_options)
43
32
  end
44
33
 
45
- parser.on("-l LABELS", "--labels=LABELS",
46
- "List of labels (comma-separated)") do |labels|
47
- @labels = normalize_labels(labels)
34
+ def classifier
35
+ @classifier ||= prepare_classifier
48
36
  end
49
37
 
50
- parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
51
- "Tokenizer (default=#{@tokenizer})") do |tokenizer|
52
- @tokenizer = tokenizer.downcase.to_sym
38
+ private
39
+ def option_parser
40
+ @option_parser ||= create_option_parser
53
41
  end
54
42
 
55
- parser
56
- end
43
+ def create_option_parser
44
+ parser = OptionParser.new
57
45
 
58
- def normalize_labels(labels)
59
- labels
60
- .strip
61
- .downcase
62
- .split(",")
63
- .collect(&:strip)
64
- .reject do |label|
65
- label.empty?
46
+ parser.on("-d PATH", "--data-dir=PATH",
47
+ "Path to the directory to store training data file (default=current directory)") do |data_dir|
48
+ @data_dir = data_dir
66
49
  end
67
- .sort
68
- .collect(&:capitalize)
69
- end
70
50
 
71
- def data_file_name
72
- @data_file_basename ||= prepare_data_file_name
73
- end
51
+ parser.on("-l LABELS", "--labels=LABELS",
52
+ "List of labels (comma-separated)") do |labels|
53
+ @labels = normalize_labels(labels)
54
+ end
74
55
 
75
- def prepare_data_file_name
76
- labels = @labels.join("-").downcase
77
- "tc.#{labels}.dat"
78
- end
56
+ parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
57
+ "Tokenizer (default=#{@tokenizer})") do |tokenizer|
58
+ @tokenizer.type = tokenizer
59
+ end
79
60
 
80
- def data_file_path
81
- @data_file_path ||= prepare_data_file_path
82
- end
61
+ parser
62
+ end
83
63
 
84
- def prepare_data_file_path
85
- path = Pathname(@data_dir)
86
- path + data_file_name
87
- end
64
+ def normalize_labels(labels)
65
+ labels
66
+ .strip
67
+ .downcase
68
+ .split(",")
69
+ .collect(&:strip)
70
+ .reject do |label|
71
+ label.empty?
72
+ end
73
+ .sort
74
+ .collect(&:capitalize)
75
+ end
88
76
 
89
- def classifier
90
- @classifier ||= prepare_classifier
91
- end
77
+ def data_file_name
78
+ @data_file_basename ||= prepare_data_file_name
79
+ end
92
80
 
93
- def prepare_classifier
94
- if data_file_path.exist?
95
- data = File.read(data_file_path.to_s)
96
- Marshal.load(data)
97
- else
98
- ClassifierReborn::Bayes.new(*@labels)
81
+ def prepare_data_file_name
82
+ labels = @labels.join("-").downcase
83
+ "tc.#{labels}.dat"
99
84
  end
100
- end
101
85
 
102
- def input
103
- @input ||= prepare_input
104
- end
86
+ def data_file_path
87
+ @data_file_path ||= prepare_data_file_path
88
+ end
105
89
 
106
- def prepare_input
107
- unless File.pipe?(STDIN)
108
- STDERR.puts("Error: No effective input. You need to give any input via the STDIN.")
109
- exit(false)
90
+ def prepare_data_file_path
91
+ path = Pathname(@data_dir)
92
+ path + data_file_name
93
+ end
94
+
95
+ def prepare_classifier
96
+ if data_file_path.exist?
97
+ data = File.read(data_file_path.to_s)
98
+ Marshal.load(data)
99
+ else
100
+ ClassifierReborn::Bayes.new(*@labels)
101
+ end
110
102
  end
111
- @input = $stdin.readlines.join("\n")
112
- tokenize
113
- @input.strip!
114
- end
115
103
 
116
- def tokenize
117
- case @tokenizer
118
- when :mecab
119
- tokenize_by_mecab
104
+ def input
105
+ @input ||= prepare_input
120
106
  end
121
- end
122
107
 
123
- def tokenize_by_mecab
124
- require "natto"
125
- natto = Natto::MeCab.new
126
- terms = []
127
- natto.parse(@input) do |term|
128
- if term.feature =~ /\A(名詞|形容詞|動詞)/
129
- terms << term.surface
108
+ def prepare_input
109
+ unless File.pipe?(STDIN)
110
+ STDERR.puts("Error: No effective input. You need to give any input via the STDIN.")
111
+ exit(false)
130
112
  end
113
+ @input = $stdin.readlines.join(" ")
114
+ @tokenizer.tokenize(@input)
115
+ @input.strip!
131
116
  end
132
- @input = terms.join(" ").strip
133
117
  end
134
118
  end
@@ -0,0 +1,88 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/base"
17
+ require "tiny-classifier/classifier"
18
+ require "fileutils"
19
+ require "base64"
20
+
21
+ module TinyClassifier
22
+ class ClassifierGenerator < Base
23
+ class << self
24
+ def run(argv=nil)
25
+ argv ||= ARGV.dup
26
+ generator = new
27
+ generator.parse_command_line_options(argv)
28
+ generator.run
29
+ end
30
+ end
31
+
32
+ def initialize
33
+ super
34
+ @output_dir = Dir.pwd
35
+ option_parser.on("-o PATH", "--output-dir=PATH",
36
+ "Path to the classifier command to be saved (default=current directory)") do |output_dir|
37
+ @output_dir = output_dir
38
+ end
39
+ end
40
+
41
+ def run
42
+ File.open(output_file_path, "w") do |file|
43
+ file.puts("#!/usr/bin/env ruby")
44
+ file.puts("require \"base64\"")
45
+ file.puts("require \"classifier-reborn\"")
46
+ file.puts("require \"tiny-classifier/classifier\"")
47
+ file.puts("classifier_code = Base64.strict_decode64(\"#{encoded_classifier}\")")
48
+ file.puts("classifier = TinyClassifier::Classifier.new")
49
+ file.puts("classifier.classifier = Marshal.load(classifier_code)")
50
+ file.puts("classifier.tokenizer.type = \"#{@tokenizer.type}\"")
51
+ file.puts("classifier.run")
52
+ end
53
+ FileUtils.chmod("a+x", output_file_path)
54
+ end
55
+
56
+ private
57
+ def encoded_classifier
58
+ @encoded_classifier ||= prepare_encoded_classifier
59
+ end
60
+
61
+ def prepare_encoded_classifier
62
+ classifier = Classifier.new
63
+ classifier.parse_command_line_options(ARGV.dup)
64
+ FileUtils.mkdir_p(output_file_path.parent)
65
+
66
+ classifier_code = Marshal.dump(classifier.classifier)
67
+ Base64.strict_encode64(classifier_code)
68
+ end
69
+
70
+ def classifier_name
71
+ @classifier_name ||= prepare_classifier_name
72
+ end
73
+
74
+ def prepare_classifier_name
75
+ labels = @labels.join("-").downcase
76
+ "tc-classify-#{labels}"
77
+ end
78
+
79
+ def output_file_path
80
+ @output_file_path ||= prepare_output_file_path
81
+ end
82
+
83
+ def prepare_output_file_path
84
+ path = Pathname(@output_dir)
85
+ path + classifier_name
86
+ end
87
+ end
88
+ end
@@ -15,25 +15,28 @@
15
15
 
16
16
  require "tiny-classifier/base"
17
17
 
18
- class Classifier < TinyClassifierBase
19
- class << self
20
- def run(argv=nil)
21
- argv ||= ARGV.dup
22
- classifier = new
23
- classifier.parse_command_line_options(argv)
24
- classifier.run
18
+ module TinyClassifier
19
+ class Classifier < Base
20
+ class << self
21
+ def run(argv=nil)
22
+ argv ||= ARGV.dup
23
+ classifier = new
24
+ classifier.parse_command_line_options(argv)
25
+ classifier.run
26
+ end
25
27
  end
26
- end
27
28
 
28
- def run(params)
29
- @label = params[:label]
30
- if input.empty?
31
- STDERR.puts("Error: No effective input.")
32
- false
33
- else
34
- label = classifier.classify(input)
35
- puts label.downcase
36
- true
29
+ attr_writer :classifier
30
+
31
+ def run
32
+ if input.empty?
33
+ STDERR.puts("Error: No effective input.")
34
+ false
35
+ else
36
+ label = classifier.classify(input)
37
+ puts label.downcase
38
+ true
39
+ end
37
40
  end
38
41
  end
39
42
  end
@@ -0,0 +1,51 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module TinyClassifier
17
+ class Tokenizer
18
+ TOKENIZERS = [:none, :mecab]
19
+
20
+ attr_accessor :type
21
+
22
+ def initialize(params = nil)
23
+ if params
24
+ @type = params[:type]
25
+ end
26
+ @type ||= :none
27
+ end
28
+
29
+ def tokenize(input)
30
+ case @tokenizer.to_s.downcase.to_sym
31
+ when :mecab
32
+ tokenize_by_mecab(input)
33
+ else
34
+ input
35
+ end
36
+ end
37
+
38
+ private
39
+ def tokenize_by_mecab(input)
40
+ require "natto"
41
+ natto = Natto::MeCab.new
42
+ terms = []
43
+ natto.parse(input) do |term|
44
+ if term.feature =~ /\A(名詞|形容詞|動詞)/
45
+ terms << term.surface
46
+ end
47
+ end
48
+ terms.join(" ").strip
49
+ end
50
+ end
51
+ end
@@ -15,58 +15,60 @@
15
15
 
16
16
  require "tiny-classifier/base"
17
17
 
18
- class Trainer < TinyClassifierBase
19
- class << self
20
- def run(argv=nil)
21
- argv ||= ARGV.dup
22
- trainer = new
23
- *labels = trainer.parse_command_line_options(argv)
24
- trainer.run(label: labels.first)
18
+ module TinyClassifier
19
+ class Trainer < Base
20
+ class << self
21
+ def run(argv=nil)
22
+ argv ||= ARGV.dup
23
+ trainer = new
24
+ *labels = trainer.parse_command_line_options(argv)
25
+ trainer.run(label: labels.first)
26
+ end
25
27
  end
26
- end
27
-
28
- def initialize
29
- super
30
- option_parser.banner += " LABEL"
31
- end
32
28
 
33
- def run(params)
34
- @label = params[:label]
35
- prepare_label
36
- if input.empty?
37
- STDERR.puts("Error: No effective input.")
38
- false
39
- else
40
- classifier.send("train_#{@label}", input)
41
- save
42
- true
29
+ def initialize
30
+ super
31
+ option_parser.banner += " LABEL"
43
32
  end
44
- end
45
33
 
46
- private
47
- def prepare_label
48
- unless @label
49
- STDERR.puts("Error: You need to specify the label for the input.")
50
- exit(false)
34
+ def run(params)
35
+ @label = params[:label]
36
+ prepare_label
37
+ if input.empty?
38
+ STDERR.puts("Error: No effective input.")
39
+ false
40
+ else
41
+ classifier.send("train_#{@label}", input)
42
+ save
43
+ true
44
+ end
51
45
  end
52
46
 
53
- @label = @label.downcase.strip
47
+ private
48
+ def prepare_label
49
+ unless @label
50
+ STDERR.puts("Error: You need to specify the label for the input.")
51
+ exit(false)
52
+ end
54
53
 
55
- if @label.empty?
56
- STDERR.puts("Error: You need to specify the label for the input.")
57
- exit(false)
58
- end
54
+ @label = @label.downcase.strip
55
+
56
+ if @label.empty?
57
+ STDERR.puts("Error: You need to specify the label for the input.")
58
+ exit(false)
59
+ end
59
60
 
60
- unless @labels.include?(@label.capitalize)
61
- STDERR.puts("Error: You need to specify one of valid labels: #{@labels.join(', ')}")
62
- exit(false)
61
+ unless @labels.include?(@label.capitalize)
62
+ STDERR.puts("Error: You need to specify one of valid labels: #{@labels.join(', ')}")
63
+ exit(false)
64
+ end
63
65
  end
64
- end
65
66
 
66
- def save
67
- data = Marshal.dump(classifier)
68
- File.open(data_file_path, "w") do |file|
69
- file.write(data)
67
+ def save
68
+ data = Marshal.dump(classifier)
69
+ File.open(data_file_path, "w") do |file|
70
+ file.write(data)
71
+ end
70
72
  end
71
73
  end
72
74
  end
@@ -21,7 +21,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
21
21
 
22
22
  Gem::Specification.new do |spec|
23
23
  spec.name = "tiny-classifier"
24
- spec.version = "1.2"
24
+ spec.version = "1.3"
25
25
  spec.homepage = "https://github.com/piroor/tiny-classifier"
26
26
  spec.authors = ["YUKI \"Piro\" Hiroshi"]
27
27
  spec.email = ["piro.outsider.reflex@gmail.com"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.2'
4
+ version: '1.3'
5
5
  platform: ruby
6
6
  authors:
7
7
  - YUKI "Piro" Hiroshi
@@ -42,6 +42,7 @@ description: ''
42
42
  email:
43
43
  - piro.outsider.reflex@gmail.com
44
44
  executables:
45
+ - tc-generate-classifier
45
46
  - tc-train
46
47
  - tc-classify
47
48
  extensions: []
@@ -51,9 +52,12 @@ files:
51
52
  - README.md
52
53
  - Rakefile
53
54
  - bin/tc-classify
55
+ - bin/tc-generate-classifier
54
56
  - bin/tc-train
55
57
  - lib/tiny-classifier/base.rb
58
+ - lib/tiny-classifier/classifier-generator.rb
56
59
  - lib/tiny-classifier/classifier.rb
60
+ - lib/tiny-classifier/tokenizer.rb
57
61
  - lib/tiny-classifier/trainer.rb
58
62
  - tiny-classifier.gemspec
59
63
  homepage: https://github.com/piroor/tiny-classifier