tiny-classifier 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ef03bed349d3c266bab512446b6dff8946d5c14f
4
+ data.tar.gz: 768126f03759f1fe38ef2e155dad3484836f4422
5
+ SHA512:
6
+ metadata.gz: 9c59f06bef1b3fcc30914a410c29ac52444ea77790f5f4f588515c268bb181a2f353083bfe1ab3ceeba94964052807a6cb556ded863712b09f6488c7d1feff07
7
+ data.tar.gz: aa188deccf010883b6d04f3ae63ba0124d9ff4f586690cfccf921be5e1466deed11e72b021083142d0e37134f80a1b462af4e7989af0d3c69c830a19391000c9
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,58 @@
1
+ # README
2
+
3
+ ## Name
4
+
5
+ tiny-classifier
6
+
7
+ ## Description
8
+
9
+ Command line tool to run text classifier based on naive bayes.
10
+
11
+ ## Install
12
+
13
+ ```
14
+ % gem install tiny-classifier
15
+ ```
16
+
17
+ ## Basic usage
18
+
19
+ Training:
20
+
21
+ ```
22
+ % echo "Hello, world!" | tc-train --labels=positive,negative positive
23
+ % echo "I'm very very happy!" | tc-train --labels=positive,negative positive
24
+ % echo "I'm so bad..." | tc-train --labels=positive,negative negative
25
+ % echo "Oh my god!" | tc-train --labels=positive,negative negative
26
+ ```
27
+
28
+ Classifying:
29
+
30
+ ~~~
31
+ % echo "Happy day?" | tc-classify --labels=positive,negative
32
+ positive
33
+ ~~~
34
+
35
+ ## Command line parameters
36
+
37
+ ### Common
38
+
39
+ `--labels=LABELS` (required)
40
+ : A comman-separated list of labels. You should use only alphabetic characters. (Non-alphabetical characters will cause problems.)
41
+
42
+ `--data-dir=PATH` (optional)
43
+ : The path to the directory that the training data to be saved. The current directory.
44
+
45
+ `--tokenizer=TOKENIZER` (optional)
46
+ : Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
47
+
48
+ ### Trainer
49
+
50
+ The `tc-train` requires one command line argument: the label. You need to specify one of labels given via the `--labels` parameter.
51
+
52
+ ## Copyright
53
+
54
+ Copyright (c) 2017 YUKI "Piro" Hiroshi
55
+
56
+ ## License
57
+
58
+ GPLv3 or later. See LICENSE.txt for details.
data/Rakefile ADDED
@@ -0,0 +1,28 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ task :default => :build
17
+
18
+ require "bundler/gem_helper"
19
+
20
+ base_dir = File.join(File.dirname(__FILE__))
21
+
22
+ helper = Bundler::GemHelper.new(base_dir)
23
+ def helper.version_tag
24
+ version
25
+ end
26
+
27
+ helper.install
28
+ spec = helper.gemspec
data/bin/tc-classify ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+
18
+ require "tiny-classifier/classifier"
19
+
20
+ Classifier.run
data/bin/tc-train ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+
18
+ require "tiny-classifier/trainer"
19
+
20
+ Trainer.run
@@ -0,0 +1,129 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "pathname"
17
+ require "optparse"
18
+ require "classifier-reborn"
19
+
20
+ class TinyClassifierBase
21
+ TOKENIZERS = [:none, :mecab]
22
+
23
+ def initialize
24
+ @tokenizer = :none
25
+ @data_dir = Dir.pwd
26
+ end
27
+
28
+ def parse_command_line_options(command_line_options)
29
+ option_parser = create_option_parser
30
+ option_parser.parse!(command_line_options)
31
+ end
32
+
33
+ private
34
+ def create_option_parser
35
+ parser = OptionParser.new
36
+
37
+ parser.on("--data-dir=PATH",
38
+ "Path to the directory to store training data file (default=current directory)") do |data_dir|
39
+ @data_dir = data_dir
40
+ end
41
+
42
+ parser.on("--labels=LABELS",
43
+ "List of labels (comma-separated)") do |labels|
44
+ @labels = normalize_labels(labels)
45
+ end
46
+
47
+ parser.on("--tokenizer=TOKENIZER",
48
+ "Tokenizer (default=#{@tokenizer})") do |tokenizer|
49
+ @tokenizer = tokenizer.to_sym
50
+ end
51
+
52
+ parser
53
+ end
54
+
55
+ def normalize_labels(labels)
56
+ labels
57
+ .strip
58
+ .downcase
59
+ .split(",")
60
+ .collect(&:strip)
61
+ .reject do |label|
62
+ label.empty?
63
+ end
64
+ .sort
65
+ .collect(&:capitalize)
66
+ end
67
+
68
+ def data_file_name
69
+ @data_file_basename ||= prepare_data_file_name
70
+ end
71
+
72
+ def prepare_data_file_name
73
+ labels = @labels.join("-").downcase
74
+ "tc.#{labels}.dat"
75
+ end
76
+
77
+ def data_file_path
78
+ @data_file_path ||= prepare_data_file_path
79
+ end
80
+
81
+ def prepare_data_file_path
82
+ path = Pathname(@data_dir)
83
+ path + data_file_name
84
+ end
85
+
86
+ def classifier
87
+ @classifier ||= prepare_classifier
88
+ end
89
+
90
+ def prepare_classifier
91
+ if data_file_path.exist?
92
+ data = File.read(data_file_path.to_s)
93
+ Marshal.load(data)
94
+ else
95
+ ClassifierReborn::Bayes.new(*@labels)
96
+ end
97
+ end
98
+
99
+ def prepare_input
100
+ tokenize
101
+ @input.strip!
102
+ end
103
+
104
+ def tokenize
105
+ case @tokenizer
106
+ when :mecab
107
+ tokenize_by_mecab
108
+ end
109
+ end
110
+
111
+ def tokenize_by_mecab
112
+ require "natto"
113
+ natto = Natto::MeCab.new
114
+ terms = []
115
+ natto.parse(@input) do |term|
116
+ if term.feature =~ /名詞|形容詞|動詞/
117
+ terms << term.surface
118
+ end
119
+ end
120
+ @input = terms.join(" ").strip
121
+ end
122
+
123
+ def save
124
+ data = Marshal.dump(classifier)
125
+ File.open(data_file_path, "w") do |file|
126
+ file.write(data)
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,41 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/base"
17
+
18
+ class Classifier < TinyClassifierBase
19
+ class << self
20
+ def run(argv=nil)
21
+ argv ||= ARGV.dup
22
+ classifier = new
23
+ classifier.parse_command_line_options(argv)
24
+ input = $stdin.readlines.join("\n")
25
+ classifier.run(input: input)
26
+ end
27
+ end
28
+
29
+ def run(params)
30
+ @label = params[:label]
31
+ @input = params[:input]
32
+ prepare_input
33
+ if @input.empty?
34
+ exit(1)
35
+ else
36
+ label = classifier.classify(@input)
37
+ puts label.downcase
38
+ exit(0)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,50 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "tiny-classifier/base"
17
+
18
+ class Trainer < TinyClassifierBase
19
+ class << self
20
+ def run(argv=nil)
21
+ argv ||= ARGV.dup
22
+ trainer = new
23
+ *labels = trainer.parse_command_line_options(argv)
24
+ input = $stdin.readlines.join("\n")
25
+ trainer.run(label: labels.first,
26
+ input: input)
27
+ end
28
+ end
29
+
30
+ def run(params)
31
+ @label = params[:label]
32
+ @input = params[:input]
33
+ prepare_input
34
+ if @input.empty?
35
+ exit(1)
36
+ else
37
+ classifier.send("train_#{@label.downcase}", @input)
38
+ save
39
+ exit(0)
40
+ end
41
+ end
42
+
43
+ private
44
+ def save
45
+ data = Marshal.dump(classifier)
46
+ File.open(data_file_path, "w") do |file|
47
+ file.write(data)
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,42 @@
1
+ # Copyright (C) 2017 YUKI "Piro" Hiroshi
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ clean_white_space = lambda do |entry|
17
+ entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
18
+ end
19
+
20
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
21
+
22
+ Gem::Specification.new do |spec|
23
+ spec.name = "tiny-classifier"
24
+ spec.version = "1.0"
25
+ spec.homepage = "https://github.com/piroor/tiny-classifier"
26
+ spec.authors = ["YUKI \"Piro\" Hiroshi"]
27
+ spec.email = ["piro.outsider.reflex@gmail.com"]
28
+ readme = File.read("README.md")
29
+ readme.force_encoding("UTF-8")
30
+ entries = readme.split(/^\#\#\s(.*)$/)
31
+ description = clean_white_space.call(entries[entries.index("Description") + 1])
32
+ spec.summary, spec.description, = description.split(/\n\n+/, 3)
33
+ spec.license = "GPLv3 or later"
34
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
35
+ spec.files += Dir.glob("lib/**/*.rb")
36
+ Dir.chdir("bin") do
37
+ spec.executables = Dir.glob("*")
38
+ end
39
+
40
+ spec.add_runtime_dependency("classifier-reborn")
41
+ spec.add_runtime_dependency("natto")
42
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tiny-classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: '1.0'
5
+ platform: ruby
6
+ authors:
7
+ - YUKI "Piro" Hiroshi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-06-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: classifier-reborn
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: natto
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: ''
42
+ email:
43
+ - piro.outsider.reflex@gmail.com
44
+ executables:
45
+ - tc-train
46
+ - tc-classify
47
+ extensions: []
48
+ extra_rdoc_files: []
49
+ files:
50
+ - Gemfile
51
+ - README.md
52
+ - Rakefile
53
+ - bin/tc-classify
54
+ - bin/tc-train
55
+ - lib/tiny-classifier/base.rb
56
+ - lib/tiny-classifier/classifier.rb
57
+ - lib/tiny-classifier/trainer.rb
58
+ - tiny-classifier.gemspec
59
+ homepage: https://github.com/piroor/tiny-classifier
60
+ licenses:
61
+ - GPLv3 or later
62
+ metadata: {}
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ requirements: []
78
+ rubyforge_project:
79
+ rubygems_version: 2.5.1
80
+ signing_key:
81
+ specification_version: 4
82
+ summary: Command line tool to run text classifier based on naive bayes.
83
+ test_files: []