tiny-classifier 1.0 → 1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -4
- data/lib/tiny-classifier/base.rb +9 -6
- data/lib/tiny-classifier/classifier.rb +3 -2
- data/lib/tiny-classifier/trainer.rb +29 -3
- data/tiny-classifier.gemspec +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 962697672b93b80fa4ca5efcf46c9e820d184b96
|
4
|
+
data.tar.gz: 199917cf6cae91ae3fed243818bf924426b929ca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 08b97dcd3859de27e4b39796fecfb57e5a978e55c8d47dcda19e700acee7cd8a7451e3e60f7e74c750530b17ee038222853babd278b63450b1f71cd82c3d5b12
|
7
|
+
data.tar.gz: 5c7be101d3209ff3079e597e069e502be00769aa11bb025825047bcc651bbbda5b1037ac361be4a4531dcea1a4cc32c9f82e922bd461ca4ab438036abbe75446
|
data/README.md
CHANGED
@@ -14,6 +14,14 @@ Command line tool to run text classifier based on naive bayes.
|
|
14
14
|
% gem install tiny-classifier
|
15
15
|
```
|
16
16
|
|
17
|
+
If you hope to use `--tokenizer=mecab`, you need to install MeCab like:
|
18
|
+
|
19
|
+
```
|
20
|
+
% sudo apt install mecab mecab-ipadic-utf8
|
21
|
+
```
|
22
|
+
|
23
|
+
This is example on Ubuntu.
|
24
|
+
|
17
25
|
## Basic usage
|
18
26
|
|
19
27
|
Training:
|
@@ -36,13 +44,13 @@ positive
|
|
36
44
|
|
37
45
|
### Common
|
38
46
|
|
39
|
-
`--labels=LABELS` (required)
|
47
|
+
`-l`, `--labels=LABELS` (required)
|
40
48
|
: A comman-separated list of labels. You should use only alphabetic characters. (Non-alphabetical characters will cause problems.)
|
41
49
|
|
42
|
-
`--data-dir=PATH` (optional)
|
43
|
-
: The path to the directory that the training data to be saved. The current directory.
|
50
|
+
`-d`, `--data-dir=PATH` (optional)
|
51
|
+
: The path to the directory that the training data to be saved. The current directory is the default value.
|
44
52
|
|
45
|
-
`--tokenizer=TOKENIZER` (optional)
|
53
|
+
`-t`, `--tokenizer=TOKENIZER` (optional)
|
46
54
|
: Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
|
47
55
|
|
48
56
|
### Trainer
|
data/lib/tiny-classifier/base.rb
CHANGED
@@ -26,27 +26,30 @@ class TinyClassifierBase
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def parse_command_line_options(command_line_options)
|
29
|
-
option_parser = create_option_parser
|
30
29
|
option_parser.parse!(command_line_options)
|
31
30
|
end
|
32
31
|
|
33
32
|
private
|
33
|
+
def option_parser
|
34
|
+
@option_parser ||= create_option_parser
|
35
|
+
end
|
36
|
+
|
34
37
|
def create_option_parser
|
35
38
|
parser = OptionParser.new
|
36
39
|
|
37
|
-
parser.on("--data-dir=PATH",
|
40
|
+
parser.on("-d PATH", "--data-dir=PATH",
|
38
41
|
"Path to the directory to store training data file (default=current directory)") do |data_dir|
|
39
42
|
@data_dir = data_dir
|
40
43
|
end
|
41
44
|
|
42
|
-
parser.on("--labels=LABELS",
|
45
|
+
parser.on("-l LABELS", "--labels=LABELS",
|
43
46
|
"List of labels (comma-separated)") do |labels|
|
44
47
|
@labels = normalize_labels(labels)
|
45
48
|
end
|
46
49
|
|
47
|
-
parser.on("--tokenizer=TOKENIZER",
|
50
|
+
parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
|
48
51
|
"Tokenizer (default=#{@tokenizer})") do |tokenizer|
|
49
|
-
@tokenizer = tokenizer.to_sym
|
52
|
+
@tokenizer = tokenizer.downcase.to_sym
|
50
53
|
end
|
51
54
|
|
52
55
|
parser
|
@@ -113,7 +116,7 @@ class TinyClassifierBase
|
|
113
116
|
natto = Natto::MeCab.new
|
114
117
|
terms = []
|
115
118
|
natto.parse(@input) do |term|
|
116
|
-
if term.feature =~
|
119
|
+
if term.feature =~ /\A(名詞|形容詞|動詞)/
|
117
120
|
terms << term.surface
|
118
121
|
end
|
119
122
|
end
|
@@ -31,11 +31,12 @@ class Classifier < TinyClassifierBase
|
|
31
31
|
@input = params[:input]
|
32
32
|
prepare_input
|
33
33
|
if @input.empty?
|
34
|
-
|
34
|
+
STDERR.puts("Error: No effective input.")
|
35
|
+
false
|
35
36
|
else
|
36
37
|
label = classifier.classify(@input)
|
37
38
|
puts label.downcase
|
38
|
-
|
39
|
+
true
|
39
40
|
end
|
40
41
|
end
|
41
42
|
end
|
@@ -27,20 +27,46 @@ class Trainer < TinyClassifierBase
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
+
def initialize
|
31
|
+
super
|
32
|
+
option_parser.banner += " LABEL"
|
33
|
+
end
|
34
|
+
|
30
35
|
def run(params)
|
31
36
|
@label = params[:label]
|
32
37
|
@input = params[:input]
|
38
|
+
prepare_label
|
33
39
|
prepare_input
|
34
40
|
if @input.empty?
|
35
|
-
|
41
|
+
STDERR.puts("Error: No effective input.")
|
42
|
+
false
|
36
43
|
else
|
37
|
-
classifier.send("train_#{@label
|
44
|
+
classifier.send("train_#{@label}", @input)
|
38
45
|
save
|
39
|
-
|
46
|
+
true
|
40
47
|
end
|
41
48
|
end
|
42
49
|
|
43
50
|
private
|
51
|
+
def prepare_label
|
52
|
+
unless @label
|
53
|
+
STDERR.puts("Error: You need to specify the label for the input.")
|
54
|
+
exit(false)
|
55
|
+
end
|
56
|
+
|
57
|
+
@label = @label.downcase.strip
|
58
|
+
|
59
|
+
if @label.empty?
|
60
|
+
STDERR.puts("Error: You need to specify the label for the input.")
|
61
|
+
exit(false)
|
62
|
+
end
|
63
|
+
|
64
|
+
unless @labels.include?(@label.capitalize)
|
65
|
+
STDERR.puts("Error: You need to specify one of valid labels: #{@labels.join(', ')}")
|
66
|
+
exit(false)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
44
70
|
def save
|
45
71
|
data = Marshal.dump(classifier)
|
46
72
|
File.open(data_file_path, "w") do |file|
|
data/tiny-classifier.gemspec
CHANGED
@@ -21,7 +21,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
|
|
21
21
|
|
22
22
|
Gem::Specification.new do |spec|
|
23
23
|
spec.name = "tiny-classifier"
|
24
|
-
spec.version = "1.
|
24
|
+
spec.version = "1.1"
|
25
25
|
spec.homepage = "https://github.com/piroor/tiny-classifier"
|
26
26
|
spec.authors = ["YUKI \"Piro\" Hiroshi"]
|
27
27
|
spec.email = ["piro.outsider.reflex@gmail.com"]
|