tiny-classifier 1.0 → 1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -4
- data/lib/tiny-classifier/base.rb +9 -6
- data/lib/tiny-classifier/classifier.rb +3 -2
- data/lib/tiny-classifier/trainer.rb +29 -3
- data/tiny-classifier.gemspec +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 962697672b93b80fa4ca5efcf46c9e820d184b96
|
4
|
+
data.tar.gz: 199917cf6cae91ae3fed243818bf924426b929ca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 08b97dcd3859de27e4b39796fecfb57e5a978e55c8d47dcda19e700acee7cd8a7451e3e60f7e74c750530b17ee038222853babd278b63450b1f71cd82c3d5b12
|
7
|
+
data.tar.gz: 5c7be101d3209ff3079e597e069e502be00769aa11bb025825047bcc651bbbda5b1037ac361be4a4531dcea1a4cc32c9f82e922bd461ca4ab438036abbe75446
|
data/README.md
CHANGED
@@ -14,6 +14,14 @@ Command line tool to run text classifier based on naive bayes.
|
|
14
14
|
% gem install tiny-classifier
|
15
15
|
```
|
16
16
|
|
17
|
+
If you hope to use `--tokenizer=mecab`, you need to install MeCab like:
|
18
|
+
|
19
|
+
```
|
20
|
+
% sudo apt install mecab mecab-ipadic-utf8
|
21
|
+
```
|
22
|
+
|
23
|
+
This is example on Ubuntu.
|
24
|
+
|
17
25
|
## Basic usage
|
18
26
|
|
19
27
|
Training:
|
@@ -36,13 +44,13 @@ positive
|
|
36
44
|
|
37
45
|
### Common
|
38
46
|
|
39
|
-
`--labels=LABELS` (required)
|
47
|
+
`-l`, `--labels=LABELS` (required)
|
40
48
|
: A comman-separated list of labels. You should use only alphabetic characters. (Non-alphabetical characters will cause problems.)
|
41
49
|
|
42
|
-
`--data-dir=PATH` (optional)
|
43
|
-
: The path to the directory that the training data to be saved. The current directory.
|
50
|
+
`-d`, `--data-dir=PATH` (optional)
|
51
|
+
: The path to the directory that the training data to be saved. The current directory is the default value.
|
44
52
|
|
45
|
-
`--tokenizer=TOKENIZER` (optional)
|
53
|
+
`-t`, `--tokenizer=TOKENIZER` (optional)
|
46
54
|
: Tokenizer for input which is not separated by whitespaces. Possible values are: only `mecab`.
|
47
55
|
|
48
56
|
### Trainer
|
data/lib/tiny-classifier/base.rb
CHANGED
@@ -26,27 +26,30 @@ class TinyClassifierBase
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def parse_command_line_options(command_line_options)
|
29
|
-
option_parser = create_option_parser
|
30
29
|
option_parser.parse!(command_line_options)
|
31
30
|
end
|
32
31
|
|
33
32
|
private
|
33
|
+
def option_parser
|
34
|
+
@option_parser ||= create_option_parser
|
35
|
+
end
|
36
|
+
|
34
37
|
def create_option_parser
|
35
38
|
parser = OptionParser.new
|
36
39
|
|
37
|
-
parser.on("--data-dir=PATH",
|
40
|
+
parser.on("-d PATH", "--data-dir=PATH",
|
38
41
|
"Path to the directory to store training data file (default=current directory)") do |data_dir|
|
39
42
|
@data_dir = data_dir
|
40
43
|
end
|
41
44
|
|
42
|
-
parser.on("--labels=LABELS",
|
45
|
+
parser.on("-l LABELS", "--labels=LABELS",
|
43
46
|
"List of labels (comma-separated)") do |labels|
|
44
47
|
@labels = normalize_labels(labels)
|
45
48
|
end
|
46
49
|
|
47
|
-
parser.on("--tokenizer=TOKENIZER",
|
50
|
+
parser.on("-t TOKENIZER", "--tokenizer=TOKENIZER",
|
48
51
|
"Tokenizer (default=#{@tokenizer})") do |tokenizer|
|
49
|
-
@tokenizer = tokenizer.to_sym
|
52
|
+
@tokenizer = tokenizer.downcase.to_sym
|
50
53
|
end
|
51
54
|
|
52
55
|
parser
|
@@ -113,7 +116,7 @@ class TinyClassifierBase
|
|
113
116
|
natto = Natto::MeCab.new
|
114
117
|
terms = []
|
115
118
|
natto.parse(@input) do |term|
|
116
|
-
if term.feature =~
|
119
|
+
if term.feature =~ /\A(名詞|形容詞|動詞)/
|
117
120
|
terms << term.surface
|
118
121
|
end
|
119
122
|
end
|
@@ -31,11 +31,12 @@ class Classifier < TinyClassifierBase
|
|
31
31
|
@input = params[:input]
|
32
32
|
prepare_input
|
33
33
|
if @input.empty?
|
34
|
-
|
34
|
+
STDERR.puts("Error: No effective input.")
|
35
|
+
false
|
35
36
|
else
|
36
37
|
label = classifier.classify(@input)
|
37
38
|
puts label.downcase
|
38
|
-
|
39
|
+
true
|
39
40
|
end
|
40
41
|
end
|
41
42
|
end
|
@@ -27,20 +27,46 @@ class Trainer < TinyClassifierBase
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
+
def initialize
|
31
|
+
super
|
32
|
+
option_parser.banner += " LABEL"
|
33
|
+
end
|
34
|
+
|
30
35
|
def run(params)
|
31
36
|
@label = params[:label]
|
32
37
|
@input = params[:input]
|
38
|
+
prepare_label
|
33
39
|
prepare_input
|
34
40
|
if @input.empty?
|
35
|
-
|
41
|
+
STDERR.puts("Error: No effective input.")
|
42
|
+
false
|
36
43
|
else
|
37
|
-
classifier.send("train_#{@label
|
44
|
+
classifier.send("train_#{@label}", @input)
|
38
45
|
save
|
39
|
-
|
46
|
+
true
|
40
47
|
end
|
41
48
|
end
|
42
49
|
|
43
50
|
private
|
51
|
+
def prepare_label
|
52
|
+
unless @label
|
53
|
+
STDERR.puts("Error: You need to specify the label for the input.")
|
54
|
+
exit(false)
|
55
|
+
end
|
56
|
+
|
57
|
+
@label = @label.downcase.strip
|
58
|
+
|
59
|
+
if @label.empty?
|
60
|
+
STDERR.puts("Error: You need to specify the label for the input.")
|
61
|
+
exit(false)
|
62
|
+
end
|
63
|
+
|
64
|
+
unless @labels.include?(@label.capitalize)
|
65
|
+
STDERR.puts("Error: You need to specify one of valid labels: #{@labels.join(', ')}")
|
66
|
+
exit(false)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
44
70
|
def save
|
45
71
|
data = Marshal.dump(classifier)
|
46
72
|
File.open(data_file_path, "w") do |file|
|
data/tiny-classifier.gemspec
CHANGED
@@ -21,7 +21,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
|
|
21
21
|
|
22
22
|
Gem::Specification.new do |spec|
|
23
23
|
spec.name = "tiny-classifier"
|
24
|
-
spec.version = "1.
|
24
|
+
spec.version = "1.1"
|
25
25
|
spec.homepage = "https://github.com/piroor/tiny-classifier"
|
26
26
|
spec.authors = ["YUKI \"Piro\" Hiroshi"]
|
27
27
|
spec.email = ["piro.outsider.reflex@gmail.com"]
|