libsvm_preprocessor 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +0 -0
- data/Rakefile +5 -0
- data/bin/libsvm_pp +33 -0
- data/lib/libsvm_preprocessor/cli.rb +57 -0
- data/lib/libsvm_preprocessor/feature_generator.rb +45 -0
- data/lib/libsvm_preprocessor/global.rb +3 -0
- data/lib/libsvm_preprocessor/preprocessor.rb +136 -0
- data/lib/libsvm_preprocessor/token_map.rb +30 -0
- data/lib/libsvm_preprocessor/tokenizer.rb +44 -0
- data/lib/libsvm_preprocessor/version.rb +3 -0
- data/lib/libsvm_prerpocessor.rb +1 -0
- data/spec/feature_generator_spec.rb +58 -0
- data/spec/preprocessor_spec.rb +111 -0
- data/spec/token_map_spec.rb +60 -0
- data/spec/tokenizer_spec.rb +36 -0
- metadata +90 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: ec1de3e3e31391a33e628f4dc4c6ace1b9c96cd3
|
4
|
+
data.tar.gz: e897e7149c5ace324fba715154402da3efc075d9
|
5
|
+
!binary "U0hBNTEy":
|
6
|
+
metadata.gz: 122a7ad95b42b0b2429aa69d8cac690c90800f870f851ac56a665c69e2ba933cb770b484f45ada754f4a7336a1e2e28c98ce8407f27ab7a98797aca8b8562613
|
7
|
+
data.tar.gz: f725942158aab1a7d8a34105ccad1639664e961f99739f5a05bbe67da1089ef4427200721f0ba527d1aa7fff2b3c52b158b95e2ccbaa84d4bfac64cc847879c4
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2013 by Andrea Nodari
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.md
ADDED
File without changes
|
data/Rakefile
ADDED
data/bin/libsvm_pp
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
if RUBY_VERSION < '2.0.0'
|
5
|
+
puts 'This gem supports only Ruby 2.0.0+'
|
6
|
+
exit 1
|
7
|
+
else
|
8
|
+
$LOAD_PATH.unshift(File.dirname(File.realpath(__FILE__)) + '/../lib')
|
9
|
+
|
10
|
+
require 'csv'
|
11
|
+
require 'libsvm_preprocessor/preprocessor'
|
12
|
+
require 'libsvm_preprocessor/cli'
|
13
|
+
|
14
|
+
options = CLI.parse(ARGV)
|
15
|
+
|
16
|
+
if !File.exist? ARGV[0]
|
17
|
+
puts "Please insert a real input file."
|
18
|
+
exit 1
|
19
|
+
end
|
20
|
+
|
21
|
+
preprocessor = Preprocessor.new(options)
|
22
|
+
preprocessor.use(ARGV[0], testing: options[:testing])
|
23
|
+
end
|
24
|
+
|
25
|
+
# output_dir = File.dirname(File.realpath(__FILE__)) + '/../output'
|
26
|
+
# input_test = ARGV[1]
|
27
|
+
# output_test_path = "#{OUTPUT_DIR}/test.svm"
|
28
|
+
# output_test = File.open(output_test_path, "w")
|
29
|
+
# CSV.foreach(input_test, OPTIONS_INPUT) do |row|
|
30
|
+
# vector = processor.toSVM(processor.push(row, testing: true))
|
31
|
+
# output_test.puts vector
|
32
|
+
# end
|
33
|
+
# output_test.close
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
class CLI
|
4
|
+
|
5
|
+
def self.parse(args)
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
|
9
|
+
options[:mode] = :unigram
|
10
|
+
options[:lang] = :it
|
11
|
+
options[:stemming] = false
|
12
|
+
options[:stopwords] = false
|
13
|
+
options[:testing] = false
|
14
|
+
options[:numeric_type] = nil
|
15
|
+
options[:output] = nil
|
16
|
+
|
17
|
+
opt_parser = OptionParser.new do |opts|
|
18
|
+
opts.banner = "libsvm_pp [options] <filename>"
|
19
|
+
|
20
|
+
opts.on("-m [TYPE]", "--mode [TYPE]", [:unigram, :bigram],
|
21
|
+
"Select unigram (default) or bigram") do |mode|
|
22
|
+
options[:mode] = mode
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-s", "--stemming", "Use this you want stemming") do |s|
|
26
|
+
options[:stemming] = s
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-w", "--remove-stopwords",
|
30
|
+
"Use this if you want remove stopwords") do |w|
|
31
|
+
options[:stopwords] = w
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-t", "--testing",
|
35
|
+
"Use this to use testing mode") do |t|
|
36
|
+
options[:testing] = t
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-l [TYPE]", "--language", [:it, :en],
|
40
|
+
"Select your language it / en") do |l|
|
41
|
+
options[:lang] = l
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on("-n N", Integer, "Numeric mode") do |n|
|
45
|
+
options[:numeric_type] = n
|
46
|
+
end
|
47
|
+
|
48
|
+
opts.on("-o [output]", String, "output file") do |o|
|
49
|
+
options[:output] = o
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
opt_parser.parse!(args)
|
54
|
+
options
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class FeatureGenerator
|
2
|
+
|
3
|
+
def hash_of_ngrams
|
4
|
+
@token_map.hash_of_ngrams
|
5
|
+
end
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
@token_map = TokenMap.new
|
9
|
+
@options = options
|
10
|
+
@options[:mode] ||= :unigram
|
11
|
+
end
|
12
|
+
|
13
|
+
def features(ary_of_terms, testing: false)
|
14
|
+
if @options[:mode] == :unigram
|
15
|
+
@token_map.token_map(unigrams(ary_of_terms), testing: testing)
|
16
|
+
elsif @options[:mode] == :bigram
|
17
|
+
@token_map.token_map(unigrams(ary_of_terms) +
|
18
|
+
bigrams(ary_of_terms),
|
19
|
+
testing: testing)
|
20
|
+
elsif @options[:mode] == :trichar
|
21
|
+
@token_map.token_map trichar(ary_of_terms)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def trichar(ary_of_terms)
|
26
|
+
string = ary_of_terms.join(" ")
|
27
|
+
if string.size < 3
|
28
|
+
return [ [string] ]
|
29
|
+
end
|
30
|
+
string1 = string[0...-2].split(//)
|
31
|
+
string2 = string[1...-1].split(//)
|
32
|
+
string3 = string[2..-1].split(//)
|
33
|
+
string1.zip(string2).zip(string3).map do |x|
|
34
|
+
[x.flatten.join]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def unigrams(ary_of_term)
|
39
|
+
ary_of_term.map { |term| [term] }
|
40
|
+
end
|
41
|
+
|
42
|
+
def bigrams(ary)
|
43
|
+
ary[0...-1].zip(ary[1..-1])
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'libsvm_preprocessor/tokenizer'
|
2
|
+
require 'libsvm_preprocessor/token_map'
|
3
|
+
require 'libsvm_preprocessor/feature_generator'
|
4
|
+
require 'libsvm_preprocessor/global'
|
5
|
+
|
6
|
+
class Preprocessor
|
7
|
+
attr_reader :categories
|
8
|
+
attr_reader :instances
|
9
|
+
attr_reader :non_zero_features
|
10
|
+
|
11
|
+
OPTIONS_MAP = {
|
12
|
+
0 => { lang: "it", mode: :unigram, stemming: false, stopword: false },
|
13
|
+
1 => { lang: "it", mode: :bigram, stemming: false, stopword: false },
|
14
|
+
2 => { lang: "it", mode: :unigram, stemming: true, stopword: false },
|
15
|
+
3 => { lang: "it", mode: :bigram, stemming: true, stopword: false },
|
16
|
+
4 => { lang: "it", mode: :unigram, stemming: false, stopword: true },
|
17
|
+
5 => { lang: "it", mode: :bigram, stemming: false, stopword: true },
|
18
|
+
6 => { lang: "it", mode: :unigram, stemming: true, stopword: true },
|
19
|
+
7 => { lang: "it", mode: :bigram, stemming: true, stopword: true },
|
20
|
+
8 => { lang: "it", mode: :trichar, stemming: true, stopword: true },
|
21
|
+
9 => { lang: "it", mode: :trichar, stemming: true, stopword: false },
|
22
|
+
10 => { lang: "it", mode: :trichar, stemming: false, stopword: true },
|
23
|
+
11 => { lang: "it", mode: :trichar, stemming: false, stopword: false },
|
24
|
+
}
|
25
|
+
|
26
|
+
def hash_of_ngrams
|
27
|
+
@generator.hash_of_ngrams
|
28
|
+
end
|
29
|
+
|
30
|
+
def override_options(options)
|
31
|
+
OPTIONS_MAP[options[:numeric_type]]
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.options_map_size
|
35
|
+
OPTIONS_MAP.size
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.options_map(key)
|
39
|
+
OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ")
|
40
|
+
end
|
41
|
+
|
42
|
+
def options
|
43
|
+
@options
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize(options = {})
|
47
|
+
if options[:numeric_type]
|
48
|
+
options = override_options(options)
|
49
|
+
end
|
50
|
+
@options = options
|
51
|
+
@tokenizer = Tokenizer.new(options)
|
52
|
+
@generator = FeatureGenerator.new(options)
|
53
|
+
|
54
|
+
@non_zero_features = {}
|
55
|
+
@non_zero_features[:testing] = 0
|
56
|
+
@non_zero_features[:training] = 0
|
57
|
+
|
58
|
+
@instances = {}
|
59
|
+
@instances[:testing] = []
|
60
|
+
@instances[:training] = []
|
61
|
+
|
62
|
+
@categories = {}
|
63
|
+
@current_category_id = -1
|
64
|
+
end
|
65
|
+
|
66
|
+
def push(data, testing: false)
|
67
|
+
category, string = data
|
68
|
+
# If it is a new category I need to associate a new id
|
69
|
+
if !@categories[category]
|
70
|
+
@categories[category] = next_category_id
|
71
|
+
end
|
72
|
+
v = vectorize(category, string, testing: testing)
|
73
|
+
if testing
|
74
|
+
@instances[:testing] << v
|
75
|
+
@non_zero_features[:testing] += v.last.size
|
76
|
+
else
|
77
|
+
@instances[:training] << v
|
78
|
+
@non_zero_features[:training] += v.last.size
|
79
|
+
end
|
80
|
+
return v
|
81
|
+
end
|
82
|
+
|
83
|
+
def toSVM(vector)
|
84
|
+
# the following line is made to have clean diff with libshorttext
|
85
|
+
return "#{vector.first} " if vector.last.empty?
|
86
|
+
features = vector.last
|
87
|
+
.map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
|
88
|
+
"#{vector.first} #{features}"
|
89
|
+
end
|
90
|
+
|
91
|
+
# This method is only meant to stringify the vector in very same
|
92
|
+
# format of libsvm (in this way diff does not mess up)
|
93
|
+
def nice_string(v)
|
94
|
+
return v.join(" ") if v[1] != ""
|
95
|
+
return "#{v[0]} "
|
96
|
+
end
|
97
|
+
|
98
|
+
def use(input_path, testing: false)
|
99
|
+
if @options[:output]
|
100
|
+
output_file = File.open(@options.output, "w")
|
101
|
+
CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
|
102
|
+
output_file.puts toSVM( push(row, testing: testing) )
|
103
|
+
end
|
104
|
+
output_file.close
|
105
|
+
else
|
106
|
+
CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
|
107
|
+
puts toSVM( push(row, testing: testing) )
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def vectorize(category, string, testing: false)
|
115
|
+
tokens = @tokenizer.tokenize(string)
|
116
|
+
features = @generator.features(tokens, testing: testing)
|
117
|
+
ids_with_frequency = count_frequency(features)
|
118
|
+
|
119
|
+
[ @categories[category], ids_with_frequency ]
|
120
|
+
end
|
121
|
+
|
122
|
+
def count_frequency(features)
|
123
|
+
ids = features.map { |x| x.keys.first }.sort
|
124
|
+
result = ids.uniq.map do |id|
|
125
|
+
{ id => ids.count(id) }
|
126
|
+
end
|
127
|
+
result
|
128
|
+
end
|
129
|
+
|
130
|
+
# Give the next category id available
|
131
|
+
def next_category_id
|
132
|
+
@current_category_id += 1
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class TokenMap
|
2
|
+
|
3
|
+
attr_reader :hash_of_ngrams
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@hash_of_ngrams = {}
|
7
|
+
@current_ngram_id = 0
|
8
|
+
end
|
9
|
+
|
10
|
+
def token_map(ary_of_ngrams, testing: false)
|
11
|
+
if !testing
|
12
|
+
ary_of_ngrams.each { |ngram| @hash_of_ngrams[ngram] ||= next_ngram_id }
|
13
|
+
ary_of_ngrams.map { |ngram| { @hash_of_ngrams[ngram] => ngram } }
|
14
|
+
else
|
15
|
+
ary_of_ngrams.map do |ngram|
|
16
|
+
{ @hash_of_ngrams[ngram] => ngram }
|
17
|
+
end.select do |hash|
|
18
|
+
hash.keys.first
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
# Give the next term id available
|
26
|
+
def next_ngram_id
|
27
|
+
@current_ngram_id += 1
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'lingua/stemmer'
|
2
|
+
require 'stopwords'
|
3
|
+
require 'unicode'
|
4
|
+
|
5
|
+
class Tokenizer
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
@options = options
|
9
|
+
@options[:stopword] ||= false
|
10
|
+
@options[:stemming] ||= false
|
11
|
+
@options[:lang] ||= "it"
|
12
|
+
@filter = Stopwords::Snowball::Filter.new(@options[:lang])
|
13
|
+
@stemmer = Lingua::Stemmer.new(language: @options[:lang])
|
14
|
+
end
|
15
|
+
|
16
|
+
def tokenize(string)
|
17
|
+
result = process_text(string)
|
18
|
+
result = remove_stopwords(result) if @options[:stopword]
|
19
|
+
result = stem_each(result) if @options[:stemming]
|
20
|
+
result
|
21
|
+
end
|
22
|
+
|
23
|
+
def process_text(string)
|
24
|
+
string.downcase!
|
25
|
+
string = Unicode.nfd(string)
|
26
|
+
string.gsub!(/[^[:alpha:]]/, ' ')
|
27
|
+
string.gsub!(/([a-z])([0-9])/, '\1 \2')
|
28
|
+
string.gsub!(/([0-9])([a-z])/, '\1 \2')
|
29
|
+
string.gsub!(/\s+/, ' ')
|
30
|
+
string.strip!
|
31
|
+
string.split(' ')
|
32
|
+
end
|
33
|
+
|
34
|
+
# Remove stopwords according to the selected language
|
35
|
+
def remove_stopwords(ary)
|
36
|
+
@filter.filter(ary)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Stem each word according to the selected language
|
40
|
+
def stem_each(ary)
|
41
|
+
ary.map { |term| @stemmer.stem(term) }
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'libsvm_preprocessor/preprocessor'
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe FeatureGenerator do
|
5
|
+
|
6
|
+
let(:ary_of_terms) { ["a","b","c"] }
|
7
|
+
let(:ary) { ["mar","rosso"] }
|
8
|
+
|
9
|
+
context "with default options" do
|
10
|
+
let(:generator) { FeatureGenerator.new }
|
11
|
+
|
12
|
+
it "use unigrams" do
|
13
|
+
expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}]
|
14
|
+
expect(generator.features(ary_of_terms)).to eq(expected)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
context "using bigrams" do
|
19
|
+
let(:generator) { FeatureGenerator.new(:mode => :bigram) }
|
20
|
+
|
21
|
+
it "use bigrams" do
|
22
|
+
expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}, {4=>["a","b"]}, {5=>["b","c"]}]
|
23
|
+
expect(generator.features(ary_of_terms)).to eq(expected)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "use ingnore duplicates" do
|
27
|
+
expected = [{1=>["a"]}, {1=>["a"]}, {2=>["a","a"]}]
|
28
|
+
expect(generator.features(["a","a"])).to eq(expected)
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
context "using trichar" do
|
34
|
+
let(:generator) { FeatureGenerator.new(:mode => :trichar) }
|
35
|
+
|
36
|
+
it "use trichar" do
|
37
|
+
expected = [{1=>["mar"]}, {2=>["ar "]}, {3=>["r r"]}, {4=> [" ro"]}, {5=>["ros"]}, {6=>["oss"]}, {7=> ["sso"]}]
|
38
|
+
expect(generator.features(ary)).to eq(expected)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "ignore duplicates" do
|
42
|
+
expected = [{1=>["aaa"]}, {1=>["aaa"]},{1=>["aaa"]}]
|
43
|
+
expect(generator.features(["aaaaa"])).to eq(expected)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "workarounds little word" do
|
47
|
+
expected = [{1 => ["te"]}]
|
48
|
+
expect(generator.features(["te"])).to eq(expected)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "workarounds little words" do
|
52
|
+
expected = [{1 => ["te "]}, {2 => ["e n"]}, {3 => [" ne"]}]
|
53
|
+
expect(generator.features(["te", "ne"])).to eq(expected)
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe Preprocessor do
|
5
|
+
|
6
|
+
describe "default settings" do
|
7
|
+
let(:preproc) { Preprocessor.new }
|
8
|
+
let(:p_trichar) { Preprocessor.new(mode: :trichar) }
|
9
|
+
|
10
|
+
context "adding a text" do
|
11
|
+
it "maps new categories" do
|
12
|
+
preproc.push ["category", "bottiglia"]
|
13
|
+
expect(preproc.categories["category"]).to eq 0
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with default settings" do
|
18
|
+
it "produce a new vector" do
|
19
|
+
v = (preproc.push ["category", "bottiglia"])
|
20
|
+
expect(v).to eq([0, [{1 => 1}]])
|
21
|
+
end
|
22
|
+
|
23
|
+
it "takes into account frequencies" do
|
24
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
25
|
+
expect(v).to eq([0, [{1 => 3}]])
|
26
|
+
end
|
27
|
+
|
28
|
+
it "produce svm format" do
|
29
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
30
|
+
result = preproc.toSVM(v)
|
31
|
+
expect(result).to eq("0 1:3")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context "with trichar mode" do
|
36
|
+
it "produce a new vector with frequencies" do
|
37
|
+
v = (p_trichar.push ["category", "osso osso"])
|
38
|
+
expect(v).to eq([0, [{1 => 2}, {2 => 2}, {3 => 1}, {4 => 1}, {5 => 1}]])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context "when I am testing" do
|
43
|
+
it "ignore new words" do
|
44
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
45
|
+
expect(v).to eq([0, []])
|
46
|
+
end
|
47
|
+
|
48
|
+
it "remembers the old ones" do
|
49
|
+
preproc.push(["category", "bottiglia"], testing: false)
|
50
|
+
v = preproc.push(["category", "bottiglia vetro"], testing: true)
|
51
|
+
expect(v).to eq([0, [{1 => 1}]])
|
52
|
+
end
|
53
|
+
|
54
|
+
it "produce svm format with blank features" do
|
55
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
56
|
+
result = preproc.toSVM(v)
|
57
|
+
expect(result).to eq("0 ")
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
describe "using bigrams as feature" do
|
64
|
+
let(:preproc) { Preprocessor.new(mode: :bigram) }
|
65
|
+
|
66
|
+
context "adding a text" do
|
67
|
+
it "maps new categories" do
|
68
|
+
preproc.push ["category", "bottiglia"]
|
69
|
+
expect(preproc.categories["category"]).to eq 0
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "simple vectorization" do
|
74
|
+
it "produce a new vector" do
|
75
|
+
v = (preproc.push ["category", "bottiglia"])
|
76
|
+
expect(v).to eq([0, [{1 => 1}]])
|
77
|
+
end
|
78
|
+
|
79
|
+
it "takes into account frequencies" do
|
80
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
81
|
+
expect(v).to eq([0, [{1 => 3}, {2 => 2}]])
|
82
|
+
end
|
83
|
+
|
84
|
+
it "produce svm format" do
|
85
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
86
|
+
result = preproc.toSVM(v)
|
87
|
+
expect(result).to eq("0 1:3 2:2")
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
context "when I am testing" do
|
92
|
+
it "ignore new words" do
|
93
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
94
|
+
expect(v).to eq([0, []])
|
95
|
+
end
|
96
|
+
|
97
|
+
it "remembers the old ones" do
|
98
|
+
preproc.push(["category", "bottiglia"], testing: false)
|
99
|
+
v = preproc.push(["category", "bottiglia vetro"], testing: true)
|
100
|
+
expect(v).to eq([0, [{1 => 1}]])
|
101
|
+
end
|
102
|
+
|
103
|
+
it "produce svm format with blank features" do
|
104
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
105
|
+
result = preproc.toSVM(v)
|
106
|
+
expect(result).to eq("0 ")
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe TokenMap do
|
5
|
+
let(:token_map) { TokenMap.new }
|
6
|
+
|
7
|
+
context "it maps terms in new ids" do
|
8
|
+
it "maps new tokens" do
|
9
|
+
ngrams = token_map.token_map([["bottiglia"],["di"],["vetro"]])
|
10
|
+
expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["vetro"]}]
|
11
|
+
expect(ngrams).to eq(expected)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
context "it remembers old ids" do
|
16
|
+
it "maps new tokens" do
|
17
|
+
token_map.token_map([["bottiglia"],["di"],["vetro"]])
|
18
|
+
ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"]])
|
19
|
+
expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {4 => ["plastica"]}]
|
20
|
+
expect(ngrams).to eq(expected)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
context "it remembers old ids also with other trichars" do
|
25
|
+
it "maps new tokens" do
|
26
|
+
token_map.token_map([["abc"],["bc "],["c a"],[" ab"],["abc"]])
|
27
|
+
ngrams = token_map.token_map([["abc"],["c a"],["bot"]])
|
28
|
+
expected = [{1 => ["abc"]}, {3 => ["c a"]}, {5 => ["bot"]}]
|
29
|
+
expect(ngrams).to eq(expected)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "it ignores duplicates" do
|
34
|
+
it "maps new tokens" do
|
35
|
+
ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"],["plastica"]])
|
36
|
+
expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["plastica"]}, {3 => ["plastica"]}]
|
37
|
+
expect(ngrams).to eq(expected)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
context "if I am creating a test file" do
|
42
|
+
it "does not consider new terms" do
|
43
|
+
token_map.token_map([["bottiglia"],["di"],["plastica"]])
|
44
|
+
ngrams = token_map.token_map([["polenta"],["valsugana"]], testing: true)
|
45
|
+
|
46
|
+
expected = []
|
47
|
+
expect(ngrams).to eq(expected)
|
48
|
+
end
|
49
|
+
|
50
|
+
it "does not consider new terms but remembers the old ones" do
|
51
|
+
token_map.token_map([["bottiglia"],["di"],["plastica"]])
|
52
|
+
ngrams = token_map.token_map([["tappo"],["plastica"]], testing: true)
|
53
|
+
|
54
|
+
expected = [{3 => ["plastica"]}]
|
55
|
+
expect(ngrams).to eq(expected)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe Tokenizer do
|
5
|
+
let(:tokenizer) { Tokenizer.new }
|
6
|
+
|
7
|
+
context "tokenizer with default settings" do
|
8
|
+
it "tokenize a single word" do
|
9
|
+
tokens = tokenizer.tokenize("bottiglia")
|
10
|
+
expect(tokens).to eq(["bottiglia"])
|
11
|
+
end
|
12
|
+
|
13
|
+
it "tokenize multiple words" do
|
14
|
+
tokens = tokenizer.tokenize("bottiglia")
|
15
|
+
expect(tokens).to eq(["bottiglia"])
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "tokenizer with stopword removal" do
|
20
|
+
let(:tokenizer) { Tokenizer.new(stopword: true) }
|
21
|
+
|
22
|
+
it "tokenize removing stopwords" do
|
23
|
+
tokens = tokenizer.tokenize("bottiglia di vetro")
|
24
|
+
expect(tokens).to eq(["bottiglia", "vetro"])
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context "tokenizer with stopword removal" do
|
29
|
+
let(:tokenizer) { Tokenizer.new(stemming: true) }
|
30
|
+
|
31
|
+
it "tokenize stemming each word" do
|
32
|
+
tokens = tokenizer.tokenize("bottiglia di vetro")
|
33
|
+
expect(tokens).to eq(["bottigl", "di", "vetr"])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: libsvm_preprocessor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrea Nodari
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-05-31 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: stopwords-filter
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ruby-stemmer
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.3
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.9.3
|
41
|
+
description: |2
|
42
|
+
It's a text preprocessor that generate a libsvm input file
|
43
|
+
email: andrea.nodari91@gmail.com
|
44
|
+
executables:
|
45
|
+
- libsvm_pp
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- README.md
|
50
|
+
- Rakefile
|
51
|
+
- LICENSE
|
52
|
+
- lib/libsvm_preprocessor/cli.rb
|
53
|
+
- lib/libsvm_preprocessor/feature_generator.rb
|
54
|
+
- lib/libsvm_preprocessor/global.rb
|
55
|
+
- lib/libsvm_preprocessor/preprocessor.rb
|
56
|
+
- lib/libsvm_preprocessor/token_map.rb
|
57
|
+
- lib/libsvm_preprocessor/tokenizer.rb
|
58
|
+
- lib/libsvm_preprocessor/version.rb
|
59
|
+
- lib/libsvm_prerpocessor.rb
|
60
|
+
- bin/libsvm_pp
|
61
|
+
- spec/feature_generator_spec.rb
|
62
|
+
- spec/preprocessor_spec.rb
|
63
|
+
- spec/token_map_spec.rb
|
64
|
+
- spec/tokenizer_spec.rb
|
65
|
+
homepage: http://github.com/nodo/libsvm_preprocessor
|
66
|
+
licenses:
|
67
|
+
- MIT
|
68
|
+
metadata: {}
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options: []
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 2.0.0.preview3.1
|
86
|
+
signing_key:
|
87
|
+
specification_version: 4
|
88
|
+
summary: It's a text preprocessor that generate a libsvm input file
|
89
|
+
test_files: []
|
90
|
+
has_rdoc: false
|