libsvm_preprocessor 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +0 -0
- data/Rakefile +5 -0
- data/bin/libsvm_pp +33 -0
- data/lib/libsvm_preprocessor/cli.rb +57 -0
- data/lib/libsvm_preprocessor/feature_generator.rb +45 -0
- data/lib/libsvm_preprocessor/global.rb +3 -0
- data/lib/libsvm_preprocessor/preprocessor.rb +136 -0
- data/lib/libsvm_preprocessor/token_map.rb +30 -0
- data/lib/libsvm_preprocessor/tokenizer.rb +44 -0
- data/lib/libsvm_preprocessor/version.rb +3 -0
- data/lib/libsvm_prerpocessor.rb +1 -0
- data/spec/feature_generator_spec.rb +58 -0
- data/spec/preprocessor_spec.rb +111 -0
- data/spec/token_map_spec.rb +60 -0
- data/spec/tokenizer_spec.rb +36 -0
- metadata +90 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: ec1de3e3e31391a33e628f4dc4c6ace1b9c96cd3
|
4
|
+
data.tar.gz: e897e7149c5ace324fba715154402da3efc075d9
|
5
|
+
!binary "U0hBNTEy":
|
6
|
+
metadata.gz: 122a7ad95b42b0b2429aa69d8cac690c90800f870f851ac56a665c69e2ba933cb770b484f45ada754f4a7336a1e2e28c98ce8407f27ab7a98797aca8b8562613
|
7
|
+
data.tar.gz: f725942158aab1a7d8a34105ccad1639664e961f99739f5a05bbe67da1089ef4427200721f0ba527d1aa7fff2b3c52b158b95e2ccbaa84d4bfac64cc847879c4
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2013 by Andrea Nodari
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.md
ADDED
File without changes
|
data/Rakefile
ADDED
data/bin/libsvm_pp
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
if RUBY_VERSION < '2.0.0'
|
5
|
+
puts 'This gem supports only Ruby 2.0.0+'
|
6
|
+
exit 1
|
7
|
+
else
|
8
|
+
$LOAD_PATH.unshift(File.dirname(File.realpath(__FILE__)) + '/../lib')
|
9
|
+
|
10
|
+
require 'csv'
|
11
|
+
require 'libsvm_preprocessor/preprocessor'
|
12
|
+
require 'libsvm_preprocessor/cli'
|
13
|
+
|
14
|
+
options = CLI.parse(ARGV)
|
15
|
+
|
16
|
+
if !File.exist? ARGV[0]
|
17
|
+
puts "Please insert a real input file."
|
18
|
+
exit 1
|
19
|
+
end
|
20
|
+
|
21
|
+
preprocessor = Preprocessor.new(options)
|
22
|
+
preprocessor.use(ARGV[0], testing: options[:testing])
|
23
|
+
end
|
24
|
+
|
25
|
+
# output_dir = File.dirname(File.realpath(__FILE__)) + '/../output'
|
26
|
+
# input_test = ARGV[1]
|
27
|
+
# output_test_path = "#{OUTPUT_DIR}/test.svm"
|
28
|
+
# output_test = File.open(output_test_path, "w")
|
29
|
+
# CSV.foreach(input_test, OPTIONS_INPUT) do |row|
|
30
|
+
# vector = processor.toSVM(processor.push(row, testing: true))
|
31
|
+
# output_test.puts vector
|
32
|
+
# end
|
33
|
+
# output_test.close
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
class CLI
|
4
|
+
|
5
|
+
def self.parse(args)
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
|
9
|
+
options[:mode] = :unigram
|
10
|
+
options[:lang] = :it
|
11
|
+
options[:stemming] = false
|
12
|
+
options[:stopwords] = false
|
13
|
+
options[:testing] = false
|
14
|
+
options[:numeric_type] = nil
|
15
|
+
options[:output] = nil
|
16
|
+
|
17
|
+
opt_parser = OptionParser.new do |opts|
|
18
|
+
opts.banner = "libsvm_pp [options] <filename>"
|
19
|
+
|
20
|
+
opts.on("-m [TYPE]", "--mode [TYPE]", [:unigram, :bigram],
|
21
|
+
"Select unigram (default) or bigram") do |mode|
|
22
|
+
options[:mode] = mode
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-s", "--stemming", "Use this you want stemming") do |s|
|
26
|
+
options[:stemming] = s
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-w", "--remove-stopwords",
|
30
|
+
"Use this if you want remove stopwords") do |w|
|
31
|
+
options[:stopwords] = w
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-t", "--testing",
|
35
|
+
"Use this to use testing mode") do |t|
|
36
|
+
options[:testing] = t
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-l [TYPE]", "--language", [:it, :en],
|
40
|
+
"Select your language it / en") do |l|
|
41
|
+
options[:lang] = l
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on("-n N", Integer, "Numeric mode") do |n|
|
45
|
+
options[:numeric_type] = n
|
46
|
+
end
|
47
|
+
|
48
|
+
opts.on("-o [output]", String, "output file") do |o|
|
49
|
+
options[:output] = o
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
opt_parser.parse!(args)
|
54
|
+
options
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class FeatureGenerator
|
2
|
+
|
3
|
+
def hash_of_ngrams
|
4
|
+
@token_map.hash_of_ngrams
|
5
|
+
end
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
@token_map = TokenMap.new
|
9
|
+
@options = options
|
10
|
+
@options[:mode] ||= :unigram
|
11
|
+
end
|
12
|
+
|
13
|
+
def features(ary_of_terms, testing: false)
|
14
|
+
if @options[:mode] == :unigram
|
15
|
+
@token_map.token_map(unigrams(ary_of_terms), testing: testing)
|
16
|
+
elsif @options[:mode] == :bigram
|
17
|
+
@token_map.token_map(unigrams(ary_of_terms) +
|
18
|
+
bigrams(ary_of_terms),
|
19
|
+
testing: testing)
|
20
|
+
elsif @options[:mode] == :trichar
|
21
|
+
@token_map.token_map trichar(ary_of_terms)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def trichar(ary_of_terms)
|
26
|
+
string = ary_of_terms.join(" ")
|
27
|
+
if string.size < 3
|
28
|
+
return [ [string] ]
|
29
|
+
end
|
30
|
+
string1 = string[0...-2].split(//)
|
31
|
+
string2 = string[1...-1].split(//)
|
32
|
+
string3 = string[2..-1].split(//)
|
33
|
+
string1.zip(string2).zip(string3).map do |x|
|
34
|
+
[x.flatten.join]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def unigrams(ary_of_term)
|
39
|
+
ary_of_term.map { |term| [term] }
|
40
|
+
end
|
41
|
+
|
42
|
+
def bigrams(ary)
|
43
|
+
ary[0...-1].zip(ary[1..-1])
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'libsvm_preprocessor/tokenizer'
|
2
|
+
require 'libsvm_preprocessor/token_map'
|
3
|
+
require 'libsvm_preprocessor/feature_generator'
|
4
|
+
require 'libsvm_preprocessor/global'
|
5
|
+
|
6
|
+
class Preprocessor
|
7
|
+
attr_reader :categories
|
8
|
+
attr_reader :instances
|
9
|
+
attr_reader :non_zero_features
|
10
|
+
|
11
|
+
OPTIONS_MAP = {
|
12
|
+
0 => { lang: "it", mode: :unigram, stemming: false, stopword: false },
|
13
|
+
1 => { lang: "it", mode: :bigram, stemming: false, stopword: false },
|
14
|
+
2 => { lang: "it", mode: :unigram, stemming: true, stopword: false },
|
15
|
+
3 => { lang: "it", mode: :bigram, stemming: true, stopword: false },
|
16
|
+
4 => { lang: "it", mode: :unigram, stemming: false, stopword: true },
|
17
|
+
5 => { lang: "it", mode: :bigram, stemming: false, stopword: true },
|
18
|
+
6 => { lang: "it", mode: :unigram, stemming: true, stopword: true },
|
19
|
+
7 => { lang: "it", mode: :bigram, stemming: true, stopword: true },
|
20
|
+
8 => { lang: "it", mode: :trichar, stemming: true, stopword: true },
|
21
|
+
9 => { lang: "it", mode: :trichar, stemming: true, stopword: false },
|
22
|
+
10 => { lang: "it", mode: :trichar, stemming: false, stopword: true },
|
23
|
+
11 => { lang: "it", mode: :trichar, stemming: false, stopword: false },
|
24
|
+
}
|
25
|
+
|
26
|
+
def hash_of_ngrams
|
27
|
+
@generator.hash_of_ngrams
|
28
|
+
end
|
29
|
+
|
30
|
+
def override_options(options)
|
31
|
+
OPTIONS_MAP[options[:numeric_type]]
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.options_map_size
|
35
|
+
OPTIONS_MAP.size
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.options_map(key)
|
39
|
+
OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ")
|
40
|
+
end
|
41
|
+
|
42
|
+
def options
|
43
|
+
@options
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize(options = {})
|
47
|
+
if options[:numeric_type]
|
48
|
+
options = override_options(options)
|
49
|
+
end
|
50
|
+
@options = options
|
51
|
+
@tokenizer = Tokenizer.new(options)
|
52
|
+
@generator = FeatureGenerator.new(options)
|
53
|
+
|
54
|
+
@non_zero_features = {}
|
55
|
+
@non_zero_features[:testing] = 0
|
56
|
+
@non_zero_features[:training] = 0
|
57
|
+
|
58
|
+
@instances = {}
|
59
|
+
@instances[:testing] = []
|
60
|
+
@instances[:training] = []
|
61
|
+
|
62
|
+
@categories = {}
|
63
|
+
@current_category_id = -1
|
64
|
+
end
|
65
|
+
|
66
|
+
def push(data, testing: false)
|
67
|
+
category, string = data
|
68
|
+
# If it is a new category I need to associate a new id
|
69
|
+
if !@categories[category]
|
70
|
+
@categories[category] = next_category_id
|
71
|
+
end
|
72
|
+
v = vectorize(category, string, testing: testing)
|
73
|
+
if testing
|
74
|
+
@instances[:testing] << v
|
75
|
+
@non_zero_features[:testing] += v.last.size
|
76
|
+
else
|
77
|
+
@instances[:training] << v
|
78
|
+
@non_zero_features[:training] += v.last.size
|
79
|
+
end
|
80
|
+
return v
|
81
|
+
end
|
82
|
+
|
83
|
+
def toSVM(vector)
|
84
|
+
# the following line is made to have clean diff with libshorttext
|
85
|
+
return "#{vector.first} " if vector.last.empty?
|
86
|
+
features = vector.last
|
87
|
+
.map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
|
88
|
+
"#{vector.first} #{features}"
|
89
|
+
end
|
90
|
+
|
91
|
+
# This method is only meant to stringify the vector in very same
|
92
|
+
# format of libsvm (in this way diff does not mess up)
|
93
|
+
def nice_string(v)
|
94
|
+
return v.join(" ") if v[1] != ""
|
95
|
+
return "#{v[0]} "
|
96
|
+
end
|
97
|
+
|
98
|
+
def use(input_path, testing: false)
|
99
|
+
if @options[:output]
|
100
|
+
output_file = File.open(@options.output, "w")
|
101
|
+
CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
|
102
|
+
output_file.puts toSVM( push(row, testing: testing) )
|
103
|
+
end
|
104
|
+
output_file.close
|
105
|
+
else
|
106
|
+
CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
|
107
|
+
puts toSVM( push(row, testing: testing) )
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def vectorize(category, string, testing: false)
|
115
|
+
tokens = @tokenizer.tokenize(string)
|
116
|
+
features = @generator.features(tokens, testing: testing)
|
117
|
+
ids_with_frequency = count_frequency(features)
|
118
|
+
|
119
|
+
[ @categories[category], ids_with_frequency ]
|
120
|
+
end
|
121
|
+
|
122
|
+
def count_frequency(features)
|
123
|
+
ids = features.map { |x| x.keys.first }.sort
|
124
|
+
result = ids.uniq.map do |id|
|
125
|
+
{ id => ids.count(id) }
|
126
|
+
end
|
127
|
+
result
|
128
|
+
end
|
129
|
+
|
130
|
+
# Give the next category id available
|
131
|
+
def next_category_id
|
132
|
+
@current_category_id += 1
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class TokenMap
|
2
|
+
|
3
|
+
attr_reader :hash_of_ngrams
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@hash_of_ngrams = {}
|
7
|
+
@current_ngram_id = 0
|
8
|
+
end
|
9
|
+
|
10
|
+
def token_map(ary_of_ngrams, testing: false)
|
11
|
+
if !testing
|
12
|
+
ary_of_ngrams.each { |ngram| @hash_of_ngrams[ngram] ||= next_ngram_id }
|
13
|
+
ary_of_ngrams.map { |ngram| { @hash_of_ngrams[ngram] => ngram } }
|
14
|
+
else
|
15
|
+
ary_of_ngrams.map do |ngram|
|
16
|
+
{ @hash_of_ngrams[ngram] => ngram }
|
17
|
+
end.select do |hash|
|
18
|
+
hash.keys.first
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
# Give the next term id available
|
26
|
+
def next_ngram_id
|
27
|
+
@current_ngram_id += 1
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'lingua/stemmer'
|
2
|
+
require 'stopwords'
|
3
|
+
require 'unicode'
|
4
|
+
|
5
|
+
class Tokenizer
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
@options = options
|
9
|
+
@options[:stopword] ||= false
|
10
|
+
@options[:stemming] ||= false
|
11
|
+
@options[:lang] ||= "it"
|
12
|
+
@filter = Stopwords::Snowball::Filter.new(@options[:lang])
|
13
|
+
@stemmer = Lingua::Stemmer.new(language: @options[:lang])
|
14
|
+
end
|
15
|
+
|
16
|
+
def tokenize(string)
|
17
|
+
result = process_text(string)
|
18
|
+
result = remove_stopwords(result) if @options[:stopword]
|
19
|
+
result = stem_each(result) if @options[:stemming]
|
20
|
+
result
|
21
|
+
end
|
22
|
+
|
23
|
+
def process_text(string)
|
24
|
+
string.downcase!
|
25
|
+
string = Unicode.nfd(string)
|
26
|
+
string.gsub!(/[^[:alpha:]]/, ' ')
|
27
|
+
string.gsub!(/([a-z])([0-9])/, '\1 \2')
|
28
|
+
string.gsub!(/([0-9])([a-z])/, '\1 \2')
|
29
|
+
string.gsub!(/\s+/, ' ')
|
30
|
+
string.strip!
|
31
|
+
string.split(' ')
|
32
|
+
end
|
33
|
+
|
34
|
+
# Remove stopwords according to the selected language
|
35
|
+
def remove_stopwords(ary)
|
36
|
+
@filter.filter(ary)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Stem each word according to the selected language
|
40
|
+
def stem_each(ary)
|
41
|
+
ary.map { |term| @stemmer.stem(term) }
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'libsvm_preprocessor/preprocessor'
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe FeatureGenerator do
|
5
|
+
|
6
|
+
let(:ary_of_terms) { ["a","b","c"] }
|
7
|
+
let(:ary) { ["mar","rosso"] }
|
8
|
+
|
9
|
+
context "with default options" do
|
10
|
+
let(:generator) { FeatureGenerator.new }
|
11
|
+
|
12
|
+
it "use unigrams" do
|
13
|
+
expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}]
|
14
|
+
expect(generator.features(ary_of_terms)).to eq(expected)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
context "using bigrams" do
|
19
|
+
let(:generator) { FeatureGenerator.new(:mode => :bigram) }
|
20
|
+
|
21
|
+
it "use bigrams" do
|
22
|
+
expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}, {4=>["a","b"]}, {5=>["b","c"]}]
|
23
|
+
expect(generator.features(ary_of_terms)).to eq(expected)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "use ingnore duplicates" do
|
27
|
+
expected = [{1=>["a"]}, {1=>["a"]}, {2=>["a","a"]}]
|
28
|
+
expect(generator.features(["a","a"])).to eq(expected)
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
context "using trichar" do
|
34
|
+
let(:generator) { FeatureGenerator.new(:mode => :trichar) }
|
35
|
+
|
36
|
+
it "use trichar" do
|
37
|
+
expected = [{1=>["mar"]}, {2=>["ar "]}, {3=>["r r"]}, {4=> [" ro"]}, {5=>["ros"]}, {6=>["oss"]}, {7=> ["sso"]}]
|
38
|
+
expect(generator.features(ary)).to eq(expected)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "ignore duplicates" do
|
42
|
+
expected = [{1=>["aaa"]}, {1=>["aaa"]},{1=>["aaa"]}]
|
43
|
+
expect(generator.features(["aaaaa"])).to eq(expected)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "workarounds little word" do
|
47
|
+
expected = [{1 => ["te"]}]
|
48
|
+
expect(generator.features(["te"])).to eq(expected)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "workarounds little words" do
|
52
|
+
expected = [{1 => ["te "]}, {2 => ["e n"]}, {3 => [" ne"]}]
|
53
|
+
expect(generator.features(["te", "ne"])).to eq(expected)
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe Preprocessor do
|
5
|
+
|
6
|
+
describe "default settings" do
|
7
|
+
let(:preproc) { Preprocessor.new }
|
8
|
+
let(:p_trichar) { Preprocessor.new(mode: :trichar) }
|
9
|
+
|
10
|
+
context "adding a text" do
|
11
|
+
it "maps new categories" do
|
12
|
+
preproc.push ["category", "bottiglia"]
|
13
|
+
expect(preproc.categories["category"]).to eq 0
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with default settings" do
|
18
|
+
it "produce a new vector" do
|
19
|
+
v = (preproc.push ["category", "bottiglia"])
|
20
|
+
expect(v).to eq([0, [{1 => 1}]])
|
21
|
+
end
|
22
|
+
|
23
|
+
it "takes into account frequencies" do
|
24
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
25
|
+
expect(v).to eq([0, [{1 => 3}]])
|
26
|
+
end
|
27
|
+
|
28
|
+
it "produce svm format" do
|
29
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
30
|
+
result = preproc.toSVM(v)
|
31
|
+
expect(result).to eq("0 1:3")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context "with trichar mode" do
|
36
|
+
it "produce a new vector with frequencies" do
|
37
|
+
v = (p_trichar.push ["category", "osso osso"])
|
38
|
+
expect(v).to eq([0, [{1 => 2}, {2 => 2}, {3 => 1}, {4 => 1}, {5 => 1}]])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context "when I am testing" do
|
43
|
+
it "ignore new words" do
|
44
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
45
|
+
expect(v).to eq([0, []])
|
46
|
+
end
|
47
|
+
|
48
|
+
it "remembers the old ones" do
|
49
|
+
preproc.push(["category", "bottiglia"], testing: false)
|
50
|
+
v = preproc.push(["category", "bottiglia vetro"], testing: true)
|
51
|
+
expect(v).to eq([0, [{1 => 1}]])
|
52
|
+
end
|
53
|
+
|
54
|
+
it "produce svm format with blank features" do
|
55
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
56
|
+
result = preproc.toSVM(v)
|
57
|
+
expect(result).to eq("0 ")
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
describe "using bigrams as feature" do
|
64
|
+
let(:preproc) { Preprocessor.new(mode: :bigram) }
|
65
|
+
|
66
|
+
context "adding a text" do
|
67
|
+
it "maps new categories" do
|
68
|
+
preproc.push ["category", "bottiglia"]
|
69
|
+
expect(preproc.categories["category"]).to eq 0
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "simple vectorization" do
|
74
|
+
it "produce a new vector" do
|
75
|
+
v = (preproc.push ["category", "bottiglia"])
|
76
|
+
expect(v).to eq([0, [{1 => 1}]])
|
77
|
+
end
|
78
|
+
|
79
|
+
it "takes into account frequencies" do
|
80
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
81
|
+
expect(v).to eq([0, [{1 => 3}, {2 => 2}]])
|
82
|
+
end
|
83
|
+
|
84
|
+
it "produce svm format" do
|
85
|
+
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
86
|
+
result = preproc.toSVM(v)
|
87
|
+
expect(result).to eq("0 1:3 2:2")
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
context "when I am testing" do
|
92
|
+
it "ignore new words" do
|
93
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
94
|
+
expect(v).to eq([0, []])
|
95
|
+
end
|
96
|
+
|
97
|
+
it "remembers the old ones" do
|
98
|
+
preproc.push(["category", "bottiglia"], testing: false)
|
99
|
+
v = preproc.push(["category", "bottiglia vetro"], testing: true)
|
100
|
+
expect(v).to eq([0, [{1 => 1}]])
|
101
|
+
end
|
102
|
+
|
103
|
+
it "produce svm format with blank features" do
|
104
|
+
v = preproc.push(["category", "bottiglia"], testing: true)
|
105
|
+
result = preproc.toSVM(v)
|
106
|
+
expect(result).to eq("0 ")
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe TokenMap do
|
5
|
+
let(:token_map) { TokenMap.new }
|
6
|
+
|
7
|
+
context "it maps terms in new ids" do
|
8
|
+
it "maps new tokens" do
|
9
|
+
ngrams = token_map.token_map([["bottiglia"],["di"],["vetro"]])
|
10
|
+
expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["vetro"]}]
|
11
|
+
expect(ngrams).to eq(expected)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
context "it remembers old ids" do
|
16
|
+
it "maps new tokens" do
|
17
|
+
token_map.token_map([["bottiglia"],["di"],["vetro"]])
|
18
|
+
ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"]])
|
19
|
+
expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {4 => ["plastica"]}]
|
20
|
+
expect(ngrams).to eq(expected)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
context "it remembers old ids also with other trichars" do
|
25
|
+
it "maps new tokens" do
|
26
|
+
token_map.token_map([["abc"],["bc "],["c a"],[" ab"],["abc"]])
|
27
|
+
ngrams = token_map.token_map([["abc"],["c a"],["bot"]])
|
28
|
+
expected = [{1 => ["abc"]}, {3 => ["c a"]}, {5 => ["bot"]}]
|
29
|
+
expect(ngrams).to eq(expected)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "it ignores duplicates" do
|
34
|
+
it "maps new tokens" do
|
35
|
+
ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"],["plastica"]])
|
36
|
+
expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["plastica"]}, {3 => ["plastica"]}]
|
37
|
+
expect(ngrams).to eq(expected)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
context "if I am creating a test file" do
|
42
|
+
it "does not consider new terms" do
|
43
|
+
token_map.token_map([["bottiglia"],["di"],["plastica"]])
|
44
|
+
ngrams = token_map.token_map([["polenta"],["valsugana"]], testing: true)
|
45
|
+
|
46
|
+
expected = []
|
47
|
+
expect(ngrams).to eq(expected)
|
48
|
+
end
|
49
|
+
|
50
|
+
it "does not consider new terms but remembers the old ones" do
|
51
|
+
token_map.token_map([["bottiglia"],["di"],["plastica"]])
|
52
|
+
ngrams = token_map.token_map([["tappo"],["plastica"]], testing: true)
|
53
|
+
|
54
|
+
expected = [{3 => ["plastica"]}]
|
55
|
+
expect(ngrams).to eq(expected)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'libsvm_preprocessor/preprocessor'
|
3
|
+
|
4
|
+
describe Tokenizer do
|
5
|
+
let(:tokenizer) { Tokenizer.new }
|
6
|
+
|
7
|
+
context "tokenizer with default settings" do
|
8
|
+
it "tokenize a single word" do
|
9
|
+
tokens = tokenizer.tokenize("bottiglia")
|
10
|
+
expect(tokens).to eq(["bottiglia"])
|
11
|
+
end
|
12
|
+
|
13
|
+
it "tokenize multiple words" do
|
14
|
+
tokens = tokenizer.tokenize("bottiglia")
|
15
|
+
expect(tokens).to eq(["bottiglia"])
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "tokenizer with stopword removal" do
|
20
|
+
let(:tokenizer) { Tokenizer.new(stopword: true) }
|
21
|
+
|
22
|
+
it "tokenize removing stopwords" do
|
23
|
+
tokens = tokenizer.tokenize("bottiglia di vetro")
|
24
|
+
expect(tokens).to eq(["bottiglia", "vetro"])
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context "tokenizer with stopword removal" do
|
29
|
+
let(:tokenizer) { Tokenizer.new(stemming: true) }
|
30
|
+
|
31
|
+
it "tokenize stemming each word" do
|
32
|
+
tokens = tokenizer.tokenize("bottiglia di vetro")
|
33
|
+
expect(tokens).to eq(["bottigl", "di", "vetr"])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: libsvm_preprocessor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrea Nodari
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-05-31 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: stopwords-filter
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: ruby-stemmer
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.3
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.9.3
|
41
|
+
description: |2
|
42
|
+
It's a text preprocessor that generate a libsvm input file
|
43
|
+
email: andrea.nodari91@gmail.com
|
44
|
+
executables:
|
45
|
+
- libsvm_pp
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- README.md
|
50
|
+
- Rakefile
|
51
|
+
- LICENSE
|
52
|
+
- lib/libsvm_preprocessor/cli.rb
|
53
|
+
- lib/libsvm_preprocessor/feature_generator.rb
|
54
|
+
- lib/libsvm_preprocessor/global.rb
|
55
|
+
- lib/libsvm_preprocessor/preprocessor.rb
|
56
|
+
- lib/libsvm_preprocessor/token_map.rb
|
57
|
+
- lib/libsvm_preprocessor/tokenizer.rb
|
58
|
+
- lib/libsvm_preprocessor/version.rb
|
59
|
+
- lib/libsvm_prerpocessor.rb
|
60
|
+
- bin/libsvm_pp
|
61
|
+
- spec/feature_generator_spec.rb
|
62
|
+
- spec/preprocessor_spec.rb
|
63
|
+
- spec/token_map_spec.rb
|
64
|
+
- spec/tokenizer_spec.rb
|
65
|
+
homepage: http://github.com/nodo/libsvm_preprocessor
|
66
|
+
licenses:
|
67
|
+
- MIT
|
68
|
+
metadata: {}
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options: []
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 2.0.0.preview3.1
|
86
|
+
signing_key:
|
87
|
+
specification_version: 4
|
88
|
+
summary: It's a text preprocessor that generate a libsvm input file
|
89
|
+
test_files: []
|
90
|
+
has_rdoc: false
|