libsvm_preprocessor 0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: ec1de3e3e31391a33e628f4dc4c6ace1b9c96cd3
4
+ data.tar.gz: e897e7149c5ace324fba715154402da3efc075d9
5
+ !binary "U0hBNTEy":
6
+ metadata.gz: 122a7ad95b42b0b2429aa69d8cac690c90800f870f851ac56a665c69e2ba933cb770b484f45ada754f4a7336a1e2e28c98ce8407f27ab7a98797aca8b8562613
7
+ data.tar.gz: f725942158aab1a7d8a34105ccad1639664e961f99739f5a05bbe67da1089ef4427200721f0ba527d1aa7fff2b3c52b158b95e2ccbaa84d4bfac64cc847879c4
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2013 by Andrea Nodari
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.md ADDED
File without changes
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ RSpec::Core::RakeTask.new(:spec)
4
+
5
+ task :default => :spec
data/bin/libsvm_pp ADDED
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ if RUBY_VERSION < '2.0.0'
5
+ puts 'This gem supports only Ruby 2.0.0+'
6
+ exit 1
7
+ else
8
+ $LOAD_PATH.unshift(File.dirname(File.realpath(__FILE__)) + '/../lib')
9
+
10
+ require 'csv'
11
+ require 'libsvm_preprocessor/preprocessor'
12
+ require 'libsvm_preprocessor/cli'
13
+
14
+ options = CLI.parse(ARGV)
15
+
16
+ if !File.exist? ARGV[0]
17
+ puts "Please insert a real input file."
18
+ exit 1
19
+ end
20
+
21
+ preprocessor = Preprocessor.new(options)
22
+ preprocessor.use(ARGV[0], testing: options[:testing])
23
+ end
24
+
25
+ # output_dir = File.dirname(File.realpath(__FILE__)) + '/../output'
26
+ # input_test = ARGV[1]
27
+ # output_test_path = "#{OUTPUT_DIR}/test.svm"
28
+ # output_test = File.open(output_test_path, "w")
29
+ # CSV.foreach(input_test, OPTIONS_INPUT) do |row|
30
+ # vector = processor.toSVM(processor.push(row, testing: true))
31
+ # output_test.puts vector
32
+ # end
33
+ # output_test.close
@@ -0,0 +1,57 @@
1
+ require 'optparse'
2
+
3
+ class CLI
4
+
5
+ def self.parse(args)
6
+
7
+ options = {}
8
+
9
+ options[:mode] = :unigram
10
+ options[:lang] = :it
11
+ options[:stemming] = false
12
+ options[:stopwords] = false
13
+ options[:testing] = false
14
+ options[:numeric_type] = nil
15
+ options[:output] = nil
16
+
17
+ opt_parser = OptionParser.new do |opts|
18
+ opts.banner = "libsvm_pp [options] <filename>"
19
+
20
+ opts.on("-m [TYPE]", "--mode [TYPE]", [:unigram, :bigram],
21
+ "Select unigram (default) or bigram") do |mode|
22
+ options[:mode] = mode
23
+ end
24
+
25
+ opts.on("-s", "--stemming", "Use this you want stemming") do |s|
26
+ options[:stemming] = s
27
+ end
28
+
29
+ opts.on("-w", "--remove-stopwords",
30
+ "Use this if you want remove stopwords") do |w|
31
+ options[:stopwords] = w
32
+ end
33
+
34
+ opts.on("-t", "--testing",
35
+ "Use this to use testing mode") do |t|
36
+ options[:testing] = t
37
+ end
38
+
39
+ opts.on("-l [TYPE]", "--language", [:it, :en],
40
+ "Select your language it / en") do |l|
41
+ options[:lang] = l
42
+ end
43
+
44
+ opts.on("-n N", Integer, "Numeric mode") do |n|
45
+ options[:numeric_type] = n
46
+ end
47
+
48
+ opts.on("-o [output]", String, "output file") do |o|
49
+ options[:output] = o
50
+ end
51
+ end
52
+
53
+ opt_parser.parse!(args)
54
+ options
55
+ end
56
+
57
+ end
@@ -0,0 +1,45 @@
1
+ class FeatureGenerator
2
+
3
+ def hash_of_ngrams
4
+ @token_map.hash_of_ngrams
5
+ end
6
+
7
+ def initialize(options = {})
8
+ @token_map = TokenMap.new
9
+ @options = options
10
+ @options[:mode] ||= :unigram
11
+ end
12
+
13
+ def features(ary_of_terms, testing: false)
14
+ if @options[:mode] == :unigram
15
+ @token_map.token_map(unigrams(ary_of_terms), testing: testing)
16
+ elsif @options[:mode] == :bigram
17
+ @token_map.token_map(unigrams(ary_of_terms) +
18
+ bigrams(ary_of_terms),
19
+ testing: testing)
20
+ elsif @options[:mode] == :trichar
21
+ @token_map.token_map trichar(ary_of_terms)
22
+ end
23
+ end
24
+
25
+ def trichar(ary_of_terms)
26
+ string = ary_of_terms.join(" ")
27
+ if string.size < 3
28
+ return [ [string] ]
29
+ end
30
+ string1 = string[0...-2].split(//)
31
+ string2 = string[1...-1].split(//)
32
+ string3 = string[2..-1].split(//)
33
+ string1.zip(string2).zip(string3).map do |x|
34
+ [x.flatten.join]
35
+ end
36
+ end
37
+
38
+ def unigrams(ary_of_term)
39
+ ary_of_term.map { |term| [term] }
40
+ end
41
+
42
+ def bigrams(ary)
43
+ ary[0...-1].zip(ary[1..-1])
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module LibsvmPreprocessor
2
+ CSV_OPTIONS = { col_sep: "\t", headers: false }
3
+ end
@@ -0,0 +1,136 @@
1
+ require 'libsvm_preprocessor/tokenizer'
2
+ require 'libsvm_preprocessor/token_map'
3
+ require 'libsvm_preprocessor/feature_generator'
4
+ require 'libsvm_preprocessor/global'
5
+
6
+ class Preprocessor
7
+ attr_reader :categories
8
+ attr_reader :instances
9
+ attr_reader :non_zero_features
10
+
11
+ OPTIONS_MAP = {
12
+ 0 => { lang: "it", mode: :unigram, stemming: false, stopword: false },
13
+ 1 => { lang: "it", mode: :bigram, stemming: false, stopword: false },
14
+ 2 => { lang: "it", mode: :unigram, stemming: true, stopword: false },
15
+ 3 => { lang: "it", mode: :bigram, stemming: true, stopword: false },
16
+ 4 => { lang: "it", mode: :unigram, stemming: false, stopword: true },
17
+ 5 => { lang: "it", mode: :bigram, stemming: false, stopword: true },
18
+ 6 => { lang: "it", mode: :unigram, stemming: true, stopword: true },
19
+ 7 => { lang: "it", mode: :bigram, stemming: true, stopword: true },
20
+ 8 => { lang: "it", mode: :trichar, stemming: true, stopword: true },
21
+ 9 => { lang: "it", mode: :trichar, stemming: true, stopword: false },
22
+ 10 => { lang: "it", mode: :trichar, stemming: false, stopword: true },
23
+ 11 => { lang: "it", mode: :trichar, stemming: false, stopword: false },
24
+ }
25
+
26
+ def hash_of_ngrams
27
+ @generator.hash_of_ngrams
28
+ end
29
+
30
+ def override_options(options)
31
+ OPTIONS_MAP[options[:numeric_type]]
32
+ end
33
+
34
+ def self.options_map_size
35
+ OPTIONS_MAP.size
36
+ end
37
+
38
+ def self.options_map(key)
39
+ OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ")
40
+ end
41
+
42
+ def options
43
+ @options
44
+ end
45
+
46
+ def initialize(options = {})
47
+ if options[:numeric_type]
48
+ options = override_options(options)
49
+ end
50
+ @options = options
51
+ @tokenizer = Tokenizer.new(options)
52
+ @generator = FeatureGenerator.new(options)
53
+
54
+ @non_zero_features = {}
55
+ @non_zero_features[:testing] = 0
56
+ @non_zero_features[:training] = 0
57
+
58
+ @instances = {}
59
+ @instances[:testing] = []
60
+ @instances[:training] = []
61
+
62
+ @categories = {}
63
+ @current_category_id = -1
64
+ end
65
+
66
+ def push(data, testing: false)
67
+ category, string = data
68
+ # If it is a new category I need to associate a new id
69
+ if !@categories[category]
70
+ @categories[category] = next_category_id
71
+ end
72
+ v = vectorize(category, string, testing: testing)
73
+ if testing
74
+ @instances[:testing] << v
75
+ @non_zero_features[:testing] += v.last.size
76
+ else
77
+ @instances[:training] << v
78
+ @non_zero_features[:training] += v.last.size
79
+ end
80
+ return v
81
+ end
82
+
83
+ def toSVM(vector)
84
+ # the following line is made to have clean diff with libshorttext
85
+ return "#{vector.first} " if vector.last.empty?
86
+ features = vector.last
87
+ .map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
88
+ "#{vector.first} #{features}"
89
+ end
90
+
91
+ # This method is only meant to stringify the vector in very same
92
+ # format of libsvm (in this way diff does not mess up)
93
+ def nice_string(v)
94
+ return v.join(" ") if v[1] != ""
95
+ return "#{v[0]} "
96
+ end
97
+
98
+ def use(input_path, testing: false)
99
+ if @options[:output]
100
+ output_file = File.open(@options.output, "w")
101
+ CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
102
+ output_file.puts toSVM( push(row, testing: testing) )
103
+ end
104
+ output_file.close
105
+ else
106
+ CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
107
+ puts toSVM( push(row, testing: testing) )
108
+ end
109
+ end
110
+ end
111
+
112
+ private
113
+
114
+ def vectorize(category, string, testing: false)
115
+ tokens = @tokenizer.tokenize(string)
116
+ features = @generator.features(tokens, testing: testing)
117
+ ids_with_frequency = count_frequency(features)
118
+
119
+ [ @categories[category], ids_with_frequency ]
120
+ end
121
+
122
+ def count_frequency(features)
123
+ ids = features.map { |x| x.keys.first }.sort
124
+ result = ids.uniq.map do |id|
125
+ { id => ids.count(id) }
126
+ end
127
+ result
128
+ end
129
+
130
+ # Give the next category id available
131
+ def next_category_id
132
+ @current_category_id += 1
133
+ end
134
+
135
+ end
136
+
@@ -0,0 +1,30 @@
1
+ class TokenMap
2
+
3
+ attr_reader :hash_of_ngrams
4
+
5
+ def initialize
6
+ @hash_of_ngrams = {}
7
+ @current_ngram_id = 0
8
+ end
9
+
10
+ def token_map(ary_of_ngrams, testing: false)
11
+ if !testing
12
+ ary_of_ngrams.each { |ngram| @hash_of_ngrams[ngram] ||= next_ngram_id }
13
+ ary_of_ngrams.map { |ngram| { @hash_of_ngrams[ngram] => ngram } }
14
+ else
15
+ ary_of_ngrams.map do |ngram|
16
+ { @hash_of_ngrams[ngram] => ngram }
17
+ end.select do |hash|
18
+ hash.keys.first
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ private
25
+ # Give the next term id available
26
+ def next_ngram_id
27
+ @current_ngram_id += 1
28
+ end
29
+
30
+ end
@@ -0,0 +1,44 @@
1
+ require 'lingua/stemmer'
2
+ require 'stopwords'
3
+ require 'unicode'
4
+
5
+ class Tokenizer
6
+
7
+ def initialize(options = {})
8
+ @options = options
9
+ @options[:stopword] ||= false
10
+ @options[:stemming] ||= false
11
+ @options[:lang] ||= "it"
12
+ @filter = Stopwords::Snowball::Filter.new(@options[:lang])
13
+ @stemmer = Lingua::Stemmer.new(language: @options[:lang])
14
+ end
15
+
16
+ def tokenize(string)
17
+ result = process_text(string)
18
+ result = remove_stopwords(result) if @options[:stopword]
19
+ result = stem_each(result) if @options[:stemming]
20
+ result
21
+ end
22
+
23
+ def process_text(string)
24
+ string.downcase!
25
+ string = Unicode.nfd(string)
26
+ string.gsub!(/[^[:alpha:]]/, ' ')
27
+ string.gsub!(/([a-z])([0-9])/, '\1 \2')
28
+ string.gsub!(/([0-9])([a-z])/, '\1 \2')
29
+ string.gsub!(/\s+/, ' ')
30
+ string.strip!
31
+ string.split(' ')
32
+ end
33
+
34
+ # Remove stopwords according to the selected language
35
+ def remove_stopwords(ary)
36
+ @filter.filter(ary)
37
+ end
38
+
39
+ # Stem each word according to the selected language
40
+ def stem_each(ary)
41
+ ary.map { |term| @stemmer.stem(term) }
42
+ end
43
+
44
+ end
@@ -0,0 +1,3 @@
1
+ module LibsvmPreprocessor
2
+ VERSION = '0.1'
3
+ end
@@ -0,0 +1 @@
1
+ require 'libsvm_preprocessor/preprocessor'
@@ -0,0 +1,58 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe FeatureGenerator do
5
+
6
+ let(:ary_of_terms) { ["a","b","c"] }
7
+ let(:ary) { ["mar","rosso"] }
8
+
9
+ context "with default options" do
10
+ let(:generator) { FeatureGenerator.new }
11
+
12
+ it "use unigrams" do
13
+ expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}]
14
+ expect(generator.features(ary_of_terms)).to eq(expected)
15
+ end
16
+ end
17
+
18
+ context "using bigrams" do
19
+ let(:generator) { FeatureGenerator.new(:mode => :bigram) }
20
+
21
+ it "use bigrams" do
22
+ expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}, {4=>["a","b"]}, {5=>["b","c"]}]
23
+ expect(generator.features(ary_of_terms)).to eq(expected)
24
+ end
25
+
26
+ it "use ingnore duplicates" do
27
+ expected = [{1=>["a"]}, {1=>["a"]}, {2=>["a","a"]}]
28
+ expect(generator.features(["a","a"])).to eq(expected)
29
+ end
30
+
31
+ end
32
+
33
+ context "using trichar" do
34
+ let(:generator) { FeatureGenerator.new(:mode => :trichar) }
35
+
36
+ it "use trichar" do
37
+ expected = [{1=>["mar"]}, {2=>["ar "]}, {3=>["r r"]}, {4=> [" ro"]}, {5=>["ros"]}, {6=>["oss"]}, {7=> ["sso"]}]
38
+ expect(generator.features(ary)).to eq(expected)
39
+ end
40
+
41
+ it "ignore duplicates" do
42
+ expected = [{1=>["aaa"]}, {1=>["aaa"]},{1=>["aaa"]}]
43
+ expect(generator.features(["aaaaa"])).to eq(expected)
44
+ end
45
+
46
+ it "workarounds little word" do
47
+ expected = [{1 => ["te"]}]
48
+ expect(generator.features(["te"])).to eq(expected)
49
+ end
50
+
51
+ it "workarounds little words" do
52
+ expected = [{1 => ["te "]}, {2 => ["e n"]}, {3 => [" ne"]}]
53
+ expect(generator.features(["te", "ne"])).to eq(expected)
54
+ end
55
+
56
+ end
57
+
58
+ end
@@ -0,0 +1,111 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe Preprocessor do
5
+
6
+ describe "default settings" do
7
+ let(:preproc) { Preprocessor.new }
8
+ let(:p_trichar) { Preprocessor.new(mode: :trichar) }
9
+
10
+ context "adding a text" do
11
+ it "maps new categories" do
12
+ preproc.push ["category", "bottiglia"]
13
+ expect(preproc.categories["category"]).to eq 0
14
+ end
15
+ end
16
+
17
+ context "with default settings" do
18
+ it "produce a new vector" do
19
+ v = (preproc.push ["category", "bottiglia"])
20
+ expect(v).to eq([0, [{1 => 1}]])
21
+ end
22
+
23
+ it "takes into account frequencies" do
24
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
25
+ expect(v).to eq([0, [{1 => 3}]])
26
+ end
27
+
28
+ it "produce svm format" do
29
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
30
+ result = preproc.toSVM(v)
31
+ expect(result).to eq("0 1:3")
32
+ end
33
+ end
34
+
35
+ context "with trichar mode" do
36
+ it "produce a new vector with frequencies" do
37
+ v = (p_trichar.push ["category", "osso osso"])
38
+ expect(v).to eq([0, [{1 => 2}, {2 => 2}, {3 => 1}, {4 => 1}, {5 => 1}]])
39
+ end
40
+ end
41
+
42
+ context "when I am testing" do
43
+ it "ignore new words" do
44
+ v = preproc.push(["category", "bottiglia"], testing: true)
45
+ expect(v).to eq([0, []])
46
+ end
47
+
48
+ it "remembers the old ones" do
49
+ preproc.push(["category", "bottiglia"], testing: false)
50
+ v = preproc.push(["category", "bottiglia vetro"], testing: true)
51
+ expect(v).to eq([0, [{1 => 1}]])
52
+ end
53
+
54
+ it "produce svm format with blank features" do
55
+ v = preproc.push(["category", "bottiglia"], testing: true)
56
+ result = preproc.toSVM(v)
57
+ expect(result).to eq("0 ")
58
+ end
59
+
60
+ end
61
+ end
62
+
63
+ describe "using bigrams as feature" do
64
+ let(:preproc) { Preprocessor.new(mode: :bigram) }
65
+
66
+ context "adding a text" do
67
+ it "maps new categories" do
68
+ preproc.push ["category", "bottiglia"]
69
+ expect(preproc.categories["category"]).to eq 0
70
+ end
71
+ end
72
+
73
+ context "simple vectorization" do
74
+ it "produce a new vector" do
75
+ v = (preproc.push ["category", "bottiglia"])
76
+ expect(v).to eq([0, [{1 => 1}]])
77
+ end
78
+
79
+ it "takes into account frequencies" do
80
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
81
+ expect(v).to eq([0, [{1 => 3}, {2 => 2}]])
82
+ end
83
+
84
+ it "produce svm format" do
85
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
86
+ result = preproc.toSVM(v)
87
+ expect(result).to eq("0 1:3 2:2")
88
+ end
89
+ end
90
+
91
+ context "when I am testing" do
92
+ it "ignore new words" do
93
+ v = preproc.push(["category", "bottiglia"], testing: true)
94
+ expect(v).to eq([0, []])
95
+ end
96
+
97
+ it "remembers the old ones" do
98
+ preproc.push(["category", "bottiglia"], testing: false)
99
+ v = preproc.push(["category", "bottiglia vetro"], testing: true)
100
+ expect(v).to eq([0, [{1 => 1}]])
101
+ end
102
+
103
+ it "produce svm format with blank features" do
104
+ v = preproc.push(["category", "bottiglia"], testing: true)
105
+ result = preproc.toSVM(v)
106
+ expect(result).to eq("0 ")
107
+ end
108
+
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,60 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe TokenMap do
5
+ let(:token_map) { TokenMap.new }
6
+
7
+ context "it maps terms in new ids" do
8
+ it "maps new tokens" do
9
+ ngrams = token_map.token_map([["bottiglia"],["di"],["vetro"]])
10
+ expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["vetro"]}]
11
+ expect(ngrams).to eq(expected)
12
+ end
13
+ end
14
+
15
+ context "it remembers old ids" do
16
+ it "maps new tokens" do
17
+ token_map.token_map([["bottiglia"],["di"],["vetro"]])
18
+ ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"]])
19
+ expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {4 => ["plastica"]}]
20
+ expect(ngrams).to eq(expected)
21
+ end
22
+ end
23
+
24
+ context "it remembers old ids also with other trichars" do
25
+ it "maps new tokens" do
26
+ token_map.token_map([["abc"],["bc "],["c a"],[" ab"],["abc"]])
27
+ ngrams = token_map.token_map([["abc"],["c a"],["bot"]])
28
+ expected = [{1 => ["abc"]}, {3 => ["c a"]}, {5 => ["bot"]}]
29
+ expect(ngrams).to eq(expected)
30
+ end
31
+ end
32
+
33
+ context "it ignores duplicates" do
34
+ it "maps new tokens" do
35
+ ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"],["plastica"]])
36
+ expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["plastica"]}, {3 => ["plastica"]}]
37
+ expect(ngrams).to eq(expected)
38
+ end
39
+ end
40
+
41
+ context "if I am creating a test file" do
42
+ it "does not consider new terms" do
43
+ token_map.token_map([["bottiglia"],["di"],["plastica"]])
44
+ ngrams = token_map.token_map([["polenta"],["valsugana"]], testing: true)
45
+
46
+ expected = []
47
+ expect(ngrams).to eq(expected)
48
+ end
49
+
50
+ it "does not consider new terms but remembers the old ones" do
51
+ token_map.token_map([["bottiglia"],["di"],["plastica"]])
52
+ ngrams = token_map.token_map([["tappo"],["plastica"]], testing: true)
53
+
54
+ expected = [{3 => ["plastica"]}]
55
+ expect(ngrams).to eq(expected)
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,36 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe Tokenizer do
5
+ let(:tokenizer) { Tokenizer.new }
6
+
7
+ context "tokenizer with default settings" do
8
+ it "tokenize a single word" do
9
+ tokens = tokenizer.tokenize("bottiglia")
10
+ expect(tokens).to eq(["bottiglia"])
11
+ end
12
+
13
+ it "tokenize multiple words" do
14
+ tokens = tokenizer.tokenize("bottiglia")
15
+ expect(tokens).to eq(["bottiglia"])
16
+ end
17
+ end
18
+
19
+ context "tokenizer with stopword removal" do
20
+ let(:tokenizer) { Tokenizer.new(stopword: true) }
21
+
22
+ it "tokenize removing stopwords" do
23
+ tokens = tokenizer.tokenize("bottiglia di vetro")
24
+ expect(tokens).to eq(["bottiglia", "vetro"])
25
+ end
26
+ end
27
+
28
+ context "tokenizer with stopword removal" do
29
+ let(:tokenizer) { Tokenizer.new(stemming: true) }
30
+
31
+ it "tokenize stemming each word" do
32
+ tokens = tokenizer.tokenize("bottiglia di vetro")
33
+ expect(tokens).to eq(["bottigl", "di", "vetr"])
34
+ end
35
+ end
36
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: libsvm_preprocessor
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Andrea Nodari
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-05-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: stopwords-filter
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: ruby-stemmer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.9.3
41
+ description: |2
42
+ It's a text preprocessor that generate a libsvm input file
43
+ email: andrea.nodari91@gmail.com
44
+ executables:
45
+ - libsvm_pp
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - README.md
50
+ - Rakefile
51
+ - LICENSE
52
+ - lib/libsvm_preprocessor/cli.rb
53
+ - lib/libsvm_preprocessor/feature_generator.rb
54
+ - lib/libsvm_preprocessor/global.rb
55
+ - lib/libsvm_preprocessor/preprocessor.rb
56
+ - lib/libsvm_preprocessor/token_map.rb
57
+ - lib/libsvm_preprocessor/tokenizer.rb
58
+ - lib/libsvm_preprocessor/version.rb
59
+ - lib/libsvm_prerpocessor.rb
60
+ - bin/libsvm_pp
61
+ - spec/feature_generator_spec.rb
62
+ - spec/preprocessor_spec.rb
63
+ - spec/token_map_spec.rb
64
+ - spec/tokenizer_spec.rb
65
+ homepage: http://github.com/nodo/libsvm_preprocessor
66
+ licenses:
67
+ - MIT
68
+ metadata: {}
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project:
85
+ rubygems_version: 2.0.0.preview3.1
86
+ signing_key:
87
+ specification_version: 4
88
+ summary: It's a text preprocessor that generate a libsvm input file
89
+ test_files: []
90
+ has_rdoc: false