libsvm_preprocessor 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: ec1de3e3e31391a33e628f4dc4c6ace1b9c96cd3
4
+ data.tar.gz: e897e7149c5ace324fba715154402da3efc075d9
5
+ !binary "U0hBNTEy":
6
+ metadata.gz: 122a7ad95b42b0b2429aa69d8cac690c90800f870f851ac56a665c69e2ba933cb770b484f45ada754f4a7336a1e2e28c98ce8407f27ab7a98797aca8b8562613
7
+ data.tar.gz: f725942158aab1a7d8a34105ccad1639664e961f99739f5a05bbe67da1089ef4427200721f0ba527d1aa7fff2b3c52b158b95e2ccbaa84d4bfac64cc847879c4
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2013 by Andrea Nodari
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.md ADDED
File without changes
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ RSpec::Core::RakeTask.new(:spec)
4
+
5
+ task :default => :spec
data/bin/libsvm_pp ADDED
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ if RUBY_VERSION < '2.0.0'
5
+ puts 'This gem supports only Ruby 2.0.0+'
6
+ exit 1
7
+ else
8
+ $LOAD_PATH.unshift(File.dirname(File.realpath(__FILE__)) + '/../lib')
9
+
10
+ require 'csv'
11
+ require 'libsvm_preprocessor/preprocessor'
12
+ require 'libsvm_preprocessor/cli'
13
+
14
+ options = CLI.parse(ARGV)
15
+
16
+ if !File.exist? ARGV[0]
17
+ puts "Please insert a real input file."
18
+ exit 1
19
+ end
20
+
21
+ preprocessor = Preprocessor.new(options)
22
+ preprocessor.use(ARGV[0], testing: options[:testing])
23
+ end
24
+
25
+ # output_dir = File.dirname(File.realpath(__FILE__)) + '/../output'
26
+ # input_test = ARGV[1]
27
+ # output_test_path = "#{OUTPUT_DIR}/test.svm"
28
+ # output_test = File.open(output_test_path, "w")
29
+ # CSV.foreach(input_test, OPTIONS_INPUT) do |row|
30
+ # vector = processor.toSVM(processor.push(row, testing: true))
31
+ # output_test.puts vector
32
+ # end
33
+ # output_test.close
@@ -0,0 +1,57 @@
1
+ require 'optparse'
2
+
3
+ class CLI
4
+
5
+ def self.parse(args)
6
+
7
+ options = {}
8
+
9
+ options[:mode] = :unigram
10
+ options[:lang] = :it
11
+ options[:stemming] = false
12
+ options[:stopwords] = false
13
+ options[:testing] = false
14
+ options[:numeric_type] = nil
15
+ options[:output] = nil
16
+
17
+ opt_parser = OptionParser.new do |opts|
18
+ opts.banner = "libsvm_pp [options] <filename>"
19
+
20
+ opts.on("-m [TYPE]", "--mode [TYPE]", [:unigram, :bigram],
21
+ "Select unigram (default) or bigram") do |mode|
22
+ options[:mode] = mode
23
+ end
24
+
25
+ opts.on("-s", "--stemming", "Use this you want stemming") do |s|
26
+ options[:stemming] = s
27
+ end
28
+
29
+ opts.on("-w", "--remove-stopwords",
30
+ "Use this if you want remove stopwords") do |w|
31
+ options[:stopwords] = w
32
+ end
33
+
34
+ opts.on("-t", "--testing",
35
+ "Use this to use testing mode") do |t|
36
+ options[:testing] = t
37
+ end
38
+
39
+ opts.on("-l [TYPE]", "--language", [:it, :en],
40
+ "Select your language it / en") do |l|
41
+ options[:lang] = l
42
+ end
43
+
44
+ opts.on("-n N", Integer, "Numeric mode") do |n|
45
+ options[:numeric_type] = n
46
+ end
47
+
48
+ opts.on("-o [output]", String, "output file") do |o|
49
+ options[:output] = o
50
+ end
51
+ end
52
+
53
+ opt_parser.parse!(args)
54
+ options
55
+ end
56
+
57
+ end
@@ -0,0 +1,45 @@
1
+ class FeatureGenerator
2
+
3
+ def hash_of_ngrams
4
+ @token_map.hash_of_ngrams
5
+ end
6
+
7
+ def initialize(options = {})
8
+ @token_map = TokenMap.new
9
+ @options = options
10
+ @options[:mode] ||= :unigram
11
+ end
12
+
13
+ def features(ary_of_terms, testing: false)
14
+ if @options[:mode] == :unigram
15
+ @token_map.token_map(unigrams(ary_of_terms), testing: testing)
16
+ elsif @options[:mode] == :bigram
17
+ @token_map.token_map(unigrams(ary_of_terms) +
18
+ bigrams(ary_of_terms),
19
+ testing: testing)
20
+ elsif @options[:mode] == :trichar
21
+ @token_map.token_map trichar(ary_of_terms)
22
+ end
23
+ end
24
+
25
+ def trichar(ary_of_terms)
26
+ string = ary_of_terms.join(" ")
27
+ if string.size < 3
28
+ return [ [string] ]
29
+ end
30
+ string1 = string[0...-2].split(//)
31
+ string2 = string[1...-1].split(//)
32
+ string3 = string[2..-1].split(//)
33
+ string1.zip(string2).zip(string3).map do |x|
34
+ [x.flatten.join]
35
+ end
36
+ end
37
+
38
+ def unigrams(ary_of_term)
39
+ ary_of_term.map { |term| [term] }
40
+ end
41
+
42
+ def bigrams(ary)
43
+ ary[0...-1].zip(ary[1..-1])
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module LibsvmPreprocessor
2
+ CSV_OPTIONS = { col_sep: "\t", headers: false }
3
+ end
@@ -0,0 +1,136 @@
1
+ require 'libsvm_preprocessor/tokenizer'
2
+ require 'libsvm_preprocessor/token_map'
3
+ require 'libsvm_preprocessor/feature_generator'
4
+ require 'libsvm_preprocessor/global'
5
+
6
+ class Preprocessor
7
+ attr_reader :categories
8
+ attr_reader :instances
9
+ attr_reader :non_zero_features
10
+
11
+ OPTIONS_MAP = {
12
+ 0 => { lang: "it", mode: :unigram, stemming: false, stopword: false },
13
+ 1 => { lang: "it", mode: :bigram, stemming: false, stopword: false },
14
+ 2 => { lang: "it", mode: :unigram, stemming: true, stopword: false },
15
+ 3 => { lang: "it", mode: :bigram, stemming: true, stopword: false },
16
+ 4 => { lang: "it", mode: :unigram, stemming: false, stopword: true },
17
+ 5 => { lang: "it", mode: :bigram, stemming: false, stopword: true },
18
+ 6 => { lang: "it", mode: :unigram, stemming: true, stopword: true },
19
+ 7 => { lang: "it", mode: :bigram, stemming: true, stopword: true },
20
+ 8 => { lang: "it", mode: :trichar, stemming: true, stopword: true },
21
+ 9 => { lang: "it", mode: :trichar, stemming: true, stopword: false },
22
+ 10 => { lang: "it", mode: :trichar, stemming: false, stopword: true },
23
+ 11 => { lang: "it", mode: :trichar, stemming: false, stopword: false },
24
+ }
25
+
26
+ def hash_of_ngrams
27
+ @generator.hash_of_ngrams
28
+ end
29
+
30
+ def override_options(options)
31
+ OPTIONS_MAP[options[:numeric_type]]
32
+ end
33
+
34
+ def self.options_map_size
35
+ OPTIONS_MAP.size
36
+ end
37
+
38
+ def self.options_map(key)
39
+ OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ")
40
+ end
41
+
42
+ def options
43
+ @options
44
+ end
45
+
46
+ def initialize(options = {})
47
+ if options[:numeric_type]
48
+ options = override_options(options)
49
+ end
50
+ @options = options
51
+ @tokenizer = Tokenizer.new(options)
52
+ @generator = FeatureGenerator.new(options)
53
+
54
+ @non_zero_features = {}
55
+ @non_zero_features[:testing] = 0
56
+ @non_zero_features[:training] = 0
57
+
58
+ @instances = {}
59
+ @instances[:testing] = []
60
+ @instances[:training] = []
61
+
62
+ @categories = {}
63
+ @current_category_id = -1
64
+ end
65
+
66
+ def push(data, testing: false)
67
+ category, string = data
68
+ # If it is a new category I need to associate a new id
69
+ if !@categories[category]
70
+ @categories[category] = next_category_id
71
+ end
72
+ v = vectorize(category, string, testing: testing)
73
+ if testing
74
+ @instances[:testing] << v
75
+ @non_zero_features[:testing] += v.last.size
76
+ else
77
+ @instances[:training] << v
78
+ @non_zero_features[:training] += v.last.size
79
+ end
80
+ return v
81
+ end
82
+
83
+ def toSVM(vector)
84
+ # the following line is made to have clean diff with libshorttext
85
+ return "#{vector.first} " if vector.last.empty?
86
+ features = vector.last
87
+ .map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
88
+ "#{vector.first} #{features}"
89
+ end
90
+
91
+ # This method is only meant to stringify the vector in very same
92
+ # format of libsvm (in this way diff does not mess up)
93
+ def nice_string(v)
94
+ return v.join(" ") if v[1] != ""
95
+ return "#{v[0]} "
96
+ end
97
+
98
+ def use(input_path, testing: false)
99
+ if @options[:output]
100
+ output_file = File.open(@options.output, "w")
101
+ CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
102
+ output_file.puts toSVM( push(row, testing: testing) )
103
+ end
104
+ output_file.close
105
+ else
106
+ CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
107
+ puts toSVM( push(row, testing: testing) )
108
+ end
109
+ end
110
+ end
111
+
112
+ private
113
+
114
+ def vectorize(category, string, testing: false)
115
+ tokens = @tokenizer.tokenize(string)
116
+ features = @generator.features(tokens, testing: testing)
117
+ ids_with_frequency = count_frequency(features)
118
+
119
+ [ @categories[category], ids_with_frequency ]
120
+ end
121
+
122
+ def count_frequency(features)
123
+ ids = features.map { |x| x.keys.first }.sort
124
+ result = ids.uniq.map do |id|
125
+ { id => ids.count(id) }
126
+ end
127
+ result
128
+ end
129
+
130
+ # Give the next category id available
131
+ def next_category_id
132
+ @current_category_id += 1
133
+ end
134
+
135
+ end
136
+
@@ -0,0 +1,30 @@
1
+ class TokenMap
2
+
3
+ attr_reader :hash_of_ngrams
4
+
5
+ def initialize
6
+ @hash_of_ngrams = {}
7
+ @current_ngram_id = 0
8
+ end
9
+
10
+ def token_map(ary_of_ngrams, testing: false)
11
+ if !testing
12
+ ary_of_ngrams.each { |ngram| @hash_of_ngrams[ngram] ||= next_ngram_id }
13
+ ary_of_ngrams.map { |ngram| { @hash_of_ngrams[ngram] => ngram } }
14
+ else
15
+ ary_of_ngrams.map do |ngram|
16
+ { @hash_of_ngrams[ngram] => ngram }
17
+ end.select do |hash|
18
+ hash.keys.first
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ private
25
+ # Give the next term id available
26
+ def next_ngram_id
27
+ @current_ngram_id += 1
28
+ end
29
+
30
+ end
@@ -0,0 +1,44 @@
1
+ require 'lingua/stemmer'
2
+ require 'stopwords'
3
+ require 'unicode'
4
+
5
+ class Tokenizer
6
+
7
+ def initialize(options = {})
8
+ @options = options
9
+ @options[:stopword] ||= false
10
+ @options[:stemming] ||= false
11
+ @options[:lang] ||= "it"
12
+ @filter = Stopwords::Snowball::Filter.new(@options[:lang])
13
+ @stemmer = Lingua::Stemmer.new(language: @options[:lang])
14
+ end
15
+
16
+ def tokenize(string)
17
+ result = process_text(string)
18
+ result = remove_stopwords(result) if @options[:stopword]
19
+ result = stem_each(result) if @options[:stemming]
20
+ result
21
+ end
22
+
23
+ def process_text(string)
24
+ string.downcase!
25
+ string = Unicode.nfd(string)
26
+ string.gsub!(/[^[:alpha:]]/, ' ')
27
+ string.gsub!(/([a-z])([0-9])/, '\1 \2')
28
+ string.gsub!(/([0-9])([a-z])/, '\1 \2')
29
+ string.gsub!(/\s+/, ' ')
30
+ string.strip!
31
+ string.split(' ')
32
+ end
33
+
34
+ # Remove stopwords according to the selected language
35
+ def remove_stopwords(ary)
36
+ @filter.filter(ary)
37
+ end
38
+
39
+ # Stem each word according to the selected language
40
+ def stem_each(ary)
41
+ ary.map { |term| @stemmer.stem(term) }
42
+ end
43
+
44
+ end
@@ -0,0 +1,3 @@
1
+ module LibsvmPreprocessor
2
+ VERSION = '0.1'
3
+ end
@@ -0,0 +1 @@
1
+ require 'libsvm_preprocessor/preprocessor'
@@ -0,0 +1,58 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe FeatureGenerator do
5
+
6
+ let(:ary_of_terms) { ["a","b","c"] }
7
+ let(:ary) { ["mar","rosso"] }
8
+
9
+ context "with default options" do
10
+ let(:generator) { FeatureGenerator.new }
11
+
12
+ it "use unigrams" do
13
+ expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}]
14
+ expect(generator.features(ary_of_terms)).to eq(expected)
15
+ end
16
+ end
17
+
18
+ context "using bigrams" do
19
+ let(:generator) { FeatureGenerator.new(:mode => :bigram) }
20
+
21
+ it "use bigrams" do
22
+ expected = [{1=>["a"]}, {2=>["b"]}, {3=>["c"]}, {4=>["a","b"]}, {5=>["b","c"]}]
23
+ expect(generator.features(ary_of_terms)).to eq(expected)
24
+ end
25
+
26
+ it "use ingnore duplicates" do
27
+ expected = [{1=>["a"]}, {1=>["a"]}, {2=>["a","a"]}]
28
+ expect(generator.features(["a","a"])).to eq(expected)
29
+ end
30
+
31
+ end
32
+
33
+ context "using trichar" do
34
+ let(:generator) { FeatureGenerator.new(:mode => :trichar) }
35
+
36
+ it "use trichar" do
37
+ expected = [{1=>["mar"]}, {2=>["ar "]}, {3=>["r r"]}, {4=> [" ro"]}, {5=>["ros"]}, {6=>["oss"]}, {7=> ["sso"]}]
38
+ expect(generator.features(ary)).to eq(expected)
39
+ end
40
+
41
+ it "ignore duplicates" do
42
+ expected = [{1=>["aaa"]}, {1=>["aaa"]},{1=>["aaa"]}]
43
+ expect(generator.features(["aaaaa"])).to eq(expected)
44
+ end
45
+
46
+ it "workarounds little word" do
47
+ expected = [{1 => ["te"]}]
48
+ expect(generator.features(["te"])).to eq(expected)
49
+ end
50
+
51
+ it "workarounds little words" do
52
+ expected = [{1 => ["te "]}, {2 => ["e n"]}, {3 => [" ne"]}]
53
+ expect(generator.features(["te", "ne"])).to eq(expected)
54
+ end
55
+
56
+ end
57
+
58
+ end
@@ -0,0 +1,111 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe Preprocessor do
5
+
6
+ describe "default settings" do
7
+ let(:preproc) { Preprocessor.new }
8
+ let(:p_trichar) { Preprocessor.new(mode: :trichar) }
9
+
10
+ context "adding a text" do
11
+ it "maps new categories" do
12
+ preproc.push ["category", "bottiglia"]
13
+ expect(preproc.categories["category"]).to eq 0
14
+ end
15
+ end
16
+
17
+ context "with default settings" do
18
+ it "produce a new vector" do
19
+ v = (preproc.push ["category", "bottiglia"])
20
+ expect(v).to eq([0, [{1 => 1}]])
21
+ end
22
+
23
+ it "takes into account frequencies" do
24
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
25
+ expect(v).to eq([0, [{1 => 3}]])
26
+ end
27
+
28
+ it "produce svm format" do
29
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
30
+ result = preproc.toSVM(v)
31
+ expect(result).to eq("0 1:3")
32
+ end
33
+ end
34
+
35
+ context "with trichar mode" do
36
+ it "produce a new vector with frequencies" do
37
+ v = (p_trichar.push ["category", "osso osso"])
38
+ expect(v).to eq([0, [{1 => 2}, {2 => 2}, {3 => 1}, {4 => 1}, {5 => 1}]])
39
+ end
40
+ end
41
+
42
+ context "when I am testing" do
43
+ it "ignore new words" do
44
+ v = preproc.push(["category", "bottiglia"], testing: true)
45
+ expect(v).to eq([0, []])
46
+ end
47
+
48
+ it "remembers the old ones" do
49
+ preproc.push(["category", "bottiglia"], testing: false)
50
+ v = preproc.push(["category", "bottiglia vetro"], testing: true)
51
+ expect(v).to eq([0, [{1 => 1}]])
52
+ end
53
+
54
+ it "produce svm format with blank features" do
55
+ v = preproc.push(["category", "bottiglia"], testing: true)
56
+ result = preproc.toSVM(v)
57
+ expect(result).to eq("0 ")
58
+ end
59
+
60
+ end
61
+ end
62
+
63
+ describe "using bigrams as feature" do
64
+ let(:preproc) { Preprocessor.new(mode: :bigram) }
65
+
66
+ context "adding a text" do
67
+ it "maps new categories" do
68
+ preproc.push ["category", "bottiglia"]
69
+ expect(preproc.categories["category"]).to eq 0
70
+ end
71
+ end
72
+
73
+ context "simple vectorization" do
74
+ it "produce a new vector" do
75
+ v = (preproc.push ["category", "bottiglia"])
76
+ expect(v).to eq([0, [{1 => 1}]])
77
+ end
78
+
79
+ it "takes into account frequencies" do
80
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
81
+ expect(v).to eq([0, [{1 => 3}, {2 => 2}]])
82
+ end
83
+
84
+ it "produce svm format" do
85
+ v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
86
+ result = preproc.toSVM(v)
87
+ expect(result).to eq("0 1:3 2:2")
88
+ end
89
+ end
90
+
91
+ context "when I am testing" do
92
+ it "ignore new words" do
93
+ v = preproc.push(["category", "bottiglia"], testing: true)
94
+ expect(v).to eq([0, []])
95
+ end
96
+
97
+ it "remembers the old ones" do
98
+ preproc.push(["category", "bottiglia"], testing: false)
99
+ v = preproc.push(["category", "bottiglia vetro"], testing: true)
100
+ expect(v).to eq([0, [{1 => 1}]])
101
+ end
102
+
103
+ it "produce svm format with blank features" do
104
+ v = preproc.push(["category", "bottiglia"], testing: true)
105
+ result = preproc.toSVM(v)
106
+ expect(result).to eq("0 ")
107
+ end
108
+
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,60 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe TokenMap do
5
+ let(:token_map) { TokenMap.new }
6
+
7
+ context "it maps terms in new ids" do
8
+ it "maps new tokens" do
9
+ ngrams = token_map.token_map([["bottiglia"],["di"],["vetro"]])
10
+ expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["vetro"]}]
11
+ expect(ngrams).to eq(expected)
12
+ end
13
+ end
14
+
15
+ context "it remembers old ids" do
16
+ it "maps new tokens" do
17
+ token_map.token_map([["bottiglia"],["di"],["vetro"]])
18
+ ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"]])
19
+ expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {4 => ["plastica"]}]
20
+ expect(ngrams).to eq(expected)
21
+ end
22
+ end
23
+
24
+ context "it remembers old ids also with other trichars" do
25
+ it "maps new tokens" do
26
+ token_map.token_map([["abc"],["bc "],["c a"],[" ab"],["abc"]])
27
+ ngrams = token_map.token_map([["abc"],["c a"],["bot"]])
28
+ expected = [{1 => ["abc"]}, {3 => ["c a"]}, {5 => ["bot"]}]
29
+ expect(ngrams).to eq(expected)
30
+ end
31
+ end
32
+
33
+ context "it ignores duplicates" do
34
+ it "maps new tokens" do
35
+ ngrams = token_map.token_map([["bottiglia"],["di"],["plastica"],["plastica"]])
36
+ expected = [{1 => ["bottiglia"]}, {2 => ["di"]}, {3 => ["plastica"]}, {3 => ["plastica"]}]
37
+ expect(ngrams).to eq(expected)
38
+ end
39
+ end
40
+
41
+ context "if I am creating a test file" do
42
+ it "does not consider new terms" do
43
+ token_map.token_map([["bottiglia"],["di"],["plastica"]])
44
+ ngrams = token_map.token_map([["polenta"],["valsugana"]], testing: true)
45
+
46
+ expected = []
47
+ expect(ngrams).to eq(expected)
48
+ end
49
+
50
+ it "does not consider new terms but remembers the old ones" do
51
+ token_map.token_map([["bottiglia"],["di"],["plastica"]])
52
+ ngrams = token_map.token_map([["tappo"],["plastica"]], testing: true)
53
+
54
+ expected = [{3 => ["plastica"]}]
55
+ expect(ngrams).to eq(expected)
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,36 @@
1
+ require 'rspec'
2
+ require 'libsvm_preprocessor/preprocessor'
3
+
4
+ describe Tokenizer do
5
+ let(:tokenizer) { Tokenizer.new }
6
+
7
+ context "tokenizer with default settings" do
8
+ it "tokenize a single word" do
9
+ tokens = tokenizer.tokenize("bottiglia")
10
+ expect(tokens).to eq(["bottiglia"])
11
+ end
12
+
13
+ it "tokenize multiple words" do
14
+ tokens = tokenizer.tokenize("bottiglia")
15
+ expect(tokens).to eq(["bottiglia"])
16
+ end
17
+ end
18
+
19
+ context "tokenizer with stopword removal" do
20
+ let(:tokenizer) { Tokenizer.new(stopword: true) }
21
+
22
+ it "tokenize removing stopwords" do
23
+ tokens = tokenizer.tokenize("bottiglia di vetro")
24
+ expect(tokens).to eq(["bottiglia", "vetro"])
25
+ end
26
+ end
27
+
28
+ context "tokenizer with stopword removal" do
29
+ let(:tokenizer) { Tokenizer.new(stemming: true) }
30
+
31
+ it "tokenize stemming each word" do
32
+ tokens = tokenizer.tokenize("bottiglia di vetro")
33
+ expect(tokens).to eq(["bottigl", "di", "vetr"])
34
+ end
35
+ end
36
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: libsvm_preprocessor
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Andrea Nodari
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-05-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: stopwords-filter
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: ruby-stemmer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.9.3
41
+ description: |2
42
+ It's a text preprocessor that generate a libsvm input file
43
+ email: andrea.nodari91@gmail.com
44
+ executables:
45
+ - libsvm_pp
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - README.md
50
+ - Rakefile
51
+ - LICENSE
52
+ - lib/libsvm_preprocessor/cli.rb
53
+ - lib/libsvm_preprocessor/feature_generator.rb
54
+ - lib/libsvm_preprocessor/global.rb
55
+ - lib/libsvm_preprocessor/preprocessor.rb
56
+ - lib/libsvm_preprocessor/token_map.rb
57
+ - lib/libsvm_preprocessor/tokenizer.rb
58
+ - lib/libsvm_preprocessor/version.rb
59
+ - lib/libsvm_prerpocessor.rb
60
+ - bin/libsvm_pp
61
+ - spec/feature_generator_spec.rb
62
+ - spec/preprocessor_spec.rb
63
+ - spec/token_map_spec.rb
64
+ - spec/tokenizer_spec.rb
65
+ homepage: http://github.com/nodo/libsvm_preprocessor
66
+ licenses:
67
+ - MIT
68
+ metadata: {}
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project:
85
+ rubygems_version: 2.0.0.preview3.1
86
+ signing_key:
87
+ specification_version: 4
88
+ summary: It's a text preprocessor that generate a libsvm input file
89
+ test_files: []
90
+ has_rdoc: false