libsvm_preprocessor 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +6 -6
- data/README.md +35 -0
- data/lib/libsvm_preprocessor/cli.rb +1 -1
- data/lib/libsvm_preprocessor/preprocessor.rb +18 -9
- data/lib/libsvm_preprocessor/version.rb +1 -1
- data/spec/preprocessor_spec.rb +3 -3
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
5
|
-
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1ec656ea774c188eb2bbb366c91e7b83761de7a1
|
4
|
+
data.tar.gz: 30839b898cb8e8391c3f2756f5f76c4700bd542c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 845c2a7f93cea7c62f34503aa2e028f9ab39bdafcd17aca8c65569c0d904990cb6b188501eb04fb455e7d2f4bce26e05f9d3fd8b793d509431e02960dc35757c
|
7
|
+
data.tar.gz: 8d4a5e748d33098f4ac8c01004e7be422d912a582d8323ccdd74ecc8083bd6c19fd5f81823b9f5867c4fe2e1510dd77ac3c931e383752ba367764c7da78ccaec
|
data/README.md
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
# Libsvm_preprocessor
|
2
|
+
|
3
|
+
This project is a simple ruby gem that provide a way to transform a text into sparse features vector using libsvm/liblinear format (<http://www.csie.ntu.edu.tw/~cjlin/libsvm>).
|
4
|
+
|
5
|
+
Since this tool is thought to be used with short-text it provides only binary representations of tokens.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
```
|
9
|
+
% libsvm_pp --help
|
10
|
+
libsvm_pp [options] <filename>
|
11
|
+
-m, --mode [TYPE] Select unigram (default)/bigram/trigrams
|
12
|
+
-s, --stemming Use this you want stemming
|
13
|
+
-w, --remove-stopwords Use this if you want remove stopwords
|
14
|
+
-t, --testing Use this to use testing mode
|
15
|
+
-l, --language [TYPE] Select your language it / en
|
16
|
+
-n N Numeric type
|
17
|
+
-o [output] output file
|
18
|
+
```
|
19
|
+
|
20
|
+
It is possible to use the library following these steps:
|
21
|
+
|
22
|
+
```
|
23
|
+
require "libsvm_preprocessor/preprocesso"
|
24
|
+
|
25
|
+
[…]
|
26
|
+
|
27
|
+
preprocessor = Preprocessor.new(numeric_type: i)
|
28
|
+
preprocessor.use("TRAIN.csv", "TRAIN.svm")
|
29
|
+
preprocessor.use("TEST.csv", "TEST.svm", testing: true)
|
30
|
+
```
|
31
|
+
|
32
|
+
In this case TRAIN.svm will contain your training set and TEST.svm will contain the testing set.
|
33
|
+
|
34
|
+
|
35
|
+
This project is far to be complete, as soon as possible I will provide a better documentation.
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'csv'
|
1
2
|
require 'libsvm_preprocessor/tokenizer'
|
2
3
|
require 'libsvm_preprocessor/token_map'
|
3
4
|
require 'libsvm_preprocessor/feature_generator'
|
@@ -45,11 +46,14 @@ class Preprocessor
|
|
45
46
|
|
46
47
|
def initialize(options = {})
|
47
48
|
if options[:numeric_type]
|
48
|
-
|
49
|
+
new_options = override_options(options)
|
50
|
+
@options = new_options.merge(output: options[:output])
|
51
|
+
else
|
52
|
+
@options = options
|
49
53
|
end
|
50
|
-
|
51
|
-
@tokenizer = Tokenizer.new(options)
|
52
|
-
@generator = FeatureGenerator.new(options)
|
54
|
+
|
55
|
+
@tokenizer = Tokenizer.new(@options)
|
56
|
+
@generator = FeatureGenerator.new(@options)
|
53
57
|
|
54
58
|
@non_zero_features = {}
|
55
59
|
@non_zero_features[:testing] = 0
|
@@ -83,8 +87,13 @@ class Preprocessor
|
|
83
87
|
def toSVM(vector)
|
84
88
|
# the following line is made to have clean diff with libshorttext
|
85
89
|
return "#{vector.first} " if vector.last.empty?
|
86
|
-
features = vector.last
|
87
|
-
|
90
|
+
features = vector.last.map {|h| "#{h.keys.first}:1"}.join(" ")
|
91
|
+
|
92
|
+
# With this lines it takes into account features (remove the
|
93
|
+
# similar one above)
|
94
|
+
|
95
|
+
#.map {|h|
|
96
|
+
# "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
|
88
97
|
"#{vector.first} #{features}"
|
89
98
|
end
|
90
99
|
|
@@ -95,9 +104,9 @@ class Preprocessor
|
|
95
104
|
return "#{v[0]} "
|
96
105
|
end
|
97
106
|
|
98
|
-
def use(input_path, testing: false)
|
99
|
-
if
|
100
|
-
output_file = File.open(
|
107
|
+
def use(input_path, output_file=nil, testing: false)
|
108
|
+
if output_file
|
109
|
+
output_file = File.open(output_file, "w")
|
101
110
|
CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
|
102
111
|
output_file.puts toSVM( push(row, testing: testing) )
|
103
112
|
end
|
data/spec/preprocessor_spec.rb
CHANGED
@@ -25,10 +25,10 @@ describe Preprocessor do
|
|
25
25
|
expect(v).to eq([0, [{1 => 3}]])
|
26
26
|
end
|
27
27
|
|
28
|
-
it "produce svm format" do
|
28
|
+
it "produce svm format (without frequency)" do
|
29
29
|
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
30
30
|
result = preproc.toSVM(v)
|
31
|
-
expect(result).to eq("0 1:
|
31
|
+
expect(result).to eq("0 1:1")
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -84,7 +84,7 @@ describe Preprocessor do
|
|
84
84
|
it "produce svm format" do
|
85
85
|
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
86
86
|
result = preproc.toSVM(v)
|
87
|
-
expect(result).to eq("0 1:
|
87
|
+
expect(result).to eq("0 1:1 2:1")
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
metadata
CHANGED
@@ -1,41 +1,41 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libsvm_preprocessor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrea Nodari
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-09-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: stopwords-filter
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 0.2.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 0.2.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: ruby-stemmer
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.9.3
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.9.3
|
41
41
|
description: |2
|
@@ -72,17 +72,17 @@ require_paths:
|
|
72
72
|
- lib
|
73
73
|
required_ruby_version: !ruby/object:Gem::Requirement
|
74
74
|
requirements:
|
75
|
-
- -
|
75
|
+
- - '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
78
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
requirements: []
|
84
84
|
rubyforge_project:
|
85
|
-
rubygems_version: 2.0.
|
85
|
+
rubygems_version: 2.0.3
|
86
86
|
signing_key:
|
87
87
|
specification_version: 4
|
88
88
|
summary: It's a text preprocessor that generate a libsvm input file
|