libsvm_preprocessor 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +6 -6
- data/README.md +35 -0
- data/lib/libsvm_preprocessor/cli.rb +1 -1
- data/lib/libsvm_preprocessor/preprocessor.rb +18 -9
- data/lib/libsvm_preprocessor/version.rb +1 -1
- data/spec/preprocessor_spec.rb +3 -3
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
5
|
-
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1ec656ea774c188eb2bbb366c91e7b83761de7a1
|
4
|
+
data.tar.gz: 30839b898cb8e8391c3f2756f5f76c4700bd542c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 845c2a7f93cea7c62f34503aa2e028f9ab39bdafcd17aca8c65569c0d904990cb6b188501eb04fb455e7d2f4bce26e05f9d3fd8b793d509431e02960dc35757c
|
7
|
+
data.tar.gz: 8d4a5e748d33098f4ac8c01004e7be422d912a582d8323ccdd74ecc8083bd6c19fd5f81823b9f5867c4fe2e1510dd77ac3c931e383752ba367764c7da78ccaec
|
data/README.md
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
# Libsvm_preprocessor
|
2
|
+
|
3
|
+
This project is a simple ruby gem that provide a way to transform a text into sparse features vector using libsvm/liblinear format (<http://www.csie.ntu.edu.tw/~cjlin/libsvm>).
|
4
|
+
|
5
|
+
Since this tool is thought to be used with short-text it provides only binary representations of tokens.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
```
|
9
|
+
% libsvm_pp --help
|
10
|
+
libsvm_pp [options] <filename>
|
11
|
+
-m, --mode [TYPE] Select unigram (default)/bigram/trigrams
|
12
|
+
-s, --stemming Use this you want stemming
|
13
|
+
-w, --remove-stopwords Use this if you want remove stopwords
|
14
|
+
-t, --testing Use this to use testing mode
|
15
|
+
-l, --language [TYPE] Select your language it / en
|
16
|
+
-n N Numeric type
|
17
|
+
-o [output] output file
|
18
|
+
```
|
19
|
+
|
20
|
+
It is possible to use the library following these steps:
|
21
|
+
|
22
|
+
```
|
23
|
+
require "libsvm_preprocessor/preprocesso"
|
24
|
+
|
25
|
+
[…]
|
26
|
+
|
27
|
+
preprocessor = Preprocessor.new(numeric_type: i)
|
28
|
+
preprocessor.use("TRAIN.csv", "TRAIN.svm")
|
29
|
+
preprocessor.use("TEST.csv", "TEST.svm", testing: true)
|
30
|
+
```
|
31
|
+
|
32
|
+
In this case TRAIN.svm will contain your training set and TEST.svm will contain the testing set.
|
33
|
+
|
34
|
+
|
35
|
+
This project is far to be complete, as soon as possible I will provide a better documentation.
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'csv'
|
1
2
|
require 'libsvm_preprocessor/tokenizer'
|
2
3
|
require 'libsvm_preprocessor/token_map'
|
3
4
|
require 'libsvm_preprocessor/feature_generator'
|
@@ -45,11 +46,14 @@ class Preprocessor
|
|
45
46
|
|
46
47
|
def initialize(options = {})
|
47
48
|
if options[:numeric_type]
|
48
|
-
|
49
|
+
new_options = override_options(options)
|
50
|
+
@options = new_options.merge(output: options[:output])
|
51
|
+
else
|
52
|
+
@options = options
|
49
53
|
end
|
50
|
-
|
51
|
-
@tokenizer = Tokenizer.new(options)
|
52
|
-
@generator = FeatureGenerator.new(options)
|
54
|
+
|
55
|
+
@tokenizer = Tokenizer.new(@options)
|
56
|
+
@generator = FeatureGenerator.new(@options)
|
53
57
|
|
54
58
|
@non_zero_features = {}
|
55
59
|
@non_zero_features[:testing] = 0
|
@@ -83,8 +87,13 @@ class Preprocessor
|
|
83
87
|
def toSVM(vector)
|
84
88
|
# the following line is made to have clean diff with libshorttext
|
85
89
|
return "#{vector.first} " if vector.last.empty?
|
86
|
-
features = vector.last
|
87
|
-
|
90
|
+
features = vector.last.map {|h| "#{h.keys.first}:1"}.join(" ")
|
91
|
+
|
92
|
+
# With this lines it takes into account features (remove the
|
93
|
+
# similar one above)
|
94
|
+
|
95
|
+
#.map {|h|
|
96
|
+
# "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
|
88
97
|
"#{vector.first} #{features}"
|
89
98
|
end
|
90
99
|
|
@@ -95,9 +104,9 @@ class Preprocessor
|
|
95
104
|
return "#{v[0]} "
|
96
105
|
end
|
97
106
|
|
98
|
-
def use(input_path, testing: false)
|
99
|
-
if
|
100
|
-
output_file = File.open(
|
107
|
+
def use(input_path, output_file=nil, testing: false)
|
108
|
+
if output_file
|
109
|
+
output_file = File.open(output_file, "w")
|
101
110
|
CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
|
102
111
|
output_file.puts toSVM( push(row, testing: testing) )
|
103
112
|
end
|
data/spec/preprocessor_spec.rb
CHANGED
@@ -25,10 +25,10 @@ describe Preprocessor do
|
|
25
25
|
expect(v).to eq([0, [{1 => 3}]])
|
26
26
|
end
|
27
27
|
|
28
|
-
it "produce svm format" do
|
28
|
+
it "produce svm format (without frequency)" do
|
29
29
|
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
30
30
|
result = preproc.toSVM(v)
|
31
|
-
expect(result).to eq("0 1:
|
31
|
+
expect(result).to eq("0 1:1")
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -84,7 +84,7 @@ describe Preprocessor do
|
|
84
84
|
it "produce svm format" do
|
85
85
|
v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
|
86
86
|
result = preproc.toSVM(v)
|
87
|
-
expect(result).to eq("0 1:
|
87
|
+
expect(result).to eq("0 1:1 2:1")
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
metadata
CHANGED
@@ -1,41 +1,41 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libsvm_preprocessor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrea Nodari
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-09-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: stopwords-filter
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 0.2.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 0.2.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: ruby-stemmer
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.9.3
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.9.3
|
41
41
|
description: |2
|
@@ -72,17 +72,17 @@ require_paths:
|
|
72
72
|
- lib
|
73
73
|
required_ruby_version: !ruby/object:Gem::Requirement
|
74
74
|
requirements:
|
75
|
-
- -
|
75
|
+
- - '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
78
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
requirements: []
|
84
84
|
rubyforge_project:
|
85
|
-
rubygems_version: 2.0.
|
85
|
+
rubygems_version: 2.0.3
|
86
86
|
signing_key:
|
87
87
|
specification_version: 4
|
88
88
|
summary: It's a text preprocessor that generate a libsvm input file
|