libsvm_preprocessor 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: ec1de3e3e31391a33e628f4dc4c6ace1b9c96cd3
4
- data.tar.gz: e897e7149c5ace324fba715154402da3efc075d9
5
- !binary "U0hBNTEy":
6
- metadata.gz: 122a7ad95b42b0b2429aa69d8cac690c90800f870f851ac56a665c69e2ba933cb770b484f45ada754f4a7336a1e2e28c98ce8407f27ab7a98797aca8b8562613
7
- data.tar.gz: f725942158aab1a7d8a34105ccad1639664e961f99739f5a05bbe67da1089ef4427200721f0ba527d1aa7fff2b3c52b158b95e2ccbaa84d4bfac64cc847879c4
2
+ SHA1:
3
+ metadata.gz: 1ec656ea774c188eb2bbb366c91e7b83761de7a1
4
+ data.tar.gz: 30839b898cb8e8391c3f2756f5f76c4700bd542c
5
+ SHA512:
6
+ metadata.gz: 845c2a7f93cea7c62f34503aa2e028f9ab39bdafcd17aca8c65569c0d904990cb6b188501eb04fb455e7d2f4bce26e05f9d3fd8b793d509431e02960dc35757c
7
+ data.tar.gz: 8d4a5e748d33098f4ac8c01004e7be422d912a582d8323ccdd74ecc8083bd6c19fd5f81823b9f5867c4fe2e1510dd77ac3c931e383752ba367764c7da78ccaec
data/README.md CHANGED
@@ -0,0 +1,35 @@
1
+ # Libsvm_preprocessor
2
+
3
+ This project is a simple ruby gem that provide a way to transform a text into sparse features vector using libsvm/liblinear format (<http://www.csie.ntu.edu.tw/~cjlin/libsvm>).
4
+
5
+ Since this tool is thought to be used with short-text it provides only binary representations of tokens.
6
+
7
+ ## Usage
8
+ ```
9
+ % libsvm_pp --help
10
+ libsvm_pp [options] <filename>
11
+ -m, --mode [TYPE] Select unigram (default)/bigram/trigrams
12
+ -s, --stemming Use this you want stemming
13
+ -w, --remove-stopwords Use this if you want remove stopwords
14
+ -t, --testing Use this to use testing mode
15
+ -l, --language [TYPE] Select your language it / en
16
+ -n N Numeric type
17
+ -o [output] output file
18
+ ```
19
+
20
+ It is possible to use the library following these steps:
21
+
22
+ ```
23
+ require "libsvm_preprocessor/preprocesso"
24
+
25
+ […]
26
+
27
+ preprocessor = Preprocessor.new(numeric_type: i)
28
+ preprocessor.use("TRAIN.csv", "TRAIN.svm")
29
+ preprocessor.use("TEST.csv", "TEST.svm", testing: true)
30
+ ```
31
+
32
+ In this case TRAIN.svm will contain your training set and TEST.svm will contain the testing set.
33
+
34
+
35
+ This project is far to be complete, as soon as possible I will provide a better documentation.
@@ -41,7 +41,7 @@ class CLI
41
41
  options[:lang] = l
42
42
  end
43
43
 
44
- opts.on("-n N", Integer, "Numeric mode") do |n|
44
+ opts.on("-n N", Integer, "Numeric type") do |n|
45
45
  options[:numeric_type] = n
46
46
  end
47
47
 
@@ -1,3 +1,4 @@
1
+ require 'csv'
1
2
  require 'libsvm_preprocessor/tokenizer'
2
3
  require 'libsvm_preprocessor/token_map'
3
4
  require 'libsvm_preprocessor/feature_generator'
@@ -45,11 +46,14 @@ class Preprocessor
45
46
 
46
47
  def initialize(options = {})
47
48
  if options[:numeric_type]
48
- options = override_options(options)
49
+ new_options = override_options(options)
50
+ @options = new_options.merge(output: options[:output])
51
+ else
52
+ @options = options
49
53
  end
50
- @options = options
51
- @tokenizer = Tokenizer.new(options)
52
- @generator = FeatureGenerator.new(options)
54
+
55
+ @tokenizer = Tokenizer.new(@options)
56
+ @generator = FeatureGenerator.new(@options)
53
57
 
54
58
  @non_zero_features = {}
55
59
  @non_zero_features[:testing] = 0
@@ -83,8 +87,13 @@ class Preprocessor
83
87
  def toSVM(vector)
84
88
  # the following line is made to have clean diff with libshorttext
85
89
  return "#{vector.first} " if vector.last.empty?
86
- features = vector.last
87
- .map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
90
+ features = vector.last.map {|h| "#{h.keys.first}:1"}.join(" ")
91
+
92
+ # With this lines it takes into account features (remove the
93
+ # similar one above)
94
+
95
+ #.map {|h|
96
+ # "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
88
97
  "#{vector.first} #{features}"
89
98
  end
90
99
 
@@ -95,9 +104,9 @@ class Preprocessor
95
104
  return "#{v[0]} "
96
105
  end
97
106
 
98
- def use(input_path, testing: false)
99
- if @options[:output]
100
- output_file = File.open(@options.output, "w")
107
+ def use(input_path, output_file=nil, testing: false)
108
+ if output_file
109
+ output_file = File.open(output_file, "w")
101
110
  CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
102
111
  output_file.puts toSVM( push(row, testing: testing) )
103
112
  end
@@ -1,3 +1,3 @@
1
1
  module LibsvmPreprocessor
2
- VERSION = '0.1'
2
+ VERSION = '0.2'
3
3
  end
@@ -25,10 +25,10 @@ describe Preprocessor do
25
25
  expect(v).to eq([0, [{1 => 3}]])
26
26
  end
27
27
 
28
- it "produce svm format" do
28
+ it "produce svm format (without frequency)" do
29
29
  v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
30
30
  result = preproc.toSVM(v)
31
- expect(result).to eq("0 1:3")
31
+ expect(result).to eq("0 1:1")
32
32
  end
33
33
  end
34
34
 
@@ -84,7 +84,7 @@ describe Preprocessor do
84
84
  it "produce svm format" do
85
85
  v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
86
86
  result = preproc.toSVM(v)
87
- expect(result).to eq("0 1:3 2:2")
87
+ expect(result).to eq("0 1:1 2:1")
88
88
  end
89
89
  end
90
90
 
metadata CHANGED
@@ -1,41 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libsvm_preprocessor
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrea Nodari
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-31 00:00:00.000000000 Z
11
+ date: 2013-09-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: stopwords-filter
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: 0.2.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: 0.2.1
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: ruby-stemmer
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: 0.9.3
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: 0.9.3
41
41
  description: |2
@@ -72,17 +72,17 @@ require_paths:
72
72
  - lib
73
73
  required_ruby_version: !ruby/object:Gem::Requirement
74
74
  requirements:
75
- - - ">="
75
+ - - '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
78
  required_rubygems_version: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - '>='
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  requirements: []
84
84
  rubyforge_project:
85
- rubygems_version: 2.0.0.preview3.1
85
+ rubygems_version: 2.0.3
86
86
  signing_key:
87
87
  specification_version: 4
88
88
  summary: It's a text preprocessor that generate a libsvm input file