libsvm_preprocessor 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: ec1de3e3e31391a33e628f4dc4c6ace1b9c96cd3
4
- data.tar.gz: e897e7149c5ace324fba715154402da3efc075d9
5
- !binary "U0hBNTEy":
6
- metadata.gz: 122a7ad95b42b0b2429aa69d8cac690c90800f870f851ac56a665c69e2ba933cb770b484f45ada754f4a7336a1e2e28c98ce8407f27ab7a98797aca8b8562613
7
- data.tar.gz: f725942158aab1a7d8a34105ccad1639664e961f99739f5a05bbe67da1089ef4427200721f0ba527d1aa7fff2b3c52b158b95e2ccbaa84d4bfac64cc847879c4
2
+ SHA1:
3
+ metadata.gz: 1ec656ea774c188eb2bbb366c91e7b83761de7a1
4
+ data.tar.gz: 30839b898cb8e8391c3f2756f5f76c4700bd542c
5
+ SHA512:
6
+ metadata.gz: 845c2a7f93cea7c62f34503aa2e028f9ab39bdafcd17aca8c65569c0d904990cb6b188501eb04fb455e7d2f4bce26e05f9d3fd8b793d509431e02960dc35757c
7
+ data.tar.gz: 8d4a5e748d33098f4ac8c01004e7be422d912a582d8323ccdd74ecc8083bd6c19fd5f81823b9f5867c4fe2e1510dd77ac3c931e383752ba367764c7da78ccaec
data/README.md CHANGED
@@ -0,0 +1,35 @@
1
+ # Libsvm_preprocessor
2
+
3
+ This project is a simple ruby gem that provide a way to transform a text into sparse features vector using libsvm/liblinear format (<http://www.csie.ntu.edu.tw/~cjlin/libsvm>).
4
+
5
+ Since this tool is thought to be used with short-text it provides only binary representations of tokens.
6
+
7
+ ## Usage
8
+ ```
9
+ % libsvm_pp --help
10
+ libsvm_pp [options] <filename>
11
+ -m, --mode [TYPE] Select unigram (default)/bigram/trigrams
12
+ -s, --stemming Use this you want stemming
13
+ -w, --remove-stopwords Use this if you want remove stopwords
14
+ -t, --testing Use this to use testing mode
15
+ -l, --language [TYPE] Select your language it / en
16
+ -n N Numeric type
17
+ -o [output] output file
18
+ ```
19
+
20
+ It is possible to use the library following these steps:
21
+
22
+ ```
23
+ require "libsvm_preprocessor/preprocesso"
24
+
25
+ […]
26
+
27
+ preprocessor = Preprocessor.new(numeric_type: i)
28
+ preprocessor.use("TRAIN.csv", "TRAIN.svm")
29
+ preprocessor.use("TEST.csv", "TEST.svm", testing: true)
30
+ ```
31
+
32
+ In this case TRAIN.svm will contain your training set and TEST.svm will contain the testing set.
33
+
34
+
35
+ This project is far to be complete, as soon as possible I will provide a better documentation.
@@ -41,7 +41,7 @@ class CLI
41
41
  options[:lang] = l
42
42
  end
43
43
 
44
- opts.on("-n N", Integer, "Numeric mode") do |n|
44
+ opts.on("-n N", Integer, "Numeric type") do |n|
45
45
  options[:numeric_type] = n
46
46
  end
47
47
 
@@ -1,3 +1,4 @@
1
+ require 'csv'
1
2
  require 'libsvm_preprocessor/tokenizer'
2
3
  require 'libsvm_preprocessor/token_map'
3
4
  require 'libsvm_preprocessor/feature_generator'
@@ -45,11 +46,14 @@ class Preprocessor
45
46
 
46
47
  def initialize(options = {})
47
48
  if options[:numeric_type]
48
- options = override_options(options)
49
+ new_options = override_options(options)
50
+ @options = new_options.merge(output: options[:output])
51
+ else
52
+ @options = options
49
53
  end
50
- @options = options
51
- @tokenizer = Tokenizer.new(options)
52
- @generator = FeatureGenerator.new(options)
54
+
55
+ @tokenizer = Tokenizer.new(@options)
56
+ @generator = FeatureGenerator.new(@options)
53
57
 
54
58
  @non_zero_features = {}
55
59
  @non_zero_features[:testing] = 0
@@ -83,8 +87,13 @@ class Preprocessor
83
87
  def toSVM(vector)
84
88
  # the following line is made to have clean diff with libshorttext
85
89
  return "#{vector.first} " if vector.last.empty?
86
- features = vector.last
87
- .map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
90
+ features = vector.last.map {|h| "#{h.keys.first}:1"}.join(" ")
91
+
92
+ # With this lines it takes into account features (remove the
93
+ # similar one above)
94
+
95
+ #.map {|h|
96
+ # "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
88
97
  "#{vector.first} #{features}"
89
98
  end
90
99
 
@@ -95,9 +104,9 @@ class Preprocessor
95
104
  return "#{v[0]} "
96
105
  end
97
106
 
98
- def use(input_path, testing: false)
99
- if @options[:output]
100
- output_file = File.open(@options.output, "w")
107
+ def use(input_path, output_file=nil, testing: false)
108
+ if output_file
109
+ output_file = File.open(output_file, "w")
101
110
  CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
102
111
  output_file.puts toSVM( push(row, testing: testing) )
103
112
  end
@@ -1,3 +1,3 @@
1
1
  module LibsvmPreprocessor
2
- VERSION = '0.1'
2
+ VERSION = '0.2'
3
3
  end
@@ -25,10 +25,10 @@ describe Preprocessor do
25
25
  expect(v).to eq([0, [{1 => 3}]])
26
26
  end
27
27
 
28
- it "produce svm format" do
28
+ it "produce svm format (without frequency)" do
29
29
  v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
30
30
  result = preproc.toSVM(v)
31
- expect(result).to eq("0 1:3")
31
+ expect(result).to eq("0 1:1")
32
32
  end
33
33
  end
34
34
 
@@ -84,7 +84,7 @@ describe Preprocessor do
84
84
  it "produce svm format" do
85
85
  v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
86
86
  result = preproc.toSVM(v)
87
- expect(result).to eq("0 1:3 2:2")
87
+ expect(result).to eq("0 1:1 2:1")
88
88
  end
89
89
  end
90
90
 
metadata CHANGED
@@ -1,41 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libsvm_preprocessor
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrea Nodari
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-31 00:00:00.000000000 Z
11
+ date: 2013-09-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: stopwords-filter
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: 0.2.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: 0.2.1
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: ruby-stemmer
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: 0.9.3
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: 0.9.3
41
41
  description: |2
@@ -72,17 +72,17 @@ require_paths:
72
72
  - lib
73
73
  required_ruby_version: !ruby/object:Gem::Requirement
74
74
  requirements:
75
- - - ">="
75
+ - - '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
78
  required_rubygems_version: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - '>='
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  requirements: []
84
84
  rubyforge_project:
85
- rubygems_version: 2.0.0.preview3.1
85
+ rubygems_version: 2.0.3
86
86
  signing_key:
87
87
  specification_version: 4
88
88
  summary: It's a text preprocessor that generate a libsvm input file