RubyGems - libsvm_preprocessor - Versions diffs - 0.1 → 0.2 - Mend

libsvm_preprocessor 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +6 -6
data/README.md +35 -0
data/lib/libsvm_preprocessor/cli.rb +1 -1
data/lib/libsvm_preprocessor/preprocessor.rb +18 -9
data/lib/libsvm_preprocessor/version.rb +1 -1
data/spec/preprocessor_spec.rb +3 -3
metadata +9 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-!binary "U0hBMQ==":
-  metadata.gz: ec1de3e3e31391a33e628f4dc4c6ace1b9c96cd3
-  data.tar.gz: e897e7149c5ace324fba715154402da3efc075d9
-!binary "U0hBNTEy":
-  metadata.gz: 122a7ad95b42b0b2429aa69d8cac690c90800f870f851ac56a665c69e2ba933cb770b484f45ada754f4a7336a1e2e28c98ce8407f27ab7a98797aca8b8562613
-  data.tar.gz: f725942158aab1a7d8a34105ccad1639664e961f99739f5a05bbe67da1089ef4427200721f0ba527d1aa7fff2b3c52b158b95e2ccbaa84d4bfac64cc847879c4
+SHA1:
+  metadata.gz: 1ec656ea774c188eb2bbb366c91e7b83761de7a1
+  data.tar.gz: 30839b898cb8e8391c3f2756f5f76c4700bd542c
+SHA512:
+  metadata.gz: 845c2a7f93cea7c62f34503aa2e028f9ab39bdafcd17aca8c65569c0d904990cb6b188501eb04fb455e7d2f4bce26e05f9d3fd8b793d509431e02960dc35757c
+  data.tar.gz: 8d4a5e748d33098f4ac8c01004e7be422d912a582d8323ccdd74ecc8083bd6c19fd5f81823b9f5867c4fe2e1510dd77ac3c931e383752ba367764c7da78ccaec

data/README.md CHANGED Viewed

@@ -0,0 +1,35 @@
+# Libsvm_preprocessor
+This project is a simple ruby gem that provide a way to transform a text into sparse features vector using libsvm/liblinear format (<http://www.csie.ntu.edu.tw/~cjlin/libsvm>).
+Since this tool is thought to be used with short-text it provides only binary representations of tokens.
+## Usage
+```
+	% libsvm_pp --help
+	libsvm_pp [options] <filename>
+  	 -m, --mode [TYPE]                Select unigram (default)/bigram/trigrams
+     -s, --stemming                   Use this you want stemming
+     -w, --remove-stopwords           Use this if you want remove stopwords
+     -t, --testing                    Use this to use testing mode
+     -l, --language [TYPE]            Select your language it / en
+     -n N                             Numeric type
+	 -o [output]                      output file
+```
+It is possible to use the library following these steps:
+```
+require "libsvm_preprocessor/preprocesso"
+[…]
+preprocessor = Preprocessor.new(numeric_type: i)
+preprocessor.use("TRAIN.csv", "TRAIN.svm")
+preprocessor.use("TEST.csv", "TEST.svm", testing: true)
+```
+In this case TRAIN.svm will contain your training set and TEST.svm will contain the testing set.
+This project is far to be complete, as soon as possible I will provide a better documentation.

data/lib/libsvm_preprocessor/cli.rb CHANGED Viewed

@@ -41,7 +41,7 @@ class CLI
         options[:lang] = l
       end
-      opts.on("-n N", Integer, "Numeric mode") do |n|
+      opts.on("-n N", Integer, "Numeric type") do |n|
         options[:numeric_type] = n
       end

data/lib/libsvm_preprocessor/preprocessor.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'csv'
 require 'libsvm_preprocessor/tokenizer'
 require 'libsvm_preprocessor/token_map'
 require 'libsvm_preprocessor/feature_generator'
@@ -45,11 +46,14 @@ class Preprocessor
   def initialize(options = {})
     if options[:numeric_type]
-      options = override_options(options)
+      new_options = override_options(options)
+      @options = new_options.merge(output: options[:output])
+    else
+      @options = options
     end
-    @options = options
-    @tokenizer  = Tokenizer.new(options)
-    @generator  = FeatureGenerator.new(options)
+    @tokenizer  = Tokenizer.new(@options)
+    @generator  = FeatureGenerator.new(@options)
     @non_zero_features = {}
     @non_zero_features[:testing]  = 0
@@ -83,8 +87,13 @@ class Preprocessor
   def toSVM(vector)
     # the following line is made to have clean diff with libshorttext
     return "#{vector.first} " if vector.last.empty?
-    features = vector.last
-      .map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
+    features = vector.last.map {|h| "#{h.keys.first}:1"}.join(" ")
+    # With this lines it takes into account features (remove the
+    #      similar one above)
+    #.map {|h|
+    #      "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
     "#{vector.first}  #{features}"
   end
@@ -95,9 +104,9 @@ class Preprocessor
     return "#{v[0]} "
   end
-  def use(input_path, testing: false)
-    if @options[:output]
-      output_file = File.open(@options.output, "w")
+  def use(input_path, output_file=nil, testing: false)
+    if output_file
+      output_file = File.open(output_file, "w")
       CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
         output_file.puts toSVM( push(row, testing: testing) )
       end

data/lib/libsvm_preprocessor/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module LibsvmPreprocessor
-  VERSION = '0.1'
+  VERSION = '0.2'
 end

data/spec/preprocessor_spec.rb CHANGED Viewed

@@ -25,10 +25,10 @@ describe Preprocessor do
         expect(v).to eq([0, [{1 => 3}]])
       end
-      it "produce svm format" do
+      it "produce svm format (without frequency)" do
         v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
         result = preproc.toSVM(v)
-        expect(result).to eq("0  1:3")
+        expect(result).to eq("0  1:1")
       end
     end
@@ -84,7 +84,7 @@ describe Preprocessor do
       it "produce svm format" do
         v = (preproc.push ["category", "bottiglia bottiglia bottiglia"])
         result = preproc.toSVM(v)
-        expect(result).to eq("0  1:3 2:2")
+        expect(result).to eq("0  1:1 2:1")
       end
     end

metadata CHANGED Viewed

@@ -1,41 +1,41 @@
 --- !ruby/object:Gem::Specification
 name: libsvm_preprocessor
 version: !ruby/object:Gem::Version
-  version: '0.1'
+  version: '0.2'
 platform: ruby
 authors:
 - Andrea Nodari
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-05-31 00:00:00.000000000 Z
+date: 2013-09-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: stopwords-filter
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: 0.2.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: 0.2.1
 - !ruby/object:Gem::Dependency
   name: ruby-stemmer
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: 0.9.3
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: 0.9.3
 description: |2
@@ -72,17 +72,17 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.0.preview3.1
+rubygems_version: 2.0.3
 signing_key:
 specification_version: 4
 summary: It's a text preprocessor that generate a libsvm input file