RubyGems - twss - Versions diffs - 0.0.3 → 0.0.4 - Mend

twss 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/lib/twss/engine.rb CHANGED

@@ -17,13 +17,14 @@ module TWSS
     def initialize(options = {})
       @data_file = options[:data_file] || DATA_FILE
-      @threshold ||= options[:threshold] || 5.0
+      @threshold ||= options[:threshold] || 7.5
       @classifier = load_classifier_from_file!(@data_file) || new_classifier
     end
     def classify(str)
       if basic_conditions_met?(str)
         c = @classifier.classifications(str)
+        require 'pp'
         c[TRUE] - c[FALSE] > threshold
       else
         false

data/lib/twss/trainer.rb CHANGED

@@ -4,65 +4,108 @@ module TWSS
   class Trainer
-    attr_reader :engine
+    attr_reader :engine, :training_percentage
     def initialize(engine, options = {})
       @engine = engine
       engine.clear_state!
-      @training_set_size = options[:training_set_size] || 100
+      @training_percentage = options[:training_percentage] || 0.9
     end
     def train
       path = File.join(File.dirname(__FILE__), '../../data/')
+      run_training(path)
+      puts "Writing to file..."
+      engine.dump_classifier_to_file
+      run_tests(path)
+    end
+    def total_documents(file)
+      t = 0
+      File.read(file).each_line do |l|
+        t += 1
+      end
+      t
+    end
+    def run_training(path)
+      positive_file = File.join(path, 'twss.txt')
+      negative_file = File.join(path, 'non_twss.txt')
       puts "Clearing state..."
       engine.clear_state!
       puts "Training NON-TWSS strings..."
-      File.read(File.join(path, 'non_twss.txt')).each_line do |l|
-        engine.train(TWSS::Engine::FALSE, strip_tweet(l))
+      File.read(negative_file).each_line do |l|
+        print '.'
+        $stdout.flush
+        engine.train(TWSS::Engine::FALSE, l)
       end
+      puts
       puts "Training TWSS strings..."
-      File.read(File.join(path, 'twss.txt')).each_line do |l|
-        engine.train(TWSS::Engine::TRUE, strip_tweet(l))
+      File.read(positive_file).each_line do |l|
+        print '.'
+        $stdout.flush
+        engine.train(TWSS::Engine::TRUE, l)
       end
-      puts "Writing to file..."
-      engine.dump_classifier_to_file
-      puts "Done."
-      puts
-      run_examples
+      puts
     end
+    def run_tests(path)
+      positive_test_file = File.join(path, 'test_twss.txt')
+      negative_test_file = File.join(path, 'test_non_twss.txt')
+      total_positive = total_documents(positive_test_file)
+      total_negative = total_documents(negative_test_file)
+      false_negatives = 0
+      false_positives = 0
+      total = 0
+      correct = 0
+      test_each(positive_test_file, (total_positive * training_percentage).to_i) do |line, result|
+        if result
+          correct += 1
+        else
+          false_negatives += 1
+        end
+        total += 1
+      end
-    # A little cleanup of the text before we train on it.
-    def strip_tweet(text)
-      t = text.gsub(/[\@\#]\w+\b/i, '') # strip mentions and hashtags
-      t.gsub!(/(RT|OH)\W/i, '') # strip RT's and OH's
-      t.gsub!(/twss/i, '') # strip out twss itself
-      t.gsub!(/http:\/\/[A-Za-z0-9\.\/]+/, '') # URLs
-      t.gsub!(/[\W\d]/, ' ') # now all non word chars and numbers
-      t.strip!
-      t
+      test_each(negative_test_file, (total_negative * training_percentage).to_i) do |line, result|
+        if !result
+          correct += 1
+        else
+          false_positives += 1
+        end
+        total += 1
+      end
+      puts
+      puts "Test set size: #{total}"
+      puts "Overall accuracy: #{100 * correct / total.to_f}%"
+      puts "False positives: #{false_positives} (#{100 * false_positives / total_negative.to_f}%)"
+      puts "False negatives: #{false_negatives} (#{100 * false_negatives / total_positive.to_f}%)"
+      puts
     end
-    def run_examples
-      ["how big is that thing going to get?",
-       "umm... that's the not the right hole",
-       "did you resolve the ticket?",
-       "did you fix the bug?",
-       "you're going to need to go faster",
-       "I'm almost there, keep going",
-       "Ok, send me a pull request",
-       "The president issued a decree",
-       "I don't get it, this isn't working correctly",
-       "finished specialties in the warehouse"].each do |s|
-         puts '"' + s + '" => ' + TWSS(s).to_s
-       end
+    def test_each(file, sample_size, &blk)
+      i = 0
+      File.read(file).each_line do |line|
+        return if i > sample_size
+        l = line.strip
+        unless l.empty?
+          r = TWSS(l)
+          puts l + ' => ' + r.to_s
+          blk.call(l, r)
+          i += 1
+        end
+      end
     end
   end
 end

data/script/collect_non_twss.rb CHANGED

@@ -1,4 +1,21 @@
-require File.expand_path('../lib/twss', File.dirname(__FILE__))
-require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
+require 'rubygems'
+require 'open-uri'
+require 'hpricot'
-TWSS::TweetCollector.new(':)', File.join(File.dirname(__FILE__), '../data/non_twss.txt')).run
+f = File.open(File.expand_path("../../data/non_twss.txt", __FILE__), "w")
+domain = "http://www.fmylife.com"
+200.times do |i|
+  url = domain + "/intimacy?page=#{i}"
+  puts url
+  body = open(url).read
+  doc = Hpricot(body)
+  doc.search('div.post p a.fmllink') do |story|
+    f.puts story.to_plain_text
+  end
+  f.flush
+  sleep rand * 3.0
+end
+f.close

data/script/collect_twss.rb CHANGED

@@ -1,4 +1,24 @@
-require File.expand_path('../lib/twss', File.dirname(__FILE__))
-require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
+require 'rubygems'
+require 'open-uri'
+require 'hpricot'
-TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run
+# Grab the first 2000 stories from twssstories.com (10 per page)
+f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w")
+domain = "http://twssstories.com"
+200.times do |i|
+  url = domain + "/node?page=#{i}"
+  puts url
+  doc = Hpricot(open(url).read)
+  doc.search('div.content p') do |story|
+    # now pull out the good stuff...
+    if story.to_plain_text =~ /\"(.*)?\"/
+      f.puts $1
+    end
+  end
+  f.flush
+  sleep rand * 3.0
+end
+f.close

data/script/train.rb CHANGED

@@ -1,6 +1,6 @@
-require File.join(File.dirname(__FILE__), '../lib/twss')
-require File.join(File.dirname(__FILE__), '../lib/twss/trainer')
+require File.expand_path('../../lib/twss', __FILE__)
+require File.expand_path('../../lib/twss/trainer', __FILE__)
 engine = TWSS::Engine.new
 trainer = TWSS::Trainer.new(engine)

data/twss.gemspec CHANGED

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name = "twss"
-  s.version = "0.0.3"
+  s.version = "0.0.4"
   s.platform    = Gem::Platform::RUBY
   s.authors = ["Ben VandenBos"]
   s.email = "bvandenbos@gmail.com"
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
   s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
   s.require_path = 'lib'
-  s.add_runtime_dependency("classifier", [">= 1.3.1"])
+  s.add_runtime_dependency("classifier", ["1.3.1"])
   s.add_development_dependency("twitter", [">= 0"])
 end

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 0
-  - 3
-  version: 0.0.3
+  - 4
+  version: 0.0.4
 platform: ruby
 authors:
 - Ben VandenBos
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-29 00:00:00 -07:00
+date: 2011-05-03 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -38,7 +38,7 @@ dependencies:
   requirement: &id002 !ruby/object:Gem::Requirement
     none: false
     requirements:
-    - - ">="
+    - - "="
       - !ruby/object:Gem::Version
         segments:
         - 1
@@ -79,11 +79,12 @@ files:
 - Rakefile
 - data/classifier
 - data/non_twss.txt
+- data/test_non_twss.txt
+- data/test_twss.txt
 - data/twss.txt
 - lib/twss.rb
 - lib/twss/engine.rb
 - lib/twss/trainer.rb
-- lib/twss/tweet_collector.rb
 - script/collect_non_twss.rb
 - script/collect_twss.rb
 - script/train.rb

data/lib/twss/tweet_collector.rb DELETED

@@ -1,29 +0,0 @@
-require 'twitter'
-module TWSS
-  class TweetCollector
-    attr_reader :search, :filename, :limit
-    def initialize(search, filename, limit = 1500)
-      @search, @filename, @limit = search, filename, limit
-    end
-    def run
-      o = File.open(filename, 'a')
-      page, per_page = 1, 100
-      begin
-        Twitter::Search.new.containing(search).per_page(per_page).page(page).each do |tweet|
-          puts tweet.text
-          o.puts tweet.text
-        end
-        page += 1
-        sleep 2
-      end while page * per_page < limit
-      o.close
-    end
-  end
-end