twss 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,13 +17,14 @@ module TWSS
17
17
 
18
18
  def initialize(options = {})
19
19
  @data_file = options[:data_file] || DATA_FILE
20
- @threshold ||= options[:threshold] || 5.0
20
+ @threshold ||= options[:threshold] || 7.5
21
21
  @classifier = load_classifier_from_file!(@data_file) || new_classifier
22
22
  end
23
23
 
24
24
  def classify(str)
25
25
  if basic_conditions_met?(str)
26
26
  c = @classifier.classifications(str)
27
+ require 'pp'
27
28
  c[TRUE] - c[FALSE] > threshold
28
29
  else
29
30
  false
@@ -4,65 +4,108 @@ module TWSS
4
4
 
5
5
  class Trainer
6
6
 
7
- attr_reader :engine
7
+ attr_reader :engine, :training_percentage
8
8
 
9
9
  def initialize(engine, options = {})
10
10
  @engine = engine
11
11
  engine.clear_state!
12
- @training_set_size = options[:training_set_size] || 100
12
+ @training_percentage = options[:training_percentage] || 0.9
13
13
  end
14
14
 
15
15
  def train
16
16
  path = File.join(File.dirname(__FILE__), '../../data/')
17
17
 
18
+ run_training(path)
19
+
20
+ puts "Writing to file..."
21
+ engine.dump_classifier_to_file
22
+
23
+ run_tests(path)
24
+ end
25
+
26
+ def total_documents(file)
27
+ t = 0
28
+ File.read(file).each_line do |l|
29
+ t += 1
30
+ end
31
+ t
32
+ end
33
+
34
+ def run_training(path)
35
+ positive_file = File.join(path, 'twss.txt')
36
+ negative_file = File.join(path, 'non_twss.txt')
37
+
18
38
  puts "Clearing state..."
19
39
  engine.clear_state!
20
40
 
21
41
  puts "Training NON-TWSS strings..."
22
- File.read(File.join(path, 'non_twss.txt')).each_line do |l|
23
- engine.train(TWSS::Engine::FALSE, strip_tweet(l))
42
+ File.read(negative_file).each_line do |l|
43
+ print '.'
44
+ $stdout.flush
45
+ engine.train(TWSS::Engine::FALSE, l)
24
46
  end
47
+ puts
25
48
 
26
49
  puts "Training TWSS strings..."
27
- File.read(File.join(path, 'twss.txt')).each_line do |l|
28
- engine.train(TWSS::Engine::TRUE, strip_tweet(l))
50
+ File.read(positive_file).each_line do |l|
51
+ print '.'
52
+ $stdout.flush
53
+ engine.train(TWSS::Engine::TRUE, l)
29
54
  end
30
-
31
- puts "Writing to file..."
32
- engine.dump_classifier_to_file
33
-
34
- puts "Done."
35
- puts
36
-
37
- run_examples
55
+ puts
38
56
  end
57
+
58
+ def run_tests(path)
59
+ positive_test_file = File.join(path, 'test_twss.txt')
60
+ negative_test_file = File.join(path, 'test_non_twss.txt')
61
+
62
+ total_positive = total_documents(positive_test_file)
63
+ total_negative = total_documents(negative_test_file)
64
+
65
+ false_negatives = 0
66
+ false_positives = 0
67
+ total = 0
68
+ correct = 0
69
+ test_each(positive_test_file, (total_positive * training_percentage).to_i) do |line, result|
70
+ if result
71
+ correct += 1
72
+ else
73
+ false_negatives += 1
74
+ end
75
+ total += 1
76
+ end
39
77
 
40
- # A little cleanup of the text before we train on it.
41
- def strip_tweet(text)
42
- t = text.gsub(/[\@\#]\w+\b/i, '') # strip mentions and hashtags
43
- t.gsub!(/(RT|OH)\W/i, '') # strip RT's and OH's
44
- t.gsub!(/twss/i, '') # strip out twss itself
45
- t.gsub!(/http:\/\/[A-Za-z0-9\.\/]+/, '') # URLs
46
- t.gsub!(/[\W\d]/, ' ') # now all non word chars and numbers
47
- t.strip!
48
- t
78
+ test_each(negative_test_file, (total_negative * training_percentage).to_i) do |line, result|
79
+ if !result
80
+ correct += 1
81
+ else
82
+ false_positives += 1
83
+ end
84
+ total += 1
85
+ end
86
+
87
+ puts
88
+ puts "Test set size: #{total}"
89
+ puts "Overall accuracy: #{100 * correct / total.to_f}%"
90
+ puts "False positives: #{false_positives} (#{100 * false_positives / total_negative.to_f}%)"
91
+ puts "False negatives: #{false_negatives} (#{100 * false_negatives / total_positive.to_f}%)"
92
+ puts
49
93
  end
50
-
51
- def run_examples
52
- ["how big is that thing going to get?",
53
- "umm... that's the not the right hole",
54
- "did you resolve the ticket?",
55
- "did you fix the bug?",
56
- "you're going to need to go faster",
57
- "I'm almost there, keep going",
58
- "Ok, send me a pull request",
59
- "The president issued a decree",
60
- "I don't get it, this isn't working correctly",
61
- "finished specialties in the warehouse"].each do |s|
62
- puts '"' + s + '" => ' + TWSS(s).to_s
63
- end
94
+
95
+ def test_each(file, sample_size, &blk)
96
+ i = 0
97
+ File.read(file).each_line do |line|
98
+ return if i > sample_size
99
+ l = line.strip
100
+ unless l.empty?
101
+ r = TWSS(l)
102
+ puts l + ' => ' + r.to_s
103
+ blk.call(l, r)
104
+ i += 1
105
+ end
106
+ end
64
107
  end
65
-
108
+
66
109
  end
67
110
 
68
111
  end
@@ -1,4 +1,21 @@
1
- require File.expand_path('../lib/twss', File.dirname(__FILE__))
2
- require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'hpricot'
3
4
 
4
- TWSS::TweetCollector.new(':)', File.join(File.dirname(__FILE__), '../data/non_twss.txt')).run
5
+ f = File.open(File.expand_path("../../data/non_twss.txt", __FILE__), "w")
6
+
7
+ domain = "http://www.fmylife.com"
8
+
9
+ 200.times do |i|
10
+ url = domain + "/intimacy?page=#{i}"
11
+ puts url
12
+ body = open(url).read
13
+ doc = Hpricot(body)
14
+ doc.search('div.post p a.fmllink') do |story|
15
+ f.puts story.to_plain_text
16
+ end
17
+ f.flush
18
+ sleep rand * 3.0
19
+ end
20
+
21
+ f.close
@@ -1,4 +1,24 @@
1
- require File.expand_path('../lib/twss', File.dirname(__FILE__))
2
- require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'hpricot'
3
4
 
4
- TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run
5
+ # Grab the first 2000 stories from twssstories.com (10 per page)
6
+
7
+ f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w")
8
+
9
+ domain = "http://twssstories.com"
10
+ 200.times do |i|
11
+ url = domain + "/node?page=#{i}"
12
+ puts url
13
+ doc = Hpricot(open(url).read)
14
+ doc.search('div.content p') do |story|
15
+ # now pull out the good stuff...
16
+ if story.to_plain_text =~ /\"(.*)?\"/
17
+ f.puts $1
18
+ end
19
+ end
20
+ f.flush
21
+ sleep rand * 3.0
22
+ end
23
+
24
+ f.close
@@ -1,6 +1,6 @@
1
1
 
2
- require File.join(File.dirname(__FILE__), '../lib/twss')
3
- require File.join(File.dirname(__FILE__), '../lib/twss/trainer')
2
+ require File.expand_path('../../lib/twss', __FILE__)
3
+ require File.expand_path('../../lib/twss/trainer', __FILE__)
4
4
 
5
5
  engine = TWSS::Engine.new
6
6
  trainer = TWSS::Trainer.new(engine)
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "twss"
3
- s.version = "0.0.3"
3
+ s.version = "0.0.4"
4
4
  s.platform = Gem::Platform::RUBY
5
5
  s.authors = ["Ben VandenBos"]
6
6
  s.email = "bvandenbos@gmail.com"
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
17
17
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
18
18
  s.require_path = 'lib'
19
19
 
20
- s.add_runtime_dependency("classifier", [">= 1.3.1"])
20
+ s.add_runtime_dependency("classifier", ["1.3.1"])
21
21
  s.add_development_dependency("twitter", [">= 0"])
22
22
 
23
23
  end
24
-
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 3
9
- version: 0.0.3
8
+ - 4
9
+ version: 0.0.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Ben VandenBos
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-04-29 00:00:00 -07:00
17
+ date: 2011-05-03 00:00:00 -07:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -38,7 +38,7 @@ dependencies:
38
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
- - - ">="
41
+ - - "="
42
42
  - !ruby/object:Gem::Version
43
43
  segments:
44
44
  - 1
@@ -79,11 +79,12 @@ files:
79
79
  - Rakefile
80
80
  - data/classifier
81
81
  - data/non_twss.txt
82
+ - data/test_non_twss.txt
83
+ - data/test_twss.txt
82
84
  - data/twss.txt
83
85
  - lib/twss.rb
84
86
  - lib/twss/engine.rb
85
87
  - lib/twss/trainer.rb
86
- - lib/twss/tweet_collector.rb
87
88
  - script/collect_non_twss.rb
88
89
  - script/collect_twss.rb
89
90
  - script/train.rb
@@ -1,29 +0,0 @@
1
- require 'twitter'
2
-
3
- module TWSS
4
-
5
- class TweetCollector
6
-
7
- attr_reader :search, :filename, :limit
8
-
9
- def initialize(search, filename, limit = 1500)
10
- @search, @filename, @limit = search, filename, limit
11
- end
12
-
13
- def run
14
- o = File.open(filename, 'a')
15
- page, per_page = 1, 100
16
- begin
17
- Twitter::Search.new.containing(search).per_page(per_page).page(page).each do |tweet|
18
- puts tweet.text
19
- o.puts tweet.text
20
- end
21
- page += 1
22
- sleep 2
23
- end while page * per_page < limit
24
- o.close
25
- end
26
-
27
- end
28
-
29
- end