twss 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,13 +17,14 @@ module TWSS
17
17
 
18
18
  def initialize(options = {})
19
19
  @data_file = options[:data_file] || DATA_FILE
20
- @threshold ||= options[:threshold] || 5.0
20
+ @threshold ||= options[:threshold] || 7.5
21
21
  @classifier = load_classifier_from_file!(@data_file) || new_classifier
22
22
  end
23
23
 
24
24
  def classify(str)
25
25
  if basic_conditions_met?(str)
26
26
  c = @classifier.classifications(str)
27
+ require 'pp'
27
28
  c[TRUE] - c[FALSE] > threshold
28
29
  else
29
30
  false
@@ -4,65 +4,108 @@ module TWSS
4
4
 
5
5
  class Trainer
6
6
 
7
- attr_reader :engine
7
+ attr_reader :engine, :training_percentage
8
8
 
9
9
  def initialize(engine, options = {})
10
10
  @engine = engine
11
11
  engine.clear_state!
12
- @training_set_size = options[:training_set_size] || 100
12
+ @training_percentage = options[:training_percentage] || 0.9
13
13
  end
14
14
 
15
15
  def train
16
16
  path = File.join(File.dirname(__FILE__), '../../data/')
17
17
 
18
+ run_training(path)
19
+
20
+ puts "Writing to file..."
21
+ engine.dump_classifier_to_file
22
+
23
+ run_tests(path)
24
+ end
25
+
26
+ def total_documents(file)
27
+ t = 0
28
+ File.read(file).each_line do |l|
29
+ t += 1
30
+ end
31
+ t
32
+ end
33
+
34
+ def run_training(path)
35
+ positive_file = File.join(path, 'twss.txt')
36
+ negative_file = File.join(path, 'non_twss.txt')
37
+
18
38
  puts "Clearing state..."
19
39
  engine.clear_state!
20
40
 
21
41
  puts "Training NON-TWSS strings..."
22
- File.read(File.join(path, 'non_twss.txt')).each_line do |l|
23
- engine.train(TWSS::Engine::FALSE, strip_tweet(l))
42
+ File.read(negative_file).each_line do |l|
43
+ print '.'
44
+ $stdout.flush
45
+ engine.train(TWSS::Engine::FALSE, l)
24
46
  end
47
+ puts
25
48
 
26
49
  puts "Training TWSS strings..."
27
- File.read(File.join(path, 'twss.txt')).each_line do |l|
28
- engine.train(TWSS::Engine::TRUE, strip_tweet(l))
50
+ File.read(positive_file).each_line do |l|
51
+ print '.'
52
+ $stdout.flush
53
+ engine.train(TWSS::Engine::TRUE, l)
29
54
  end
30
-
31
- puts "Writing to file..."
32
- engine.dump_classifier_to_file
33
-
34
- puts "Done."
35
- puts
36
-
37
- run_examples
55
+ puts
38
56
  end
57
+
58
+ def run_tests(path)
59
+ positive_test_file = File.join(path, 'test_twss.txt')
60
+ negative_test_file = File.join(path, 'test_non_twss.txt')
61
+
62
+ total_positive = total_documents(positive_test_file)
63
+ total_negative = total_documents(negative_test_file)
64
+
65
+ false_negatives = 0
66
+ false_positives = 0
67
+ total = 0
68
+ correct = 0
69
+ test_each(positive_test_file, (total_positive * training_percentage).to_i) do |line, result|
70
+ if result
71
+ correct += 1
72
+ else
73
+ false_negatives += 1
74
+ end
75
+ total += 1
76
+ end
39
77
 
40
- # A little cleanup of the text before we train on it.
41
- def strip_tweet(text)
42
- t = text.gsub(/[\@\#]\w+\b/i, '') # strip mentions and hashtags
43
- t.gsub!(/(RT|OH)\W/i, '') # strip RT's and OH's
44
- t.gsub!(/twss/i, '') # strip out twss itself
45
- t.gsub!(/http:\/\/[A-Za-z0-9\.\/]+/, '') # URLs
46
- t.gsub!(/[\W\d]/, ' ') # now all non word chars and numbers
47
- t.strip!
48
- t
78
+ test_each(negative_test_file, (total_negative * training_percentage).to_i) do |line, result|
79
+ if !result
80
+ correct += 1
81
+ else
82
+ false_positives += 1
83
+ end
84
+ total += 1
85
+ end
86
+
87
+ puts
88
+ puts "Test set size: #{total}"
89
+ puts "Overall accuracy: #{100 * correct / total.to_f}%"
90
+ puts "False positives: #{false_positives} (#{100 * false_positives / total_negative.to_f}%)"
91
+ puts "False negatives: #{false_negatives} (#{100 * false_negatives / total_positive.to_f}%)"
92
+ puts
49
93
  end
50
-
51
- def run_examples
52
- ["how big is that thing going to get?",
53
- "umm... that's the not the right hole",
54
- "did you resolve the ticket?",
55
- "did you fix the bug?",
56
- "you're going to need to go faster",
57
- "I'm almost there, keep going",
58
- "Ok, send me a pull request",
59
- "The president issued a decree",
60
- "I don't get it, this isn't working correctly",
61
- "finished specialties in the warehouse"].each do |s|
62
- puts '"' + s + '" => ' + TWSS(s).to_s
63
- end
94
+
95
+ def test_each(file, sample_size, &blk)
96
+ i = 0
97
+ File.read(file).each_line do |line|
98
+ return if i > sample_size
99
+ l = line.strip
100
+ unless l.empty?
101
+ r = TWSS(l)
102
+ puts l + ' => ' + r.to_s
103
+ blk.call(l, r)
104
+ i += 1
105
+ end
106
+ end
64
107
  end
65
-
108
+
66
109
  end
67
110
 
68
111
  end
@@ -1,4 +1,21 @@
1
- require File.expand_path('../lib/twss', File.dirname(__FILE__))
2
- require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'hpricot'
3
4
 
4
- TWSS::TweetCollector.new(':)', File.join(File.dirname(__FILE__), '../data/non_twss.txt')).run
5
+ f = File.open(File.expand_path("../../data/non_twss.txt", __FILE__), "w")
6
+
7
+ domain = "http://www.fmylife.com"
8
+
9
+ 200.times do |i|
10
+ url = domain + "/intimacy?page=#{i}"
11
+ puts url
12
+ body = open(url).read
13
+ doc = Hpricot(body)
14
+ doc.search('div.post p a.fmllink') do |story|
15
+ f.puts story.to_plain_text
16
+ end
17
+ f.flush
18
+ sleep rand * 3.0
19
+ end
20
+
21
+ f.close
@@ -1,4 +1,24 @@
1
- require File.expand_path('../lib/twss', File.dirname(__FILE__))
2
- require File.expand_path('../lib/twss/tweet_collector', File.dirname(__FILE__))
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'hpricot'
3
4
 
4
- TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run
5
+ # Grab the first 2000 stories from twssstories.com (10 per page)
6
+
7
+ f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w")
8
+
9
+ domain = "http://twssstories.com"
10
+ 200.times do |i|
11
+ url = domain + "/node?page=#{i}"
12
+ puts url
13
+ doc = Hpricot(open(url).read)
14
+ doc.search('div.content p') do |story|
15
+ # now pull out the good stuff...
16
+ if story.to_plain_text =~ /\"(.*)?\"/
17
+ f.puts $1
18
+ end
19
+ end
20
+ f.flush
21
+ sleep rand * 3.0
22
+ end
23
+
24
+ f.close
@@ -1,6 +1,6 @@
1
1
 
2
- require File.join(File.dirname(__FILE__), '../lib/twss')
3
- require File.join(File.dirname(__FILE__), '../lib/twss/trainer')
2
+ require File.expand_path('../../lib/twss', __FILE__)
3
+ require File.expand_path('../../lib/twss/trainer', __FILE__)
4
4
 
5
5
  engine = TWSS::Engine.new
6
6
  trainer = TWSS::Trainer.new(engine)
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "twss"
3
- s.version = "0.0.3"
3
+ s.version = "0.0.4"
4
4
  s.platform = Gem::Platform::RUBY
5
5
  s.authors = ["Ben VandenBos"]
6
6
  s.email = "bvandenbos@gmail.com"
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
17
17
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
18
18
  s.require_path = 'lib'
19
19
 
20
- s.add_runtime_dependency("classifier", [">= 1.3.1"])
20
+ s.add_runtime_dependency("classifier", ["1.3.1"])
21
21
  s.add_development_dependency("twitter", [">= 0"])
22
22
 
23
23
  end
24
-
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 3
9
- version: 0.0.3
8
+ - 4
9
+ version: 0.0.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Ben VandenBos
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-04-29 00:00:00 -07:00
17
+ date: 2011-05-03 00:00:00 -07:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -38,7 +38,7 @@ dependencies:
38
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
- - - ">="
41
+ - - "="
42
42
  - !ruby/object:Gem::Version
43
43
  segments:
44
44
  - 1
@@ -79,11 +79,12 @@ files:
79
79
  - Rakefile
80
80
  - data/classifier
81
81
  - data/non_twss.txt
82
+ - data/test_non_twss.txt
83
+ - data/test_twss.txt
82
84
  - data/twss.txt
83
85
  - lib/twss.rb
84
86
  - lib/twss/engine.rb
85
87
  - lib/twss/trainer.rb
86
- - lib/twss/tweet_collector.rb
87
88
  - script/collect_non_twss.rb
88
89
  - script/collect_twss.rb
89
90
  - script/train.rb
@@ -1,29 +0,0 @@
1
- require 'twitter'
2
-
3
- module TWSS
4
-
5
- class TweetCollector
6
-
7
- attr_reader :search, :filename, :limit
8
-
9
- def initialize(search, filename, limit = 1500)
10
- @search, @filename, @limit = search, filename, limit
11
- end
12
-
13
- def run
14
- o = File.open(filename, 'a')
15
- page, per_page = 1, 100
16
- begin
17
- Twitter::Search.new.containing(search).per_page(per_page).page(page).each do |tweet|
18
- puts tweet.text
19
- o.puts tweet.text
20
- end
21
- page += 1
22
- sleep 2
23
- end while page * per_page < limit
24
- o.close
25
- end
26
-
27
- end
28
-
29
- end