twss 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +7 -1
- data/data/classifier +1139 -2827
- data/data/non_twss.txt +6743 -7806
- data/data/test_non_twss.txt +200 -0
- data/data/test_twss.txt +200 -0
- data/data/twss.txt +1767 -2405
- data/lib/twss/engine.rb +2 -1
- data/lib/twss/trainer.rb +81 -38
- data/script/collect_non_twss.rb +20 -3
- data/script/collect_twss.rb +23 -3
- data/script/train.rb +2 -2
- data/twss.gemspec +2 -3
- metadata +6 -5
- data/lib/twss/tweet_collector.rb +0 -29
data/lib/twss/engine.rb
CHANGED
@@ -17,13 +17,14 @@ module TWSS
|
|
17
17
|
|
18
18
|
def initialize(options = {})
|
19
19
|
@data_file = options[:data_file] || DATA_FILE
|
20
|
-
@threshold ||= options[:threshold] || 5
|
20
|
+
@threshold ||= options[:threshold] || 7.5
|
21
21
|
@classifier = load_classifier_from_file!(@data_file) || new_classifier
|
22
22
|
end
|
23
23
|
|
24
24
|
def classify(str)
|
25
25
|
if basic_conditions_met?(str)
|
26
26
|
c = @classifier.classifications(str)
|
27
|
+
require 'pp'
|
27
28
|
c[TRUE] - c[FALSE] > threshold
|
28
29
|
else
|
29
30
|
false
|
data/lib/twss/trainer.rb
CHANGED
@@ -4,65 +4,108 @@ module TWSS
|
|
4
4
|
|
5
5
|
class Trainer
|
6
6
|
|
7
|
-
attr_reader :engine
|
7
|
+
attr_reader :engine, :training_percentage
|
8
8
|
|
9
9
|
def initialize(engine, options = {})
|
10
10
|
@engine = engine
|
11
11
|
engine.clear_state!
|
12
|
-
@
|
12
|
+
@training_percentage = options[:training_percentage] || 0.9
|
13
13
|
end
|
14
14
|
|
15
15
|
def train
|
16
16
|
path = File.join(File.dirname(__FILE__), '../../data/')
|
17
17
|
|
18
|
+
run_training(path)
|
19
|
+
|
20
|
+
puts "Writing to file..."
|
21
|
+
engine.dump_classifier_to_file
|
22
|
+
|
23
|
+
run_tests(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
def total_documents(file)
|
27
|
+
t = 0
|
28
|
+
File.read(file).each_line do |l|
|
29
|
+
t += 1
|
30
|
+
end
|
31
|
+
t
|
32
|
+
end
|
33
|
+
|
34
|
+
def run_training(path)
|
35
|
+
positive_file = File.join(path, 'twss.txt')
|
36
|
+
negative_file = File.join(path, 'non_twss.txt')
|
37
|
+
|
18
38
|
puts "Clearing state..."
|
19
39
|
engine.clear_state!
|
20
40
|
|
21
41
|
puts "Training NON-TWSS strings..."
|
22
|
-
File.read(
|
23
|
-
|
42
|
+
File.read(negative_file).each_line do |l|
|
43
|
+
print '.'
|
44
|
+
$stdout.flush
|
45
|
+
engine.train(TWSS::Engine::FALSE, l)
|
24
46
|
end
|
47
|
+
puts
|
25
48
|
|
26
49
|
puts "Training TWSS strings..."
|
27
|
-
File.read(
|
28
|
-
|
50
|
+
File.read(positive_file).each_line do |l|
|
51
|
+
print '.'
|
52
|
+
$stdout.flush
|
53
|
+
engine.train(TWSS::Engine::TRUE, l)
|
29
54
|
end
|
30
|
-
|
31
|
-
puts "Writing to file..."
|
32
|
-
engine.dump_classifier_to_file
|
33
|
-
|
34
|
-
puts "Done."
|
35
|
-
puts
|
36
|
-
|
37
|
-
run_examples
|
55
|
+
puts
|
38
56
|
end
|
57
|
+
|
58
|
+
def run_tests(path)
|
59
|
+
positive_test_file = File.join(path, 'test_twss.txt')
|
60
|
+
negative_test_file = File.join(path, 'test_non_twss.txt')
|
61
|
+
|
62
|
+
total_positive = total_documents(positive_test_file)
|
63
|
+
total_negative = total_documents(negative_test_file)
|
64
|
+
|
65
|
+
false_negatives = 0
|
66
|
+
false_positives = 0
|
67
|
+
total = 0
|
68
|
+
correct = 0
|
69
|
+
test_each(positive_test_file, (total_positive * training_percentage).to_i) do |line, result|
|
70
|
+
if result
|
71
|
+
correct += 1
|
72
|
+
else
|
73
|
+
false_negatives += 1
|
74
|
+
end
|
75
|
+
total += 1
|
76
|
+
end
|
39
77
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
78
|
+
test_each(negative_test_file, (total_negative * training_percentage).to_i) do |line, result|
|
79
|
+
if !result
|
80
|
+
correct += 1
|
81
|
+
else
|
82
|
+
false_positives += 1
|
83
|
+
end
|
84
|
+
total += 1
|
85
|
+
end
|
86
|
+
|
87
|
+
puts
|
88
|
+
puts "Test set size: #{total}"
|
89
|
+
puts "Overall accuracy: #{100 * correct / total.to_f}%"
|
90
|
+
puts "False positives: #{false_positives} (#{100 * false_positives / total_negative.to_f}%)"
|
91
|
+
puts "False negatives: #{false_negatives} (#{100 * false_negatives / total_positive.to_f}%)"
|
92
|
+
puts
|
49
93
|
end
|
50
|
-
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
94
|
+
|
95
|
+
def test_each(file, sample_size, &blk)
|
96
|
+
i = 0
|
97
|
+
File.read(file).each_line do |line|
|
98
|
+
return if i > sample_size
|
99
|
+
l = line.strip
|
100
|
+
unless l.empty?
|
101
|
+
r = TWSS(l)
|
102
|
+
puts l + ' => ' + r.to_s
|
103
|
+
blk.call(l, r)
|
104
|
+
i += 1
|
105
|
+
end
|
106
|
+
end
|
64
107
|
end
|
65
|
-
|
108
|
+
|
66
109
|
end
|
67
110
|
|
68
111
|
end
|
data/script/collect_non_twss.rb
CHANGED
@@ -1,4 +1,21 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'rubygems'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'hpricot'
|
3
4
|
|
4
|
-
|
5
|
+
f = File.open(File.expand_path("../../data/non_twss.txt", __FILE__), "w")
|
6
|
+
|
7
|
+
domain = "http://www.fmylife.com"
|
8
|
+
|
9
|
+
200.times do |i|
|
10
|
+
url = domain + "/intimacy?page=#{i}"
|
11
|
+
puts url
|
12
|
+
body = open(url).read
|
13
|
+
doc = Hpricot(body)
|
14
|
+
doc.search('div.post p a.fmllink') do |story|
|
15
|
+
f.puts story.to_plain_text
|
16
|
+
end
|
17
|
+
f.flush
|
18
|
+
sleep rand * 3.0
|
19
|
+
end
|
20
|
+
|
21
|
+
f.close
|
data/script/collect_twss.rb
CHANGED
@@ -1,4 +1,24 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'rubygems'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'hpricot'
|
3
4
|
|
4
|
-
|
5
|
+
# Grab the first 2000 stories from twssstories.com (10 per page)
|
6
|
+
|
7
|
+
f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w")
|
8
|
+
|
9
|
+
domain = "http://twssstories.com"
|
10
|
+
200.times do |i|
|
11
|
+
url = domain + "/node?page=#{i}"
|
12
|
+
puts url
|
13
|
+
doc = Hpricot(open(url).read)
|
14
|
+
doc.search('div.content p') do |story|
|
15
|
+
# now pull out the good stuff...
|
16
|
+
if story.to_plain_text =~ /\"(.*)?\"/
|
17
|
+
f.puts $1
|
18
|
+
end
|
19
|
+
end
|
20
|
+
f.flush
|
21
|
+
sleep rand * 3.0
|
22
|
+
end
|
23
|
+
|
24
|
+
f.close
|
data/script/train.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
|
2
|
-
require File.
|
3
|
-
require File.
|
2
|
+
require File.expand_path('../../lib/twss', __FILE__)
|
3
|
+
require File.expand_path('../../lib/twss/trainer', __FILE__)
|
4
4
|
|
5
5
|
engine = TWSS::Engine.new
|
6
6
|
trainer = TWSS::Trainer.new(engine)
|
data/twss.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "twss"
|
3
|
-
s.version = "0.0.
|
3
|
+
s.version = "0.0.4"
|
4
4
|
s.platform = Gem::Platform::RUBY
|
5
5
|
s.authors = ["Ben VandenBos"]
|
6
6
|
s.email = "bvandenbos@gmail.com"
|
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
18
18
|
s.require_path = 'lib'
|
19
19
|
|
20
|
-
s.add_runtime_dependency("classifier", ["
|
20
|
+
s.add_runtime_dependency("classifier", ["1.3.1"])
|
21
21
|
s.add_development_dependency("twitter", [">= 0"])
|
22
22
|
|
23
23
|
end
|
24
|
-
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 4
|
9
|
+
version: 0.0.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Ben VandenBos
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-05-03 00:00:00 -07:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
|
-
- - "
|
41
|
+
- - "="
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
segments:
|
44
44
|
- 1
|
@@ -79,11 +79,12 @@ files:
|
|
79
79
|
- Rakefile
|
80
80
|
- data/classifier
|
81
81
|
- data/non_twss.txt
|
82
|
+
- data/test_non_twss.txt
|
83
|
+
- data/test_twss.txt
|
82
84
|
- data/twss.txt
|
83
85
|
- lib/twss.rb
|
84
86
|
- lib/twss/engine.rb
|
85
87
|
- lib/twss/trainer.rb
|
86
|
-
- lib/twss/tweet_collector.rb
|
87
88
|
- script/collect_non_twss.rb
|
88
89
|
- script/collect_twss.rb
|
89
90
|
- script/train.rb
|
data/lib/twss/tweet_collector.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
require 'twitter'
|
2
|
-
|
3
|
-
module TWSS
|
4
|
-
|
5
|
-
class TweetCollector
|
6
|
-
|
7
|
-
attr_reader :search, :filename, :limit
|
8
|
-
|
9
|
-
def initialize(search, filename, limit = 1500)
|
10
|
-
@search, @filename, @limit = search, filename, limit
|
11
|
-
end
|
12
|
-
|
13
|
-
def run
|
14
|
-
o = File.open(filename, 'a')
|
15
|
-
page, per_page = 1, 100
|
16
|
-
begin
|
17
|
-
Twitter::Search.new.containing(search).per_page(per_page).page(page).each do |tweet|
|
18
|
-
puts tweet.text
|
19
|
-
o.puts tweet.text
|
20
|
-
end
|
21
|
-
page += 1
|
22
|
-
sleep 2
|
23
|
-
end while page * per_page < limit
|
24
|
-
o.close
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|