twss 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +7 -1
- data/data/classifier +1139 -2827
- data/data/non_twss.txt +6743 -7806
- data/data/test_non_twss.txt +200 -0
- data/data/test_twss.txt +200 -0
- data/data/twss.txt +1767 -2405
- data/lib/twss/engine.rb +2 -1
- data/lib/twss/trainer.rb +81 -38
- data/script/collect_non_twss.rb +20 -3
- data/script/collect_twss.rb +23 -3
- data/script/train.rb +2 -2
- data/twss.gemspec +2 -3
- metadata +6 -5
- data/lib/twss/tweet_collector.rb +0 -29
data/lib/twss/engine.rb
CHANGED
@@ -17,13 +17,14 @@ module TWSS
|
|
17
17
|
|
18
18
|
def initialize(options = {})
|
19
19
|
@data_file = options[:data_file] || DATA_FILE
|
20
|
-
@threshold ||= options[:threshold] || 5
|
20
|
+
@threshold ||= options[:threshold] || 7.5
|
21
21
|
@classifier = load_classifier_from_file!(@data_file) || new_classifier
|
22
22
|
end
|
23
23
|
|
24
24
|
def classify(str)
|
25
25
|
if basic_conditions_met?(str)
|
26
26
|
c = @classifier.classifications(str)
|
27
|
+
require 'pp'
|
27
28
|
c[TRUE] - c[FALSE] > threshold
|
28
29
|
else
|
29
30
|
false
|
data/lib/twss/trainer.rb
CHANGED
@@ -4,65 +4,108 @@ module TWSS
|
|
4
4
|
|
5
5
|
class Trainer
|
6
6
|
|
7
|
-
attr_reader :engine
|
7
|
+
attr_reader :engine, :training_percentage
|
8
8
|
|
9
9
|
def initialize(engine, options = {})
|
10
10
|
@engine = engine
|
11
11
|
engine.clear_state!
|
12
|
-
@
|
12
|
+
@training_percentage = options[:training_percentage] || 0.9
|
13
13
|
end
|
14
14
|
|
15
15
|
def train
|
16
16
|
path = File.join(File.dirname(__FILE__), '../../data/')
|
17
17
|
|
18
|
+
run_training(path)
|
19
|
+
|
20
|
+
puts "Writing to file..."
|
21
|
+
engine.dump_classifier_to_file
|
22
|
+
|
23
|
+
run_tests(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
def total_documents(file)
|
27
|
+
t = 0
|
28
|
+
File.read(file).each_line do |l|
|
29
|
+
t += 1
|
30
|
+
end
|
31
|
+
t
|
32
|
+
end
|
33
|
+
|
34
|
+
def run_training(path)
|
35
|
+
positive_file = File.join(path, 'twss.txt')
|
36
|
+
negative_file = File.join(path, 'non_twss.txt')
|
37
|
+
|
18
38
|
puts "Clearing state..."
|
19
39
|
engine.clear_state!
|
20
40
|
|
21
41
|
puts "Training NON-TWSS strings..."
|
22
|
-
File.read(
|
23
|
-
|
42
|
+
File.read(negative_file).each_line do |l|
|
43
|
+
print '.'
|
44
|
+
$stdout.flush
|
45
|
+
engine.train(TWSS::Engine::FALSE, l)
|
24
46
|
end
|
47
|
+
puts
|
25
48
|
|
26
49
|
puts "Training TWSS strings..."
|
27
|
-
File.read(
|
28
|
-
|
50
|
+
File.read(positive_file).each_line do |l|
|
51
|
+
print '.'
|
52
|
+
$stdout.flush
|
53
|
+
engine.train(TWSS::Engine::TRUE, l)
|
29
54
|
end
|
30
|
-
|
31
|
-
puts "Writing to file..."
|
32
|
-
engine.dump_classifier_to_file
|
33
|
-
|
34
|
-
puts "Done."
|
35
|
-
puts
|
36
|
-
|
37
|
-
run_examples
|
55
|
+
puts
|
38
56
|
end
|
57
|
+
|
58
|
+
def run_tests(path)
|
59
|
+
positive_test_file = File.join(path, 'test_twss.txt')
|
60
|
+
negative_test_file = File.join(path, 'test_non_twss.txt')
|
61
|
+
|
62
|
+
total_positive = total_documents(positive_test_file)
|
63
|
+
total_negative = total_documents(negative_test_file)
|
64
|
+
|
65
|
+
false_negatives = 0
|
66
|
+
false_positives = 0
|
67
|
+
total = 0
|
68
|
+
correct = 0
|
69
|
+
test_each(positive_test_file, (total_positive * training_percentage).to_i) do |line, result|
|
70
|
+
if result
|
71
|
+
correct += 1
|
72
|
+
else
|
73
|
+
false_negatives += 1
|
74
|
+
end
|
75
|
+
total += 1
|
76
|
+
end
|
39
77
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
78
|
+
test_each(negative_test_file, (total_negative * training_percentage).to_i) do |line, result|
|
79
|
+
if !result
|
80
|
+
correct += 1
|
81
|
+
else
|
82
|
+
false_positives += 1
|
83
|
+
end
|
84
|
+
total += 1
|
85
|
+
end
|
86
|
+
|
87
|
+
puts
|
88
|
+
puts "Test set size: #{total}"
|
89
|
+
puts "Overall accuracy: #{100 * correct / total.to_f}%"
|
90
|
+
puts "False positives: #{false_positives} (#{100 * false_positives / total_negative.to_f}%)"
|
91
|
+
puts "False negatives: #{false_negatives} (#{100 * false_negatives / total_positive.to_f}%)"
|
92
|
+
puts
|
49
93
|
end
|
50
|
-
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
94
|
+
|
95
|
+
def test_each(file, sample_size, &blk)
|
96
|
+
i = 0
|
97
|
+
File.read(file).each_line do |line|
|
98
|
+
return if i > sample_size
|
99
|
+
l = line.strip
|
100
|
+
unless l.empty?
|
101
|
+
r = TWSS(l)
|
102
|
+
puts l + ' => ' + r.to_s
|
103
|
+
blk.call(l, r)
|
104
|
+
i += 1
|
105
|
+
end
|
106
|
+
end
|
64
107
|
end
|
65
|
-
|
108
|
+
|
66
109
|
end
|
67
110
|
|
68
111
|
end
|
data/script/collect_non_twss.rb
CHANGED
@@ -1,4 +1,21 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'rubygems'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'hpricot'
|
3
4
|
|
4
|
-
|
5
|
+
f = File.open(File.expand_path("../../data/non_twss.txt", __FILE__), "w")
|
6
|
+
|
7
|
+
domain = "http://www.fmylife.com"
|
8
|
+
|
9
|
+
200.times do |i|
|
10
|
+
url = domain + "/intimacy?page=#{i}"
|
11
|
+
puts url
|
12
|
+
body = open(url).read
|
13
|
+
doc = Hpricot(body)
|
14
|
+
doc.search('div.post p a.fmllink') do |story|
|
15
|
+
f.puts story.to_plain_text
|
16
|
+
end
|
17
|
+
f.flush
|
18
|
+
sleep rand * 3.0
|
19
|
+
end
|
20
|
+
|
21
|
+
f.close
|
data/script/collect_twss.rb
CHANGED
@@ -1,4 +1,24 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'rubygems'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'hpricot'
|
3
4
|
|
4
|
-
|
5
|
+
# Grab the first 2000 stories from twssstories.com (10 per page)
|
6
|
+
|
7
|
+
f = File.open(File.expand_path("../../data/twss.txt", __FILE__), "w")
|
8
|
+
|
9
|
+
domain = "http://twssstories.com"
|
10
|
+
200.times do |i|
|
11
|
+
url = domain + "/node?page=#{i}"
|
12
|
+
puts url
|
13
|
+
doc = Hpricot(open(url).read)
|
14
|
+
doc.search('div.content p') do |story|
|
15
|
+
# now pull out the good stuff...
|
16
|
+
if story.to_plain_text =~ /\"(.*)?\"/
|
17
|
+
f.puts $1
|
18
|
+
end
|
19
|
+
end
|
20
|
+
f.flush
|
21
|
+
sleep rand * 3.0
|
22
|
+
end
|
23
|
+
|
24
|
+
f.close
|
data/script/train.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
|
2
|
-
require File.
|
3
|
-
require File.
|
2
|
+
require File.expand_path('../../lib/twss', __FILE__)
|
3
|
+
require File.expand_path('../../lib/twss/trainer', __FILE__)
|
4
4
|
|
5
5
|
engine = TWSS::Engine.new
|
6
6
|
trainer = TWSS::Trainer.new(engine)
|
data/twss.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "twss"
|
3
|
-
s.version = "0.0.
|
3
|
+
s.version = "0.0.4"
|
4
4
|
s.platform = Gem::Platform::RUBY
|
5
5
|
s.authors = ["Ben VandenBos"]
|
6
6
|
s.email = "bvandenbos@gmail.com"
|
@@ -17,8 +17,7 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
18
18
|
s.require_path = 'lib'
|
19
19
|
|
20
|
-
s.add_runtime_dependency("classifier", ["
|
20
|
+
s.add_runtime_dependency("classifier", ["1.3.1"])
|
21
21
|
s.add_development_dependency("twitter", [">= 0"])
|
22
22
|
|
23
23
|
end
|
24
|
-
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 4
|
9
|
+
version: 0.0.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Ben VandenBos
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-05-03 00:00:00 -07:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
|
-
- - "
|
41
|
+
- - "="
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
segments:
|
44
44
|
- 1
|
@@ -79,11 +79,12 @@ files:
|
|
79
79
|
- Rakefile
|
80
80
|
- data/classifier
|
81
81
|
- data/non_twss.txt
|
82
|
+
- data/test_non_twss.txt
|
83
|
+
- data/test_twss.txt
|
82
84
|
- data/twss.txt
|
83
85
|
- lib/twss.rb
|
84
86
|
- lib/twss/engine.rb
|
85
87
|
- lib/twss/trainer.rb
|
86
|
-
- lib/twss/tweet_collector.rb
|
87
88
|
- script/collect_non_twss.rb
|
88
89
|
- script/collect_twss.rb
|
89
90
|
- script/train.rb
|
data/lib/twss/tweet_collector.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
require 'twitter'
|
2
|
-
|
3
|
-
module TWSS
|
4
|
-
|
5
|
-
class TweetCollector
|
6
|
-
|
7
|
-
attr_reader :search, :filename, :limit
|
8
|
-
|
9
|
-
def initialize(search, filename, limit = 1500)
|
10
|
-
@search, @filename, @limit = search, filename, limit
|
11
|
-
end
|
12
|
-
|
13
|
-
def run
|
14
|
-
o = File.open(filename, 'a')
|
15
|
-
page, per_page = 1, 100
|
16
|
-
begin
|
17
|
-
Twitter::Search.new.containing(search).per_page(per_page).page(page).each do |tweet|
|
18
|
-
puts tweet.text
|
19
|
-
o.puts tweet.text
|
20
|
-
end
|
21
|
-
page += 1
|
22
|
-
sleep 2
|
23
|
-
end while page * per_page < limit
|
24
|
-
o.close
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|