twss 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/README.markdown +46 -0
- data/Rakefile +52 -0
- data/data/classifier +2332 -0
- data/data/non_twss.txt +6389 -0
- data/data/twss.txt +1947 -0
- data/lib/twss.rb +31 -0
- data/lib/twss/engine.rb +63 -0
- data/lib/twss/trainer.rb +68 -0
- data/lib/twss/tweet_collector.rb +29 -0
- data/script/collect_non_twss.rb +4 -0
- data/script/collect_twss.rb +4 -0
- data/script/train.rb +7 -0
- data/twss.gemspec +61 -0
- metadata +126 -0
data/lib/twss.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
require File.join(File.dirname(__FILE__), 'twss/engine')
|
5
|
+
|
6
|
+
module TWSS
|
7
|
+
|
8
|
+
Version = '0.0.1'
|
9
|
+
|
10
|
+
class << self
|
11
|
+
|
12
|
+
extend Forwardable
|
13
|
+
|
14
|
+
def_delegators :engine, :threshold, :threshold=
|
15
|
+
|
16
|
+
def classify(str)
|
17
|
+
engine.classify(str)
|
18
|
+
end
|
19
|
+
|
20
|
+
def engine
|
21
|
+
@engine ||= TWSS::Engine.new
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
# Shortcut for TWSS.classify(str)
|
29
|
+
def TWSS(str)
|
30
|
+
TWSS.classify(str)
|
31
|
+
end
|
data/lib/twss/engine.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'classifier'
|
2
|
+
|
3
|
+
module TWSS
|
4
|
+
|
5
|
+
class Engine
|
6
|
+
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@classifier, :train, :untrain, :classifications
|
10
|
+
|
11
|
+
DATA_FILE = File.join(File.dirname(__FILE__), '../../data/classifier')
|
12
|
+
|
13
|
+
TRUE = '1'
|
14
|
+
FALSE = '0'
|
15
|
+
|
16
|
+
attr_accessor :threshold
|
17
|
+
|
18
|
+
def initialize(options = {})
|
19
|
+
@data_file = options[:data_file] || DATA_FILE
|
20
|
+
@threshold ||= options[:threshold] || 5.0
|
21
|
+
@classifier = load_classifier_from_file!(@data_file) || new_classifier
|
22
|
+
end
|
23
|
+
|
24
|
+
def classify(str)
|
25
|
+
if basic_conditions_met?(str)
|
26
|
+
c = @classifier.classifications(str)
|
27
|
+
c[TRUE] - c[FALSE] > threshold
|
28
|
+
else
|
29
|
+
false
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Dumps the current classifier state to specified path
|
34
|
+
def dump_classifier_to_file(f = @data_file)
|
35
|
+
o = File.open(f, 'w')
|
36
|
+
o.write(Marshal.dump(@classifier))
|
37
|
+
o.close
|
38
|
+
end
|
39
|
+
|
40
|
+
# Clears out the current classifier instance and nukes the data file
|
41
|
+
def clear_state!
|
42
|
+
File.delete(@data_file) if File.exists?(@data_file)
|
43
|
+
@classifier = new_classifier
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def new_classifier
|
49
|
+
Classifier::Bayes.new(TRUE, FALSE)
|
50
|
+
end
|
51
|
+
|
52
|
+
def basic_conditions_met?(str)
|
53
|
+
str.split(' ').length > 3 # more than 3 words
|
54
|
+
end
|
55
|
+
|
56
|
+
# Given a path to a classifier file, load the instance into memory
|
57
|
+
def load_classifier_from_file!(f)
|
58
|
+
Marshal.load(File.read(f)) rescue nil if File.exists?(f)
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
data/lib/twss/trainer.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#require 'twitter'
|
2
|
+
|
3
|
+
module TWSS
|
4
|
+
|
5
|
+
class Trainer
|
6
|
+
|
7
|
+
attr_reader :engine
|
8
|
+
|
9
|
+
def initialize(engine, options = {})
|
10
|
+
@engine = engine
|
11
|
+
engine.clear_state!
|
12
|
+
@training_set_size = options[:training_set_size] || 100
|
13
|
+
end
|
14
|
+
|
15
|
+
def train
|
16
|
+
path = File.join(File.dirname(__FILE__), '../../data/')
|
17
|
+
|
18
|
+
puts "Clearing state..."
|
19
|
+
engine.clear_state!
|
20
|
+
|
21
|
+
puts "Training NON-TWSS strings..."
|
22
|
+
File.read(File.join(path, 'non_twss.txt')).each_line do |l|
|
23
|
+
engine.train(TWSS::Engine::FALSE, strip_tweet(l))
|
24
|
+
end
|
25
|
+
|
26
|
+
puts "Training TWSS strings..."
|
27
|
+
File.read(File.join(path, 'twss.txt')).each_line do |l|
|
28
|
+
engine.train(TWSS::Engine::TRUE, strip_tweet(l))
|
29
|
+
end
|
30
|
+
|
31
|
+
puts "Writing to file..."
|
32
|
+
engine.dump_classifier_to_file
|
33
|
+
|
34
|
+
puts "Done."
|
35
|
+
puts
|
36
|
+
|
37
|
+
run_examples
|
38
|
+
end
|
39
|
+
|
40
|
+
# A little cleanup of the text before we train on it.
|
41
|
+
def strip_tweet(text)
|
42
|
+
t = text.gsub(/[\@\#]\w+\b/i, '') # strip mentions and hashtags
|
43
|
+
t.gsub!(/(RT|OH)\W/i, '') # strip RT's and OH's
|
44
|
+
t.gsub!(/twss/i, '') # strip out twss itself
|
45
|
+
t.gsub!(/http:\/\/[A-Za-z0-9\.\/]+/, '') # URLs
|
46
|
+
t.gsub!(/[\W\d]/, ' ') # now all non word chars and numbers
|
47
|
+
t.strip!
|
48
|
+
t
|
49
|
+
end
|
50
|
+
|
51
|
+
def run_examples
|
52
|
+
["how big is that thing going to get?",
|
53
|
+
"umm... that's the not the right hole",
|
54
|
+
"did you resolve the ticket?",
|
55
|
+
"did you fix the bug?",
|
56
|
+
"you're going to need to go faster",
|
57
|
+
"I'm almost there, keep going",
|
58
|
+
"Ok, send me a pull request",
|
59
|
+
"The president issued a decree",
|
60
|
+
"I don't get it, this isn't working correctly",
|
61
|
+
"finished specialties in the warehouse"].each do |s|
|
62
|
+
puts '"' + s + '" => ' + TWSS(s).to_s
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'twitter'
|
2
|
+
|
3
|
+
module TWSS
|
4
|
+
|
5
|
+
class TweetCollector
|
6
|
+
|
7
|
+
attr_reader :search, :filename, :limit
|
8
|
+
|
9
|
+
def initialize(search, filename, limit = 1500)
|
10
|
+
@search, @filename, @limit = search, filename, limit
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
o = File.open(filename, 'a')
|
15
|
+
page, per_page = 1, 100
|
16
|
+
begin
|
17
|
+
Twitter::Search.new(search).per_page(per_page).page(page).each do |tweet|
|
18
|
+
puts tweet.text
|
19
|
+
o.puts tweet.text
|
20
|
+
end
|
21
|
+
page += 1
|
22
|
+
sleep 2
|
23
|
+
end while page * per_page < limit
|
24
|
+
o.close
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/script/train.rb
ADDED
data/twss.gemspec
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{twss}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Ben VandenBos"]
|
12
|
+
s.date = %q{2010-08-07}
|
13
|
+
s.description = %q{Pre-trained "That's What She Said" bayes classifier.
|
14
|
+
Given a string, returns true if it's a TWSS joke. Pre-trained from
|
15
|
+
Twitter #twss. Let the twss mashups begin!}
|
16
|
+
s.email = %q{bvandenbos@gmail.com}
|
17
|
+
s.extra_rdoc_files = [
|
18
|
+
"README.markdown"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".gitignore",
|
22
|
+
"README.markdown",
|
23
|
+
"Rakefile",
|
24
|
+
"data/classifier",
|
25
|
+
"data/non_twss.txt",
|
26
|
+
"data/twss.txt",
|
27
|
+
"lib/twss.rb",
|
28
|
+
"lib/twss/engine.rb",
|
29
|
+
"lib/twss/trainer.rb",
|
30
|
+
"lib/twss/tweet_collector.rb",
|
31
|
+
"script/collect_non_twss.rb",
|
32
|
+
"script/collect_twss.rb",
|
33
|
+
"script/train.rb",
|
34
|
+
"twss.gemspec"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/bvandenbos/twss}
|
37
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = %q{1.3.7}
|
40
|
+
s.summary = %q{Pre-trained That's What She Said classifier}
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
44
|
+
s.specification_version = 3
|
45
|
+
|
46
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
47
|
+
s.add_runtime_dependency(%q<classifier>, [">= 1.3.1"])
|
48
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
49
|
+
s.add_development_dependency(%q<twitter>, [">= 0"])
|
50
|
+
else
|
51
|
+
s.add_dependency(%q<classifier>, [">= 1.3.1"])
|
52
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
53
|
+
s.add_dependency(%q<twitter>, [">= 0"])
|
54
|
+
end
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<classifier>, [">= 1.3.1"])
|
57
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
58
|
+
s.add_dependency(%q<twitter>, [">= 0"])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
metadata
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twss
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Ben VandenBos
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-08-07 00:00:00 -07:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: classifier
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 25
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 3
|
33
|
+
- 1
|
34
|
+
version: 1.3.1
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: jeweler
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 3
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
version: "0"
|
49
|
+
type: :development
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: twitter
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
60
|
+
segments:
|
61
|
+
- 0
|
62
|
+
version: "0"
|
63
|
+
type: :development
|
64
|
+
version_requirements: *id003
|
65
|
+
description: |-
|
66
|
+
Pre-trained "That's What She Said" bayes classifier.
|
67
|
+
Given a string, returns true if it's a TWSS joke. Pre-trained from
|
68
|
+
Twitter #twss. Let the twss mashups begin!
|
69
|
+
email: bvandenbos@gmail.com
|
70
|
+
executables: []
|
71
|
+
|
72
|
+
extensions: []
|
73
|
+
|
74
|
+
extra_rdoc_files:
|
75
|
+
- README.markdown
|
76
|
+
files:
|
77
|
+
- .gitignore
|
78
|
+
- README.markdown
|
79
|
+
- Rakefile
|
80
|
+
- data/classifier
|
81
|
+
- data/non_twss.txt
|
82
|
+
- data/twss.txt
|
83
|
+
- lib/twss.rb
|
84
|
+
- lib/twss/engine.rb
|
85
|
+
- lib/twss/trainer.rb
|
86
|
+
- lib/twss/tweet_collector.rb
|
87
|
+
- script/collect_non_twss.rb
|
88
|
+
- script/collect_twss.rb
|
89
|
+
- script/train.rb
|
90
|
+
- twss.gemspec
|
91
|
+
has_rdoc: true
|
92
|
+
homepage: http://github.com/bvandenbos/twss
|
93
|
+
licenses: []
|
94
|
+
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options:
|
97
|
+
- --charset=UTF-8
|
98
|
+
require_paths:
|
99
|
+
- lib
|
100
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
hash: 3
|
106
|
+
segments:
|
107
|
+
- 0
|
108
|
+
version: "0"
|
109
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
110
|
+
none: false
|
111
|
+
requirements:
|
112
|
+
- - ">="
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
hash: 3
|
115
|
+
segments:
|
116
|
+
- 0
|
117
|
+
version: "0"
|
118
|
+
requirements: []
|
119
|
+
|
120
|
+
rubyforge_project:
|
121
|
+
rubygems_version: 1.3.7
|
122
|
+
signing_key:
|
123
|
+
specification_version: 3
|
124
|
+
summary: Pre-trained That's What She Said classifier
|
125
|
+
test_files: []
|
126
|
+
|