twss 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ require 'rubygems'
2
+ require 'forwardable'
3
+
4
+ require File.join(File.dirname(__FILE__), 'twss/engine')
5
+
6
+ module TWSS
7
+
8
+ Version = '0.0.1'
9
+
10
+ class << self
11
+
12
+ extend Forwardable
13
+
14
+ def_delegators :engine, :threshold, :threshold=
15
+
16
+ def classify(str)
17
+ engine.classify(str)
18
+ end
19
+
20
+ def engine
21
+ @engine ||= TWSS::Engine.new
22
+ end
23
+
24
+ end
25
+
26
+ end
27
+
28
+ # Shortcut for TWSS.classify(str)
29
+ def TWSS(str)
30
+ TWSS.classify(str)
31
+ end
@@ -0,0 +1,63 @@
1
+ require 'classifier'
2
+
3
+ module TWSS
4
+
5
+ class Engine
6
+
7
+ extend Forwardable
8
+
9
+ def_delegators :@classifier, :train, :untrain, :classifications
10
+
11
+ DATA_FILE = File.join(File.dirname(__FILE__), '../../data/classifier')
12
+
13
+ TRUE = '1'
14
+ FALSE = '0'
15
+
16
+ attr_accessor :threshold
17
+
18
+ def initialize(options = {})
19
+ @data_file = options[:data_file] || DATA_FILE
20
+ @threshold ||= options[:threshold] || 5.0
21
+ @classifier = load_classifier_from_file!(@data_file) || new_classifier
22
+ end
23
+
24
+ def classify(str)
25
+ if basic_conditions_met?(str)
26
+ c = @classifier.classifications(str)
27
+ c[TRUE] - c[FALSE] > threshold
28
+ else
29
+ false
30
+ end
31
+ end
32
+
33
+ # Dumps the current classifier state to specified path
34
+ def dump_classifier_to_file(f = @data_file)
35
+ o = File.open(f, 'w')
36
+ o.write(Marshal.dump(@classifier))
37
+ o.close
38
+ end
39
+
40
+ # Clears out the current classifier instance and nukes the data file
41
+ def clear_state!
42
+ File.delete(@data_file) if File.exists?(@data_file)
43
+ @classifier = new_classifier
44
+ end
45
+
46
+ private
47
+
48
+ def new_classifier
49
+ Classifier::Bayes.new(TRUE, FALSE)
50
+ end
51
+
52
+ def basic_conditions_met?(str)
53
+ str.split(' ').length > 3 # more than 3 words
54
+ end
55
+
56
+ # Given a path to a classifier file, load the instance into memory
57
+ def load_classifier_from_file!(f)
58
+ Marshal.load(File.read(f)) rescue nil if File.exists?(f)
59
+ end
60
+
61
+ end
62
+
63
+ end
@@ -0,0 +1,68 @@
1
+ #require 'twitter'
2
+
3
+ module TWSS
4
+
5
+ class Trainer
6
+
7
+ attr_reader :engine
8
+
9
+ def initialize(engine, options = {})
10
+ @engine = engine
11
+ engine.clear_state!
12
+ @training_set_size = options[:training_set_size] || 100
13
+ end
14
+
15
+ def train
16
+ path = File.join(File.dirname(__FILE__), '../../data/')
17
+
18
+ puts "Clearing state..."
19
+ engine.clear_state!
20
+
21
+ puts "Training NON-TWSS strings..."
22
+ File.read(File.join(path, 'non_twss.txt')).each_line do |l|
23
+ engine.train(TWSS::Engine::FALSE, strip_tweet(l))
24
+ end
25
+
26
+ puts "Training TWSS strings..."
27
+ File.read(File.join(path, 'twss.txt')).each_line do |l|
28
+ engine.train(TWSS::Engine::TRUE, strip_tweet(l))
29
+ end
30
+
31
+ puts "Writing to file..."
32
+ engine.dump_classifier_to_file
33
+
34
+ puts "Done."
35
+ puts
36
+
37
+ run_examples
38
+ end
39
+
40
+ # A little cleanup of the text before we train on it.
41
+ def strip_tweet(text)
42
+ t = text.gsub(/[\@\#]\w+\b/i, '') # strip mentions and hashtags
43
+ t.gsub!(/(RT|OH)\W/i, '') # strip RT's and OH's
44
+ t.gsub!(/twss/i, '') # strip out twss itself
45
+ t.gsub!(/http:\/\/[A-Za-z0-9\.\/]+/, '') # URLs
46
+ t.gsub!(/[\W\d]/, ' ') # now all non word chars and numbers
47
+ t.strip!
48
+ t
49
+ end
50
+
51
+ def run_examples
52
+ ["how big is that thing going to get?",
53
+ "umm... that's the not the right hole",
54
+ "did you resolve the ticket?",
55
+ "did you fix the bug?",
56
+ "you're going to need to go faster",
57
+ "I'm almost there, keep going",
58
+ "Ok, send me a pull request",
59
+ "The president issued a decree",
60
+ "I don't get it, this isn't working correctly",
61
+ "finished specialties in the warehouse"].each do |s|
62
+ puts '"' + s + '" => ' + TWSS(s).to_s
63
+ end
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,29 @@
1
+ require 'twitter'
2
+
3
+ module TWSS
4
+
5
+ class TweetCollector
6
+
7
+ attr_reader :search, :filename, :limit
8
+
9
+ def initialize(search, filename, limit = 1500)
10
+ @search, @filename, @limit = search, filename, limit
11
+ end
12
+
13
+ def run
14
+ o = File.open(filename, 'a')
15
+ page, per_page = 1, 100
16
+ begin
17
+ Twitter::Search.new(search).per_page(per_page).page(page).each do |tweet|
18
+ puts tweet.text
19
+ o.puts tweet.text
20
+ end
21
+ page += 1
22
+ sleep 2
23
+ end while page * per_page < limit
24
+ o.close
25
+ end
26
+
27
+ end
28
+
29
+ end
@@ -0,0 +1,4 @@
1
+ require File.join(File.dirname(__FILE__), '../lib/twss')
2
+ require File.join(File.dirname(__FILE__), '../lib/twss/tweet_collector')
3
+
4
+ TWSS::TweetCollector.new(':)', File.join(File.dirname(__FILE__), '../data/non_twss.txt')).run
@@ -0,0 +1,4 @@
1
+ require File.join(File.dirname(__FILE__), '../lib/twss')
2
+ require File.join(File.dirname(__FILE__), '../lib/twss/tweet_collector')
3
+
4
+ TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run
@@ -0,0 +1,7 @@
1
+
2
+ require File.join(File.dirname(__FILE__), '../lib/twss')
3
+ require File.join(File.dirname(__FILE__), '../lib/twss/trainer')
4
+
5
+ engine = TWSS::Engine.new
6
+ trainer = TWSS::Trainer.new(engine)
7
+ trainer.train
@@ -0,0 +1,61 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{twss}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ben VandenBos"]
12
+ s.date = %q{2010-08-07}
13
+ s.description = %q{Pre-trained "That's What She Said" bayes classifier.
14
+ Given a string, returns true if it's a TWSS joke. Pre-trained from
15
+ Twitter #twss. Let the twss mashups begin!}
16
+ s.email = %q{bvandenbos@gmail.com}
17
+ s.extra_rdoc_files = [
18
+ "README.markdown"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "README.markdown",
23
+ "Rakefile",
24
+ "data/classifier",
25
+ "data/non_twss.txt",
26
+ "data/twss.txt",
27
+ "lib/twss.rb",
28
+ "lib/twss/engine.rb",
29
+ "lib/twss/trainer.rb",
30
+ "lib/twss/tweet_collector.rb",
31
+ "script/collect_non_twss.rb",
32
+ "script/collect_twss.rb",
33
+ "script/train.rb",
34
+ "twss.gemspec"
35
+ ]
36
+ s.homepage = %q{http://github.com/bvandenbos/twss}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.7}
40
+ s.summary = %q{Pre-trained That's What She Said classifier}
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
47
+ s.add_runtime_dependency(%q<classifier>, [">= 1.3.1"])
48
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
49
+ s.add_development_dependency(%q<twitter>, [">= 0"])
50
+ else
51
+ s.add_dependency(%q<classifier>, [">= 1.3.1"])
52
+ s.add_dependency(%q<jeweler>, [">= 0"])
53
+ s.add_dependency(%q<twitter>, [">= 0"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<classifier>, [">= 1.3.1"])
57
+ s.add_dependency(%q<jeweler>, [">= 0"])
58
+ s.add_dependency(%q<twitter>, [">= 0"])
59
+ end
60
+ end
61
+
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twss
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Ben VandenBos
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-07 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: classifier
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 25
30
+ segments:
31
+ - 1
32
+ - 3
33
+ - 1
34
+ version: 1.3.1
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: jeweler
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 3
46
+ segments:
47
+ - 0
48
+ version: "0"
49
+ type: :development
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: twitter
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 3
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ type: :development
64
+ version_requirements: *id003
65
+ description: |-
66
+ Pre-trained "That's What She Said" bayes classifier.
67
+ Given a string, returns true if it's a TWSS joke. Pre-trained from
68
+ Twitter #twss. Let the twss mashups begin!
69
+ email: bvandenbos@gmail.com
70
+ executables: []
71
+
72
+ extensions: []
73
+
74
+ extra_rdoc_files:
75
+ - README.markdown
76
+ files:
77
+ - .gitignore
78
+ - README.markdown
79
+ - Rakefile
80
+ - data/classifier
81
+ - data/non_twss.txt
82
+ - data/twss.txt
83
+ - lib/twss.rb
84
+ - lib/twss/engine.rb
85
+ - lib/twss/trainer.rb
86
+ - lib/twss/tweet_collector.rb
87
+ - script/collect_non_twss.rb
88
+ - script/collect_twss.rb
89
+ - script/train.rb
90
+ - twss.gemspec
91
+ has_rdoc: true
92
+ homepage: http://github.com/bvandenbos/twss
93
+ licenses: []
94
+
95
+ post_install_message:
96
+ rdoc_options:
97
+ - --charset=UTF-8
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ hash: 3
106
+ segments:
107
+ - 0
108
+ version: "0"
109
+ required_rubygems_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ hash: 3
115
+ segments:
116
+ - 0
117
+ version: "0"
118
+ requirements: []
119
+
120
+ rubyforge_project:
121
+ rubygems_version: 1.3.7
122
+ signing_key:
123
+ specification_version: 3
124
+ summary: Pre-trained That's What She Said classifier
125
+ test_files: []
126
+