twss 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,31 @@
1
+ require 'rubygems'
2
+ require 'forwardable'
3
+
4
+ require File.join(File.dirname(__FILE__), 'twss/engine')
5
+
6
+ module TWSS
7
+
8
+ Version = '0.0.1'
9
+
10
+ class << self
11
+
12
+ extend Forwardable
13
+
14
+ def_delegators :engine, :threshold, :threshold=
15
+
16
+ def classify(str)
17
+ engine.classify(str)
18
+ end
19
+
20
+ def engine
21
+ @engine ||= TWSS::Engine.new
22
+ end
23
+
24
+ end
25
+
26
+ end
27
+
28
+ # Shortcut for TWSS.classify(str)
29
+ def TWSS(str)
30
+ TWSS.classify(str)
31
+ end
@@ -0,0 +1,63 @@
1
+ require 'classifier'
2
+
3
+ module TWSS
4
+
5
+ class Engine
6
+
7
+ extend Forwardable
8
+
9
+ def_delegators :@classifier, :train, :untrain, :classifications
10
+
11
+ DATA_FILE = File.join(File.dirname(__FILE__), '../../data/classifier')
12
+
13
+ TRUE = '1'
14
+ FALSE = '0'
15
+
16
+ attr_accessor :threshold
17
+
18
+ def initialize(options = {})
19
+ @data_file = options[:data_file] || DATA_FILE
20
+ @threshold ||= options[:threshold] || 5.0
21
+ @classifier = load_classifier_from_file!(@data_file) || new_classifier
22
+ end
23
+
24
+ def classify(str)
25
+ if basic_conditions_met?(str)
26
+ c = @classifier.classifications(str)
27
+ c[TRUE] - c[FALSE] > threshold
28
+ else
29
+ false
30
+ end
31
+ end
32
+
33
+ # Dumps the current classifier state to specified path
34
+ def dump_classifier_to_file(f = @data_file)
35
+ o = File.open(f, 'w')
36
+ o.write(Marshal.dump(@classifier))
37
+ o.close
38
+ end
39
+
40
+ # Clears out the current classifier instance and nukes the data file
41
+ def clear_state!
42
+ File.delete(@data_file) if File.exists?(@data_file)
43
+ @classifier = new_classifier
44
+ end
45
+
46
+ private
47
+
48
+ def new_classifier
49
+ Classifier::Bayes.new(TRUE, FALSE)
50
+ end
51
+
52
+ def basic_conditions_met?(str)
53
+ str.split(' ').length > 3 # more than 3 words
54
+ end
55
+
56
+ # Given a path to a classifier file, load the instance into memory
57
+ def load_classifier_from_file!(f)
58
+ Marshal.load(File.read(f)) rescue nil if File.exists?(f)
59
+ end
60
+
61
+ end
62
+
63
+ end
@@ -0,0 +1,68 @@
1
+ #require 'twitter'
2
+
3
+ module TWSS
4
+
5
+ class Trainer
6
+
7
+ attr_reader :engine
8
+
9
+ def initialize(engine, options = {})
10
+ @engine = engine
11
+ engine.clear_state!
12
+ @training_set_size = options[:training_set_size] || 100
13
+ end
14
+
15
+ def train
16
+ path = File.join(File.dirname(__FILE__), '../../data/')
17
+
18
+ puts "Clearing state..."
19
+ engine.clear_state!
20
+
21
+ puts "Training NON-TWSS strings..."
22
+ File.read(File.join(path, 'non_twss.txt')).each_line do |l|
23
+ engine.train(TWSS::Engine::FALSE, strip_tweet(l))
24
+ end
25
+
26
+ puts "Training TWSS strings..."
27
+ File.read(File.join(path, 'twss.txt')).each_line do |l|
28
+ engine.train(TWSS::Engine::TRUE, strip_tweet(l))
29
+ end
30
+
31
+ puts "Writing to file..."
32
+ engine.dump_classifier_to_file
33
+
34
+ puts "Done."
35
+ puts
36
+
37
+ run_examples
38
+ end
39
+
40
+ # A little cleanup of the text before we train on it.
41
+ def strip_tweet(text)
42
+ t = text.gsub(/[\@\#]\w+\b/i, '') # strip mentions and hashtags
43
+ t.gsub!(/(RT|OH)\W/i, '') # strip RT's and OH's
44
+ t.gsub!(/twss/i, '') # strip out twss itself
45
+ t.gsub!(/http:\/\/[A-Za-z0-9\.\/]+/, '') # URLs
46
+ t.gsub!(/[\W\d]/, ' ') # now all non word chars and numbers
47
+ t.strip!
48
+ t
49
+ end
50
+
51
+ def run_examples
52
+ ["how big is that thing going to get?",
53
+ "umm... that's the not the right hole",
54
+ "did you resolve the ticket?",
55
+ "did you fix the bug?",
56
+ "you're going to need to go faster",
57
+ "I'm almost there, keep going",
58
+ "Ok, send me a pull request",
59
+ "The president issued a decree",
60
+ "I don't get it, this isn't working correctly",
61
+ "finished specialties in the warehouse"].each do |s|
62
+ puts '"' + s + '" => ' + TWSS(s).to_s
63
+ end
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,29 @@
1
+ require 'twitter'
2
+
3
+ module TWSS
4
+
5
+ class TweetCollector
6
+
7
+ attr_reader :search, :filename, :limit
8
+
9
+ def initialize(search, filename, limit = 1500)
10
+ @search, @filename, @limit = search, filename, limit
11
+ end
12
+
13
+ def run
14
+ o = File.open(filename, 'a')
15
+ page, per_page = 1, 100
16
+ begin
17
+ Twitter::Search.new(search).per_page(per_page).page(page).each do |tweet|
18
+ puts tweet.text
19
+ o.puts tweet.text
20
+ end
21
+ page += 1
22
+ sleep 2
23
+ end while page * per_page < limit
24
+ o.close
25
+ end
26
+
27
+ end
28
+
29
+ end
@@ -0,0 +1,4 @@
1
+ require File.join(File.dirname(__FILE__), '../lib/twss')
2
+ require File.join(File.dirname(__FILE__), '../lib/twss/tweet_collector')
3
+
4
+ TWSS::TweetCollector.new(':)', File.join(File.dirname(__FILE__), '../data/non_twss.txt')).run
@@ -0,0 +1,4 @@
1
+ require File.join(File.dirname(__FILE__), '../lib/twss')
2
+ require File.join(File.dirname(__FILE__), '../lib/twss/tweet_collector')
3
+
4
+ TWSS::TweetCollector.new('#twss', File.join(File.dirname(__FILE__), '../data/twss.txt')).run
@@ -0,0 +1,7 @@
1
+
2
+ require File.join(File.dirname(__FILE__), '../lib/twss')
3
+ require File.join(File.dirname(__FILE__), '../lib/twss/trainer')
4
+
5
+ engine = TWSS::Engine.new
6
+ trainer = TWSS::Trainer.new(engine)
7
+ trainer.train
@@ -0,0 +1,61 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{twss}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ben VandenBos"]
12
+ s.date = %q{2010-08-07}
13
+ s.description = %q{Pre-trained "That's What She Said" bayes classifier.
14
+ Given a string, returns true if it's a TWSS joke. Pre-trained from
15
+ Twitter #twss. Let the twss mashups begin!}
16
+ s.email = %q{bvandenbos@gmail.com}
17
+ s.extra_rdoc_files = [
18
+ "README.markdown"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "README.markdown",
23
+ "Rakefile",
24
+ "data/classifier",
25
+ "data/non_twss.txt",
26
+ "data/twss.txt",
27
+ "lib/twss.rb",
28
+ "lib/twss/engine.rb",
29
+ "lib/twss/trainer.rb",
30
+ "lib/twss/tweet_collector.rb",
31
+ "script/collect_non_twss.rb",
32
+ "script/collect_twss.rb",
33
+ "script/train.rb",
34
+ "twss.gemspec"
35
+ ]
36
+ s.homepage = %q{http://github.com/bvandenbos/twss}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.7}
40
+ s.summary = %q{Pre-trained That's What She Said classifier}
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
47
+ s.add_runtime_dependency(%q<classifier>, [">= 1.3.1"])
48
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
49
+ s.add_development_dependency(%q<twitter>, [">= 0"])
50
+ else
51
+ s.add_dependency(%q<classifier>, [">= 1.3.1"])
52
+ s.add_dependency(%q<jeweler>, [">= 0"])
53
+ s.add_dependency(%q<twitter>, [">= 0"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<classifier>, [">= 1.3.1"])
57
+ s.add_dependency(%q<jeweler>, [">= 0"])
58
+ s.add_dependency(%q<twitter>, [">= 0"])
59
+ end
60
+ end
61
+
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twss
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Ben VandenBos
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-07 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: classifier
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 25
30
+ segments:
31
+ - 1
32
+ - 3
33
+ - 1
34
+ version: 1.3.1
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: jeweler
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 3
46
+ segments:
47
+ - 0
48
+ version: "0"
49
+ type: :development
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: twitter
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 3
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ type: :development
64
+ version_requirements: *id003
65
+ description: |-
66
+ Pre-trained "That's What She Said" bayes classifier.
67
+ Given a string, returns true if it's a TWSS joke. Pre-trained from
68
+ Twitter #twss. Let the twss mashups begin!
69
+ email: bvandenbos@gmail.com
70
+ executables: []
71
+
72
+ extensions: []
73
+
74
+ extra_rdoc_files:
75
+ - README.markdown
76
+ files:
77
+ - .gitignore
78
+ - README.markdown
79
+ - Rakefile
80
+ - data/classifier
81
+ - data/non_twss.txt
82
+ - data/twss.txt
83
+ - lib/twss.rb
84
+ - lib/twss/engine.rb
85
+ - lib/twss/trainer.rb
86
+ - lib/twss/tweet_collector.rb
87
+ - script/collect_non_twss.rb
88
+ - script/collect_twss.rb
89
+ - script/train.rb
90
+ - twss.gemspec
91
+ has_rdoc: true
92
+ homepage: http://github.com/bvandenbos/twss
93
+ licenses: []
94
+
95
+ post_install_message:
96
+ rdoc_options:
97
+ - --charset=UTF-8
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ hash: 3
106
+ segments:
107
+ - 0
108
+ version: "0"
109
+ required_rubygems_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ hash: 3
115
+ segments:
116
+ - 0
117
+ version: "0"
118
+ requirements: []
119
+
120
+ rubyforge_project:
121
+ rubygems_version: 1.3.7
122
+ signing_key:
123
+ specification_version: 3
124
+ summary: Pre-trained That's What She Said classifier
125
+ test_files: []
126
+