RedditPostClassifierBot 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'RedditPostClassifierBot/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "RedditPostClassifierBot"
8
+ spec.version = RedditPostClassifierBot::VERSION
9
+ spec.authors = ["Diego Salazar"]
10
+ spec.email = ["diego@greyrobot.com"]
11
+
12
+ spec.summary = %q{Run Naive Bayes classification of Reddit posts}
13
+ spec.description = %q{This gem wraps Ruby's nbayes gem to do text classification of Reddit posts. It classifies posts according to where they were fetch - frontpage, controversial, top, or bad posts. It can be used to try to predict if a new post will make it to the front page, maybe.}
14
+ spec.homepage = "https://github.com/DiegoSalazar/RedditPostClassifierBot"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+
22
+ spec.add_dependency "nbayes"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.9"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "RedditPostClassifierBot"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ module RedditPostClassifierBot
2
+ class NBayesClassifier
3
+ NBAYES_FILE = ENV.fetch "NBAYES_FILE_PATH", "./RPCB-nbayes.yml"
4
+
5
+ def initialize
6
+ @nbayes = NBayes::Base.new
7
+ end
8
+
9
+ def train(text, classification)
10
+ @nbayes.train tokenize(text), classification
11
+ self
12
+ end
13
+
14
+ def classify(subreddit, title, post)
15
+ @nbayes.classify tokenize("#{subreddit}\n#{title}\n#{post}")
16
+ end
17
+
18
+ def dump
19
+ @nbayes.dump NBAYES_FILE
20
+ end
21
+
22
+ def load
23
+ @nbayes.load NBAYES_FILE
24
+ true
25
+ rescue Errno::ENOENT, NoMethodError
26
+ FileUtils.touch NBAYES_FILE unless File.exists?(NBAYES_FILE)
27
+ false
28
+ end
29
+
30
+ private def tokenize(text = @text)
31
+ text.split(/\s+/)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,121 @@
1
+ module RedditPostClassifierBot
2
+ class RedditTrainer
3
+ REDDIT_URL = "https://www.reddit.com"
4
+ CLASSES = {
5
+ hot: "/",
6
+ new_post: "/new/",
7
+ rising: "/rising/",
8
+ controversial: "/controversial/",
9
+ top_hour: "/top/",
10
+ top_day: "/top/?sort=top&t=day",
11
+ top_week: "/top/?sort=top&t=week",
12
+ top_month: "/top/?sort=top&t=month",
13
+ top_year: "/top/?sort=top&t=year"
14
+ }
15
+
16
+ attr_reader :classifications
17
+
18
+ def self.trained_on
19
+ CLASSES.KEYS
20
+ end
21
+
22
+ def initialize(trials = 10, per_page = 200, debug = true)
23
+ @max_trials, @per_page, @debug = trials, per_page, debug
24
+ @posts, @trials_done = [], 0
25
+ end
26
+
27
+ def nbayes
28
+ @nbayes ||= RedditPostClassifierBot::NBayesClassifier.new
29
+ end
30
+
31
+ def train(classes = CLASSES)
32
+ classes.each do |classification, path|
33
+ log "training on #{classification} posts, page #{@trials_done} of #{@max_trials}"
34
+
35
+ reddit(path)["data"]["children"].each do |p|
36
+ @posts << (post = Post.new p)
37
+ nbayes.train post.serialize, classification
38
+ end
39
+ end
40
+
41
+ @trials_done += 1
42
+ recurse_to_next_page CLASSES, @posts.last if @trials_done <= @max_trials
43
+ self
44
+ end
45
+
46
+ def classify(subreddit, title, post)
47
+ @classifications = nbayes.classify subreddit, title, post
48
+ @classifications.max_class
49
+ end
50
+
51
+ def dump
52
+ nbayes.dump; self
53
+ end
54
+
55
+ def load
56
+ train and dump unless nbayes.load
57
+ self
58
+ end
59
+
60
+ def fetch_and_classify(path = CLASSES[:front])
61
+ posts = reddit(path)["data"]["children"]
62
+ log "Classifying #{posts.size} posts"
63
+
64
+ posts.inject({}) do |h, p|
65
+ post = Post.new p
66
+ classification = classify post.subreddit, post.title, post.body
67
+ h.merge! uri_with_base(post.path).to_s => classification
68
+ end.group_by { |_, c| c }
69
+ end
70
+
71
+ def inspect
72
+ "<#{self.class}:#{object_id.to_s(16)} @max_trials=#{@max_trials.inspect} @per_page=#{@per_page.inspect} @debug=#{@debug.inspect} @posts.size=#{@posts.size}>"
73
+ end
74
+
75
+ private
76
+
77
+ def reddit(path)
78
+ path = path.split("?")
79
+ path = "#{path[0]}.json?#{path[1]}"
80
+ uri = uri_with_base path
81
+ cmd = "curl -s -c \"reddit_session=#{ENV["REDDIT_SESSION_ID"]}\" #{uri}"
82
+
83
+ JSON.parse `#{cmd}` rescue { "data" => { "children" => [] }}
84
+ end
85
+
86
+ def recurse_to_next_page(classes, last_post)
87
+ classes = classes.inject({}) do |h, (k, v)|
88
+ u = uri_with_base v
89
+ u.query = "count=#{@per_page}&after=#{last_post.full_id}"
90
+ h.merge! k => u.to_s
91
+ end
92
+
93
+ train classes
94
+ end
95
+
96
+ def uri_with_base(path, base = REDDIT_URL)
97
+ URI.parse (s = URI.join(base, path)) rescue s # ignore bad uri
98
+ end
99
+
100
+ def log(msg)
101
+ puts "[#{self.class}] #{msg}" if @debug
102
+ end
103
+
104
+ class Post
105
+ attr_reader :full_id, :path, :subreddit, :title, :body
106
+
107
+ def initialize(json)
108
+ @data = json["data"]
109
+ @full_id = "#{json["kind"]}_#{@data["id"]}"
110
+ @path = @data["permalink"]
111
+ @subreddit = @data["subreddit"]
112
+ @title = @data["title"]
113
+ @body = (@data["selftext"].to_s.empty? ? @data["url"] : @data["selftext"]).to_s
114
+ end
115
+
116
+ def serialize
117
+ "#{@data["subreddit"]}\n#{@data["title"]}\n#{@data["url"]}\n#{@data["selftext"]}"
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,3 @@
1
+ module RedditPostClassifierBot
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,20 @@
1
+ require "fileutils"
2
+ require "nbayes"
3
+ require "RedditPostClassifierBot/version"
4
+ require "RedditPostClassifierBot/nbayes_classifier"
5
+ require "RedditPostClassifierBot/reddit_trainer"
6
+ require "RedditPostClassifierBot/good_post_finder"
7
+
8
+ module RedditPostClassifierBot
9
+ def self.classifier
10
+ @classifier ||= RedditTrainer.new.load
11
+ end
12
+
13
+ def self.train_classifier
14
+ classifier.train.dump
15
+ end
16
+
17
+ def self.classify_posts(path)
18
+ classifier.fetch_and_classify path
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: RedditPostClassifierBot
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Diego Salazar
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-12-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nbayes
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.9'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.9'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ description: This gem wraps Ruby's nbayes gem to do text classification of Reddit
56
+ posts. It classifies posts according to where they were fetch - frontpage, controversial,
57
+ top, or bad posts. It can be used to try to predict if a new post will make it to
58
+ the front page, maybe.
59
+ email:
60
+ - diego@greyrobot.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - ".gitignore"
66
+ - ".travis.yml"
67
+ - Gemfile
68
+ - README.md
69
+ - RPCB-nbayes.yml
70
+ - Rakefile
71
+ - RedditPostClassifierBot.gemspec
72
+ - bin/console
73
+ - bin/setup
74
+ - lib/RedditPostClassifierBot.rb
75
+ - lib/RedditPostClassifierBot/nbayes_classifier.rb
76
+ - lib/RedditPostClassifierBot/reddit_trainer.rb
77
+ - lib/RedditPostClassifierBot/version.rb
78
+ homepage: https://github.com/DiegoSalazar/RedditPostClassifierBot
79
+ licenses: []
80
+ metadata: {}
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 2.4.6
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Run Naive Bayes classification of Reddit posts
101
+ test_files: []