RedditPostClassifierBot 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'RedditPostClassifierBot/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "RedditPostClassifierBot"
8
+ spec.version = RedditPostClassifierBot::VERSION
9
+ spec.authors = ["Diego Salazar"]
10
+ spec.email = ["diego@greyrobot.com"]
11
+
12
+ spec.summary = %q{Run Naive Bayes classification of Reddit posts}
13
+ spec.description = %q{This gem wraps Ruby's nbayes gem to do text classification of Reddit posts. It classifies posts according to where they were fetch - frontpage, controversial, top, or bad posts. It can be used to try to predict if a new post will make it to the front page, maybe.}
14
+ spec.homepage = "https://github.com/DiegoSalazar/RedditPostClassifierBot"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+
22
+ spec.add_dependency "nbayes"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.9"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "RedditPostClassifierBot"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ module RedditPostClassifierBot
2
+ class NBayesClassifier
3
+ NBAYES_FILE = ENV.fetch "NBAYES_FILE_PATH", "./RPCB-nbayes.yml"
4
+
5
+ def initialize
6
+ @nbayes = NBayes::Base.new
7
+ end
8
+
9
+ def train(text, classification)
10
+ @nbayes.train tokenize(text), classification
11
+ self
12
+ end
13
+
14
+ def classify(subreddit, title, post)
15
+ @nbayes.classify tokenize("#{subreddit}\n#{title}\n#{post}")
16
+ end
17
+
18
+ def dump
19
+ @nbayes.dump NBAYES_FILE
20
+ end
21
+
22
+ def load
23
+ @nbayes.load NBAYES_FILE
24
+ true
25
+ rescue Errno::ENOENT, NoMethodError
26
+ FileUtils.touch NBAYES_FILE unless File.exists?(NBAYES_FILE)
27
+ false
28
+ end
29
+
30
+ private def tokenize(text = @text)
31
+ text.split(/\s+/)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,121 @@
1
+ module RedditPostClassifierBot
2
+ class RedditTrainer
3
+ REDDIT_URL = "https://www.reddit.com"
4
+ CLASSES = {
5
+ hot: "/",
6
+ new_post: "/new/",
7
+ rising: "/rising/",
8
+ controversial: "/controversial/",
9
+ top_hour: "/top/",
10
+ top_day: "/top/?sort=top&t=day",
11
+ top_week: "/top/?sort=top&t=week",
12
+ top_month: "/top/?sort=top&t=month",
13
+ top_year: "/top/?sort=top&t=year"
14
+ }
15
+
16
+ attr_reader :classifications
17
+
18
+ def self.trained_on
19
+ CLASSES.KEYS
20
+ end
21
+
22
+ def initialize(trials = 10, per_page = 200, debug = true)
23
+ @max_trials, @per_page, @debug = trials, per_page, debug
24
+ @posts, @trials_done = [], 0
25
+ end
26
+
27
+ def nbayes
28
+ @nbayes ||= RedditPostClassifierBot::NBayesClassifier.new
29
+ end
30
+
31
+ def train(classes = CLASSES)
32
+ classes.each do |classification, path|
33
+ log "training on #{classification} posts, page #{@trials_done} of #{@max_trials}"
34
+
35
+ reddit(path)["data"]["children"].each do |p|
36
+ @posts << (post = Post.new p)
37
+ nbayes.train post.serialize, classification
38
+ end
39
+ end
40
+
41
+ @trials_done += 1
42
+ recurse_to_next_page CLASSES, @posts.last if @trials_done <= @max_trials
43
+ self
44
+ end
45
+
46
+ def classify(subreddit, title, post)
47
+ @classifications = nbayes.classify subreddit, title, post
48
+ @classifications.max_class
49
+ end
50
+
51
+ def dump
52
+ nbayes.dump; self
53
+ end
54
+
55
+ def load
56
+ train and dump unless nbayes.load
57
+ self
58
+ end
59
+
60
+ def fetch_and_classify(path = CLASSES[:front])
61
+ posts = reddit(path)["data"]["children"]
62
+ log "Classifying #{posts.size} posts"
63
+
64
+ posts.inject({}) do |h, p|
65
+ post = Post.new p
66
+ classification = classify post.subreddit, post.title, post.body
67
+ h.merge! uri_with_base(post.path).to_s => classification
68
+ end.group_by { |_, c| c }
69
+ end
70
+
71
+ def inspect
72
+ "<#{self.class}:#{object_id.to_s(16)} @max_trials=#{@max_trials.inspect} @per_page=#{@per_page.inspect} @debug=#{@debug.inspect} @posts.size=#{@posts.size}>"
73
+ end
74
+
75
+ private
76
+
77
+ def reddit(path)
78
+ path = path.split("?")
79
+ path = "#{path[0]}.json?#{path[1]}"
80
+ uri = uri_with_base path
81
+ cmd = "curl -s -c \"reddit_session=#{ENV["REDDIT_SESSION_ID"]}\" #{uri}"
82
+
83
+ JSON.parse `#{cmd}` rescue { "data" => { "children" => [] }}
84
+ end
85
+
86
+ def recurse_to_next_page(classes, last_post)
87
+ classes = classes.inject({}) do |h, (k, v)|
88
+ u = uri_with_base v
89
+ u.query = "count=#{@per_page}&after=#{last_post.full_id}"
90
+ h.merge! k => u.to_s
91
+ end
92
+
93
+ train classes
94
+ end
95
+
96
+ def uri_with_base(path, base = REDDIT_URL)
97
+ URI.parse (s = URI.join(base, path)) rescue s # ignore bad uri
98
+ end
99
+
100
+ def log(msg)
101
+ puts "[#{self.class}] #{msg}" if @debug
102
+ end
103
+
104
+ class Post
105
+ attr_reader :full_id, :path, :subreddit, :title, :body
106
+
107
+ def initialize(json)
108
+ @data = json["data"]
109
+ @full_id = "#{json["kind"]}_#{@data["id"]}"
110
+ @path = @data["permalink"]
111
+ @subreddit = @data["subreddit"]
112
+ @title = @data["title"]
113
+ @body = (@data["selftext"].to_s.empty? ? @data["url"] : @data["selftext"]).to_s
114
+ end
115
+
116
+ def serialize
117
+ "#{@data["subreddit"]}\n#{@data["title"]}\n#{@data["url"]}\n#{@data["selftext"]}"
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,3 @@
1
+ module RedditPostClassifierBot
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,20 @@
1
+ require "fileutils"
2
+ require "nbayes"
3
+ require "RedditPostClassifierBot/version"
4
+ require "RedditPostClassifierBot/nbayes_classifier"
5
+ require "RedditPostClassifierBot/reddit_trainer"
6
+ require "RedditPostClassifierBot/good_post_finder"
7
+
8
+ module RedditPostClassifierBot
9
+ def self.classifier
10
+ @classifier ||= RedditTrainer.new.load
11
+ end
12
+
13
+ def self.train_classifier
14
+ classifier.train.dump
15
+ end
16
+
17
+ def self.classify_posts(path)
18
+ classifier.fetch_and_classify path
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: RedditPostClassifierBot
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Diego Salazar
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-12-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nbayes
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.9'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.9'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ description: This gem wraps Ruby's nbayes gem to do text classification of Reddit
56
+ posts. It classifies posts according to where they were fetch - frontpage, controversial,
57
+ top, or bad posts. It can be used to try to predict if a new post will make it to
58
+ the front page, maybe.
59
+ email:
60
+ - diego@greyrobot.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - ".gitignore"
66
+ - ".travis.yml"
67
+ - Gemfile
68
+ - README.md
69
+ - RPCB-nbayes.yml
70
+ - Rakefile
71
+ - RedditPostClassifierBot.gemspec
72
+ - bin/console
73
+ - bin/setup
74
+ - lib/RedditPostClassifierBot.rb
75
+ - lib/RedditPostClassifierBot/nbayes_classifier.rb
76
+ - lib/RedditPostClassifierBot/reddit_trainer.rb
77
+ - lib/RedditPostClassifierBot/version.rb
78
+ homepage: https://github.com/DiegoSalazar/RedditPostClassifierBot
79
+ licenses: []
80
+ metadata: {}
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 2.4.6
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Run Naive Bayes classification of Reddit posts
101
+ test_files: []