RedditPostClassifierBot 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +3 -0
- data/Gemfile +4 -0
- data/README.md +86 -0
- data/RPCB-nbayes.yml +4568 -0
- data/Rakefile +1 -0
- data/RedditPostClassifierBot.gemspec +26 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/RedditPostClassifierBot/nbayes_classifier.rb +34 -0
- data/lib/RedditPostClassifierBot/reddit_trainer.rb +121 -0
- data/lib/RedditPostClassifierBot/version.rb +3 -0
- data/lib/RedditPostClassifierBot.rb +20 -0
- metadata +101 -0
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'RedditPostClassifierBot/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "RedditPostClassifierBot"
|
8
|
+
spec.version = RedditPostClassifierBot::VERSION
|
9
|
+
spec.authors = ["Diego Salazar"]
|
10
|
+
spec.email = ["diego@greyrobot.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Run Naive Bayes classification of Reddit posts}
|
13
|
+
spec.description = %q{This gem wraps Ruby's nbayes gem to do text classification of Reddit posts. It classifies posts according to where they were fetch - frontpage, controversial, top, or bad posts. It can be used to try to predict if a new post will make it to the front page, maybe.}
|
14
|
+
spec.homepage = "https://github.com/DiegoSalazar/RedditPostClassifierBot"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = "exe"
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
|
22
|
+
spec.add_dependency "nbayes"
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.9"
|
25
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
26
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "RedditPostClassifierBot"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
module RedditPostClassifierBot
|
2
|
+
class NBayesClassifier
|
3
|
+
NBAYES_FILE = ENV.fetch "NBAYES_FILE_PATH", "./RPCB-nbayes.yml"
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@nbayes = NBayes::Base.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def train(text, classification)
|
10
|
+
@nbayes.train tokenize(text), classification
|
11
|
+
self
|
12
|
+
end
|
13
|
+
|
14
|
+
def classify(subreddit, title, post)
|
15
|
+
@nbayes.classify tokenize("#{subreddit}\n#{title}\n#{post}")
|
16
|
+
end
|
17
|
+
|
18
|
+
def dump
|
19
|
+
@nbayes.dump NBAYES_FILE
|
20
|
+
end
|
21
|
+
|
22
|
+
def load
|
23
|
+
@nbayes.load NBAYES_FILE
|
24
|
+
true
|
25
|
+
rescue Errno::ENOENT, NoMethodError
|
26
|
+
FileUtils.touch NBAYES_FILE unless File.exists?(NBAYES_FILE)
|
27
|
+
false
|
28
|
+
end
|
29
|
+
|
30
|
+
private def tokenize(text = @text)
|
31
|
+
text.split(/\s+/)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module RedditPostClassifierBot
|
2
|
+
class RedditTrainer
|
3
|
+
REDDIT_URL = "https://www.reddit.com"
|
4
|
+
CLASSES = {
|
5
|
+
hot: "/",
|
6
|
+
new_post: "/new/",
|
7
|
+
rising: "/rising/",
|
8
|
+
controversial: "/controversial/",
|
9
|
+
top_hour: "/top/",
|
10
|
+
top_day: "/top/?sort=top&t=day",
|
11
|
+
top_week: "/top/?sort=top&t=week",
|
12
|
+
top_month: "/top/?sort=top&t=month",
|
13
|
+
top_year: "/top/?sort=top&t=year"
|
14
|
+
}
|
15
|
+
|
16
|
+
attr_reader :classifications
|
17
|
+
|
18
|
+
def self.trained_on
|
19
|
+
CLASSES.KEYS
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(trials = 10, per_page = 200, debug = true)
|
23
|
+
@max_trials, @per_page, @debug = trials, per_page, debug
|
24
|
+
@posts, @trials_done = [], 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def nbayes
|
28
|
+
@nbayes ||= RedditPostClassifierBot::NBayesClassifier.new
|
29
|
+
end
|
30
|
+
|
31
|
+
def train(classes = CLASSES)
|
32
|
+
classes.each do |classification, path|
|
33
|
+
log "training on #{classification} posts, page #{@trials_done} of #{@max_trials}"
|
34
|
+
|
35
|
+
reddit(path)["data"]["children"].each do |p|
|
36
|
+
@posts << (post = Post.new p)
|
37
|
+
nbayes.train post.serialize, classification
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
@trials_done += 1
|
42
|
+
recurse_to_next_page CLASSES, @posts.last if @trials_done <= @max_trials
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def classify(subreddit, title, post)
|
47
|
+
@classifications = nbayes.classify subreddit, title, post
|
48
|
+
@classifications.max_class
|
49
|
+
end
|
50
|
+
|
51
|
+
def dump
|
52
|
+
nbayes.dump; self
|
53
|
+
end
|
54
|
+
|
55
|
+
def load
|
56
|
+
train and dump unless nbayes.load
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
def fetch_and_classify(path = CLASSES[:front])
|
61
|
+
posts = reddit(path)["data"]["children"]
|
62
|
+
log "Classifying #{posts.size} posts"
|
63
|
+
|
64
|
+
posts.inject({}) do |h, p|
|
65
|
+
post = Post.new p
|
66
|
+
classification = classify post.subreddit, post.title, post.body
|
67
|
+
h.merge! uri_with_base(post.path).to_s => classification
|
68
|
+
end.group_by { |_, c| c }
|
69
|
+
end
|
70
|
+
|
71
|
+
def inspect
|
72
|
+
"<#{self.class}:#{object_id.to_s(16)} @max_trials=#{@max_trials.inspect} @per_page=#{@per_page.inspect} @debug=#{@debug.inspect} @posts.size=#{@posts.size}>"
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def reddit(path)
|
78
|
+
path = path.split("?")
|
79
|
+
path = "#{path[0]}.json?#{path[1]}"
|
80
|
+
uri = uri_with_base path
|
81
|
+
cmd = "curl -s -c \"reddit_session=#{ENV["REDDIT_SESSION_ID"]}\" #{uri}"
|
82
|
+
|
83
|
+
JSON.parse `#{cmd}` rescue { "data" => { "children" => [] }}
|
84
|
+
end
|
85
|
+
|
86
|
+
def recurse_to_next_page(classes, last_post)
|
87
|
+
classes = classes.inject({}) do |h, (k, v)|
|
88
|
+
u = uri_with_base v
|
89
|
+
u.query = "count=#{@per_page}&after=#{last_post.full_id}"
|
90
|
+
h.merge! k => u.to_s
|
91
|
+
end
|
92
|
+
|
93
|
+
train classes
|
94
|
+
end
|
95
|
+
|
96
|
+
def uri_with_base(path, base = REDDIT_URL)
|
97
|
+
URI.parse (s = URI.join(base, path)) rescue s # ignore bad uri
|
98
|
+
end
|
99
|
+
|
100
|
+
def log(msg)
|
101
|
+
puts "[#{self.class}] #{msg}" if @debug
|
102
|
+
end
|
103
|
+
|
104
|
+
class Post
|
105
|
+
attr_reader :full_id, :path, :subreddit, :title, :body
|
106
|
+
|
107
|
+
def initialize(json)
|
108
|
+
@data = json["data"]
|
109
|
+
@full_id = "#{json["kind"]}_#{@data["id"]}"
|
110
|
+
@path = @data["permalink"]
|
111
|
+
@subreddit = @data["subreddit"]
|
112
|
+
@title = @data["title"]
|
113
|
+
@body = (@data["selftext"].to_s.empty? ? @data["url"] : @data["selftext"]).to_s
|
114
|
+
end
|
115
|
+
|
116
|
+
def serialize
|
117
|
+
"#{@data["subreddit"]}\n#{@data["title"]}\n#{@data["url"]}\n#{@data["selftext"]}"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "nbayes"
|
3
|
+
require "RedditPostClassifierBot/version"
|
4
|
+
require "RedditPostClassifierBot/nbayes_classifier"
|
5
|
+
require "RedditPostClassifierBot/reddit_trainer"
|
6
|
+
require "RedditPostClassifierBot/good_post_finder"
|
7
|
+
|
8
|
+
module RedditPostClassifierBot
|
9
|
+
def self.classifier
|
10
|
+
@classifier ||= RedditTrainer.new.load
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.train_classifier
|
14
|
+
classifier.train.dump
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.classify_posts(path)
|
18
|
+
classifier.fetch_and_classify path
|
19
|
+
end
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: RedditPostClassifierBot
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Diego Salazar
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nbayes
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.9'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.9'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
description: This gem wraps Ruby's nbayes gem to do text classification of Reddit
|
56
|
+
posts. It classifies posts according to where they were fetch - frontpage, controversial,
|
57
|
+
top, or bad posts. It can be used to try to predict if a new post will make it to
|
58
|
+
the front page, maybe.
|
59
|
+
email:
|
60
|
+
- diego@greyrobot.com
|
61
|
+
executables: []
|
62
|
+
extensions: []
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- ".gitignore"
|
66
|
+
- ".travis.yml"
|
67
|
+
- Gemfile
|
68
|
+
- README.md
|
69
|
+
- RPCB-nbayes.yml
|
70
|
+
- Rakefile
|
71
|
+
- RedditPostClassifierBot.gemspec
|
72
|
+
- bin/console
|
73
|
+
- bin/setup
|
74
|
+
- lib/RedditPostClassifierBot.rb
|
75
|
+
- lib/RedditPostClassifierBot/nbayes_classifier.rb
|
76
|
+
- lib/RedditPostClassifierBot/reddit_trainer.rb
|
77
|
+
- lib/RedditPostClassifierBot/version.rb
|
78
|
+
homepage: https://github.com/DiegoSalazar/RedditPostClassifierBot
|
79
|
+
licenses: []
|
80
|
+
metadata: {}
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project:
|
97
|
+
rubygems_version: 2.4.6
|
98
|
+
signing_key:
|
99
|
+
specification_version: 4
|
100
|
+
summary: Run Naive Bayes classification of Reddit posts
|
101
|
+
test_files: []
|