paperboy 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+ docs
21
+
22
+ ## PROJECT::SPECIFIC
23
+ example
24
+ *_paperboy_output.html
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 TPM Media LLC
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,19 @@
1
+ # paperboy
2
+
3
+ Use chartbeat's historical API to get popular stories over any time period. Look at those stories' META tags to get context and weed out dead links. Package stories into neat HTML files. Ideal for morning emails. Works with [chartbeat.gem](http://github.com/ashaw/chartbeat) and [stats_combiner.gem](http://github.com/tpm/stats_combiner) (for data filtering).
4
+
5
+ [Docs](http://tpm.github.com/paperboy).
6
+
7
+ Install with `gem install paperboy`
8
+
9
+ To do: tests.
10
+
11
+ ## Author
12
+
13
+ Al Shaw (al@talkingpointsmemo.com)
14
+
15
+ ## License
16
+
17
+ Copyright (c) 2010 TPM Media LLC
18
+
19
+ MIT License (see LICENSE file).
@@ -0,0 +1,65 @@
1
+ $LOAD_PATH.unshift 'lib'
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'rake/clean'
6
+
7
+ begin
8
+ require 'jeweler'
9
+ Jeweler::Tasks.new do |gem|
10
+ gem.name = "paperboy"
11
+ gem.summary = %Q{A tool to generate HTML summaries of popular stories}
12
+ gem.description = %Q{HTML story list generator from the chartbeat API. Good for daily email newsletters.}
13
+ gem.email = "almshaw@gmail.com"
14
+ gem.homepage = "http://github.com/tpm/paperboy"
15
+ gem.authors = ["Al Shaw"]
16
+ gem.add_dependency 'chartbeat'
17
+ gem.add_dependency 'stats_combiner', '>= 0.0.3'
18
+ gem.add_dependency 'hashie'
19
+ gem.add_dependency 'nokogiri'
20
+
21
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
22
+ end
23
+ Jeweler::GemcutterTasks.new
24
+ rescue LoadError
25
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
26
+ end
27
+
28
+ # rocco for docs
29
+ begin
30
+ require 'rocco/tasks'
31
+ Rocco::make 'docs/'
32
+ rescue LoadError
33
+ warn "#$! -- rocco tasks not loaded."
34
+ task :rocco
35
+ end
36
+
37
+ desc 'Build rocco docs'
38
+ task :docs => :rocco
39
+ directory 'docs/'
40
+
41
+ desc 'Build docs and open in browser for the reading'
42
+ task :read => :docs do
43
+ sh 'open docs/rocco.html'
44
+ end
45
+
46
+ # Make index.html a copy of rocco.html
47
+ file 'docs/index.html' => 'docs/paperboy.html' do |f|
48
+ cp 'docs/paperboy.html', 'docs/index.html', :preserve => true
49
+ end
50
+ task :docs => 'docs/index.html'
51
+ CLEAN.include 'docs/index.html'
52
+
53
+ # Alias for docs task
54
+ task :doc => :docs
55
+
56
+ #gh-pages
57
+
58
+ desc "update gh-pages"
59
+ task :pages do
60
+ file = "index.html"
61
+ sh "git checkout gh-pages"
62
+ sh "cp ./docs/#{file} #{file}"
63
+ sh "git add #{file}"
64
+ sh "git commit -m 'docs -> gh-pages'"
65
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,371 @@
1
+ require 'erb'
2
+ require 'open-uri'
3
+ require 'chartbeat'
4
+ require 'stats_combiner/filterer'
5
+ require 'hashie'
6
+ require 'nokogiri'
7
+
8
+ # **Paperboy** is a chartbeat-based library for creating
9
+ # HTML files showcasing from your most popular stories
10
+ # over the course of a time period (perhaps for a daily newsletter).
11
+ # It sniffs out META tags from URLs to build images and blurbs for the page.
12
+ # Paperboy builds on concepts from [stats_combiner.gem][sc] but relies on chartbeat's
13
+ # historical [snapshots][sn] endpoint, where stats_combiner uses real-time data.
14
+ #
15
+ # [sc]: http://github.com/tpm/stats_combiner
16
+ # [sn]: http://chartbeat.pbworks.com/snapshots
17
+ module Paperboy
18
+
19
+ # `Paperboy::Collector` queries the chartbeat API's snapshots method
20
+ # and consolidates visitors over the specified timespan.
21
+ # Then it pushes out barebones HTML to be gussied-up.
22
+ class Collector
23
+
24
+ attr_accessor :outfile
25
+
26
+ # Initialize a `Paperboy` instance. This script is relatively expensive, and
27
+ # is ideally run on a cron, perhaps once a day. Unlike stats_combiner, Paperboy uses
28
+ # historical data, so it really doesn't matter *when* you set this to run, as long as you're
29
+ # grabbing relative timestamps.
30
+ #
31
+ # API key and host come from your Chartbeat settings.
32
+ # Start and end times are UNIX timestamps. Paperboy will collect hourly between them.
33
+ # It defaults to yesterday from midnight to midnight.
34
+ # Filters is an instance of `StatsCombiner::Filterer`. To use it, first
35
+ # instantiate a Filterer object with:
36
+ #
37
+ # e = StatsCombiner::Filterer.new
38
+ #
39
+ # then add filter rules such as
40
+ #
41
+ # e.add {
42
+ # :prefix => 'tpmdc',
43
+ # :title_regex => /\| TPMDC/,
44
+ # :modify_title => true
45
+ # }
46
+ #
47
+ # finally, pass `e.filters` to this method.
48
+ #
49
+ # `img_xpath` and `blurb_xpath` are xpath queries that will run on the URL extracted
50
+ # from chartbeat (and any filters run on it) to populate your email with data that might
51
+ # reside in META tags. Here some I've found useful. `*_xpath` takes the `content` attribute
52
+ # of whatever HEAD tag is queried.
53
+ #
54
+ # :img_xpath => '//head/meta[@property="og:image"]',
55
+ # :blurb_xpath => '//head/meta[@name="description"]'
56
+ #
57
+ # Another option is `:interval`, which determines the interval of snapshots it takes before
58
+ # `start_time` and `end_time`. The default is 3600 seconds, or one hour.
59
+ #
60
+ # Usage example:
61
+ # p = Paperboy::Collector.new {
62
+ # :api_key => 'chartbeat_api_key',
63
+ # :host => 'yourdomain.com',
64
+ # :start_time => 1277784000
65
+ # :end_time => 1277870399,
66
+ # :interval => 3600,
67
+ # :filters => e.filters,
68
+ # :img_xpath => '//head/meta[@property="og:image"]',
69
+ # :blurb_xpath => '//head/meta[@name="description"]'
70
+ # }
71
+ #
72
+ # The static file generated by Paperboy will be called "yourdomain.com_paperboy_output.html."
73
+ # Change this with `p.outfile`
74
+ #
75
+ def initialize(opts = {})
76
+ @opts = {
77
+ :apikey => nil,
78
+ :host => nil,
79
+ :start_time => Time.now.to_i - 18000, #four hour default window
80
+ :end_time => Time.now.to_i - 3600,
81
+ :interval => 3600,
82
+ :filters => nil,
83
+ :img_xpath => nil,
84
+ :blurb_xpath => nil
85
+ }.merge!(opts)
86
+
87
+ if @opts[:apikey].nil? || @opts[:host].nil?
88
+ raise Paperboy::Error, "No Chartbeat API Key or Host Specified!"
89
+ end
90
+
91
+ @c = Chartbeat.new :apikey => @opts[:apikey], :host => @opts[:host]
92
+ @outfile = "#{@opts[:host]}_paperboy_output.html"
93
+
94
+ @stories = []
95
+ @uniq_stories = []
96
+ end
97
+
98
+ # **Run** runs the collector according to parameters set up in `new`.
99
+ # By default, it will generate an HTML file in the current directory with a
100
+ # standard bare-bones structure. There is also an option to pass data through an
101
+ # ERB template. That is done like so:
102
+ #
103
+ # p.run :via => 'erb', :template => '/path/to/tmpl.erb'
104
+ #
105
+ # ERB templates will expect to iterate over a `@stories` array, where each item is
106
+ # a hash of story attributes. See Paperboy::View#erb below for more on templating.
107
+ def run(opts = {})
108
+ @run_opts = {
109
+ :via => 'html',
110
+ :template => nil
111
+ }.merge!(opts)
112
+
113
+ result = self.collect_stories
114
+ v = Paperboy::View.new(result,@outfile)
115
+
116
+ if @run_opts[:via] == 'erb'
117
+ if @run_opts[:template].nil?
118
+ raise Paperboy::Error, "A template file must be specified with the erb option."
119
+ end
120
+ v.erb(@run_opts[:template])
121
+ else
122
+ v.html
123
+ end
124
+ end
125
+
126
+ # Determine if there is an outfile for this instance. If so, get the filename.
127
+ def outfile
128
+ f = @outfile
129
+ if File::exists?(f)
130
+ puts f
131
+ else
132
+ raise Paperboy::Error, "No result file: #{f} Try calling `run` first in this directory"
133
+ end
134
+ end
135
+
136
+ # Get the contents of the HTML file. I.e. the final product of the Paperboy run.
137
+ def html
138
+ File.open(@outfile).read
139
+ end
140
+
141
+ #### Internals
142
+
143
+ protected
144
+
145
+ # Find out how many times we'll have to query the Chartbeat API.
146
+ # We'll only do it once per `@opts[:interval]` between start and end times.
147
+ # By default, the interval is 3600 seconds.
148
+ def get_collection_intervals
149
+ times = []
150
+ i = @opts[:start_time]
151
+ loop do
152
+ times << i
153
+ i += @opts[:interval]
154
+ break if i >= @opts[:end_time] || @opts[:end_time] - @opts[:start_time] < @opts[:interval]
155
+ end
156
+ times
157
+ end
158
+
159
+ # Query the chartbeat API via the chartbeat gem, and organize
160
+ # stories over the course of the day into a big array.
161
+ def collect_stories
162
+ times = self.get_collection_intervals
163
+
164
+ times.each do |time|
165
+ puts "Collecting for #{Time.at(time)}..."
166
+ h = Hashie::Mash.new(@c.snapshots(:timestamp => time))
167
+
168
+ titles = h.titles.to_a
169
+ paths = h.active
170
+
171
+ if not titles.nil? || paths.nil?
172
+ paths_visitors = paths.collect {|q| [q.path,q.total]}
173
+
174
+ # Match titles to paths and add visitors.
175
+ titles.each do |title|
176
+ paths_visitors.each do |path_visitor|
177
+ if path_visitor[0] == title[0]
178
+ title << path_visitor[1]
179
+ end
180
+ end
181
+ end
182
+ else
183
+ warn "Warning! No data collected for #{Time.at(time)}. Results may be skewed! Try setting older timestamps for best results."
184
+ end
185
+
186
+ @stories << titles
187
+ end
188
+
189
+ self.package_stories
190
+ end
191
+
192
+ # If filters are enabled, run each story through the Filterer,
193
+ # and modify URL and Title as necessary
194
+ def filter_story(hed,path)
195
+ filters = @opts[:filters]
196
+ d = StatsCombiner::Filterer.apply_filters! @opts[:filters], :title => hed, :path => path
197
+ if not d[:prefix].nil?
198
+ d[:prefix] = d[:prefix] + '.'
199
+ end
200
+ d[:url] = "http://#{d[:prefix]}#{@opts[:host]}#{path}"
201
+ d
202
+ end
203
+
204
+ # Find out if we need to filter the stories, and send to `filter_story` if so.
205
+ # Otherwise, weed out the dupes and get ready to package into something we can use.
206
+ def prepackage_stories
207
+ @stories.each do |hour|
208
+ hour.each do |datum|
209
+ path = datum[0].dup
210
+ hed = datum[1].dup
211
+ visitors = datum[2] || 0
212
+
213
+ if @opts[:filters]
214
+ d = self.filter_story(hed,path)
215
+ hed = d[:title]
216
+ path = d[:path]
217
+ url = d[:url]
218
+ else
219
+ url = "http://#{@opts[:host]}#{path}"
220
+ end
221
+
222
+ if not path.nil?
223
+ if not @uniq_stories.collect {|q| q[1] }.include?(hed)
224
+ @uniq_stories << [url,hed,visitors]
225
+ else
226
+ dupe_idx = @uniq_stories.collect{|q| q[1]}.index(hed)
227
+ @uniq_stories[dupe_idx][2] += visitors
228
+ end
229
+ end
230
+ end
231
+ end
232
+ end
233
+
234
+ # First, send stories to be prepackaged. Then, sort them by visitors,
235
+ # and start looking for blurbs and images out on the URLs themselves for the top ten.
236
+ # At some point, it might be a good idea to make the number collected an option.
237
+ # Finally, generate the HTML and save as a static file.
238
+ def package_stories
239
+
240
+ self.prepackage_stories
241
+
242
+ uniq_stories = @uniq_stories.sort{|a,b| b[2] <=> a[2]}[0..9]
243
+
244
+ story_pkgs = []
245
+
246
+ uniq_stories.each do |story|
247
+ story_pkg = []
248
+ url = story[0]
249
+ hed = story[1]
250
+ visitors = story[2]
251
+
252
+ begin
253
+ d = Nokogiri::HTML(open(url))
254
+ rescue OpenURI::HTTPError || Timeout::Error
255
+ d = nil
256
+ end
257
+
258
+ # Only grab metadata and add this story to the stories array
259
+ # if it's reachable. Otherwise, we'll assume it's a dead link and skip it.
260
+ if not d.nil?
261
+ description = d.xpath(@opts[:blurb_xpath]).attr('content').value rescue nil
262
+ img = d.xpath(@opts[:img_xpath]).attr('content').value rescue nil
263
+
264
+ story_pkg = {
265
+ :url => url,
266
+ :hed => hed,
267
+ :visitors => visitors,
268
+ :blurb => description || '',
269
+ :img => img || ''
270
+ }
271
+ story_pkgs << story_pkg
272
+ end
273
+
274
+ end
275
+
276
+ story_pkgs
277
+ end
278
+
279
+ end
280
+
281
+ #### Templating Paperboy
282
+
283
+ # **Paperboy::View** is for templating Paperboy output.
284
+ class View
285
+
286
+ # Views are initialized from the `run` method of `PaperBoy::Collector`
287
+ # but can also be invoked separately, if you have an array of stories.
288
+ def initialize(story_pkgs,outfile)
289
+ @stories = story_pkgs
290
+ @outfile = outfile
291
+ end
292
+
293
+ # HTML is the default output method. It will return a bare-bones
294
+ # page of story output including blurbs and images if available.
295
+ def html
296
+
297
+ html = ''
298
+
299
+ @stories.each do |pkg|
300
+
301
+ html << <<DOCUMENT
302
+ <div class="story">
303
+ <h2><a href="#{pkg[:url]}">#{pkg[:hed]}</a></h2>
304
+ DOCUMENT
305
+
306
+ if not pkg[:img].empty?
307
+ html << <<DOCUMENT
308
+ <div class="img"><a href="#{pkg[:url]}"><img src="#{pkg[:img]}"></a></div>
309
+ DOCUMENT
310
+ end
311
+
312
+ if not pkg[:blurb].empty?
313
+ html << <<DOCUMENT
314
+ <div class="blurb">#{pkg[:blurb]}</div>
315
+ DOCUMENT
316
+ end
317
+
318
+ html << <<DOCUMENT
319
+ </div>
320
+ DOCUMENT
321
+
322
+ end
323
+
324
+ self.write(html)
325
+ end
326
+
327
+ # Templatize your story output with embedded ruby.
328
+ # This allows the greatest flexibility for presenting the data.
329
+ #
330
+ # To use, access the `@stories` array, and it's component hashes.
331
+ # Example:
332
+ #
333
+ # <h1>My Popular Stories</h1>
334
+ #
335
+ # <% @stories.each do |story| %>
336
+ # <div class="story">
337
+ # <h2><a href="<%= story[:url] %>"><%= story[:hed] %></a></h2>
338
+ # <% if not story[:img].empty? %>
339
+ # <div class="img">
340
+ # <a href="<%= story[:url] %>">
341
+ # <img src="<%= story[:img] %>">
342
+ # </a>
343
+ # </div>
344
+ # <% end %>
345
+ # <% if not story[:blurb].empty? %>
346
+ # <div class="blurb"><%= story[:blurb] %></div>
347
+ # <% end %>
348
+ # </div>
349
+ # <% end %>
350
+ #
351
+ def erb(template)
352
+ t = File.open(template).read
353
+ template = t.to_s
354
+ html = ERB.new(template).result(binding)
355
+
356
+ self.write(html)
357
+ end
358
+
359
+ # Write out flat HTML to a file from either plain html or erb templating.
360
+ def write(html)
361
+ f = File.new(@outfile,"w+")
362
+ f.write(html)
363
+ f.close
364
+ end
365
+
366
+ end
367
+
368
+ end
369
+
370
+ class Paperboy::Error < StandardError
371
+ end