websitary 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ load './lib/websitary.rb'
6
+
7
+ Hoe.new('websitary', Websitary::VERSION) do |p|
8
+ p.rubyforge_name = 'websitiary'
9
+ p.author = 'Thomas Link'
10
+ p.email = 'micathom at gmail com'
11
+ p.summary = 'A unified website news, rss feed, podcast monitor'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ p.extra_deps << 'hpricot'
16
+ # p.need_tgz = false
17
+ p.need_zip = true
18
+ end
19
+
20
+ require 'rtagstask'
21
+ RTagsTask.new
22
+
23
+ task :ctags do
24
+ `ctags --extra=+q -R bin lib`
25
+ end
26
+
27
+ # vim: syntax=Ruby
data/bin/websitary ADDED
@@ -0,0 +1,43 @@
1
+ #! /usr/bin/ruby.exe
2
+ # websitary.rb -- The website news, rss feed, podcast catching monitor
3
+ # @Last Change: 2007-09-09.
4
+ # Author:: Thomas Link (micathom at gmail com)
5
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
6
+ # Created:: 2007-06-09.
7
+
8
+
9
+ require 'websitary'
10
+
11
+
12
+ if __FILE__ == $0
13
+ w = Websitary::App.new(ARGV)
14
+ t = w.configuration.get_optionvalue(:global, :timer)
15
+ if t
16
+ exit_code = 0
17
+ while exit_code <= 1
18
+ exit_code = Websitary::App.new(ARGV).process
19
+ case t
20
+ when Numeric
21
+ $logger.info "Sleep: #{t}s"
22
+ sleep t
23
+ when Proc
24
+ t.call
25
+ else
26
+ $logger.fatal "Malformed timer: #{t}"
27
+ exit_code = 5
28
+ break
29
+ end
30
+ end
31
+ else
32
+ exit_code = w.process
33
+ end
34
+ exit exit_code
35
+ # sleep 5
36
+ end
37
+
38
+
39
+
40
+ # vi: ft=ruby:tw=72:ts=2:sw=4
41
+ # Local Variables:
42
+ # revisionRx: REVISION\s\+=\s\+\'
43
+ # End:
data/lib/websitary.rb ADDED
@@ -0,0 +1,610 @@
1
+ # websitary.rb
2
+ # @Last Change: 2007-09-11.
3
+ # Author:: Thomas Link (micathom AT gmail com)
4
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
+ # Created:: 2007-09-08.
6
+ #
7
+ # = TODO
8
+ # * Built-in support for robots.txt
9
+ # * Option to append to output files (e.g. rss)
10
+ # * Option to trim output files (when appending items)
11
+
12
+
13
+ require 'cgi'
14
+ require 'digest/md5'
15
+ require 'ftools'
16
+ require 'net/ftp'
17
+ require 'optparse'
18
+ require 'pathname'
19
+ require 'rbconfig'
20
+ require 'uri'
21
+ require 'open-uri'
22
+ require 'yaml'
23
+ require 'rss'
24
+
25
+ ['hpricot', 'robot_rules'].each do |f|
26
+ begin
27
+ require f
28
+ rescue Exception => e
29
+ $stderr.puts <<EOT
30
+ #{e.message}
31
+ Library could not be loaded: #{f}
32
+ Please see the requirements section at: http://websitiary.rubyforge.org
33
+ EOT
34
+ end
35
+ end
36
+
37
+
38
+ module Websitary
39
+ APPNAME = 'websitary'
40
+ VERSION = '0.2.0'
41
+ REVISION = '2403'
42
+ end
43
+
44
+ require 'websitary/applog'
45
+ require 'websitary/filemtimes'
46
+ require 'websitary/configuration'
47
+ require 'websitary/htmldiff'
48
+
49
+
50
+ # Basic usage:
51
+ # Websitary.new(ARGV).process
52
+ class Websitary::App
53
+ MINUTE_SECS = 60
54
+ HOUR_SECS = MINUTE_SECS * 60
55
+ DAY_SECS = HOUR_SECS * 24
56
+
57
+
58
+ # Hash: The output of the diff commands for each url.
59
+ attr_reader :difftext
60
+
61
+ # The configurator
62
+ attr_reader :configuration
63
+
64
+ # Secs until next update.
65
+ attr_reader :tdiff_min
66
+
67
+
68
+ # args:: Array of command-line (like) arguments.
69
+ def initialize(args=[])
70
+ @configuration = Websitary::Configuration.new(self, args)
71
+ @difftext = {}
72
+ @tdiff_min = nil
73
+
74
+ ensure_dir(@configuration.cfgdir)
75
+ css = File.join(@configuration.cfgdir, 'websitary.css')
76
+ unless File.exists?(css)
77
+ $logger.info "Copying default css file: #{css}"
78
+ @configuration.write_file(css, 'w') do |io|
79
+ io.puts <<CSS
80
+ body {
81
+ color: black;
82
+ background-color: #f0f0f0;
83
+ }
84
+ a.external {
85
+ }
86
+ a.old {
87
+ }
88
+ a.latest {
89
+ }
90
+ a.toc {
91
+ }
92
+ ol.toc {
93
+ float: left;
94
+ width: 200px;
95
+ position: fixed;
96
+ padding: 0;
97
+ margin: 0;
98
+ }
99
+ li.toc {
100
+ list-style: none;
101
+ border: 1px solid #e0e0e0;
102
+ background-color: #fafafa;
103
+ padding: 0.1em;
104
+ font-size: 80%;
105
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
106
+ }
107
+ li.toc:hover {
108
+ background-color: #ffff8d;
109
+ }
110
+ div.contents {
111
+ margin-left: 210px;
112
+ min-width: 16em;
113
+ }
114
+ div.webpage {
115
+ margin: 5px 0 5px 0;
116
+ padding: 5px;
117
+ border: 1px solid #e0e0e0;
118
+ background-color: white;
119
+ }
120
+ div.count {
121
+ text-align: right;
122
+ }
123
+ .enclosure {
124
+ padding: 4px;
125
+ margin: 4px 0 4px 0;
126
+ background: #f9f9f9;
127
+ }
128
+ h1.diff {
129
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
130
+ }
131
+ h2.rss {
132
+ border-top: 10px solid #f0f0f0;
133
+ padding-top: 10px;
134
+ }
135
+ div.diff {
136
+ padding-left: 2em;
137
+ }
138
+ pre.diff {
139
+ padding-left: 2em;
140
+ }
141
+ div.annotation {
142
+ font-size: 80%;
143
+ }
144
+ hr.separator {
145
+ width: 100%;
146
+ visibility: hidden;
147
+ }
148
+ .error {
149
+ color: yellow;
150
+ background-color: red;
151
+ }
152
+ .highlight-yellow {
153
+ background-color: #ffc730;
154
+ }
155
+ .highlight-red {
156
+ background-color: red;
157
+ }
158
+ .highlight-blue {
159
+ background-color: blue;
160
+ }
161
+ .highlight-aqua {
162
+ background-color: aqua;
163
+ }
164
+ CSS
165
+ end
166
+ end
167
+ end
168
+
169
+
170
+ # Run the command stored in @execute.
171
+ def process
172
+ begin
173
+ m = "execute_#{@configuration.execute}"
174
+ if respond_to?(m)
175
+ exit_code = send(m)
176
+ else
177
+ $logger.fatal "Unknown command: #{@configuration.execute}"
178
+ exit_code = 5
179
+ end
180
+ ensure
181
+ @configuration.mtimes.swap_out
182
+ end
183
+ return exit_code
184
+ end
185
+
186
+
187
+ # Show the currently configured URLs
188
+ def execute_configuration
189
+ keys = @configuration.options.keys
190
+ urls = @configuration.todo
191
+ # urls = @configuration.todo..sort {|a,b| @configuration.get(a, :title, a) <=> @configuration.get(b, :title, b)}
192
+ urls.each_with_index do |url, i|
193
+ data = @configuration.urls[url]
194
+ text = [
195
+ "<b>URL</b><br/>#{url}<br/>",
196
+ "<b>current</b><br/>#{CGI.escapeHTML(@configuration.latestname(url, true))}<br/>",
197
+ "<b>backup</b><br/>#{CGI.escapeHTML(@configuration.oldname(url, true))}<br/>",
198
+ *((data.keys | keys).map do |k|
199
+ v = @configuration.get(url, k).inspect
200
+ "<b>:#{k}</b><br/>#{CGI.escapeHTML(v)}<br/>"
201
+ end)
202
+ ]
203
+ accumulate(url, text.join("<br/>"))
204
+ end
205
+ return show
206
+ end
207
+
208
+
209
+ def cmdline_arg_add(configuration, url)
210
+ configuration.todo << url
211
+ end
212
+
213
+
214
+ def execute_add
215
+ if @configuration.quicklist_profile
216
+ quicklist = @configuration.profile_filename(@configuration.quicklist_profile, false)
217
+ $logger.info "Use quicklist file: #{quicklist}"
218
+ if quicklist
219
+ @configuration.write_file(quicklist, 'a') do |io|
220
+ @configuration.todo.each do |url|
221
+ io.puts %{source #{url.inspect}}
222
+ end
223
+ end
224
+ return 0
225
+ end
226
+ end
227
+ $logger.fatal 'No valid quick-list profile defined'
228
+ exit 5
229
+ end
230
+
231
+
232
+ # Restore previous backups
233
+ def execute_unroll
234
+ @configuration.todo.each do |url|
235
+ latest = @configuration.latestname(url, true)
236
+ backup = @configuration.oldname(url, true)
237
+ if File.exist?(backup)
238
+ $logger.warn "Restore: #{url}"
239
+ $logger.debug "Copy: #{backup} => #{latest}"
240
+ copy(backup, latest)
241
+ end
242
+ end
243
+ return 0
244
+ end
245
+
246
+
247
+ # Edit currently chosen profiles
248
+ def execute_edit
249
+ @configuration.edit_profile
250
+ exit 0
251
+ end
252
+
253
+
254
+ # Show the latest report
255
+ def execute_review
256
+ @configuration.view_output
257
+ 0
258
+ end
259
+
260
+
261
+ # Show the current version of all urls
262
+ def execute_latest
263
+ @configuration.todo.each do |url|
264
+ latest = @configuration.latestname(url)
265
+ text = File.read(latest)
266
+ accumulate(url, text)
267
+ end
268
+ return show
269
+ end
270
+
271
+
272
+ # Rebuild the report from the already downloaded copies.
273
+ def execute_rebuild
274
+ execute_downdiff(true, true)
275
+ end
276
+
277
+
278
+ # Aggregate data for later review (see #execute_show)
279
+ def execute_aggregate
280
+ rv = execute_downdiff(false) do |url, difftext, opts|
281
+ if difftext and !difftext.empty?
282
+ aggrbase = @configuration.encoded_filename('aggregate', url, true, 'md5')
283
+ aggrext = Digest::MD5.hexdigest(Time.now.to_s)
284
+ aggrfile = [aggrbase, aggrext].join('_')
285
+ @configuration.write_file(aggrfile) {|io| io.puts difftext}
286
+ end
287
+ end
288
+ clean_diffs
289
+ rv
290
+ end
291
+
292
+
293
+ # Show data collected by #execute_aggregate
294
+ def execute_show
295
+ @configuration.todo.each do |url|
296
+ opts = @configuration.urls[url]
297
+ $logger.debug "Source: #{@configuration.get(url, :title, url)}"
298
+ aggrbase = @configuration.encoded_filename('aggregate', url, true, 'md5')
299
+ difftext = []
300
+ aggrfiles = Dir["#{aggrbase}_*"]
301
+ aggrfiles.each do |file|
302
+ difftext << File.read(file)
303
+ end
304
+ difftext.compact!
305
+ difftext.delete('')
306
+ unless difftext.empty?
307
+ joindiffs = @configuration.get(url, :joindiffs, lambda {|t| t.join("\n")})
308
+ difftext = @configuration.call_cmd(joindiffs, [difftext]) if joindiffs
309
+ accumulate(url, difftext, opts)
310
+ end
311
+ aggrfiles.each do |file|
312
+ File.delete(file)
313
+ end
314
+ end
315
+ show
316
+ end
317
+
318
+
319
+ # Process the sources in @configuration.url as defined by profiles
320
+ # and command-line options. The differences are stored in @difftext (a Hash).
321
+ # show_output:: If true, show the output with the defined viewer.
322
+ def execute_downdiff(show_output=true, rebuild=false, &accumulator)
323
+ @configuration.todo.each do |url|
324
+ opts = @configuration.urls[url]
325
+ $logger.debug "Source: #{@configuration.get(url, :title, url)}"
326
+
327
+ diffed = @configuration.diffname(url, true)
328
+ $logger.debug "diffname: #{diffed}"
329
+
330
+ if File.exists?(diffed)
331
+ $logger.warn "Reuse old diff: #{@configuration.get(url, :title, url)} => #{diffed}"
332
+ difftext = File.read(diffed)
333
+ accumulate(url, difftext, opts)
334
+ else
335
+ latest = @configuration.latestname(url, true)
336
+ $logger.debug "latest: #{latest}"
337
+ next unless rebuild or !skip_url?(url, latest, opts)
338
+
339
+ older = @configuration.oldname(url, true)
340
+ $logger.debug "older: #{older}"
341
+
342
+ if rebuild or download(url, opts, latest, older)
343
+ difftext = diff(url, opts, latest, older)
344
+ if difftext
345
+ @configuration.write_file(diffed, 'wb') {|io| io.puts difftext}
346
+ # $logger.debug "difftext: #{difftext}" #DBG#
347
+ if accumulator
348
+ accumulator.call(url, difftext, opts)
349
+ else
350
+ accumulate(url, difftext, opts)
351
+ end
352
+ end
353
+ end
354
+ end
355
+ end
356
+ return show_output ? show : @difftext.empty? ? 0 : 1
357
+ end
358
+
359
+
360
+ def move(from, to)
361
+ copy_move(:rename, from, to)
362
+ end
363
+
364
+
365
+ def copy(from, to)
366
+ copy_move(:copy, from, to)
367
+ end
368
+
369
+
370
+ def copy_move(method, from, to)
371
+ if File.exists?(from)
372
+ $logger.debug "Overwriting: #{from} -> #{to}" if File.exists?(to)
373
+ lst = File.lstat(from)
374
+ File.send(method, from, to)
375
+ File.utime(lst.atime, lst.mtime, to)
376
+ @configuration.mtimes.set(from, lst.mtime)
377
+ @configuration.mtimes.set(to, lst.mtime)
378
+ end
379
+ end
380
+
381
+
382
+ def format_tdiff(secs)
383
+ d = (secs / DAY_SECS).to_i
384
+ if d > 0
385
+ return "#{d}d"
386
+ else
387
+ d = (secs / HOUR_SECS).to_i
388
+ return "#{d}h"
389
+ end
390
+ end
391
+
392
+
393
+ def ensure_dir(dir, fatal_nondir=true)
394
+ if File.exist?(dir)
395
+ unless File.directory?(dir)
396
+ if fatal_nondir
397
+ $logger.fatal "Not a directory: #{dir}"
398
+ exit 5
399
+ else
400
+ $logger.info "Not a directory: #{dir}"
401
+ return false
402
+ end
403
+ end
404
+ else
405
+ parent = Pathname.new(dir).parent.to_s
406
+ ensure_dir(parent, fatal_nondir) unless File.directory?(parent)
407
+ Dir.mkdir(dir)
408
+ end
409
+ return true
410
+ end
411
+
412
+
413
+ private
414
+
415
+ def download(url, opts, latest, older=nil)
416
+ if @configuration.done.include?(url)
417
+ $logger.info "Already downloaded: #{@configuration.get(url, :title, url).inspect}"
418
+ return false
419
+ end
420
+
421
+ $logger.warn "Download: #{@configuration.get(url, :title, url).inspect}"
422
+ @configuration.done << url
423
+ text = @configuration.call_cmd(@configuration.get(url, :download), [url])
424
+ # $logger.debug text #DBG#
425
+ unless text
426
+ $logger.warn "no contents: #{@configuration.get(url, :title, url)}"
427
+ return false
428
+ end
429
+
430
+ if opts
431
+ if (sleepsecs = opts[:sleep])
432
+ sleep sleepsecs
433
+ end
434
+ text = text.split("\n")
435
+ if (range = opts[:lines])
436
+ $logger.debug "download: lines=#{range}"
437
+ text = text[range] || []
438
+ end
439
+ if (range = opts[:cols])
440
+ $logger.debug "download: cols=#{range}"
441
+ text.map! {|l| l[range]}
442
+ text.compact!
443
+ end
444
+ if (o = opts[:sort])
445
+ $logger.debug "download: sort=#{o}"
446
+ case o
447
+ when true
448
+ text.sort!
449
+ when Proc
450
+ text.sort!(&o)
451
+ end
452
+ end
453
+ if (o = opts[:strip])
454
+ $logger.debug "download: strip!"
455
+ text.delete_if {|l| l !~ /\S/}
456
+ end
457
+ text = text.join("\n")
458
+ end
459
+
460
+ pprc = @configuration.get(url, :downloadprocess)
461
+ if pprc
462
+ $logger.debug "download process: #{pprc}"
463
+ text = @configuration.call_cmd(pprc, [text])
464
+ # $logger.debug text #DBG#
465
+ end
466
+
467
+ if older
468
+ if File.exist?(latest)
469
+ move(latest, older)
470
+ elsif !File.exist?(older)
471
+ $logger.warn "Initial copy: #{latest.inspect}"
472
+ end
473
+ end
474
+ @configuration.write_file(latest) {|io| io.puts(text)}
475
+ return true
476
+ end
477
+
478
+
479
+ def diff(url, opts, new, old)
480
+ if File.exists?(old)
481
+ $logger.debug "diff: #{old} <-> #{new}"
482
+ difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
483
+ # $logger.debug "diff: #{difftext}" #DBG#
484
+
485
+ if difftext =~ /\S/
486
+ if (pprc = @configuration.get(url, :diffprocess))
487
+ $logger.debug "diff process: #{pprc}"
488
+ difftext = @configuration.call_cmd(pprc, [difftext])
489
+ end
490
+ # $logger.debug "difftext: #{difftext}" #DBG#
491
+ if difftext =~ /\S/
492
+ $logger.warn "Changed: #{@configuration.get(url, :title, url).inspect}"
493
+ return difftext
494
+ end
495
+ end
496
+
497
+ $logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
498
+
499
+ elsif File.exist?(new) and
500
+ (@configuration.get(url, :show_initial) or @configuration.get_optionvalue(:global, :show_initial))
501
+
502
+ return File.read(new)
503
+
504
+ end
505
+ return nil
506
+ end
507
+
508
+
509
+ def skip_url?(url, latest, opts)
510
+ if File.exists?(latest) and !opts[:ignore_age]
511
+ tn = Time.now
512
+ tl = @configuration.mtimes.mtime(latest)
513
+ td = tn - tl
514
+ tdiff = tdiff_with(opts, tn, tl)
515
+ case tdiff
516
+ when nil, false
517
+ $logger.debug "Age requirement fulfilled: #{@configuration.get(url, :title, url).inspect}: #{format_tdiff(td)} old"
518
+ return false
519
+ when :skip, true
520
+ $logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{format_tdiff(td)} old"
521
+ return true
522
+ when Numeric
523
+ if td < tdiff
524
+ tdd = tdiff - td
525
+ @tdiff_min = tdd if @tdiff_min.nil? or tdd < @tdiff_min
526
+ $logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{format_tdiff(td)} old (#{format_tdiff(tdiff)})"
527
+ return true
528
+ end
529
+ else
530
+ $logger.fatal "Internal error: tdiff=#{tdiff.inspect}"
531
+ exit 5
532
+ end
533
+ end
534
+ end
535
+
536
+
537
+ def tdiff_with(opts, tn, tl)
538
+ if (hdiff = opts[:hours])
539
+ tdiff = hdiff * HOUR_SECS
540
+ $logger.debug "hours: #{hdiff} (#{tdiff}s)"
541
+ elsif (daily = opts[:daily])
542
+ tdiff = tl.year == tn.year && tl.yday == tn.yday
543
+ $logger.debug "daily: #{tl} <=> #{tn} (#{tdiff})"
544
+ elsif (dweek = opts[:days_of_week] || opts[:wdays])
545
+ tdiff = tdiff_x_of_y(dweek, tn.wday, tn.yday / 7, tl.yday / 7)
546
+ $logger.debug "wdays: #{dweek} (#{tdiff})"
547
+ elsif (dmonth = opts[:days_of_month] || opts[:mdays])
548
+ tdiff = tdiff_x_of_y(dmonth, tn.day, tn.month, tl.month)
549
+ $logger.debug "mdays: #{dmonth} (#{tdiff})"
550
+ elsif (ddiff = opts[:days])
551
+ tdiff = ddiff * DAY_SECS
552
+ $logger.debug "days: #{ddiff} (#{tdiff}s)"
553
+ elsif (dmonth = opts[:months])
554
+ tnowm = tn.month + 12 * (tn.year - tl.year)
555
+ tlm = tl.month
556
+ tdiff = (tnowm - tlm) < dmonth
557
+ $logger.debug "months: #{dmonth} (#{tdiff})"
558
+ else
559
+ tdiff = false
560
+ end
561
+ return tdiff
562
+ end
563
+
564
+
565
+ def tdiff_x_of_y(eligible, now, parent_eligible, parent_now)
566
+ if parent_eligible == parent_now
567
+ return true
568
+ else
569
+ case now
570
+ when Array, Range
571
+ return !eligible.include?(now)
572
+ when Integer
573
+ return eligible != now
574
+ else
575
+ $logger.error "#{@configuration.get(url, :title, url)}: Wrong type for :days_of_week=#{dweek.inspect}"
576
+ return :skip
577
+ end
578
+ end
579
+ end
580
+
581
+
582
+ def accumulate(url, difftext, opts=nil)
583
+ # opts ||= @configuration.urls[url]
584
+ @difftext[url] = difftext
585
+ end
586
+
587
+
588
+ def show
589
+ begin
590
+ return @configuration.show_output(@difftext)
591
+ ensure
592
+ clean_diffs
593
+ end
594
+ end
595
+
596
+
597
+ def clean_diffs
598
+ Dir[File.join(@configuration.cfgdir, 'diff', '*')].each do |f|
599
+ $logger.debug "Delete saved diff: #{f}"
600
+ File.delete(f)
601
+ end
602
+ end
603
+
604
+ end
605
+
606
+
607
+
608
+ # Local Variables:
609
+ # revisionRx: REVISION\s\+=\s\+\'
610
+ # End: