websitary 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ load './lib/websitary.rb'
6
+
7
+ Hoe.new('websitary', Websitary::VERSION) do |p|
8
+ p.rubyforge_name = 'websitiary'
9
+ p.author = 'Thomas Link'
10
+ p.email = 'micathom at gmail com'
11
+ p.summary = 'A unified website news, rss feed, podcast monitor'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ p.extra_deps << 'hpricot'
16
+ # p.need_tgz = false
17
+ p.need_zip = true
18
+ end
19
+
20
+ require 'rtagstask'
21
+ RTagsTask.new
22
+
23
+ task :ctags do
24
+ `ctags --extra=+q -R bin lib`
25
+ end
26
+
27
+ # vim: syntax=Ruby
data/bin/websitary ADDED
@@ -0,0 +1,43 @@
1
+ #! /usr/bin/ruby.exe
2
+ # websitary.rb -- The website news, rss feed, podcast catching monitor
3
+ # @Last Change: 2007-09-09.
4
+ # Author:: Thomas Link (micathom at gmail com)
5
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
6
+ # Created:: 2007-06-09.
7
+
8
+
9
+ require 'websitary'
10
+
11
+
12
+ if __FILE__ == $0
13
+ w = Websitary::App.new(ARGV)
14
+ t = w.configuration.get_optionvalue(:global, :timer)
15
+ if t
16
+ exit_code = 0
17
+ while exit_code <= 1
18
+ exit_code = Websitary::App.new(ARGV).process
19
+ case t
20
+ when Numeric
21
+ $logger.info "Sleep: #{t}s"
22
+ sleep t
23
+ when Proc
24
+ t.call
25
+ else
26
+ $logger.fatal "Malformed timer: #{t}"
27
+ exit_code = 5
28
+ break
29
+ end
30
+ end
31
+ else
32
+ exit_code = w.process
33
+ end
34
+ exit exit_code
35
+ # sleep 5
36
+ end
37
+
38
+
39
+
40
+ # vi: ft=ruby:tw=72:ts=2:sw=4
41
+ # Local Variables:
42
+ # revisionRx: REVISION\s\+=\s\+\'
43
+ # End:
data/lib/websitary.rb ADDED
@@ -0,0 +1,610 @@
1
+ # websitary.rb
2
+ # @Last Change: 2007-09-11.
3
+ # Author:: Thomas Link (micathom AT gmail com)
4
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
+ # Created:: 2007-09-08.
6
+ #
7
+ # = TODO
8
+ # * Built-in support for robots.txt
9
+ # * Option to append to output files (e.g. rss)
10
+ # * Option to trim output files (when appending items)
11
+
12
+
13
+ require 'cgi'
14
+ require 'digest/md5'
15
+ require 'ftools'
16
+ require 'net/ftp'
17
+ require 'optparse'
18
+ require 'pathname'
19
+ require 'rbconfig'
20
+ require 'uri'
21
+ require 'open-uri'
22
+ require 'yaml'
23
+ require 'rss'
24
+
25
+ ['hpricot', 'robot_rules'].each do |f|
26
+ begin
27
+ require f
28
+ rescue Exception => e
29
+ $stderr.puts <<EOT
30
+ #{e.message}
31
+ Library could not be loaded: #{f}
32
+ Please see the requirements section at: http://websitiary.rubyforge.org
33
+ EOT
34
+ end
35
+ end
36
+
37
+
38
+ module Websitary
39
+ APPNAME = 'websitary'
40
+ VERSION = '0.2.0'
41
+ REVISION = '2403'
42
+ end
43
+
44
+ require 'websitary/applog'
45
+ require 'websitary/filemtimes'
46
+ require 'websitary/configuration'
47
+ require 'websitary/htmldiff'
48
+
49
+
50
+ # Basic usage:
51
+ # Websitary.new(ARGV).process
52
+ class Websitary::App
53
+ MINUTE_SECS = 60
54
+ HOUR_SECS = MINUTE_SECS * 60
55
+ DAY_SECS = HOUR_SECS * 24
56
+
57
+
58
+ # Hash: The output of the diff commands for each url.
59
+ attr_reader :difftext
60
+
61
+ # The configurator
62
+ attr_reader :configuration
63
+
64
+ # Secs until next update.
65
+ attr_reader :tdiff_min
66
+
67
+
68
+ # args:: Array of command-line (like) arguments.
69
+ def initialize(args=[])
70
+ @configuration = Websitary::Configuration.new(self, args)
71
+ @difftext = {}
72
+ @tdiff_min = nil
73
+
74
+ ensure_dir(@configuration.cfgdir)
75
+ css = File.join(@configuration.cfgdir, 'websitary.css')
76
+ unless File.exists?(css)
77
+ $logger.info "Copying default css file: #{css}"
78
+ @configuration.write_file(css, 'w') do |io|
79
+ io.puts <<CSS
80
+ body {
81
+ color: black;
82
+ background-color: #f0f0f0;
83
+ }
84
+ a.external {
85
+ }
86
+ a.old {
87
+ }
88
+ a.latest {
89
+ }
90
+ a.toc {
91
+ }
92
+ ol.toc {
93
+ float: left;
94
+ width: 200px;
95
+ position: fixed;
96
+ padding: 0;
97
+ margin: 0;
98
+ }
99
+ li.toc {
100
+ list-style: none;
101
+ border: 1px solid #e0e0e0;
102
+ background-color: #fafafa;
103
+ padding: 0.1em;
104
+ font-size: 80%;
105
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
106
+ }
107
+ li.toc:hover {
108
+ background-color: #ffff8d;
109
+ }
110
+ div.contents {
111
+ margin-left: 210px;
112
+ min-width: 16em;
113
+ }
114
+ div.webpage {
115
+ margin: 5px 0 5px 0;
116
+ padding: 5px;
117
+ border: 1px solid #e0e0e0;
118
+ background-color: white;
119
+ }
120
+ div.count {
121
+ text-align: right;
122
+ }
123
+ .enclosure {
124
+ padding: 4px;
125
+ margin: 4px 0 4px 0;
126
+ background: #f9f9f9;
127
+ }
128
+ h1.diff {
129
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
130
+ }
131
+ h2.rss {
132
+ border-top: 10px solid #f0f0f0;
133
+ padding-top: 10px;
134
+ }
135
+ div.diff {
136
+ padding-left: 2em;
137
+ }
138
+ pre.diff {
139
+ padding-left: 2em;
140
+ }
141
+ div.annotation {
142
+ font-size: 80%;
143
+ }
144
+ hr.separator {
145
+ width: 100%;
146
+ visibility: hidden;
147
+ }
148
+ .error {
149
+ color: yellow;
150
+ background-color: red;
151
+ }
152
+ .highlight-yellow {
153
+ background-color: #ffc730;
154
+ }
155
+ .highlight-red {
156
+ background-color: red;
157
+ }
158
+ .highlight-blue {
159
+ background-color: blue;
160
+ }
161
+ .highlight-aqua {
162
+ background-color: aqua;
163
+ }
164
+ CSS
165
+ end
166
+ end
167
+ end
168
+
169
+
170
+ # Run the command stored in @execute.
171
+ def process
172
+ begin
173
+ m = "execute_#{@configuration.execute}"
174
+ if respond_to?(m)
175
+ exit_code = send(m)
176
+ else
177
+ $logger.fatal "Unknown command: #{@configuration.execute}"
178
+ exit_code = 5
179
+ end
180
+ ensure
181
+ @configuration.mtimes.swap_out
182
+ end
183
+ return exit_code
184
+ end
185
+
186
+
187
+ # Show the currently configured URLs
188
+ def execute_configuration
189
+ keys = @configuration.options.keys
190
+ urls = @configuration.todo
191
+ # urls = @configuration.todo..sort {|a,b| @configuration.get(a, :title, a) <=> @configuration.get(b, :title, b)}
192
+ urls.each_with_index do |url, i|
193
+ data = @configuration.urls[url]
194
+ text = [
195
+ "<b>URL</b><br/>#{url}<br/>",
196
+ "<b>current</b><br/>#{CGI.escapeHTML(@configuration.latestname(url, true))}<br/>",
197
+ "<b>backup</b><br/>#{CGI.escapeHTML(@configuration.oldname(url, true))}<br/>",
198
+ *((data.keys | keys).map do |k|
199
+ v = @configuration.get(url, k).inspect
200
+ "<b>:#{k}</b><br/>#{CGI.escapeHTML(v)}<br/>"
201
+ end)
202
+ ]
203
+ accumulate(url, text.join("<br/>"))
204
+ end
205
+ return show
206
+ end
207
+
208
+
209
+ def cmdline_arg_add(configuration, url)
210
+ configuration.todo << url
211
+ end
212
+
213
+
214
+ def execute_add
215
+ if @configuration.quicklist_profile
216
+ quicklist = @configuration.profile_filename(@configuration.quicklist_profile, false)
217
+ $logger.info "Use quicklist file: #{quicklist}"
218
+ if quicklist
219
+ @configuration.write_file(quicklist, 'a') do |io|
220
+ @configuration.todo.each do |url|
221
+ io.puts %{source #{url.inspect}}
222
+ end
223
+ end
224
+ return 0
225
+ end
226
+ end
227
+ $logger.fatal 'No valid quick-list profile defined'
228
+ exit 5
229
+ end
230
+
231
+
232
+ # Restore previous backups
233
+ def execute_unroll
234
+ @configuration.todo.each do |url|
235
+ latest = @configuration.latestname(url, true)
236
+ backup = @configuration.oldname(url, true)
237
+ if File.exist?(backup)
238
+ $logger.warn "Restore: #{url}"
239
+ $logger.debug "Copy: #{backup} => #{latest}"
240
+ copy(backup, latest)
241
+ end
242
+ end
243
+ return 0
244
+ end
245
+
246
+
247
+ # Edit currently chosen profiles
248
+ def execute_edit
249
+ @configuration.edit_profile
250
+ exit 0
251
+ end
252
+
253
+
254
+ # Show the latest report
255
+ def execute_review
256
+ @configuration.view_output
257
+ 0
258
+ end
259
+
260
+
261
+ # Show the current version of all urls
262
+ def execute_latest
263
+ @configuration.todo.each do |url|
264
+ latest = @configuration.latestname(url)
265
+ text = File.read(latest)
266
+ accumulate(url, text)
267
+ end
268
+ return show
269
+ end
270
+
271
+
272
+ # Rebuild the report from the already downloaded copies.
273
+ def execute_rebuild
274
+ execute_downdiff(true, true)
275
+ end
276
+
277
+
278
+ # Aggregate data for later review (see #execute_show)
279
+ def execute_aggregate
280
+ rv = execute_downdiff(false) do |url, difftext, opts|
281
+ if difftext and !difftext.empty?
282
+ aggrbase = @configuration.encoded_filename('aggregate', url, true, 'md5')
283
+ aggrext = Digest::MD5.hexdigest(Time.now.to_s)
284
+ aggrfile = [aggrbase, aggrext].join('_')
285
+ @configuration.write_file(aggrfile) {|io| io.puts difftext}
286
+ end
287
+ end
288
+ clean_diffs
289
+ rv
290
+ end
291
+
292
+
293
+ # Show data collected by #execute_aggregate
294
+ def execute_show
295
+ @configuration.todo.each do |url|
296
+ opts = @configuration.urls[url]
297
+ $logger.debug "Source: #{@configuration.get(url, :title, url)}"
298
+ aggrbase = @configuration.encoded_filename('aggregate', url, true, 'md5')
299
+ difftext = []
300
+ aggrfiles = Dir["#{aggrbase}_*"]
301
+ aggrfiles.each do |file|
302
+ difftext << File.read(file)
303
+ end
304
+ difftext.compact!
305
+ difftext.delete('')
306
+ unless difftext.empty?
307
+ joindiffs = @configuration.get(url, :joindiffs, lambda {|t| t.join("\n")})
308
+ difftext = @configuration.call_cmd(joindiffs, [difftext]) if joindiffs
309
+ accumulate(url, difftext, opts)
310
+ end
311
+ aggrfiles.each do |file|
312
+ File.delete(file)
313
+ end
314
+ end
315
+ show
316
+ end
317
+
318
+
319
+ # Process the sources in @configuration.url as defined by profiles
320
+ # and command-line options. The differences are stored in @difftext (a Hash).
321
+ # show_output:: If true, show the output with the defined viewer.
322
+ def execute_downdiff(show_output=true, rebuild=false, &accumulator)
323
+ @configuration.todo.each do |url|
324
+ opts = @configuration.urls[url]
325
+ $logger.debug "Source: #{@configuration.get(url, :title, url)}"
326
+
327
+ diffed = @configuration.diffname(url, true)
328
+ $logger.debug "diffname: #{diffed}"
329
+
330
+ if File.exists?(diffed)
331
+ $logger.warn "Reuse old diff: #{@configuration.get(url, :title, url)} => #{diffed}"
332
+ difftext = File.read(diffed)
333
+ accumulate(url, difftext, opts)
334
+ else
335
+ latest = @configuration.latestname(url, true)
336
+ $logger.debug "latest: #{latest}"
337
+ next unless rebuild or !skip_url?(url, latest, opts)
338
+
339
+ older = @configuration.oldname(url, true)
340
+ $logger.debug "older: #{older}"
341
+
342
+ if rebuild or download(url, opts, latest, older)
343
+ difftext = diff(url, opts, latest, older)
344
+ if difftext
345
+ @configuration.write_file(diffed, 'wb') {|io| io.puts difftext}
346
+ # $logger.debug "difftext: #{difftext}" #DBG#
347
+ if accumulator
348
+ accumulator.call(url, difftext, opts)
349
+ else
350
+ accumulate(url, difftext, opts)
351
+ end
352
+ end
353
+ end
354
+ end
355
+ end
356
+ return show_output ? show : @difftext.empty? ? 0 : 1
357
+ end
358
+
359
+
360
+ def move(from, to)
361
+ copy_move(:rename, from, to)
362
+ end
363
+
364
+
365
+ def copy(from, to)
366
+ copy_move(:copy, from, to)
367
+ end
368
+
369
+
370
+ def copy_move(method, from, to)
371
+ if File.exists?(from)
372
+ $logger.debug "Overwriting: #{from} -> #{to}" if File.exists?(to)
373
+ lst = File.lstat(from)
374
+ File.send(method, from, to)
375
+ File.utime(lst.atime, lst.mtime, to)
376
+ @configuration.mtimes.set(from, lst.mtime)
377
+ @configuration.mtimes.set(to, lst.mtime)
378
+ end
379
+ end
380
+
381
+
382
+ def format_tdiff(secs)
383
+ d = (secs / DAY_SECS).to_i
384
+ if d > 0
385
+ return "#{d}d"
386
+ else
387
+ d = (secs / HOUR_SECS).to_i
388
+ return "#{d}h"
389
+ end
390
+ end
391
+
392
+
393
+ def ensure_dir(dir, fatal_nondir=true)
394
+ if File.exist?(dir)
395
+ unless File.directory?(dir)
396
+ if fatal_nondir
397
+ $logger.fatal "Not a directory: #{dir}"
398
+ exit 5
399
+ else
400
+ $logger.info "Not a directory: #{dir}"
401
+ return false
402
+ end
403
+ end
404
+ else
405
+ parent = Pathname.new(dir).parent.to_s
406
+ ensure_dir(parent, fatal_nondir) unless File.directory?(parent)
407
+ Dir.mkdir(dir)
408
+ end
409
+ return true
410
+ end
411
+
412
+
413
+ private
414
+
415
+ def download(url, opts, latest, older=nil)
416
+ if @configuration.done.include?(url)
417
+ $logger.info "Already downloaded: #{@configuration.get(url, :title, url).inspect}"
418
+ return false
419
+ end
420
+
421
+ $logger.warn "Download: #{@configuration.get(url, :title, url).inspect}"
422
+ @configuration.done << url
423
+ text = @configuration.call_cmd(@configuration.get(url, :download), [url])
424
+ # $logger.debug text #DBG#
425
+ unless text
426
+ $logger.warn "no contents: #{@configuration.get(url, :title, url)}"
427
+ return false
428
+ end
429
+
430
+ if opts
431
+ if (sleepsecs = opts[:sleep])
432
+ sleep sleepsecs
433
+ end
434
+ text = text.split("\n")
435
+ if (range = opts[:lines])
436
+ $logger.debug "download: lines=#{range}"
437
+ text = text[range] || []
438
+ end
439
+ if (range = opts[:cols])
440
+ $logger.debug "download: cols=#{range}"
441
+ text.map! {|l| l[range]}
442
+ text.compact!
443
+ end
444
+ if (o = opts[:sort])
445
+ $logger.debug "download: sort=#{o}"
446
+ case o
447
+ when true
448
+ text.sort!
449
+ when Proc
450
+ text.sort!(&o)
451
+ end
452
+ end
453
+ if (o = opts[:strip])
454
+ $logger.debug "download: strip!"
455
+ text.delete_if {|l| l !~ /\S/}
456
+ end
457
+ text = text.join("\n")
458
+ end
459
+
460
+ pprc = @configuration.get(url, :downloadprocess)
461
+ if pprc
462
+ $logger.debug "download process: #{pprc}"
463
+ text = @configuration.call_cmd(pprc, [text])
464
+ # $logger.debug text #DBG#
465
+ end
466
+
467
+ if older
468
+ if File.exist?(latest)
469
+ move(latest, older)
470
+ elsif !File.exist?(older)
471
+ $logger.warn "Initial copy: #{latest.inspect}"
472
+ end
473
+ end
474
+ @configuration.write_file(latest) {|io| io.puts(text)}
475
+ return true
476
+ end
477
+
478
+
479
+ def diff(url, opts, new, old)
480
+ if File.exists?(old)
481
+ $logger.debug "diff: #{old} <-> #{new}"
482
+ difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
483
+ # $logger.debug "diff: #{difftext}" #DBG#
484
+
485
+ if difftext =~ /\S/
486
+ if (pprc = @configuration.get(url, :diffprocess))
487
+ $logger.debug "diff process: #{pprc}"
488
+ difftext = @configuration.call_cmd(pprc, [difftext])
489
+ end
490
+ # $logger.debug "difftext: #{difftext}" #DBG#
491
+ if difftext =~ /\S/
492
+ $logger.warn "Changed: #{@configuration.get(url, :title, url).inspect}"
493
+ return difftext
494
+ end
495
+ end
496
+
497
+ $logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
498
+
499
+ elsif File.exist?(new) and
500
+ (@configuration.get(url, :show_initial) or @configuration.get_optionvalue(:global, :show_initial))
501
+
502
+ return File.read(new)
503
+
504
+ end
505
+ return nil
506
+ end
507
+
508
+
509
+ def skip_url?(url, latest, opts)
510
+ if File.exists?(latest) and !opts[:ignore_age]
511
+ tn = Time.now
512
+ tl = @configuration.mtimes.mtime(latest)
513
+ td = tn - tl
514
+ tdiff = tdiff_with(opts, tn, tl)
515
+ case tdiff
516
+ when nil, false
517
+ $logger.debug "Age requirement fulfilled: #{@configuration.get(url, :title, url).inspect}: #{format_tdiff(td)} old"
518
+ return false
519
+ when :skip, true
520
+ $logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{format_tdiff(td)} old"
521
+ return true
522
+ when Numeric
523
+ if td < tdiff
524
+ tdd = tdiff - td
525
+ @tdiff_min = tdd if @tdiff_min.nil? or tdd < @tdiff_min
526
+ $logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{format_tdiff(td)} old (#{format_tdiff(tdiff)})"
527
+ return true
528
+ end
529
+ else
530
+ $logger.fatal "Internal error: tdiff=#{tdiff.inspect}"
531
+ exit 5
532
+ end
533
+ end
534
+ end
535
+
536
+
537
+ def tdiff_with(opts, tn, tl)
538
+ if (hdiff = opts[:hours])
539
+ tdiff = hdiff * HOUR_SECS
540
+ $logger.debug "hours: #{hdiff} (#{tdiff}s)"
541
+ elsif (daily = opts[:daily])
542
+ tdiff = tl.year == tn.year && tl.yday == tn.yday
543
+ $logger.debug "daily: #{tl} <=> #{tn} (#{tdiff})"
544
+ elsif (dweek = opts[:days_of_week] || opts[:wdays])
545
+ tdiff = tdiff_x_of_y(dweek, tn.wday, tn.yday / 7, tl.yday / 7)
546
+ $logger.debug "wdays: #{dweek} (#{tdiff})"
547
+ elsif (dmonth = opts[:days_of_month] || opts[:mdays])
548
+ tdiff = tdiff_x_of_y(dmonth, tn.day, tn.month, tl.month)
549
+ $logger.debug "mdays: #{dmonth} (#{tdiff})"
550
+ elsif (ddiff = opts[:days])
551
+ tdiff = ddiff * DAY_SECS
552
+ $logger.debug "days: #{ddiff} (#{tdiff}s)"
553
+ elsif (dmonth = opts[:months])
554
+ tnowm = tn.month + 12 * (tn.year - tl.year)
555
+ tlm = tl.month
556
+ tdiff = (tnowm - tlm) < dmonth
557
+ $logger.debug "months: #{dmonth} (#{tdiff})"
558
+ else
559
+ tdiff = false
560
+ end
561
+ return tdiff
562
+ end
563
+
564
+
565
+ def tdiff_x_of_y(eligible, now, parent_eligible, parent_now)
566
+ if parent_eligible == parent_now
567
+ return true
568
+ else
569
+ case now
570
+ when Array, Range
571
+ return !eligible.include?(now)
572
+ when Integer
573
+ return eligible != now
574
+ else
575
+ $logger.error "#{@configuration.get(url, :title, url)}: Wrong type for :days_of_week=#{dweek.inspect}"
576
+ return :skip
577
+ end
578
+ end
579
+ end
580
+
581
+
582
+ def accumulate(url, difftext, opts=nil)
583
+ # opts ||= @configuration.urls[url]
584
+ @difftext[url] = difftext
585
+ end
586
+
587
+
588
+ def show
589
+ begin
590
+ return @configuration.show_output(@difftext)
591
+ ensure
592
+ clean_diffs
593
+ end
594
+ end
595
+
596
+
597
+ def clean_diffs
598
+ Dir[File.join(@configuration.cfgdir, 'diff', '*')].each do |f|
599
+ $logger.debug "Delete saved diff: #{f}"
600
+ File.delete(f)
601
+ end
602
+ end
603
+
604
+ end
605
+
606
+
607
+
608
+ # Local Variables:
609
+ # revisionRx: REVISION\s\+=\s\+\'
610
+ # End: