websitiary 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (7) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +6 -0
  3. data/README.txt +474 -0
  4. data/Rakefile +20 -0
  5. data/bin/websitiary +1351 -0
  6. data/setup.rb +1585 -0
  7. metadata +71 -0
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ load './bin/websitiary'
6
+
7
+ Hoe.new('websitiary', Websitiary::VERSION) do |p|
8
+ p.rubyforge_name = 'websitiary'
9
+ p.author = 'Thomas Link'
10
+ p.email = 'sanobast-ruby@yahoo.de'
11
+ p.summary = 'A simple website monitor'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ p.extra_deps << 'hpricot'
16
+ # p.need_tgz = false
17
+ p.need_zip = true
18
+ end
19
+
20
+ # vim: syntax=Ruby
data/bin/websitiary ADDED
@@ -0,0 +1,1351 @@
1
+ #! /usr/bin/ruby.exe
2
+ # websitiary.rb -- Website Monitor
3
+ # @Last Change: 2007-07-16.
4
+ # Author:: Thomas Link (samul AT web de)
5
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
6
+ # Created:: 2007-06-09.
7
+ #
8
+ # = TODO
9
+ # * Find a ruby-based replacement for webdiff (or port webdiff to ruby)
10
+ # * Built-in support for robots.txt
11
+ # * Option to append to output files
12
+ # * Option to trim output files (when appending items)
13
+
14
+
15
+ require 'cgi'
16
+ require 'digest/md5'
17
+ require 'logger'
18
+ require 'optparse'
19
+ require 'pathname'
20
+ require 'rbconfig'
21
+ require 'uri'
22
+ require 'open-uri'
23
+
24
+
25
+ ['hpricot', 'robot_rules'].each do |f|
26
+ begin
27
+ require f
28
+ rescue Exception => e
29
+ $stderr.puts <<EOT
30
+ #{e.message}
31
+ Library could not be loaded: #{f}
32
+ Please see the requirements section at: http://websitiary.rubyforge.org
33
+ EOT
34
+ end
35
+ end
36
+
37
+
38
+
39
+ # Basic usage:
40
+ # Websitiary.new(ARGV).process
41
+ class Websitiary
42
+ APPNAME = 'websitiary'
43
+ VERSION = '0.1.0'
44
+ REVISION = '1447'
45
+ MINUTE_SECS = 60
46
+ HOUR_SECS = MINUTE_SECS * 60
47
+ DAY_SECS = HOUR_SECS * 24
48
+
49
+ # A simple wrapper around Logger.
50
+ class AppLog
51
+ def initialize(output=nil)
52
+ @output = output || $stdout
53
+ $logger = Logger.new(@output, 'daily')
54
+ $logger.progname = APPNAME
55
+ $logger.datetime_format = "%H:%M:%S"
56
+ set_level
57
+ end
58
+
59
+
60
+ def set_level(level=:default)
61
+ case level
62
+ when :debug
63
+ $logger.level = Logger::DEBUG
64
+ when :verbose
65
+ $logger.level = Logger::INFO
66
+ when :quiet
67
+ $logger.level = Logger::ERROR
68
+ else
69
+ $logger.level = Logger::WARN
70
+ end
71
+ $logger.debug "Set logger level: #{level}"
72
+ end
73
+ end
74
+
75
+
76
+ # This class defines the scope in which profiles are evaluated. Most
77
+ # of its methods are suitable for use in profiles.
78
+ class Configuration
79
+ # Hash (key = URL, value = Hash of options)
80
+ attr_accessor :urls
81
+ # Array of urls to be downloaded.
82
+ attr_accessor :todo
83
+ # Array of downloaded urls.
84
+ attr_accessor :done
85
+ # The user configuration directory
86
+ attr_accessor :cfgdir
87
+ # attr_accessor :default_profiles
88
+ # attr_accessor :options
89
+ # attr_accessor :cmd_edit
90
+
91
+
92
+ def initialize(app, args=[])
93
+ @logger = AppLog.new
94
+ $logger.debug "Configuration#initialize"
95
+ @app = app
96
+ @urls = {}
97
+ @todo = []
98
+ @done = []
99
+ @robots = {}
100
+ @allow = {}
101
+
102
+ @suffix = {
103
+ 'text' => 'txt'
104
+ # 'rss' => 'xml'
105
+ }
106
+
107
+ @cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitiary') : '.'
108
+ [
109
+ ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitiary'),
110
+ File.join(Config::CONFIG['sysconfdir'], 'websitiary')
111
+ ].each do |dir|
112
+ if File.exists?(dir)
113
+ @cfgdir = dir
114
+ break
115
+ end
116
+ end
117
+
118
+ @user_agent = "websitiary/#{Websitiary::VERSION}"
119
+
120
+ @cmd_edit = 'vi "%s"'
121
+
122
+ @options = {:global => {}}
123
+
124
+ @options[:diff] = {
125
+ :default => :diff,
126
+ :diff => 'diff -d -w -u2 "%s" "%s"',
127
+ :webdiff => lambda do |old, new|
128
+ $logger.debug "webdiff: #{File.basename(new)}"
129
+ $logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
130
+ difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
131
+ $?.exitstatus == 1 ? difftext : ''
132
+ end,
133
+ }
134
+
135
+ @options[:format] = {
136
+ :default => :diff,
137
+ :diff => %{<pre class="diff">\n%s\n</pre>},
138
+ :webdiff => "%s\n",
139
+ }
140
+
141
+ @options[:diffprocess] = {
142
+ :default => :diff,
143
+ :diff => lambda {|text| text.split("\n")[2..-1].delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n")},
144
+ :webdiff => false,
145
+ }
146
+
147
+ @options[:download] = {
148
+ :default => :w3m,
149
+ }
150
+
151
+ @options[:downloadformat] = {
152
+ :w3m => :text,
153
+ :webdiff => :html,
154
+ }
155
+
156
+ @options[:downloadprocess] = {
157
+ }
158
+
159
+ @options[:rss] = {
160
+ :version => '2.0',
161
+ }
162
+
163
+ @options[:strip_tags] = {
164
+ :default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
165
+ }
166
+
167
+ shortcut :w3m, :delegate => :diff,
168
+ :download => 'w3m -no-cookie -S -F -dump "%s"'
169
+
170
+ shortcut :lynx, :delegate => :diff,
171
+ :download => 'lynx -dump "%s"'
172
+
173
+ shortcut :links, :delegate => :diff,
174
+ :download => 'links -dump "%s"'
175
+
176
+ shortcut :curl, :delegate => :webdiff,
177
+ :download => 'curl --silent "%s"'
178
+
179
+ shortcut :wget, :delegate => :webdiff,
180
+ :download => 'wget -q -O - "%s"'
181
+
182
+ shortcut :body_html, :delegate => :webdiff,
183
+ :strip_tags => :default,
184
+ :download => lambda {|url|
185
+ begin
186
+ doc = Hpricot(open(url))
187
+ doc = doc.at('body')
188
+ if doc
189
+ doc = rewrite_urls(url, doc)
190
+ doc = doc.inner_html
191
+ if (tags = get(url, :strip_tags))
192
+ doc = strip_tags(doc, :format => :hpricot, :tags => tags)
193
+ end
194
+ else
195
+ $logger.warn 'inner html: No body'
196
+ end
197
+ doc.to_s
198
+ rescue Exception => e
199
+ # $logger.error e #DBG#
200
+ $logger.error e.message
201
+ $logger.debug e.backtrace
202
+ break %{<pre class="error">\n#{e.message}\n</pre>}
203
+ end
204
+ }
205
+
206
+ shortcut :openuri, :delegate => :webdiff,
207
+ :download => lambda {|url|
208
+ begin
209
+ open(url).read
210
+ rescue Exception => e
211
+ # $logger.error e #DBG#
212
+ $logger.error e.message
213
+ $logger.debug e.backtrace
214
+ %{<pre class="error">\n#{e.to_s}\n</pre>}
215
+ end
216
+ }
217
+
218
+ shortcut :website, :delegate => :webdiff,
219
+ :download => lambda {|url|
220
+ html = @options[:download][:body_html].call(url)
221
+ break unless html
222
+ doc = Hpricot(html)
223
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
224
+ eligible_path?(url, uri0.path, uri.path) &&
225
+ uri.host == uri0.host
226
+ end
227
+ html
228
+ }
229
+
230
+ shortcut :website_below, :delegate => :webdiff,
231
+ :download => lambda {|url|
232
+ html = @options[:download][:body_html].call(url)
233
+ break unless html
234
+ doc = Hpricot(html)
235
+ if doc
236
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
237
+ eligible_path?(url, uri0.path, uri.path) &&
238
+ uri.host == uri0.host &&
239
+ pn.relative_path_from(pn0).to_s == '.'
240
+ end
241
+ end
242
+ html
243
+ }
244
+
245
+ shortcut :website_txt, :delegate => :default,
246
+ :download => lambda {|url|
247
+ success, cmd = get_option(:download, :default)
248
+ if success
249
+ html = @options[:download][:website].call(url)
250
+ html_to_text(html)
251
+ end
252
+ }
253
+
254
+ shortcut :website_txt_below, :delegate => :default,
255
+ :download => lambda {|url|
256
+ success, cmd = get_option(:download, :default)
257
+ if success
258
+ html = @options[:download][:website_below].call(url)
259
+ html_to_text(html)
260
+ end
261
+ }
262
+
263
+ @options[:page] = {:format => lambda do |ti, li, bd|
264
+ template = <<OUT
265
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
266
+ <html>
267
+ <head>
268
+ <title>%s</title>
269
+ <link rel="stylesheet" href="websitiary.css" type="text/css">
270
+ <link rel="alternate" href="websitiary.rss" type="application/rss+xml" title="%s">
271
+ </head>
272
+ <body>
273
+ <ol class="toc">
274
+ %s
275
+ </ol>
276
+ <div class="contents">
277
+ %s
278
+ </div>
279
+ </body>
280
+ </html>
281
+ OUT
282
+ template % [ti, ti, li, bd]
283
+ end
284
+ }
285
+
286
+ # @view = nil
287
+ @view = 'w3m "%s"'
288
+ @default_options = {}
289
+ @default_profiles = []
290
+ @profiles = []
291
+ @outfile = {}
292
+ profile 'config.rb'
293
+ parse_command_line_args(args)
294
+ @output_format ||= ['html']
295
+ @output_title = %{#{APPNAME}: #{@profiles.join(", ")}}
296
+ end
297
+
298
+
299
+ def parse_command_line_args(args)
300
+ $logger.debug "parse_command_line_args: #{args}"
301
+ opts = OptionParser.new do |opts|
302
+ opts.banner = "Usage: #{APPNAME} [OPTIONS] [PROFILES] > [OUT]"
303
+ opts.separator ''
304
+ opts.separator "#{APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
305
+ opts.separator 'the terms of the GNU General Public License version 2 or newer.'
306
+ opts.separator ''
307
+
308
+ opts.separator 'General Options:'
309
+
310
+ opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
311
+ @cfgdir = value
312
+ end
313
+
314
+ opts.on('-e', '--edit=PROFILE', String, 'Edit a profile') do |value|
315
+ edit_profile value
316
+ exit 0
317
+ end
318
+
319
+ opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
320
+ output_format(*value.split(/,/))
321
+ end
322
+
323
+ opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
324
+ set :ignore_age => bool
325
+ end
326
+
327
+ opts.on('--log=DESTINATION', String, 'Log destination') do |value|
328
+ @logger = AppLog.new(value != '-' && value)
329
+ end
330
+ opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
331
+ output_file(value)
332
+ end
333
+
334
+ opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
335
+ key, val = value.split(/=/, 2)
336
+ set key.intern => eval(val)
337
+ end
338
+
339
+ opts.on('--review', 'View last diff') do |value|
340
+ view_output
341
+ exit 0
342
+ end
343
+
344
+ opts.separator ''
345
+ opts.separator 'Available profiles:'
346
+ opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
347
+
348
+ opts.separator ''
349
+ opts.separator 'Other Options:'
350
+
351
+ opts.on('--debug', 'Show debug messages') do |v|
352
+ $VERBOSE = $DEBUG = true
353
+ @logger.set_level(:debug)
354
+ end
355
+
356
+ opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
357
+ @logger.set_level(:quiet)
358
+ end
359
+
360
+ opts.on('-v', '--verbose', 'Run verbosely') do |v|
361
+ $VERBOSE = true
362
+ @logger.set_level(:verbose)
363
+ end
364
+
365
+ opts.on_tail('-h', '--help', 'Show this message') do
366
+ puts opts
367
+ exit 1
368
+ end
369
+ end
370
+
371
+ @profiles = opts.parse!(args)
372
+ @profiles = @default_profiles if @profiles.empty?
373
+ for pn in @profiles
374
+ profile pn
375
+ end
376
+
377
+ self
378
+ end
379
+
380
+
381
+ # Retrieve an option for an url
382
+ # url:: String
383
+ # opt:: Symbol
384
+ def get(url, opt, default=nil)
385
+ opts = @urls[url]
386
+ $logger.debug "get: opts=#{opts.inspect}"
387
+ case opt
388
+ when :diffprocess, :format
389
+ opt_ = opts.has_key?(opt) ? opt : :diff
390
+ else
391
+ opt_ = opt
392
+ end
393
+
394
+ $logger.debug "get: opt=#{opt} opt_=#{opt_} #{opts[opt_]} #{opts[:use]}"
395
+ if opts.has_key?(opt_)
396
+ val = opts[opt_]
397
+ elsif opts.has_key?(:use)
398
+ val = opts[:use]
399
+ else
400
+ val = nil
401
+ end
402
+
403
+ case val
404
+ when nil
405
+ when Symbol
406
+ $logger.debug "get: val=#{val}"
407
+ success, rv = get_option(opt, val)
408
+ $logger.debug "get: #{success}, #{rv}"
409
+ if success
410
+ return rv
411
+ end
412
+ else
413
+ $logger.debug "get: return val=#{val}"
414
+ return val
415
+ end
416
+ unless default
417
+ success, default1 = get_option(opt, :default)
418
+ default = default1 if success
419
+ end
420
+
421
+ $logger.debug "get: return default=#{default}"
422
+ return default
423
+ end
424
+
425
+
426
+ # Configuration command:
427
+ # Set the default profiles
428
+ def default(*profile_names)
429
+ @default_profiles = profile_names
430
+ end
431
+
432
+
433
+ # Configuration command:
434
+ # Load a profile
435
+ def profile(profile_name)
436
+ case profile_name
437
+ when '-'
438
+ readlines.map! {|l| l.chomp}.each {|url| source url}
439
+ else
440
+ fn = profile_filename(profile_name)
441
+ if fn
442
+ $logger.debug "Profile: #{fn}"
443
+ contents = File.read(fn)
444
+ @current_profile = fn
445
+ begin
446
+ self.instance_eval(contents)
447
+ ensure
448
+ @current_profile = nil
449
+ end
450
+ true
451
+ else
452
+ false
453
+ end
454
+ end
455
+ end
456
+
457
+
458
+ # Define a options shortcut.
459
+ def shortcut(symbol, args)
460
+ ak = args.keys
461
+ ok = @options.keys
462
+ dk = ok - ak
463
+
464
+ # :downloadprocess
465
+ if !ak.include?(:delegate) and
466
+ dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
467
+ $logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
468
+ end
469
+
470
+ if ak.include?(:delegate)
471
+ dk.each do |field|
472
+ @options[field][symbol] = args[:delegate]
473
+ end
474
+ end
475
+
476
+ args.each do |field, val|
477
+ @options[field][symbol] = val unless field == :delegate
478
+ end
479
+ end
480
+
481
+
482
+ # Set the output format.
483
+ def output_format(*format)
484
+ unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
485
+ $logger.fatal "Unknown output format: #{format}"
486
+ exit 5
487
+ end
488
+ @output_format = format
489
+ end
490
+
491
+
492
+ # Set the output file.
493
+ def output_file(filename, outformat=nil)
494
+ @outfile[outformat] = filename
495
+ end
496
+
497
+
498
+ # Configuration command:
499
+ # Set global options.
500
+ # type:: Symbol
501
+ # options:: Hash
502
+ def option(type, options)
503
+ $logger.info "option #{type}: #{options.inspect}"
504
+ o = @options[type]
505
+ if o
506
+ o.merge!(options)
507
+ else
508
+ $logger.error "Unknown option type: #{type} (#{options.inspect})"
509
+ end
510
+ end
511
+
512
+
513
+ # Set a global option.
514
+ def global(options)
515
+ options.each do |type, value|
516
+ @options[type] = value
517
+ end
518
+ end
519
+
520
+
521
+ # Configuration command:
522
+ # Set the default value for source-options.
523
+ def set(options)
524
+ $logger.debug "set: #{options.inspect}"
525
+ @default_options.merge!(options)
526
+ end
527
+
528
+
529
+ # Configuration command:
530
+ # Unset a default source-option.
531
+ def unset(*options)
532
+ for option in options
533
+ @default_options.delete(option)
534
+ end
535
+ end
536
+
537
+
538
+ # Configuration command:
539
+ # Define a source.
540
+ # urls:: String
541
+ def source(urls, opts={})
542
+ urls.split("\n").flatten.compact.each do |url|
543
+ @urls[url] = @default_options.dup.update(opts)
544
+ @todo << url
545
+ end
546
+ end
547
+
548
+
549
+ # Configuration command:
550
+ # Set the default download processor. The block takes the
551
+ # downloaded text (STRING) as argument.
552
+ def downloadprocess(&block)
553
+ @options[:downloadprocess][:default] = block
554
+ end
555
+
556
+
557
+ # Configuration command:
558
+ # Set the default diff processor. The block takes the
559
+ # diff text (STRING) as argument.
560
+ def diffprocess(&block)
561
+ @options[:diff][:default] = block
562
+ end
563
+
564
+
565
+ # Configuration command:
566
+ # Set the editor.
567
+ def edit(cmd)
568
+ @cmd_edit = cmd
569
+ end
570
+
571
+
572
+ # Configuration command:
573
+ # Set the viewer.
574
+ def view(view)
575
+ @view = view
576
+ end
577
+
578
+
579
+ # Configuration command:
580
+ # Set the default diff program.
581
+ def diff(diff)
582
+ @options[:diff][:default] = diff
583
+ end
584
+
585
+
586
+ # Configuration command:
587
+ # Set the default dowloader.
588
+ def download(download)
589
+ @options[:download][:default] = download
590
+ end
591
+
592
+
593
+ # Format a diff according to URL's source options.
594
+ def format(url, difftext)
595
+ fmt = get(url, :format)
596
+ eval_arg(fmt, [difftext])
597
+ end
598
+
599
+
600
+ # Apply some arguments to a format.
601
+ # format:: String or Proc
602
+ # args:: Array of Arguments
603
+ def eval_arg(format, args, default=nil, &process_string)
604
+ case format
605
+ when nil
606
+ return default
607
+ when Proc
608
+ $logger.debug "eval proc: #{format} #{args.inspect}"
609
+ return format.call(*args)
610
+ else
611
+ ca = format % args
612
+ $logger.debug "eval string: #{ca}"
613
+ if process_string
614
+ return process_string.call(ca)
615
+ else
616
+ return ca
617
+ end
618
+ end
619
+ end
620
+
621
+
622
+ # Apply the argument to cmd (a format String or a Proc). If a
623
+ # String, execute the command.
624
+ def call_cmd(cmd, args, default=nil)
625
+ eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
626
+ end
627
+
628
+
629
+ # Generate & view the final output.
630
+ # difftext:: Hash
631
+ def show_output(difftext)
632
+ if difftext.empty?
633
+ $logger.warn 'No news is good news'
634
+ return
635
+ end
636
+
637
+ @output_format.each do |outformat|
638
+ meth = "get_output_#{outformat}"
639
+
640
+ unless respond_to?(meth)
641
+ $logger.fatal "Unknown output format: #{outformat}"
642
+ exit 5
643
+ end
644
+
645
+ out = send(meth, difftext)
646
+ if out
647
+ outfile = get_outfile(outformat)
648
+ case outfile
649
+ when '-'
650
+ puts out
651
+ else
652
+ File.open(outfile, 'w') {|io| io.puts out}
653
+ meth = "view_output_#{outformat}"
654
+ self.send(meth, outfile)
655
+ end
656
+ end
657
+ end
658
+ end
659
+
660
+
661
+ def get_output_text(difftext)
662
+ difftext.map do |url, difftext|
663
+ if difftext
664
+ difftext = html_to_text(difftext) if is_html?(difftext)
665
+ !difftext.empty? && [url, difftext_annotation(url), nil, difftext].join("\n")
666
+ end
667
+ end.compact.join("\n\n#{('-' * 68)}\n\n")
668
+ end
669
+
670
+
671
+ def get_output_rss(difftext)
672
+ success, rss_url = get_option(:rss, :url)
673
+ if success
674
+ success, rss_version = get_option(:rss, :version)
675
+ require "rss/#{rss_version}"
676
+
677
+ rss = RSS::Rss.new(rss_version)
678
+ chan = RSS::Rss::Channel.new
679
+ chan.title = @output_title
680
+ [:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
681
+ ok, val = get_option(:rss, field)
682
+ item.send(format_symbol(field, '%s='), val) if ok
683
+ end
684
+ chan.link = rss_url
685
+ rss.channel = chan
686
+
687
+ cnt = difftext.map do |url, text|
688
+ rss_format = get(url, :rss_format, :plain_text)
689
+ text = strip_tags(text, :format => rss_format)
690
+ next if text.empty?
691
+
692
+ item = RSS::Rss::Channel::Item.new
693
+ item.title = get(url, :title, File.basename(url))
694
+ item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
695
+ [:author, :date, :enclosure, :category, :pubDate].each do |field|
696
+ val = get(url, format_symbol(field, 'rss_%s'))
697
+ item.send(format_symbol(field, '%s='), val) if val
698
+ end
699
+
700
+ annotation = difftext_annotation(url)
701
+ case rss_format
702
+ when :plain_text
703
+ annotation = "<pre>#{annotation}</pre>" if annotation
704
+ item.description = %{#{annotation}<pre>#{text}</pre>}
705
+ else
706
+ item.description = %{<pre>#{annotation}</pre>\n#{text}}
707
+ end
708
+ chan.items << item
709
+ end
710
+
711
+ return rss.to_s
712
+
713
+ else
714
+
715
+ $logger.fatal "Global option :rss[:url] not defined."
716
+ exit 5
717
+
718
+ end
719
+ end
720
+
721
+
722
+ def get_output_html(difftext)
723
+ difftext = difftext.map do |url, text|
724
+ tags = get(url, :strip_tags)
725
+ text = strip_tags(text, :tags => tags) if tags
726
+ text.empty? ? nil : [url, text]
727
+ end
728
+ difftext.compact!
729
+
730
+ toc = difftext.map do |url, text|
731
+ lab = Digest::MD5.hexdigest(url)
732
+ ti = get(url, :title, File.basename(url))
733
+ # %{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a> <a class="external" href="#{url}">[W]</a></li>}
734
+ %{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a></li>}
735
+ end.join("\n")
736
+
737
+ cnt = difftext.map do |url, text|
738
+ lab = Digest::MD5.hexdigest(url)
739
+ ti = get(url, :title, File.basename(url))
740
+ if (rewrite = get(url, :rewrite_link))
741
+ url = eval_arg(rewrite, [url])
742
+ ext = ''
743
+ else
744
+ old = %{<a class="old" href="#{file_url(backupname(url))}">old</a>}
745
+ lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
746
+ ext = %{ (#{old}, #{lst})}
747
+ end
748
+ <<HTML
749
+ <div class="webpage">
750
+ <h1 class="diff" id="#{lab}"><a class="external" href="#{url}">#{ti}</a>#{ext}</h1>
751
+ <div class="annotation">
752
+ #{CGI::escapeHTML(difftext_annotation(url))}
753
+ </div>
754
+ <div class="diff">
755
+ #{format(url, text)}
756
+ </div>
757
+ </div>
758
+ HTML
759
+ end.join(('<hr class="separator"/>') + "\n")
760
+
761
+ success, template = get_option(:page, :format)
762
+ unless success
763
+ success, template = get_option(:page, :simple)
764
+ end
765
+ return eval_arg(template, [@output_title, toc, cnt])
766
+ end
767
+
768
+
769
+ # Get the backup filename.
770
+ def backupname(url)
771
+ File.join(@cfgdir, 'old', encode(url))
772
+ end
773
+
774
+
775
+ # Get the filename for the freshly downloaded copy.
776
+ def latestname(url)
777
+ File.join(@cfgdir, 'latest', encode(url))
778
+ end
779
+
780
+
781
+ # Guess path's dirname.
782
+ # foo/bar -> foo
783
+ # foo/bar.txt -> foo
784
+ # foo/bar/ -> foo/bar
785
+ def guess_dir(path)
786
+ path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
787
+ end
788
+
789
+
790
+ # Strip the url's last part (after #).
791
+ def canonic_url(url)
792
+ url.sub(/#.*$/, '')
793
+ end
794
+
795
+
796
+ def strip_tags_default
797
+ success, tags = get_option(:strip_tags, :default)
798
+ tags.dup if success
799
+ end
800
+
801
+
802
+ def strip_tags(doc, args={})
803
+ tags = args[:tags] || strip_tags_default
804
+ case doc
805
+ when String
806
+ doc = Hpricot(doc)
807
+ end
808
+ tags.each do |tag|
809
+ doc.search(tag).remove
810
+ end
811
+ case args[:format]
812
+ when :hpricot
813
+ doc
814
+ else
815
+ doc.send("to_#{args[:format] || :html}")
816
+ end
817
+ end
818
+
819
+
820
+ # Check whether path is eligible on the basis of url or path0.
821
+ # This checks either for a :match option for url or the extensions
822
+ # of path0 and path.
823
+ def eligible_path?(url, path0, path)
824
+ rx = get(url, :match)
825
+ if rx
826
+ return path =~ rx
827
+ else
828
+ return File.extname(path0) == File.extname(path)
829
+ end
830
+ end
831
+
832
+
833
+ # Scan hpricot document for hrefs and push the onto @todo if not
834
+ # already included.
835
+ def push_hrefs(url, hpricot, &condition)
836
+ begin
837
+ depth = get(url, :depth)
838
+ return if depth and depth <= 0
839
+ uri0 = URI.parse(url)
840
+ pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
841
+ (hpricot / 'a').each do |a|
842
+ href = a['href']
843
+ next if href.nil?
844
+ curl = canonic_url(href)
845
+ next if @done.include?(curl) or @todo.include?(curl)
846
+ uri = URI.parse(href)
847
+ next unless robots_allowed?(curl, uri)
848
+ pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
849
+ if condition.call(uri0, pn0, uri, pn)
850
+ opts = @urls[url].dup
851
+ opts[:title] = File.basename(curl)
852
+ opts[:depth] = depth - 1 if depth and depth >= 0
853
+ @urls[curl] = opts
854
+ @todo << curl
855
+ end
856
+ end
857
+ rescue Exception => e
858
+ # $logger.error e #DBG#
859
+ $logger.error e.message
860
+ $logger.debug e.backtrace
861
+ end
862
+ end
863
+
864
+
865
+ # Rewrite urls in doc
866
+ # url:: String
867
+ # doc:: Hpricot document
868
+ def rewrite_urls(url, doc)
869
+ uri = URI.parse(url)
870
+ urd = guess_dir(uri.path)
871
+ (doc / 'a').each do |a|
872
+ href = rewrite_href(a['href'], url, uri, urd)
873
+ a['href'] = href
874
+ end
875
+ (doc / 'img').each do |a|
876
+ href = rewrite_href(a['src'], url, uri, urd)
877
+ a['src'] = href if href
878
+ end
879
+ doc
880
+ end
881
+
882
+
883
+ # Try to make href an absolute url.
884
+ def rewrite_href(href, url, uri, urd)
885
+ begin
886
+ return if !href
887
+ rv = nil
888
+ href = href.strip
889
+
890
+ if href =~ /\w+:/
891
+ elsif uri.relative? and URI.parse(href).relative?
892
+ if uri.instance_of?(URI::Generic)
893
+ rv = File.join(urd, href)
894
+ end
895
+ elsif href[0..0] == '#'
896
+ rv = url + href
897
+ else
898
+ rv = uri.merge(href).to_s
899
+ end
900
+
901
+ case rv
902
+ when String
903
+ return rv
904
+ when nil
905
+ else
906
+ $logger.error "Internal error: href=#{href}"
907
+ $logger.debug caller.join("\n")
908
+ end
909
+ return
910
+ rescue Exception => e
911
+ # $logger.error e #DBG#
912
+ $logger.error e.message
913
+ $logger.debug e.backtrace
914
+ end
915
+ end
916
+
917
+
918
+ # Return a Proc that takes an text as argument and highlight occurences of rx.
919
+ # rx:: Regular expression
920
+ # group:: A number (default: 0)
921
+ # tag:: The HTML tag to use (default: "span")
922
+ def highlighter(rx, group=nil, tag='span')
923
+ lambda {|text| text.gsub(rx, %{<#{tag} class="highlight">\\#{group || 0}</#{tag}>})}
924
+ end
925
+
926
+
927
+ private
928
+
929
+ def difftext_annotation(url)
930
+ bak = backupname(url)
931
+ lst = latestname(url)
932
+ if File.exist?(bak) and File.exist?(lst)
933
+ eval_arg(get(url, :format_annotation, '%s >>> %s'), [File.mtime(bak), File.mtime(lst)])
934
+ end
935
+ end
936
+
937
+
938
+ def format_symbol(name, format_string)
939
+ (format_string % name.to_s).intern
940
+ end
941
+
942
+
943
+ def is_html?(text)
944
+ text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
945
+ end
946
+
947
+
948
+ def html_to_text(text)
949
+ Hpricot(text).to_plain_text
950
+ end
951
+
952
+
953
+ def robots_allowed?(url, uri)
954
+ if @allow.has_key?(url)
955
+ return @allow[url]
956
+ end
957
+
958
+ if defined?(RobotRules)
959
+ host = uri.host
960
+
961
+ unless (rules = @robots[host])
962
+ rurl = robots_uri(uri).to_s
963
+ return true unless rurl
964
+ begin
965
+ robots_txt = open(rurl).read
966
+ rules = RobotRules.new(@user_agent)
967
+ rules.parse(rurl, robots_txt)
968
+ @robots[host] = rules
969
+ $logger.info "Loaded #{rurl} for #{@user_agent}"
970
+ $logger.debug robots_txt
971
+ rescue Exception => e
972
+ # puts e
973
+ # puts robots_txt
974
+ end
975
+ end
976
+
977
+ rv = if rules and !rules.allowed?(url)
978
+ $logger.info "Excluded url: #{url}"
979
+ false
980
+ else
981
+ true
982
+ end
983
+ @allow[url] = rv
984
+ return rv
985
+ end
986
+
987
+ unless @robots[:warning]
988
+ $logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
989
+ @robots[:warning] = true
990
+ end
991
+ @allow[url] = true
992
+ return true
993
+ end
994
+
995
+
996
+ def robots_uri(uri)
997
+ uri.merge('/robots.txt') unless uri.relative?
998
+ end
999
+
1000
+
1001
+ def file_url(filename)
1002
+ filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
1003
+ # "file://#{encode(filename, ':/')}"
1004
+ encode(filename, ':/')
1005
+ end
1006
+
1007
+
1008
+ def get_optionvalue(opt, val, default=nil)
1009
+ ok, val = get_option(opt, val)
1010
+ if ok
1011
+ val
1012
+ else
1013
+ default
1014
+ end
1015
+ end
1016
+
1017
+
1018
+ def get_option(opt, val)
1019
+ vals = @options[opt]
1020
+ $logger.debug "val=#{val} vals=#{vals.inspect}"
1021
+ if vals and vals.has_key?(val)
1022
+ rv = vals[val]
1023
+ $logger.debug "get_option ok: #{opt} => #{rv.inspect}"
1024
+ case rv
1025
+ when Symbol
1026
+ $logger.debug "get_option re: #{rv}"
1027
+ return get_option(opt, rv)
1028
+ else
1029
+ $logger.debug "get_option true, #{rv}"
1030
+ return [true, rv]
1031
+ end
1032
+ else
1033
+ $logger.debug "get_option no: #{opt} => #{val.inspect}"
1034
+ return [false, val]
1035
+ end
1036
+ end
1037
+
1038
+
1039
+ def encode(text, chars='')
1040
+ text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
1041
+ end
1042
+
1043
+
1044
+ def output_suffix(outformat)
1045
+ @suffix[outformat] || outformat
1046
+ end
1047
+
1048
+
1049
+ def get_outfile(outformat=nil)
1050
+ @outfile[outformat] || File.join(@cfgdir, "websitiary.#{output_suffix(outformat || @output_format[0])}")
1051
+ end
1052
+
1053
+
1054
+ def view_output(outfile=nil)
1055
+ send("view_output_#{@output_format[0]}", outfile || get_outfile)
1056
+ end
1057
+
1058
+
1059
+ def view_output_general(outfile)
1060
+ if @view
1061
+ system((@view % outfile))
1062
+ end
1063
+ end
1064
+ alias :view_output_html :view_output_general
1065
+ alias :view_output_text :view_output_general
1066
+ alias :view_output_rss :view_output_general
1067
+
1068
+
1069
+ def edit_profile(profile)
1070
+ fn = profile_filename(profile)
1071
+ $logger.debug "edit: #{fn}"
1072
+ `#{@cmd_edit % fn}`
1073
+ end
1074
+
1075
+
1076
+ def profile_filename(profile_name)
1077
+ if File.extname(profile_name) != '.rb'
1078
+ profile_name = "#{profile_name}.rb"
1079
+ end
1080
+ for d in ['.', @cfgdir]
1081
+ filename = File.join(d, profile_name)
1082
+ if File.exists?(filename)
1083
+ return filename
1084
+ end
1085
+ end
1086
+ return nil
1087
+ end
1088
+
1089
+ end
1090
+
1091
+
1092
+
1093
+ # Hash: The output of the diff commands for each url.
1094
+ attr_reader :difftext
1095
+
1096
+ # The configurator
1097
+ attr_reader :configuration
1098
+
1099
+
1100
+ # args:: Array of command-line (like) arguments.
1101
+ def initialize(args=[])
1102
+ @configuration = Configuration.new(self, args)
1103
+ @difftext = {}
1104
+
1105
+ ensure_dir(@configuration.cfgdir)
1106
+ css = File.join(@configuration.cfgdir, 'websitiary.css')
1107
+ unless File.exists?(css)
1108
+ $logger.info "Copying default css file: #{css}"
1109
+ File.open(css, 'w') do |io|
1110
+ io.puts <<CSS
1111
+ body {
1112
+ color: black;
1113
+ background-color: #f0f0f0;
1114
+ }
1115
+ a.external {
1116
+ }
1117
+ a.old {
1118
+ }
1119
+ a.latest {
1120
+ }
1121
+ a.toc {
1122
+ }
1123
+ ol.toc {
1124
+ float: left;
1125
+ width: 200px;
1126
+ position: fixed;
1127
+ padding: 0;
1128
+ margin: 0;
1129
+ }
1130
+ li.toc {
1131
+ list-style: none;
1132
+ border: 1px solid silver;
1133
+ background-color: #fafafa;
1134
+ padding: 0.5em;
1135
+ font-size: 80%;
1136
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
1137
+ }
1138
+ li.toc:hover {
1139
+ background-color: #ffff8d;
1140
+ }
1141
+ div.contents {
1142
+ margin-left: 210px;
1143
+ min-width: 16em;
1144
+ }
1145
+ div.webpage {
1146
+ margin: 5px 0 5px 0;
1147
+ padding: 5px;
1148
+ border: 1px solid silver;
1149
+ background-color: white;
1150
+ }
1151
+ h1.diff {
1152
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
1153
+ }
1154
+ div.diff {
1155
+ padding-left: 2em;
1156
+ }
1157
+ pre.diff {
1158
+ padding-left: 2em;
1159
+ }
1160
+ hr.separator {
1161
+ width: 100%;
1162
+ visibility: hidden;
1163
+ }
1164
+ .error {
1165
+ color: yellow;
1166
+ background-color: red;
1167
+ }
1168
+ .highlight {
1169
+ background-color: #ffc730;
1170
+ }
1171
+ CSS
1172
+ end
1173
+ end
1174
+ end
1175
+
1176
+
1177
+ # Process the sources in @configuration.url as defined by profiles
1178
+ # and command-line options. The differences are stored in @difftext (a Hash).
1179
+ # show_output:: If true, show the output with the defined viewer.
1180
+ def process(show_output=true)
1181
+ @configuration.todo.each do |url|
1182
+ opts = @configuration.urls[url]
1183
+ $logger.debug "Source: #{@configuration.get(url, :title, url)}"
1184
+ older = @configuration.backupname(url)
1185
+ ensure_dir(File.dirname(older))
1186
+ $logger.debug "older: #{older}"
1187
+ latest = @configuration.latestname(url)
1188
+ ensure_dir(File.dirname(latest))
1189
+ $logger.debug "latest: #{latest}"
1190
+
1191
+ if File.exists?(latest) and !opts[:ignore_age]
1192
+ if (hdiff = opts[:hours])
1193
+ tdiff = hdiff * HOUR_SECS
1194
+ $logger.debug "hours: #{hdiff} (#{tdiff}s)"
1195
+ elsif (ddiff = opts[:days])
1196
+ tdiff = ddiff * DAY_SECS
1197
+ $logger.debug "days: #{ddiff} (#{tdiff}s)"
1198
+ else
1199
+ tdiff = nil
1200
+ end
1201
+
1202
+ if tdiff
1203
+ tn = Time.now
1204
+ tl = File.mtime(latest)
1205
+ td = tn - tl
1206
+ if td < tdiff
1207
+ $logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{(td / DAY_SECS).to_i}d old (#{(tdiff / DAY_SECS).to_i}d)"
1208
+ next
1209
+ end
1210
+ end
1211
+ end
1212
+
1213
+ move(latest, older)
1214
+ if download(url, latest, opts)
1215
+ difftext = diff(url, older, latest, opts)
1216
+ if difftext
1217
+ $logger.debug "difftext: #{difftext}"
1218
+ accumulate(url, difftext, opts)
1219
+ end
1220
+ end
1221
+ end
1222
+ show if show_output
1223
+ end
1224
+
1225
+
1226
+
1227
+ private
1228
+
1229
+ def download(url, latest, opts)
1230
+ if @configuration.done.include?(url)
1231
+ $logger.info "Already downloaded: #{@configuration.get(url, :title, url).inspect}"
1232
+ return false
1233
+ end
1234
+
1235
+ $logger.info "Download: #{@configuration.get(url, :title, url).inspect}"
1236
+ @configuration.done << url
1237
+ text = @configuration.call_cmd(@configuration.get(url, :download), [url])
1238
+ # $logger.debug text
1239
+ unless text
1240
+ $logger.warn "no contents: #{@configuration.get(url, :title, url)}"
1241
+ return false
1242
+ end
1243
+
1244
+ if opts
1245
+ text = text.split("\n")
1246
+ if (range = opts[:lines])
1247
+ $logger.debug "download: lines=#{range}"
1248
+ text = text[range] || []
1249
+ end
1250
+ if (range = opts[:cols])
1251
+ $logger.debug "download: cols=#{range}"
1252
+ text.map! {|l| l[range]}
1253
+ text.compact!
1254
+ end
1255
+ if (o = opts[:sort])
1256
+ $logger.debug "download: sort=#{o}"
1257
+ case o
1258
+ when true
1259
+ text.sort!
1260
+ when Proc
1261
+ text.sort!(&o)
1262
+ end
1263
+ end
1264
+ if (o = opts[:strip])
1265
+ $logger.debug "download: strip!"
1266
+ text.delete_if {|l| l !~ /\S/}
1267
+ end
1268
+ text = text.join("\n")
1269
+ end
1270
+
1271
+ pprc = @configuration.get(url, :downloadprocess)
1272
+ if pprc
1273
+ $logger.debug "download process: #{pprc}"
1274
+ text = @configuration.call_cmd(pprc, [text])
1275
+ $logger.debug text
1276
+ end
1277
+
1278
+ File.open(latest, 'w') {|io| io.puts(text)}
1279
+ return true
1280
+ end
1281
+
1282
+
1283
+ def diff(url, old, new, opts)
1284
+ if File.exists?(old)
1285
+ $logger.debug "diff: #{old} <-> #{new}"
1286
+ difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
1287
+ $logger.debug "diff: #{difftext}"
1288
+
1289
+ if difftext =~ /\S/
1290
+ if (pprc = @configuration.get(url, :diffprocess))
1291
+ $logger.debug "diff process: #{pprc}"
1292
+ difftext = @configuration.call_cmd(pprc, [difftext])
1293
+ end
1294
+ $logger.debug "difftext: #{difftext}"
1295
+ if difftext =~ /\S/
1296
+ $logger.warn "Changed: #{@configuration.get(url, :title, url).inspect}"
1297
+ return difftext
1298
+ end
1299
+ end
1300
+
1301
+ $logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
1302
+ else
1303
+ $logger.info "Initial copy: #{old.inspect}"
1304
+ end
1305
+ return nil
1306
+ end
1307
+
1308
+
1309
+ def accumulate(url, difftext, opts)
1310
+ @difftext[url] = difftext
1311
+ end
1312
+
1313
+
1314
+ def show
1315
+ @configuration.show_output(@difftext)
1316
+ end
1317
+
1318
+
1319
+ def move(from, to)
1320
+ if File.exists?(from)
1321
+ $logger.debug "Overwriting: #{from} -> #{to}" if File.exists?(to)
1322
+ File.rename(from, to)
1323
+ end
1324
+ end
1325
+
1326
+
1327
+ def ensure_dir(dir, &fill_dir)
1328
+ if File.exist?(dir)
1329
+ unless File.directory?(dir)
1330
+ $logger.fatal "Not a directory: #{dir}"
1331
+ exit 5
1332
+ end
1333
+ else
1334
+ Dir.mkdir(dir)
1335
+ fill_dir.call(dir) if fill_dir
1336
+ end
1337
+ end
1338
+
1339
+ end
1340
+
1341
+
1342
+
1343
+ if __FILE__ == $0
1344
+ Websitiary.new(ARGV).process
1345
+ # sleep 5
1346
+ end
1347
+
1348
+
1349
+ # Local Variables:
1350
+ # revisionRx: REVISION\s\+=\s\+\'
1351
+ # End: