websitiary 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (7) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +6 -0
  3. data/README.txt +474 -0
  4. data/Rakefile +20 -0
  5. data/bin/websitiary +1351 -0
  6. data/setup.rb +1585 -0
  7. metadata +71 -0
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ load './bin/websitiary'
6
+
7
+ Hoe.new('websitiary', Websitiary::VERSION) do |p|
8
+ p.rubyforge_name = 'websitiary'
9
+ p.author = 'Thomas Link'
10
+ p.email = 'sanobast-ruby@yahoo.de'
11
+ p.summary = 'A simple website monitor'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ p.extra_deps << 'hpricot'
16
+ # p.need_tgz = false
17
+ p.need_zip = true
18
+ end
19
+
20
+ # vim: syntax=Ruby
data/bin/websitiary ADDED
@@ -0,0 +1,1351 @@
1
+ #! /usr/bin/ruby.exe
2
+ # websitiary.rb -- Website Monitor
3
+ # @Last Change: 2007-07-16.
4
+ # Author:: Thomas Link (samul AT web de)
5
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
6
+ # Created:: 2007-06-09.
7
+ #
8
+ # = TODO
9
+ # * Find a ruby-based replacement for webdiff (or port webdiff to ruby)
10
+ # * Built-in support for robots.txt
11
+ # * Option to append to output files
12
+ # * Option to trim output files (when appending items)
13
+
14
+
15
+ require 'cgi'
16
+ require 'digest/md5'
17
+ require 'logger'
18
+ require 'optparse'
19
+ require 'pathname'
20
+ require 'rbconfig'
21
+ require 'uri'
22
+ require 'open-uri'
23
+
24
+
25
+ ['hpricot', 'robot_rules'].each do |f|
26
+ begin
27
+ require f
28
+ rescue Exception => e
29
+ $stderr.puts <<EOT
30
+ #{e.message}
31
+ Library could not be loaded: #{f}
32
+ Please see the requirements section at: http://websitiary.rubyforge.org
33
+ EOT
34
+ end
35
+ end
36
+
37
+
38
+
39
+ # Basic usage:
40
+ # Websitiary.new(ARGV).process
41
+ class Websitiary
42
+ APPNAME = 'websitiary'
43
+ VERSION = '0.1.0'
44
+ REVISION = '1447'
45
+ MINUTE_SECS = 60
46
+ HOUR_SECS = MINUTE_SECS * 60
47
+ DAY_SECS = HOUR_SECS * 24
48
+
49
+ # A simple wrapper around Logger.
50
+ class AppLog
51
+ def initialize(output=nil)
52
+ @output = output || $stdout
53
+ $logger = Logger.new(@output, 'daily')
54
+ $logger.progname = APPNAME
55
+ $logger.datetime_format = "%H:%M:%S"
56
+ set_level
57
+ end
58
+
59
+
60
+ def set_level(level=:default)
61
+ case level
62
+ when :debug
63
+ $logger.level = Logger::DEBUG
64
+ when :verbose
65
+ $logger.level = Logger::INFO
66
+ when :quiet
67
+ $logger.level = Logger::ERROR
68
+ else
69
+ $logger.level = Logger::WARN
70
+ end
71
+ $logger.debug "Set logger level: #{level}"
72
+ end
73
+ end
74
+
75
+
76
+ # This class defines the scope in which profiles are evaluated. Most
77
+ # of its methods are suitable for use in profiles.
78
+ class Configuration
79
+ # Hash (key = URL, value = Hash of options)
80
+ attr_accessor :urls
81
+ # Array of urls to be downloaded.
82
+ attr_accessor :todo
83
+ # Array of downloaded urls.
84
+ attr_accessor :done
85
+ # The user configuration directory
86
+ attr_accessor :cfgdir
87
+ # attr_accessor :default_profiles
88
+ # attr_accessor :options
89
+ # attr_accessor :cmd_edit
90
+
91
+
92
+ def initialize(app, args=[])
93
+ @logger = AppLog.new
94
+ $logger.debug "Configuration#initialize"
95
+ @app = app
96
+ @urls = {}
97
+ @todo = []
98
+ @done = []
99
+ @robots = {}
100
+ @allow = {}
101
+
102
+ @suffix = {
103
+ 'text' => 'txt'
104
+ # 'rss' => 'xml'
105
+ }
106
+
107
+ @cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitiary') : '.'
108
+ [
109
+ ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitiary'),
110
+ File.join(Config::CONFIG['sysconfdir'], 'websitiary')
111
+ ].each do |dir|
112
+ if File.exists?(dir)
113
+ @cfgdir = dir
114
+ break
115
+ end
116
+ end
117
+
118
+ @user_agent = "websitiary/#{Websitiary::VERSION}"
119
+
120
+ @cmd_edit = 'vi "%s"'
121
+
122
+ @options = {:global => {}}
123
+
124
+ @options[:diff] = {
125
+ :default => :diff,
126
+ :diff => 'diff -d -w -u2 "%s" "%s"',
127
+ :webdiff => lambda do |old, new|
128
+ $logger.debug "webdiff: #{File.basename(new)}"
129
+ $logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
130
+ difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
131
+ $?.exitstatus == 1 ? difftext : ''
132
+ end,
133
+ }
134
+
135
+ @options[:format] = {
136
+ :default => :diff,
137
+ :diff => %{<pre class="diff">\n%s\n</pre>},
138
+ :webdiff => "%s\n",
139
+ }
140
+
141
+ @options[:diffprocess] = {
142
+ :default => :diff,
143
+ :diff => lambda {|text| text.split("\n")[2..-1].delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n")},
144
+ :webdiff => false,
145
+ }
146
+
147
+ @options[:download] = {
148
+ :default => :w3m,
149
+ }
150
+
151
+ @options[:downloadformat] = {
152
+ :w3m => :text,
153
+ :webdiff => :html,
154
+ }
155
+
156
+ @options[:downloadprocess] = {
157
+ }
158
+
159
+ @options[:rss] = {
160
+ :version => '2.0',
161
+ }
162
+
163
+ @options[:strip_tags] = {
164
+ :default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
165
+ }
166
+
167
+ shortcut :w3m, :delegate => :diff,
168
+ :download => 'w3m -no-cookie -S -F -dump "%s"'
169
+
170
+ shortcut :lynx, :delegate => :diff,
171
+ :download => 'lynx -dump "%s"'
172
+
173
+ shortcut :links, :delegate => :diff,
174
+ :download => 'links -dump "%s"'
175
+
176
+ shortcut :curl, :delegate => :webdiff,
177
+ :download => 'curl --silent "%s"'
178
+
179
+ shortcut :wget, :delegate => :webdiff,
180
+ :download => 'wget -q -O - "%s"'
181
+
182
+ shortcut :body_html, :delegate => :webdiff,
183
+ :strip_tags => :default,
184
+ :download => lambda {|url|
185
+ begin
186
+ doc = Hpricot(open(url))
187
+ doc = doc.at('body')
188
+ if doc
189
+ doc = rewrite_urls(url, doc)
190
+ doc = doc.inner_html
191
+ if (tags = get(url, :strip_tags))
192
+ doc = strip_tags(doc, :format => :hpricot, :tags => tags)
193
+ end
194
+ else
195
+ $logger.warn 'inner html: No body'
196
+ end
197
+ doc.to_s
198
+ rescue Exception => e
199
+ # $logger.error e #DBG#
200
+ $logger.error e.message
201
+ $logger.debug e.backtrace
202
+ break %{<pre class="error">\n#{e.message}\n</pre>}
203
+ end
204
+ }
205
+
206
+ shortcut :openuri, :delegate => :webdiff,
207
+ :download => lambda {|url|
208
+ begin
209
+ open(url).read
210
+ rescue Exception => e
211
+ # $logger.error e #DBG#
212
+ $logger.error e.message
213
+ $logger.debug e.backtrace
214
+ %{<pre class="error">\n#{e.to_s}\n</pre>}
215
+ end
216
+ }
217
+
218
+ shortcut :website, :delegate => :webdiff,
219
+ :download => lambda {|url|
220
+ html = @options[:download][:body_html].call(url)
221
+ break unless html
222
+ doc = Hpricot(html)
223
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
224
+ eligible_path?(url, uri0.path, uri.path) &&
225
+ uri.host == uri0.host
226
+ end
227
+ html
228
+ }
229
+
230
+ shortcut :website_below, :delegate => :webdiff,
231
+ :download => lambda {|url|
232
+ html = @options[:download][:body_html].call(url)
233
+ break unless html
234
+ doc = Hpricot(html)
235
+ if doc
236
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
237
+ eligible_path?(url, uri0.path, uri.path) &&
238
+ uri.host == uri0.host &&
239
+ pn.relative_path_from(pn0).to_s == '.'
240
+ end
241
+ end
242
+ html
243
+ }
244
+
245
+ shortcut :website_txt, :delegate => :default,
246
+ :download => lambda {|url|
247
+ success, cmd = get_option(:download, :default)
248
+ if success
249
+ html = @options[:download][:website].call(url)
250
+ html_to_text(html)
251
+ end
252
+ }
253
+
254
+ shortcut :website_txt_below, :delegate => :default,
255
+ :download => lambda {|url|
256
+ success, cmd = get_option(:download, :default)
257
+ if success
258
+ html = @options[:download][:website_below].call(url)
259
+ html_to_text(html)
260
+ end
261
+ }
262
+
263
+ @options[:page] = {:format => lambda do |ti, li, bd|
264
+ template = <<OUT
265
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
266
+ <html>
267
+ <head>
268
+ <title>%s</title>
269
+ <link rel="stylesheet" href="websitiary.css" type="text/css">
270
+ <link rel="alternate" href="websitiary.rss" type="application/rss+xml" title="%s">
271
+ </head>
272
+ <body>
273
+ <ol class="toc">
274
+ %s
275
+ </ol>
276
+ <div class="contents">
277
+ %s
278
+ </div>
279
+ </body>
280
+ </html>
281
+ OUT
282
+ template % [ti, ti, li, bd]
283
+ end
284
+ }
285
+
286
+ # @view = nil
287
+ @view = 'w3m "%s"'
288
+ @default_options = {}
289
+ @default_profiles = []
290
+ @profiles = []
291
+ @outfile = {}
292
+ profile 'config.rb'
293
+ parse_command_line_args(args)
294
+ @output_format ||= ['html']
295
+ @output_title = %{#{APPNAME}: #{@profiles.join(", ")}}
296
+ end
297
+
298
+
299
+ def parse_command_line_args(args)
300
+ $logger.debug "parse_command_line_args: #{args}"
301
+ opts = OptionParser.new do |opts|
302
+ opts.banner = "Usage: #{APPNAME} [OPTIONS] [PROFILES] > [OUT]"
303
+ opts.separator ''
304
+ opts.separator "#{APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
305
+ opts.separator 'the terms of the GNU General Public License version 2 or newer.'
306
+ opts.separator ''
307
+
308
+ opts.separator 'General Options:'
309
+
310
+ opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
311
+ @cfgdir = value
312
+ end
313
+
314
+ opts.on('-e', '--edit=PROFILE', String, 'Edit a profile') do |value|
315
+ edit_profile value
316
+ exit 0
317
+ end
318
+
319
+ opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
320
+ output_format(*value.split(/,/))
321
+ end
322
+
323
+ opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
324
+ set :ignore_age => bool
325
+ end
326
+
327
+ opts.on('--log=DESTINATION', String, 'Log destination') do |value|
328
+ @logger = AppLog.new(value != '-' && value)
329
+ end
330
+ opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
331
+ output_file(value)
332
+ end
333
+
334
+ opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
335
+ key, val = value.split(/=/, 2)
336
+ set key.intern => eval(val)
337
+ end
338
+
339
+ opts.on('--review', 'View last diff') do |value|
340
+ view_output
341
+ exit 0
342
+ end
343
+
344
+ opts.separator ''
345
+ opts.separator 'Available profiles:'
346
+ opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
347
+
348
+ opts.separator ''
349
+ opts.separator 'Other Options:'
350
+
351
+ opts.on('--debug', 'Show debug messages') do |v|
352
+ $VERBOSE = $DEBUG = true
353
+ @logger.set_level(:debug)
354
+ end
355
+
356
+ opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
357
+ @logger.set_level(:quiet)
358
+ end
359
+
360
+ opts.on('-v', '--verbose', 'Run verbosely') do |v|
361
+ $VERBOSE = true
362
+ @logger.set_level(:verbose)
363
+ end
364
+
365
+ opts.on_tail('-h', '--help', 'Show this message') do
366
+ puts opts
367
+ exit 1
368
+ end
369
+ end
370
+
371
+ @profiles = opts.parse!(args)
372
+ @profiles = @default_profiles if @profiles.empty?
373
+ for pn in @profiles
374
+ profile pn
375
+ end
376
+
377
+ self
378
+ end
379
+
380
+
381
+ # Retrieve an option for an url
382
+ # url:: String
383
+ # opt:: Symbol
384
+ def get(url, opt, default=nil)
385
+ opts = @urls[url]
386
+ $logger.debug "get: opts=#{opts.inspect}"
387
+ case opt
388
+ when :diffprocess, :format
389
+ opt_ = opts.has_key?(opt) ? opt : :diff
390
+ else
391
+ opt_ = opt
392
+ end
393
+
394
+ $logger.debug "get: opt=#{opt} opt_=#{opt_} #{opts[opt_]} #{opts[:use]}"
395
+ if opts.has_key?(opt_)
396
+ val = opts[opt_]
397
+ elsif opts.has_key?(:use)
398
+ val = opts[:use]
399
+ else
400
+ val = nil
401
+ end
402
+
403
+ case val
404
+ when nil
405
+ when Symbol
406
+ $logger.debug "get: val=#{val}"
407
+ success, rv = get_option(opt, val)
408
+ $logger.debug "get: #{success}, #{rv}"
409
+ if success
410
+ return rv
411
+ end
412
+ else
413
+ $logger.debug "get: return val=#{val}"
414
+ return val
415
+ end
416
+ unless default
417
+ success, default1 = get_option(opt, :default)
418
+ default = default1 if success
419
+ end
420
+
421
+ $logger.debug "get: return default=#{default}"
422
+ return default
423
+ end
424
+
425
+
426
+ # Configuration command:
427
+ # Set the default profiles
428
+ def default(*profile_names)
429
+ @default_profiles = profile_names
430
+ end
431
+
432
+
433
+ # Configuration command:
434
+ # Load a profile
435
+ def profile(profile_name)
436
+ case profile_name
437
+ when '-'
438
+ readlines.map! {|l| l.chomp}.each {|url| source url}
439
+ else
440
+ fn = profile_filename(profile_name)
441
+ if fn
442
+ $logger.debug "Profile: #{fn}"
443
+ contents = File.read(fn)
444
+ @current_profile = fn
445
+ begin
446
+ self.instance_eval(contents)
447
+ ensure
448
+ @current_profile = nil
449
+ end
450
+ true
451
+ else
452
+ false
453
+ end
454
+ end
455
+ end
456
+
457
+
458
+ # Define a options shortcut.
459
+ def shortcut(symbol, args)
460
+ ak = args.keys
461
+ ok = @options.keys
462
+ dk = ok - ak
463
+
464
+ # :downloadprocess
465
+ if !ak.include?(:delegate) and
466
+ dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
467
+ $logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
468
+ end
469
+
470
+ if ak.include?(:delegate)
471
+ dk.each do |field|
472
+ @options[field][symbol] = args[:delegate]
473
+ end
474
+ end
475
+
476
+ args.each do |field, val|
477
+ @options[field][symbol] = val unless field == :delegate
478
+ end
479
+ end
480
+
481
+
482
+ # Set the output format.
483
+ def output_format(*format)
484
+ unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
485
+ $logger.fatal "Unknown output format: #{format}"
486
+ exit 5
487
+ end
488
+ @output_format = format
489
+ end
490
+
491
+
492
+ # Set the output file.
493
+ def output_file(filename, outformat=nil)
494
+ @outfile[outformat] = filename
495
+ end
496
+
497
+
498
+ # Configuration command:
499
+ # Set global options.
500
+ # type:: Symbol
501
+ # options:: Hash
502
+ def option(type, options)
503
+ $logger.info "option #{type}: #{options.inspect}"
504
+ o = @options[type]
505
+ if o
506
+ o.merge!(options)
507
+ else
508
+ $logger.error "Unknown option type: #{type} (#{options.inspect})"
509
+ end
510
+ end
511
+
512
+
513
+ # Set a global option.
514
+ def global(options)
515
+ options.each do |type, value|
516
+ @options[type] = value
517
+ end
518
+ end
519
+
520
+
521
+ # Configuration command:
522
+ # Set the default value for source-options.
523
+ def set(options)
524
+ $logger.debug "set: #{options.inspect}"
525
+ @default_options.merge!(options)
526
+ end
527
+
528
+
529
+ # Configuration command:
530
+ # Unset a default source-option.
531
+ def unset(*options)
532
+ for option in options
533
+ @default_options.delete(option)
534
+ end
535
+ end
536
+
537
+
538
+ # Configuration command:
539
+ # Define a source.
540
+ # urls:: String
541
+ def source(urls, opts={})
542
+ urls.split("\n").flatten.compact.each do |url|
543
+ @urls[url] = @default_options.dup.update(opts)
544
+ @todo << url
545
+ end
546
+ end
547
+
548
+
549
+ # Configuration command:
550
+ # Set the default download processor. The block takes the
551
+ # downloaded text (STRING) as argument.
552
+ def downloadprocess(&block)
553
+ @options[:downloadprocess][:default] = block
554
+ end
555
+
556
+
557
+ # Configuration command:
558
+ # Set the default diff processor. The block takes the
559
+ # diff text (STRING) as argument.
560
+ def diffprocess(&block)
561
+ @options[:diff][:default] = block
562
+ end
563
+
564
+
565
+ # Configuration command:
566
+ # Set the editor.
567
+ def edit(cmd)
568
+ @cmd_edit = cmd
569
+ end
570
+
571
+
572
+ # Configuration command:
573
+ # Set the viewer.
574
+ def view(view)
575
+ @view = view
576
+ end
577
+
578
+
579
+ # Configuration command:
580
+ # Set the default diff program.
581
+ def diff(diff)
582
+ @options[:diff][:default] = diff
583
+ end
584
+
585
+
586
+ # Configuration command:
587
+ # Set the default dowloader.
588
+ def download(download)
589
+ @options[:download][:default] = download
590
+ end
591
+
592
+
593
+ # Format a diff according to URL's source options.
594
+ def format(url, difftext)
595
+ fmt = get(url, :format)
596
+ eval_arg(fmt, [difftext])
597
+ end
598
+
599
+
600
+ # Apply some arguments to a format.
601
+ # format:: String or Proc
602
+ # args:: Array of Arguments
603
+ def eval_arg(format, args, default=nil, &process_string)
604
+ case format
605
+ when nil
606
+ return default
607
+ when Proc
608
+ $logger.debug "eval proc: #{format} #{args.inspect}"
609
+ return format.call(*args)
610
+ else
611
+ ca = format % args
612
+ $logger.debug "eval string: #{ca}"
613
+ if process_string
614
+ return process_string.call(ca)
615
+ else
616
+ return ca
617
+ end
618
+ end
619
+ end
620
+
621
+
622
+ # Apply the argument to cmd (a format String or a Proc). If a
623
+ # String, execute the command.
624
+ def call_cmd(cmd, args, default=nil)
625
+ eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
626
+ end
627
+
628
+
629
+ # Generate & view the final output.
630
+ # difftext:: Hash
631
+ def show_output(difftext)
632
+ if difftext.empty?
633
+ $logger.warn 'No news is good news'
634
+ return
635
+ end
636
+
637
+ @output_format.each do |outformat|
638
+ meth = "get_output_#{outformat}"
639
+
640
+ unless respond_to?(meth)
641
+ $logger.fatal "Unknown output format: #{outformat}"
642
+ exit 5
643
+ end
644
+
645
+ out = send(meth, difftext)
646
+ if out
647
+ outfile = get_outfile(outformat)
648
+ case outfile
649
+ when '-'
650
+ puts out
651
+ else
652
+ File.open(outfile, 'w') {|io| io.puts out}
653
+ meth = "view_output_#{outformat}"
654
+ self.send(meth, outfile)
655
+ end
656
+ end
657
+ end
658
+ end
659
+
660
+
661
+ def get_output_text(difftext)
662
+ difftext.map do |url, difftext|
663
+ if difftext
664
+ difftext = html_to_text(difftext) if is_html?(difftext)
665
+ !difftext.empty? && [url, difftext_annotation(url), nil, difftext].join("\n")
666
+ end
667
+ end.compact.join("\n\n#{('-' * 68)}\n\n")
668
+ end
669
+
670
+
671
+ def get_output_rss(difftext)
672
+ success, rss_url = get_option(:rss, :url)
673
+ if success
674
+ success, rss_version = get_option(:rss, :version)
675
+ require "rss/#{rss_version}"
676
+
677
+ rss = RSS::Rss.new(rss_version)
678
+ chan = RSS::Rss::Channel.new
679
+ chan.title = @output_title
680
+ [:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
681
+ ok, val = get_option(:rss, field)
682
+ item.send(format_symbol(field, '%s='), val) if ok
683
+ end
684
+ chan.link = rss_url
685
+ rss.channel = chan
686
+
687
+ cnt = difftext.map do |url, text|
688
+ rss_format = get(url, :rss_format, :plain_text)
689
+ text = strip_tags(text, :format => rss_format)
690
+ next if text.empty?
691
+
692
+ item = RSS::Rss::Channel::Item.new
693
+ item.title = get(url, :title, File.basename(url))
694
+ item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
695
+ [:author, :date, :enclosure, :category, :pubDate].each do |field|
696
+ val = get(url, format_symbol(field, 'rss_%s'))
697
+ item.send(format_symbol(field, '%s='), val) if val
698
+ end
699
+
700
+ annotation = difftext_annotation(url)
701
+ case rss_format
702
+ when :plain_text
703
+ annotation = "<pre>#{annotation}</pre>" if annotation
704
+ item.description = %{#{annotation}<pre>#{text}</pre>}
705
+ else
706
+ item.description = %{<pre>#{annotation}</pre>\n#{text}}
707
+ end
708
+ chan.items << item
709
+ end
710
+
711
+ return rss.to_s
712
+
713
+ else
714
+
715
+ $logger.fatal "Global option :rss[:url] not defined."
716
+ exit 5
717
+
718
+ end
719
+ end
720
+
721
+
722
+ def get_output_html(difftext)
723
+ difftext = difftext.map do |url, text|
724
+ tags = get(url, :strip_tags)
725
+ text = strip_tags(text, :tags => tags) if tags
726
+ text.empty? ? nil : [url, text]
727
+ end
728
+ difftext.compact!
729
+
730
+ toc = difftext.map do |url, text|
731
+ lab = Digest::MD5.hexdigest(url)
732
+ ti = get(url, :title, File.basename(url))
733
+ # %{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a> <a class="external" href="#{url}">[W]</a></li>}
734
+ %{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a></li>}
735
+ end.join("\n")
736
+
737
+ cnt = difftext.map do |url, text|
738
+ lab = Digest::MD5.hexdigest(url)
739
+ ti = get(url, :title, File.basename(url))
740
+ if (rewrite = get(url, :rewrite_link))
741
+ url = eval_arg(rewrite, [url])
742
+ ext = ''
743
+ else
744
+ old = %{<a class="old" href="#{file_url(backupname(url))}">old</a>}
745
+ lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
746
+ ext = %{ (#{old}, #{lst})}
747
+ end
748
+ <<HTML
749
+ <div class="webpage">
750
+ <h1 class="diff" id="#{lab}"><a class="external" href="#{url}">#{ti}</a>#{ext}</h1>
751
+ <div class="annotation">
752
+ #{CGI::escapeHTML(difftext_annotation(url))}
753
+ </div>
754
+ <div class="diff">
755
+ #{format(url, text)}
756
+ </div>
757
+ </div>
758
+ HTML
759
+ end.join(('<hr class="separator"/>') + "\n")
760
+
761
+ success, template = get_option(:page, :format)
762
+ unless success
763
+ success, template = get_option(:page, :simple)
764
+ end
765
+ return eval_arg(template, [@output_title, toc, cnt])
766
+ end
767
+
768
+
769
+ # Get the backup filename.
770
+ def backupname(url)
771
+ File.join(@cfgdir, 'old', encode(url))
772
+ end
773
+
774
+
775
+ # Get the filename for the freshly downloaded copy.
776
+ def latestname(url)
777
+ File.join(@cfgdir, 'latest', encode(url))
778
+ end
779
+
780
+
781
+ # Guess path's dirname.
782
+ # foo/bar -> foo
783
+ # foo/bar.txt -> foo
784
+ # foo/bar/ -> foo/bar
785
+ def guess_dir(path)
786
+ path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
787
+ end
788
+
789
+
790
+ # Strip the url's last part (after #).
791
+ def canonic_url(url)
792
+ url.sub(/#.*$/, '')
793
+ end
794
+
795
+
796
+ def strip_tags_default
797
+ success, tags = get_option(:strip_tags, :default)
798
+ tags.dup if success
799
+ end
800
+
801
+
802
+ def strip_tags(doc, args={})
803
+ tags = args[:tags] || strip_tags_default
804
+ case doc
805
+ when String
806
+ doc = Hpricot(doc)
807
+ end
808
+ tags.each do |tag|
809
+ doc.search(tag).remove
810
+ end
811
+ case args[:format]
812
+ when :hpricot
813
+ doc
814
+ else
815
+ doc.send("to_#{args[:format] || :html}")
816
+ end
817
+ end
818
+
819
+
820
+ # Check whether path is eligible on the basis of url or path0.
821
+ # This checks either for a :match option for url or the extensions
822
+ # of path0 and path.
823
+ def eligible_path?(url, path0, path)
824
+ rx = get(url, :match)
825
+ if rx
826
+ return path =~ rx
827
+ else
828
+ return File.extname(path0) == File.extname(path)
829
+ end
830
+ end
831
+
832
+
833
+ # Scan hpricot document for hrefs and push the onto @todo if not
834
+ # already included.
835
+ def push_hrefs(url, hpricot, &condition)
836
+ begin
837
+ depth = get(url, :depth)
838
+ return if depth and depth <= 0
839
+ uri0 = URI.parse(url)
840
+ pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
841
+ (hpricot / 'a').each do |a|
842
+ href = a['href']
843
+ next if href.nil?
844
+ curl = canonic_url(href)
845
+ next if @done.include?(curl) or @todo.include?(curl)
846
+ uri = URI.parse(href)
847
+ next unless robots_allowed?(curl, uri)
848
+ pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
849
+ if condition.call(uri0, pn0, uri, pn)
850
+ opts = @urls[url].dup
851
+ opts[:title] = File.basename(curl)
852
+ opts[:depth] = depth - 1 if depth and depth >= 0
853
+ @urls[curl] = opts
854
+ @todo << curl
855
+ end
856
+ end
857
+ rescue Exception => e
858
+ # $logger.error e #DBG#
859
+ $logger.error e.message
860
+ $logger.debug e.backtrace
861
+ end
862
+ end
863
+
864
+
865
+ # Rewrite urls in doc
866
+ # url:: String
867
+ # doc:: Hpricot document
868
+ def rewrite_urls(url, doc)
869
+ uri = URI.parse(url)
870
+ urd = guess_dir(uri.path)
871
+ (doc / 'a').each do |a|
872
+ href = rewrite_href(a['href'], url, uri, urd)
873
+ a['href'] = href
874
+ end
875
+ (doc / 'img').each do |a|
876
+ href = rewrite_href(a['src'], url, uri, urd)
877
+ a['src'] = href if href
878
+ end
879
+ doc
880
+ end
881
+
882
+
883
+ # Try to make href an absolute url.
884
+ def rewrite_href(href, url, uri, urd)
885
+ begin
886
+ return if !href
887
+ rv = nil
888
+ href = href.strip
889
+
890
+ if href =~ /\w+:/
891
+ elsif uri.relative? and URI.parse(href).relative?
892
+ if uri.instance_of?(URI::Generic)
893
+ rv = File.join(urd, href)
894
+ end
895
+ elsif href[0..0] == '#'
896
+ rv = url + href
897
+ else
898
+ rv = uri.merge(href).to_s
899
+ end
900
+
901
+ case rv
902
+ when String
903
+ return rv
904
+ when nil
905
+ else
906
+ $logger.error "Internal error: href=#{href}"
907
+ $logger.debug caller.join("\n")
908
+ end
909
+ return
910
+ rescue Exception => e
911
+ # $logger.error e #DBG#
912
+ $logger.error e.message
913
+ $logger.debug e.backtrace
914
+ end
915
+ end
916
+
917
+
918
+ # Return a Proc that takes an text as argument and highlight occurences of rx.
919
+ # rx:: Regular expression
920
+ # group:: A number (default: 0)
921
+ # tag:: The HTML tag to use (default: "span")
922
+ def highlighter(rx, group=nil, tag='span')
923
+ lambda {|text| text.gsub(rx, %{<#{tag} class="highlight">\\#{group || 0}</#{tag}>})}
924
+ end
925
+
926
+
927
+ private
928
+
929
+ def difftext_annotation(url)
930
+ bak = backupname(url)
931
+ lst = latestname(url)
932
+ if File.exist?(bak) and File.exist?(lst)
933
+ eval_arg(get(url, :format_annotation, '%s >>> %s'), [File.mtime(bak), File.mtime(lst)])
934
+ end
935
+ end
936
+
937
+
938
+ def format_symbol(name, format_string)
939
+ (format_string % name.to_s).intern
940
+ end
941
+
942
+
943
+ def is_html?(text)
944
+ text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
945
+ end
946
+
947
+
948
+ def html_to_text(text)
949
+ Hpricot(text).to_plain_text
950
+ end
951
+
952
+
953
+ def robots_allowed?(url, uri)
954
+ if @allow.has_key?(url)
955
+ return @allow[url]
956
+ end
957
+
958
+ if defined?(RobotRules)
959
+ host = uri.host
960
+
961
+ unless (rules = @robots[host])
962
+ rurl = robots_uri(uri).to_s
963
+ return true unless rurl
964
+ begin
965
+ robots_txt = open(rurl).read
966
+ rules = RobotRules.new(@user_agent)
967
+ rules.parse(rurl, robots_txt)
968
+ @robots[host] = rules
969
+ $logger.info "Loaded #{rurl} for #{@user_agent}"
970
+ $logger.debug robots_txt
971
+ rescue Exception => e
972
+ # puts e
973
+ # puts robots_txt
974
+ end
975
+ end
976
+
977
+ rv = if rules and !rules.allowed?(url)
978
+ $logger.info "Excluded url: #{url}"
979
+ false
980
+ else
981
+ true
982
+ end
983
+ @allow[url] = rv
984
+ return rv
985
+ end
986
+
987
+ unless @robots[:warning]
988
+ $logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
989
+ @robots[:warning] = true
990
+ end
991
+ @allow[url] = true
992
+ return true
993
+ end
994
+
995
+
996
+ def robots_uri(uri)
997
+ uri.merge('/robots.txt') unless uri.relative?
998
+ end
999
+
1000
+
1001
+ def file_url(filename)
1002
+ filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
1003
+ # "file://#{encode(filename, ':/')}"
1004
+ encode(filename, ':/')
1005
+ end
1006
+
1007
+
1008
+ def get_optionvalue(opt, val, default=nil)
1009
+ ok, val = get_option(opt, val)
1010
+ if ok
1011
+ val
1012
+ else
1013
+ default
1014
+ end
1015
+ end
1016
+
1017
+
1018
+ def get_option(opt, val)
1019
+ vals = @options[opt]
1020
+ $logger.debug "val=#{val} vals=#{vals.inspect}"
1021
+ if vals and vals.has_key?(val)
1022
+ rv = vals[val]
1023
+ $logger.debug "get_option ok: #{opt} => #{rv.inspect}"
1024
+ case rv
1025
+ when Symbol
1026
+ $logger.debug "get_option re: #{rv}"
1027
+ return get_option(opt, rv)
1028
+ else
1029
+ $logger.debug "get_option true, #{rv}"
1030
+ return [true, rv]
1031
+ end
1032
+ else
1033
+ $logger.debug "get_option no: #{opt} => #{val.inspect}"
1034
+ return [false, val]
1035
+ end
1036
+ end
1037
+
1038
+
1039
+ def encode(text, chars='')
1040
+ text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
1041
+ end
1042
+
1043
+
1044
+ def output_suffix(outformat)
1045
+ @suffix[outformat] || outformat
1046
+ end
1047
+
1048
+
1049
+ def get_outfile(outformat=nil)
1050
+ @outfile[outformat] || File.join(@cfgdir, "websitiary.#{output_suffix(outformat || @output_format[0])}")
1051
+ end
1052
+
1053
+
1054
+ def view_output(outfile=nil)
1055
+ send("view_output_#{@output_format[0]}", outfile || get_outfile)
1056
+ end
1057
+
1058
+
1059
+ def view_output_general(outfile)
1060
+ if @view
1061
+ system((@view % outfile))
1062
+ end
1063
+ end
1064
+ alias :view_output_html :view_output_general
1065
+ alias :view_output_text :view_output_general
1066
+ alias :view_output_rss :view_output_general
1067
+
1068
+
1069
+ def edit_profile(profile)
1070
+ fn = profile_filename(profile)
1071
+ $logger.debug "edit: #{fn}"
1072
+ `#{@cmd_edit % fn}`
1073
+ end
1074
+
1075
+
1076
+ def profile_filename(profile_name)
1077
+ if File.extname(profile_name) != '.rb'
1078
+ profile_name = "#{profile_name}.rb"
1079
+ end
1080
+ for d in ['.', @cfgdir]
1081
+ filename = File.join(d, profile_name)
1082
+ if File.exists?(filename)
1083
+ return filename
1084
+ end
1085
+ end
1086
+ return nil
1087
+ end
1088
+
1089
+ end
1090
+
1091
+
1092
+
1093
+ # Hash: The output of the diff commands for each url.
1094
+ attr_reader :difftext
1095
+
1096
+ # The configurator
1097
+ attr_reader :configuration
1098
+
1099
+
1100
+ # args:: Array of command-line (like) arguments.
1101
+ def initialize(args=[])
1102
+ @configuration = Configuration.new(self, args)
1103
+ @difftext = {}
1104
+
1105
+ ensure_dir(@configuration.cfgdir)
1106
+ css = File.join(@configuration.cfgdir, 'websitiary.css')
1107
+ unless File.exists?(css)
1108
+ $logger.info "Copying default css file: #{css}"
1109
+ File.open(css, 'w') do |io|
1110
+ io.puts <<CSS
1111
+ body {
1112
+ color: black;
1113
+ background-color: #f0f0f0;
1114
+ }
1115
+ a.external {
1116
+ }
1117
+ a.old {
1118
+ }
1119
+ a.latest {
1120
+ }
1121
+ a.toc {
1122
+ }
1123
+ ol.toc {
1124
+ float: left;
1125
+ width: 200px;
1126
+ position: fixed;
1127
+ padding: 0;
1128
+ margin: 0;
1129
+ }
1130
+ li.toc {
1131
+ list-style: none;
1132
+ border: 1px solid silver;
1133
+ background-color: #fafafa;
1134
+ padding: 0.5em;
1135
+ font-size: 80%;
1136
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
1137
+ }
1138
+ li.toc:hover {
1139
+ background-color: #ffff8d;
1140
+ }
1141
+ div.contents {
1142
+ margin-left: 210px;
1143
+ min-width: 16em;
1144
+ }
1145
+ div.webpage {
1146
+ margin: 5px 0 5px 0;
1147
+ padding: 5px;
1148
+ border: 1px solid silver;
1149
+ background-color: white;
1150
+ }
1151
+ h1.diff {
1152
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
1153
+ }
1154
+ div.diff {
1155
+ padding-left: 2em;
1156
+ }
1157
+ pre.diff {
1158
+ padding-left: 2em;
1159
+ }
1160
+ hr.separator {
1161
+ width: 100%;
1162
+ visibility: hidden;
1163
+ }
1164
+ .error {
1165
+ color: yellow;
1166
+ background-color: red;
1167
+ }
1168
+ .highlight {
1169
+ background-color: #ffc730;
1170
+ }
1171
+ CSS
1172
+ end
1173
+ end
1174
+ end
1175
+
1176
+
1177
+ # Process the sources in @configuration.url as defined by profiles
1178
+ # and command-line options. The differences are stored in @difftext (a Hash).
1179
+ # show_output:: If true, show the output with the defined viewer.
1180
+ def process(show_output=true)
1181
+ @configuration.todo.each do |url|
1182
+ opts = @configuration.urls[url]
1183
+ $logger.debug "Source: #{@configuration.get(url, :title, url)}"
1184
+ older = @configuration.backupname(url)
1185
+ ensure_dir(File.dirname(older))
1186
+ $logger.debug "older: #{older}"
1187
+ latest = @configuration.latestname(url)
1188
+ ensure_dir(File.dirname(latest))
1189
+ $logger.debug "latest: #{latest}"
1190
+
1191
+ if File.exists?(latest) and !opts[:ignore_age]
1192
+ if (hdiff = opts[:hours])
1193
+ tdiff = hdiff * HOUR_SECS
1194
+ $logger.debug "hours: #{hdiff} (#{tdiff}s)"
1195
+ elsif (ddiff = opts[:days])
1196
+ tdiff = ddiff * DAY_SECS
1197
+ $logger.debug "days: #{ddiff} (#{tdiff}s)"
1198
+ else
1199
+ tdiff = nil
1200
+ end
1201
+
1202
+ if tdiff
1203
+ tn = Time.now
1204
+ tl = File.mtime(latest)
1205
+ td = tn - tl
1206
+ if td < tdiff
1207
+ $logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{(td / DAY_SECS).to_i}d old (#{(tdiff / DAY_SECS).to_i}d)"
1208
+ next
1209
+ end
1210
+ end
1211
+ end
1212
+
1213
+ move(latest, older)
1214
+ if download(url, latest, opts)
1215
+ difftext = diff(url, older, latest, opts)
1216
+ if difftext
1217
+ $logger.debug "difftext: #{difftext}"
1218
+ accumulate(url, difftext, opts)
1219
+ end
1220
+ end
1221
+ end
1222
+ show if show_output
1223
+ end
1224
+
1225
+
1226
+
1227
+ private
1228
+
1229
+ def download(url, latest, opts)
1230
+ if @configuration.done.include?(url)
1231
+ $logger.info "Already downloaded: #{@configuration.get(url, :title, url).inspect}"
1232
+ return false
1233
+ end
1234
+
1235
+ $logger.info "Download: #{@configuration.get(url, :title, url).inspect}"
1236
+ @configuration.done << url
1237
+ text = @configuration.call_cmd(@configuration.get(url, :download), [url])
1238
+ # $logger.debug text
1239
+ unless text
1240
+ $logger.warn "no contents: #{@configuration.get(url, :title, url)}"
1241
+ return false
1242
+ end
1243
+
1244
+ if opts
1245
+ text = text.split("\n")
1246
+ if (range = opts[:lines])
1247
+ $logger.debug "download: lines=#{range}"
1248
+ text = text[range] || []
1249
+ end
1250
+ if (range = opts[:cols])
1251
+ $logger.debug "download: cols=#{range}"
1252
+ text.map! {|l| l[range]}
1253
+ text.compact!
1254
+ end
1255
+ if (o = opts[:sort])
1256
+ $logger.debug "download: sort=#{o}"
1257
+ case o
1258
+ when true
1259
+ text.sort!
1260
+ when Proc
1261
+ text.sort!(&o)
1262
+ end
1263
+ end
1264
+ if (o = opts[:strip])
1265
+ $logger.debug "download: strip!"
1266
+ text.delete_if {|l| l !~ /\S/}
1267
+ end
1268
+ text = text.join("\n")
1269
+ end
1270
+
1271
+ pprc = @configuration.get(url, :downloadprocess)
1272
+ if pprc
1273
+ $logger.debug "download process: #{pprc}"
1274
+ text = @configuration.call_cmd(pprc, [text])
1275
+ $logger.debug text
1276
+ end
1277
+
1278
+ File.open(latest, 'w') {|io| io.puts(text)}
1279
+ return true
1280
+ end
1281
+
1282
+
1283
+ def diff(url, old, new, opts)
1284
+ if File.exists?(old)
1285
+ $logger.debug "diff: #{old} <-> #{new}"
1286
+ difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
1287
+ $logger.debug "diff: #{difftext}"
1288
+
1289
+ if difftext =~ /\S/
1290
+ if (pprc = @configuration.get(url, :diffprocess))
1291
+ $logger.debug "diff process: #{pprc}"
1292
+ difftext = @configuration.call_cmd(pprc, [difftext])
1293
+ end
1294
+ $logger.debug "difftext: #{difftext}"
1295
+ if difftext =~ /\S/
1296
+ $logger.warn "Changed: #{@configuration.get(url, :title, url).inspect}"
1297
+ return difftext
1298
+ end
1299
+ end
1300
+
1301
+ $logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
1302
+ else
1303
+ $logger.info "Initial copy: #{old.inspect}"
1304
+ end
1305
+ return nil
1306
+ end
1307
+
1308
+
1309
+ def accumulate(url, difftext, opts)
1310
+ @difftext[url] = difftext
1311
+ end
1312
+
1313
+
1314
+ def show
1315
+ @configuration.show_output(@difftext)
1316
+ end
1317
+
1318
+
1319
+ def move(from, to)
1320
+ if File.exists?(from)
1321
+ $logger.debug "Overwriting: #{from} -> #{to}" if File.exists?(to)
1322
+ File.rename(from, to)
1323
+ end
1324
+ end
1325
+
1326
+
1327
+ def ensure_dir(dir, &fill_dir)
1328
+ if File.exist?(dir)
1329
+ unless File.directory?(dir)
1330
+ $logger.fatal "Not a directory: #{dir}"
1331
+ exit 5
1332
+ end
1333
+ else
1334
+ Dir.mkdir(dir)
1335
+ fill_dir.call(dir) if fill_dir
1336
+ end
1337
+ end
1338
+
1339
+ end
1340
+
1341
+
1342
+
1343
+ if __FILE__ == $0
1344
+ Websitiary.new(ARGV).process
1345
+ # sleep 5
1346
+ end
1347
+
1348
+
1349
+ # Local Variables:
1350
+ # revisionRx: REVISION\s\+=\s\+\'
1351
+ # End: