websitiary 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +6 -0
- data/README.txt +474 -0
- data/Rakefile +20 -0
- data/bin/websitiary +1351 -0
- data/setup.rb +1585 -0
- metadata +71 -0
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
load './bin/websitiary'
|
6
|
+
|
7
|
+
Hoe.new('websitiary', Websitiary::VERSION) do |p|
|
8
|
+
p.rubyforge_name = 'websitiary'
|
9
|
+
p.author = 'Thomas Link'
|
10
|
+
p.email = 'sanobast-ruby@yahoo.de'
|
11
|
+
p.summary = 'A simple website monitor'
|
12
|
+
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
+
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
|
+
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
+
p.extra_deps << 'hpricot'
|
16
|
+
# p.need_tgz = false
|
17
|
+
p.need_zip = true
|
18
|
+
end
|
19
|
+
|
20
|
+
# vim: syntax=Ruby
|
data/bin/websitiary
ADDED
@@ -0,0 +1,1351 @@
|
|
1
|
+
#! /usr/bin/ruby.exe
|
2
|
+
# websitiary.rb -- Website Monitor
|
3
|
+
# @Last Change: 2007-07-16.
|
4
|
+
# Author:: Thomas Link (samul AT web de)
|
5
|
+
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
6
|
+
# Created:: 2007-06-09.
|
7
|
+
#
|
8
|
+
# = TODO
|
9
|
+
# * Find a ruby-based replacement for webdiff (or port webdiff to ruby)
|
10
|
+
# * Built-in support for robots.txt
|
11
|
+
# * Option to append to output files
|
12
|
+
# * Option to trim output files (when appending items)
|
13
|
+
|
14
|
+
|
15
|
+
require 'cgi'
|
16
|
+
require 'digest/md5'
|
17
|
+
require 'logger'
|
18
|
+
require 'optparse'
|
19
|
+
require 'pathname'
|
20
|
+
require 'rbconfig'
|
21
|
+
require 'uri'
|
22
|
+
require 'open-uri'
|
23
|
+
|
24
|
+
|
25
|
+
['hpricot', 'robot_rules'].each do |f|
|
26
|
+
begin
|
27
|
+
require f
|
28
|
+
rescue Exception => e
|
29
|
+
$stderr.puts <<EOT
|
30
|
+
#{e.message}
|
31
|
+
Library could not be loaded: #{f}
|
32
|
+
Please see the requirements section at: http://websitiary.rubyforge.org
|
33
|
+
EOT
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
# Basic usage:
|
40
|
+
# Websitiary.new(ARGV).process
|
41
|
+
class Websitiary
|
42
|
+
APPNAME = 'websitiary'
|
43
|
+
VERSION = '0.1.0'
|
44
|
+
REVISION = '1447'
|
45
|
+
MINUTE_SECS = 60
|
46
|
+
HOUR_SECS = MINUTE_SECS * 60
|
47
|
+
DAY_SECS = HOUR_SECS * 24
|
48
|
+
|
49
|
+
# A simple wrapper around Logger.
|
50
|
+
class AppLog
|
51
|
+
def initialize(output=nil)
|
52
|
+
@output = output || $stdout
|
53
|
+
$logger = Logger.new(@output, 'daily')
|
54
|
+
$logger.progname = APPNAME
|
55
|
+
$logger.datetime_format = "%H:%M:%S"
|
56
|
+
set_level
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def set_level(level=:default)
|
61
|
+
case level
|
62
|
+
when :debug
|
63
|
+
$logger.level = Logger::DEBUG
|
64
|
+
when :verbose
|
65
|
+
$logger.level = Logger::INFO
|
66
|
+
when :quiet
|
67
|
+
$logger.level = Logger::ERROR
|
68
|
+
else
|
69
|
+
$logger.level = Logger::WARN
|
70
|
+
end
|
71
|
+
$logger.debug "Set logger level: #{level}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
# This class defines the scope in which profiles are evaluated. Most
|
77
|
+
# of its methods are suitable for use in profiles.
|
78
|
+
class Configuration
|
79
|
+
# Hash (key = URL, value = Hash of options)
|
80
|
+
attr_accessor :urls
|
81
|
+
# Array of urls to be downloaded.
|
82
|
+
attr_accessor :todo
|
83
|
+
# Array of downloaded urls.
|
84
|
+
attr_accessor :done
|
85
|
+
# The user configuration directory
|
86
|
+
attr_accessor :cfgdir
|
87
|
+
# attr_accessor :default_profiles
|
88
|
+
# attr_accessor :options
|
89
|
+
# attr_accessor :cmd_edit
|
90
|
+
|
91
|
+
|
92
|
+
def initialize(app, args=[])
|
93
|
+
@logger = AppLog.new
|
94
|
+
$logger.debug "Configuration#initialize"
|
95
|
+
@app = app
|
96
|
+
@urls = {}
|
97
|
+
@todo = []
|
98
|
+
@done = []
|
99
|
+
@robots = {}
|
100
|
+
@allow = {}
|
101
|
+
|
102
|
+
@suffix = {
|
103
|
+
'text' => 'txt'
|
104
|
+
# 'rss' => 'xml'
|
105
|
+
}
|
106
|
+
|
107
|
+
@cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitiary') : '.'
|
108
|
+
[
|
109
|
+
ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitiary'),
|
110
|
+
File.join(Config::CONFIG['sysconfdir'], 'websitiary')
|
111
|
+
].each do |dir|
|
112
|
+
if File.exists?(dir)
|
113
|
+
@cfgdir = dir
|
114
|
+
break
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
@user_agent = "websitiary/#{Websitiary::VERSION}"
|
119
|
+
|
120
|
+
@cmd_edit = 'vi "%s"'
|
121
|
+
|
122
|
+
@options = {:global => {}}
|
123
|
+
|
124
|
+
@options[:diff] = {
|
125
|
+
:default => :diff,
|
126
|
+
:diff => 'diff -d -w -u2 "%s" "%s"',
|
127
|
+
:webdiff => lambda do |old, new|
|
128
|
+
$logger.debug "webdiff: #{File.basename(new)}"
|
129
|
+
$logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
|
130
|
+
difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
|
131
|
+
$?.exitstatus == 1 ? difftext : ''
|
132
|
+
end,
|
133
|
+
}
|
134
|
+
|
135
|
+
@options[:format] = {
|
136
|
+
:default => :diff,
|
137
|
+
:diff => %{<pre class="diff">\n%s\n</pre>},
|
138
|
+
:webdiff => "%s\n",
|
139
|
+
}
|
140
|
+
|
141
|
+
@options[:diffprocess] = {
|
142
|
+
:default => :diff,
|
143
|
+
:diff => lambda {|text| text.split("\n")[2..-1].delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n")},
|
144
|
+
:webdiff => false,
|
145
|
+
}
|
146
|
+
|
147
|
+
@options[:download] = {
|
148
|
+
:default => :w3m,
|
149
|
+
}
|
150
|
+
|
151
|
+
@options[:downloadformat] = {
|
152
|
+
:w3m => :text,
|
153
|
+
:webdiff => :html,
|
154
|
+
}
|
155
|
+
|
156
|
+
@options[:downloadprocess] = {
|
157
|
+
}
|
158
|
+
|
159
|
+
@options[:rss] = {
|
160
|
+
:version => '2.0',
|
161
|
+
}
|
162
|
+
|
163
|
+
@options[:strip_tags] = {
|
164
|
+
:default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
|
165
|
+
}
|
166
|
+
|
167
|
+
shortcut :w3m, :delegate => :diff,
|
168
|
+
:download => 'w3m -no-cookie -S -F -dump "%s"'
|
169
|
+
|
170
|
+
shortcut :lynx, :delegate => :diff,
|
171
|
+
:download => 'lynx -dump "%s"'
|
172
|
+
|
173
|
+
shortcut :links, :delegate => :diff,
|
174
|
+
:download => 'links -dump "%s"'
|
175
|
+
|
176
|
+
shortcut :curl, :delegate => :webdiff,
|
177
|
+
:download => 'curl --silent "%s"'
|
178
|
+
|
179
|
+
shortcut :wget, :delegate => :webdiff,
|
180
|
+
:download => 'wget -q -O - "%s"'
|
181
|
+
|
182
|
+
shortcut :body_html, :delegate => :webdiff,
|
183
|
+
:strip_tags => :default,
|
184
|
+
:download => lambda {|url|
|
185
|
+
begin
|
186
|
+
doc = Hpricot(open(url))
|
187
|
+
doc = doc.at('body')
|
188
|
+
if doc
|
189
|
+
doc = rewrite_urls(url, doc)
|
190
|
+
doc = doc.inner_html
|
191
|
+
if (tags = get(url, :strip_tags))
|
192
|
+
doc = strip_tags(doc, :format => :hpricot, :tags => tags)
|
193
|
+
end
|
194
|
+
else
|
195
|
+
$logger.warn 'inner html: No body'
|
196
|
+
end
|
197
|
+
doc.to_s
|
198
|
+
rescue Exception => e
|
199
|
+
# $logger.error e #DBG#
|
200
|
+
$logger.error e.message
|
201
|
+
$logger.debug e.backtrace
|
202
|
+
break %{<pre class="error">\n#{e.message}\n</pre>}
|
203
|
+
end
|
204
|
+
}
|
205
|
+
|
206
|
+
shortcut :openuri, :delegate => :webdiff,
|
207
|
+
:download => lambda {|url|
|
208
|
+
begin
|
209
|
+
open(url).read
|
210
|
+
rescue Exception => e
|
211
|
+
# $logger.error e #DBG#
|
212
|
+
$logger.error e.message
|
213
|
+
$logger.debug e.backtrace
|
214
|
+
%{<pre class="error">\n#{e.to_s}\n</pre>}
|
215
|
+
end
|
216
|
+
}
|
217
|
+
|
218
|
+
shortcut :website, :delegate => :webdiff,
|
219
|
+
:download => lambda {|url|
|
220
|
+
html = @options[:download][:body_html].call(url)
|
221
|
+
break unless html
|
222
|
+
doc = Hpricot(html)
|
223
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
224
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
225
|
+
uri.host == uri0.host
|
226
|
+
end
|
227
|
+
html
|
228
|
+
}
|
229
|
+
|
230
|
+
shortcut :website_below, :delegate => :webdiff,
|
231
|
+
:download => lambda {|url|
|
232
|
+
html = @options[:download][:body_html].call(url)
|
233
|
+
break unless html
|
234
|
+
doc = Hpricot(html)
|
235
|
+
if doc
|
236
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
237
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
238
|
+
uri.host == uri0.host &&
|
239
|
+
pn.relative_path_from(pn0).to_s == '.'
|
240
|
+
end
|
241
|
+
end
|
242
|
+
html
|
243
|
+
}
|
244
|
+
|
245
|
+
shortcut :website_txt, :delegate => :default,
|
246
|
+
:download => lambda {|url|
|
247
|
+
success, cmd = get_option(:download, :default)
|
248
|
+
if success
|
249
|
+
html = @options[:download][:website].call(url)
|
250
|
+
html_to_text(html)
|
251
|
+
end
|
252
|
+
}
|
253
|
+
|
254
|
+
shortcut :website_txt_below, :delegate => :default,
|
255
|
+
:download => lambda {|url|
|
256
|
+
success, cmd = get_option(:download, :default)
|
257
|
+
if success
|
258
|
+
html = @options[:download][:website_below].call(url)
|
259
|
+
html_to_text(html)
|
260
|
+
end
|
261
|
+
}
|
262
|
+
|
263
|
+
@options[:page] = {:format => lambda do |ti, li, bd|
|
264
|
+
template = <<OUT
|
265
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
266
|
+
<html>
|
267
|
+
<head>
|
268
|
+
<title>%s</title>
|
269
|
+
<link rel="stylesheet" href="websitiary.css" type="text/css">
|
270
|
+
<link rel="alternate" href="websitiary.rss" type="application/rss+xml" title="%s">
|
271
|
+
</head>
|
272
|
+
<body>
|
273
|
+
<ol class="toc">
|
274
|
+
%s
|
275
|
+
</ol>
|
276
|
+
<div class="contents">
|
277
|
+
%s
|
278
|
+
</div>
|
279
|
+
</body>
|
280
|
+
</html>
|
281
|
+
OUT
|
282
|
+
template % [ti, ti, li, bd]
|
283
|
+
end
|
284
|
+
}
|
285
|
+
|
286
|
+
# @view = nil
|
287
|
+
@view = 'w3m "%s"'
|
288
|
+
@default_options = {}
|
289
|
+
@default_profiles = []
|
290
|
+
@profiles = []
|
291
|
+
@outfile = {}
|
292
|
+
profile 'config.rb'
|
293
|
+
parse_command_line_args(args)
|
294
|
+
@output_format ||= ['html']
|
295
|
+
@output_title = %{#{APPNAME}: #{@profiles.join(", ")}}
|
296
|
+
end
|
297
|
+
|
298
|
+
|
299
|
+
def parse_command_line_args(args)
|
300
|
+
$logger.debug "parse_command_line_args: #{args}"
|
301
|
+
opts = OptionParser.new do |opts|
|
302
|
+
opts.banner = "Usage: #{APPNAME} [OPTIONS] [PROFILES] > [OUT]"
|
303
|
+
opts.separator ''
|
304
|
+
opts.separator "#{APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
|
305
|
+
opts.separator 'the terms of the GNU General Public License version 2 or newer.'
|
306
|
+
opts.separator ''
|
307
|
+
|
308
|
+
opts.separator 'General Options:'
|
309
|
+
|
310
|
+
opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
|
311
|
+
@cfgdir = value
|
312
|
+
end
|
313
|
+
|
314
|
+
opts.on('-e', '--edit=PROFILE', String, 'Edit a profile') do |value|
|
315
|
+
edit_profile value
|
316
|
+
exit 0
|
317
|
+
end
|
318
|
+
|
319
|
+
opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
|
320
|
+
output_format(*value.split(/,/))
|
321
|
+
end
|
322
|
+
|
323
|
+
opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
|
324
|
+
set :ignore_age => bool
|
325
|
+
end
|
326
|
+
|
327
|
+
opts.on('--log=DESTINATION', String, 'Log destination') do |value|
|
328
|
+
@logger = AppLog.new(value != '-' && value)
|
329
|
+
end
|
330
|
+
opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
|
331
|
+
output_file(value)
|
332
|
+
end
|
333
|
+
|
334
|
+
opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
|
335
|
+
key, val = value.split(/=/, 2)
|
336
|
+
set key.intern => eval(val)
|
337
|
+
end
|
338
|
+
|
339
|
+
opts.on('--review', 'View last diff') do |value|
|
340
|
+
view_output
|
341
|
+
exit 0
|
342
|
+
end
|
343
|
+
|
344
|
+
opts.separator ''
|
345
|
+
opts.separator 'Available profiles:'
|
346
|
+
opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
|
347
|
+
|
348
|
+
opts.separator ''
|
349
|
+
opts.separator 'Other Options:'
|
350
|
+
|
351
|
+
opts.on('--debug', 'Show debug messages') do |v|
|
352
|
+
$VERBOSE = $DEBUG = true
|
353
|
+
@logger.set_level(:debug)
|
354
|
+
end
|
355
|
+
|
356
|
+
opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
|
357
|
+
@logger.set_level(:quiet)
|
358
|
+
end
|
359
|
+
|
360
|
+
opts.on('-v', '--verbose', 'Run verbosely') do |v|
|
361
|
+
$VERBOSE = true
|
362
|
+
@logger.set_level(:verbose)
|
363
|
+
end
|
364
|
+
|
365
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
366
|
+
puts opts
|
367
|
+
exit 1
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
@profiles = opts.parse!(args)
|
372
|
+
@profiles = @default_profiles if @profiles.empty?
|
373
|
+
for pn in @profiles
|
374
|
+
profile pn
|
375
|
+
end
|
376
|
+
|
377
|
+
self
|
378
|
+
end
|
379
|
+
|
380
|
+
|
381
|
+
# Retrieve an option for an url
|
382
|
+
# url:: String
|
383
|
+
# opt:: Symbol
|
384
|
+
def get(url, opt, default=nil)
|
385
|
+
opts = @urls[url]
|
386
|
+
$logger.debug "get: opts=#{opts.inspect}"
|
387
|
+
case opt
|
388
|
+
when :diffprocess, :format
|
389
|
+
opt_ = opts.has_key?(opt) ? opt : :diff
|
390
|
+
else
|
391
|
+
opt_ = opt
|
392
|
+
end
|
393
|
+
|
394
|
+
$logger.debug "get: opt=#{opt} opt_=#{opt_} #{opts[opt_]} #{opts[:use]}"
|
395
|
+
if opts.has_key?(opt_)
|
396
|
+
val = opts[opt_]
|
397
|
+
elsif opts.has_key?(:use)
|
398
|
+
val = opts[:use]
|
399
|
+
else
|
400
|
+
val = nil
|
401
|
+
end
|
402
|
+
|
403
|
+
case val
|
404
|
+
when nil
|
405
|
+
when Symbol
|
406
|
+
$logger.debug "get: val=#{val}"
|
407
|
+
success, rv = get_option(opt, val)
|
408
|
+
$logger.debug "get: #{success}, #{rv}"
|
409
|
+
if success
|
410
|
+
return rv
|
411
|
+
end
|
412
|
+
else
|
413
|
+
$logger.debug "get: return val=#{val}"
|
414
|
+
return val
|
415
|
+
end
|
416
|
+
unless default
|
417
|
+
success, default1 = get_option(opt, :default)
|
418
|
+
default = default1 if success
|
419
|
+
end
|
420
|
+
|
421
|
+
$logger.debug "get: return default=#{default}"
|
422
|
+
return default
|
423
|
+
end
|
424
|
+
|
425
|
+
|
426
|
+
# Configuration command:
|
427
|
+
# Set the default profiles
|
428
|
+
def default(*profile_names)
|
429
|
+
@default_profiles = profile_names
|
430
|
+
end
|
431
|
+
|
432
|
+
|
433
|
+
# Configuration command:
|
434
|
+
# Load a profile
|
435
|
+
def profile(profile_name)
|
436
|
+
case profile_name
|
437
|
+
when '-'
|
438
|
+
readlines.map! {|l| l.chomp}.each {|url| source url}
|
439
|
+
else
|
440
|
+
fn = profile_filename(profile_name)
|
441
|
+
if fn
|
442
|
+
$logger.debug "Profile: #{fn}"
|
443
|
+
contents = File.read(fn)
|
444
|
+
@current_profile = fn
|
445
|
+
begin
|
446
|
+
self.instance_eval(contents)
|
447
|
+
ensure
|
448
|
+
@current_profile = nil
|
449
|
+
end
|
450
|
+
true
|
451
|
+
else
|
452
|
+
false
|
453
|
+
end
|
454
|
+
end
|
455
|
+
end
|
456
|
+
|
457
|
+
|
458
|
+
# Define a options shortcut.
|
459
|
+
def shortcut(symbol, args)
|
460
|
+
ak = args.keys
|
461
|
+
ok = @options.keys
|
462
|
+
dk = ok - ak
|
463
|
+
|
464
|
+
# :downloadprocess
|
465
|
+
if !ak.include?(:delegate) and
|
466
|
+
dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
|
467
|
+
$logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
|
468
|
+
end
|
469
|
+
|
470
|
+
if ak.include?(:delegate)
|
471
|
+
dk.each do |field|
|
472
|
+
@options[field][symbol] = args[:delegate]
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
args.each do |field, val|
|
477
|
+
@options[field][symbol] = val unless field == :delegate
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
|
482
|
+
# Set the output format.
|
483
|
+
def output_format(*format)
|
484
|
+
unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
|
485
|
+
$logger.fatal "Unknown output format: #{format}"
|
486
|
+
exit 5
|
487
|
+
end
|
488
|
+
@output_format = format
|
489
|
+
end
|
490
|
+
|
491
|
+
|
492
|
+
# Set the output file.
|
493
|
+
def output_file(filename, outformat=nil)
|
494
|
+
@outfile[outformat] = filename
|
495
|
+
end
|
496
|
+
|
497
|
+
|
498
|
+
# Configuration command:
|
499
|
+
# Set global options.
|
500
|
+
# type:: Symbol
|
501
|
+
# options:: Hash
|
502
|
+
def option(type, options)
|
503
|
+
$logger.info "option #{type}: #{options.inspect}"
|
504
|
+
o = @options[type]
|
505
|
+
if o
|
506
|
+
o.merge!(options)
|
507
|
+
else
|
508
|
+
$logger.error "Unknown option type: #{type} (#{options.inspect})"
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
|
513
|
+
# Set a global option.
|
514
|
+
def global(options)
|
515
|
+
options.each do |type, value|
|
516
|
+
@options[type] = value
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
|
521
|
+
# Configuration command:
|
522
|
+
# Set the default value for source-options.
|
523
|
+
def set(options)
|
524
|
+
$logger.debug "set: #{options.inspect}"
|
525
|
+
@default_options.merge!(options)
|
526
|
+
end
|
527
|
+
|
528
|
+
|
529
|
+
# Configuration command:
|
530
|
+
# Unset a default source-option.
|
531
|
+
def unset(*options)
|
532
|
+
for option in options
|
533
|
+
@default_options.delete(option)
|
534
|
+
end
|
535
|
+
end
|
536
|
+
|
537
|
+
|
538
|
+
# Configuration command:
|
539
|
+
# Define a source.
|
540
|
+
# urls:: String
|
541
|
+
def source(urls, opts={})
|
542
|
+
urls.split("\n").flatten.compact.each do |url|
|
543
|
+
@urls[url] = @default_options.dup.update(opts)
|
544
|
+
@todo << url
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
548
|
+
|
549
|
+
# Configuration command:
|
550
|
+
# Set the default download processor. The block takes the
|
551
|
+
# downloaded text (STRING) as argument.
|
552
|
+
def downloadprocess(&block)
|
553
|
+
@options[:downloadprocess][:default] = block
|
554
|
+
end
|
555
|
+
|
556
|
+
|
557
|
+
# Configuration command:
|
558
|
+
# Set the default diff processor. The block takes the
|
559
|
+
# diff text (STRING) as argument.
|
560
|
+
def diffprocess(&block)
|
561
|
+
@options[:diff][:default] = block
|
562
|
+
end
|
563
|
+
|
564
|
+
|
565
|
+
# Configuration command:
|
566
|
+
# Set the editor.
|
567
|
+
def edit(cmd)
|
568
|
+
@cmd_edit = cmd
|
569
|
+
end
|
570
|
+
|
571
|
+
|
572
|
+
# Configuration command:
|
573
|
+
# Set the viewer.
|
574
|
+
def view(view)
|
575
|
+
@view = view
|
576
|
+
end
|
577
|
+
|
578
|
+
|
579
|
+
# Configuration command:
|
580
|
+
# Set the default diff program.
|
581
|
+
def diff(diff)
|
582
|
+
@options[:diff][:default] = diff
|
583
|
+
end
|
584
|
+
|
585
|
+
|
586
|
+
# Configuration command:
|
587
|
+
# Set the default dowloader.
|
588
|
+
def download(download)
|
589
|
+
@options[:download][:default] = download
|
590
|
+
end
|
591
|
+
|
592
|
+
|
593
|
+
# Format a diff according to URL's source options.
|
594
|
+
def format(url, difftext)
|
595
|
+
fmt = get(url, :format)
|
596
|
+
eval_arg(fmt, [difftext])
|
597
|
+
end
|
598
|
+
|
599
|
+
|
600
|
+
# Apply some arguments to a format.
|
601
|
+
# format:: String or Proc
|
602
|
+
# args:: Array of Arguments
|
603
|
+
def eval_arg(format, args, default=nil, &process_string)
|
604
|
+
case format
|
605
|
+
when nil
|
606
|
+
return default
|
607
|
+
when Proc
|
608
|
+
$logger.debug "eval proc: #{format} #{args.inspect}"
|
609
|
+
return format.call(*args)
|
610
|
+
else
|
611
|
+
ca = format % args
|
612
|
+
$logger.debug "eval string: #{ca}"
|
613
|
+
if process_string
|
614
|
+
return process_string.call(ca)
|
615
|
+
else
|
616
|
+
return ca
|
617
|
+
end
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
|
622
|
+
# Apply the argument to cmd (a format String or a Proc). If a
|
623
|
+
# String, execute the command.
|
624
|
+
def call_cmd(cmd, args, default=nil)
|
625
|
+
eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
|
626
|
+
end
|
627
|
+
|
628
|
+
|
629
|
+
# Generate & view the final output.
|
630
|
+
# difftext:: Hash
|
631
|
+
def show_output(difftext)
|
632
|
+
if difftext.empty?
|
633
|
+
$logger.warn 'No news is good news'
|
634
|
+
return
|
635
|
+
end
|
636
|
+
|
637
|
+
@output_format.each do |outformat|
|
638
|
+
meth = "get_output_#{outformat}"
|
639
|
+
|
640
|
+
unless respond_to?(meth)
|
641
|
+
$logger.fatal "Unknown output format: #{outformat}"
|
642
|
+
exit 5
|
643
|
+
end
|
644
|
+
|
645
|
+
out = send(meth, difftext)
|
646
|
+
if out
|
647
|
+
outfile = get_outfile(outformat)
|
648
|
+
case outfile
|
649
|
+
when '-'
|
650
|
+
puts out
|
651
|
+
else
|
652
|
+
File.open(outfile, 'w') {|io| io.puts out}
|
653
|
+
meth = "view_output_#{outformat}"
|
654
|
+
self.send(meth, outfile)
|
655
|
+
end
|
656
|
+
end
|
657
|
+
end
|
658
|
+
end
|
659
|
+
|
660
|
+
|
661
|
+
def get_output_text(difftext)
|
662
|
+
difftext.map do |url, difftext|
|
663
|
+
if difftext
|
664
|
+
difftext = html_to_text(difftext) if is_html?(difftext)
|
665
|
+
!difftext.empty? && [url, difftext_annotation(url), nil, difftext].join("\n")
|
666
|
+
end
|
667
|
+
end.compact.join("\n\n#{('-' * 68)}\n\n")
|
668
|
+
end
|
669
|
+
|
670
|
+
|
671
|
+
def get_output_rss(difftext)
|
672
|
+
success, rss_url = get_option(:rss, :url)
|
673
|
+
if success
|
674
|
+
success, rss_version = get_option(:rss, :version)
|
675
|
+
require "rss/#{rss_version}"
|
676
|
+
|
677
|
+
rss = RSS::Rss.new(rss_version)
|
678
|
+
chan = RSS::Rss::Channel.new
|
679
|
+
chan.title = @output_title
|
680
|
+
[:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
|
681
|
+
ok, val = get_option(:rss, field)
|
682
|
+
item.send(format_symbol(field, '%s='), val) if ok
|
683
|
+
end
|
684
|
+
chan.link = rss_url
|
685
|
+
rss.channel = chan
|
686
|
+
|
687
|
+
cnt = difftext.map do |url, text|
|
688
|
+
rss_format = get(url, :rss_format, :plain_text)
|
689
|
+
text = strip_tags(text, :format => rss_format)
|
690
|
+
next if text.empty?
|
691
|
+
|
692
|
+
item = RSS::Rss::Channel::Item.new
|
693
|
+
item.title = get(url, :title, File.basename(url))
|
694
|
+
item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
|
695
|
+
[:author, :date, :enclosure, :category, :pubDate].each do |field|
|
696
|
+
val = get(url, format_symbol(field, 'rss_%s'))
|
697
|
+
item.send(format_symbol(field, '%s='), val) if val
|
698
|
+
end
|
699
|
+
|
700
|
+
annotation = difftext_annotation(url)
|
701
|
+
case rss_format
|
702
|
+
when :plain_text
|
703
|
+
annotation = "<pre>#{annotation}</pre>" if annotation
|
704
|
+
item.description = %{#{annotation}<pre>#{text}</pre>}
|
705
|
+
else
|
706
|
+
item.description = %{<pre>#{annotation}</pre>\n#{text}}
|
707
|
+
end
|
708
|
+
chan.items << item
|
709
|
+
end
|
710
|
+
|
711
|
+
return rss.to_s
|
712
|
+
|
713
|
+
else
|
714
|
+
|
715
|
+
$logger.fatal "Global option :rss[:url] not defined."
|
716
|
+
exit 5
|
717
|
+
|
718
|
+
end
|
719
|
+
end
|
720
|
+
|
721
|
+
|
722
|
+
def get_output_html(difftext)
|
723
|
+
difftext = difftext.map do |url, text|
|
724
|
+
tags = get(url, :strip_tags)
|
725
|
+
text = strip_tags(text, :tags => tags) if tags
|
726
|
+
text.empty? ? nil : [url, text]
|
727
|
+
end
|
728
|
+
difftext.compact!
|
729
|
+
|
730
|
+
toc = difftext.map do |url, text|
|
731
|
+
lab = Digest::MD5.hexdigest(url)
|
732
|
+
ti = get(url, :title, File.basename(url))
|
733
|
+
# %{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a> <a class="external" href="#{url}">[W]</a></li>}
|
734
|
+
%{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a></li>}
|
735
|
+
end.join("\n")
|
736
|
+
|
737
|
+
cnt = difftext.map do |url, text|
|
738
|
+
lab = Digest::MD5.hexdigest(url)
|
739
|
+
ti = get(url, :title, File.basename(url))
|
740
|
+
if (rewrite = get(url, :rewrite_link))
|
741
|
+
url = eval_arg(rewrite, [url])
|
742
|
+
ext = ''
|
743
|
+
else
|
744
|
+
old = %{<a class="old" href="#{file_url(backupname(url))}">old</a>}
|
745
|
+
lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
|
746
|
+
ext = %{ (#{old}, #{lst})}
|
747
|
+
end
|
748
|
+
<<HTML
|
749
|
+
<div class="webpage">
|
750
|
+
<h1 class="diff" id="#{lab}"><a class="external" href="#{url}">#{ti}</a>#{ext}</h1>
|
751
|
+
<div class="annotation">
|
752
|
+
#{CGI::escapeHTML(difftext_annotation(url))}
|
753
|
+
</div>
|
754
|
+
<div class="diff">
|
755
|
+
#{format(url, text)}
|
756
|
+
</div>
|
757
|
+
</div>
|
758
|
+
HTML
|
759
|
+
end.join(('<hr class="separator"/>') + "\n")
|
760
|
+
|
761
|
+
success, template = get_option(:page, :format)
|
762
|
+
unless success
|
763
|
+
success, template = get_option(:page, :simple)
|
764
|
+
end
|
765
|
+
return eval_arg(template, [@output_title, toc, cnt])
|
766
|
+
end
|
767
|
+
|
768
|
+
|
769
|
+
# Get the backup filename.
|
770
|
+
def backupname(url)
|
771
|
+
File.join(@cfgdir, 'old', encode(url))
|
772
|
+
end
|
773
|
+
|
774
|
+
|
775
|
+
# Get the filename for the freshly downloaded copy.
|
776
|
+
def latestname(url)
|
777
|
+
File.join(@cfgdir, 'latest', encode(url))
|
778
|
+
end
|
779
|
+
|
780
|
+
|
781
|
+
# Guess path's dirname.
|
782
|
+
# foo/bar -> foo
|
783
|
+
# foo/bar.txt -> foo
|
784
|
+
# foo/bar/ -> foo/bar
|
785
|
+
def guess_dir(path)
|
786
|
+
path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
|
787
|
+
end
|
788
|
+
|
789
|
+
|
790
|
+
# Strip the url's last part (after #).
|
791
|
+
def canonic_url(url)
|
792
|
+
url.sub(/#.*$/, '')
|
793
|
+
end
|
794
|
+
|
795
|
+
|
796
|
+
def strip_tags_default
|
797
|
+
success, tags = get_option(:strip_tags, :default)
|
798
|
+
tags.dup if success
|
799
|
+
end
|
800
|
+
|
801
|
+
|
802
|
+
def strip_tags(doc, args={})
|
803
|
+
tags = args[:tags] || strip_tags_default
|
804
|
+
case doc
|
805
|
+
when String
|
806
|
+
doc = Hpricot(doc)
|
807
|
+
end
|
808
|
+
tags.each do |tag|
|
809
|
+
doc.search(tag).remove
|
810
|
+
end
|
811
|
+
case args[:format]
|
812
|
+
when :hpricot
|
813
|
+
doc
|
814
|
+
else
|
815
|
+
doc.send("to_#{args[:format] || :html}")
|
816
|
+
end
|
817
|
+
end
|
818
|
+
|
819
|
+
|
820
|
+
# Check whether path is eligible on the basis of url or path0.
|
821
|
+
# This checks either for a :match option for url or the extensions
|
822
|
+
# of path0 and path.
|
823
|
+
def eligible_path?(url, path0, path)
|
824
|
+
rx = get(url, :match)
|
825
|
+
if rx
|
826
|
+
return path =~ rx
|
827
|
+
else
|
828
|
+
return File.extname(path0) == File.extname(path)
|
829
|
+
end
|
830
|
+
end
|
831
|
+
|
832
|
+
|
833
|
+
# Scan hpricot document for hrefs and push the onto @todo if not
|
834
|
+
# already included.
|
835
|
+
def push_hrefs(url, hpricot, &condition)
|
836
|
+
begin
|
837
|
+
depth = get(url, :depth)
|
838
|
+
return if depth and depth <= 0
|
839
|
+
uri0 = URI.parse(url)
|
840
|
+
pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
|
841
|
+
(hpricot / 'a').each do |a|
|
842
|
+
href = a['href']
|
843
|
+
next if href.nil?
|
844
|
+
curl = canonic_url(href)
|
845
|
+
next if @done.include?(curl) or @todo.include?(curl)
|
846
|
+
uri = URI.parse(href)
|
847
|
+
next unless robots_allowed?(curl, uri)
|
848
|
+
pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
|
849
|
+
if condition.call(uri0, pn0, uri, pn)
|
850
|
+
opts = @urls[url].dup
|
851
|
+
opts[:title] = File.basename(curl)
|
852
|
+
opts[:depth] = depth - 1 if depth and depth >= 0
|
853
|
+
@urls[curl] = opts
|
854
|
+
@todo << curl
|
855
|
+
end
|
856
|
+
end
|
857
|
+
rescue Exception => e
|
858
|
+
# $logger.error e #DBG#
|
859
|
+
$logger.error e.message
|
860
|
+
$logger.debug e.backtrace
|
861
|
+
end
|
862
|
+
end
|
863
|
+
|
864
|
+
|
865
|
+
# Rewrite urls in doc
|
866
|
+
# url:: String
|
867
|
+
# doc:: Hpricot document
|
868
|
+
def rewrite_urls(url, doc)
|
869
|
+
uri = URI.parse(url)
|
870
|
+
urd = guess_dir(uri.path)
|
871
|
+
(doc / 'a').each do |a|
|
872
|
+
href = rewrite_href(a['href'], url, uri, urd)
|
873
|
+
a['href'] = href
|
874
|
+
end
|
875
|
+
(doc / 'img').each do |a|
|
876
|
+
href = rewrite_href(a['src'], url, uri, urd)
|
877
|
+
a['src'] = href if href
|
878
|
+
end
|
879
|
+
doc
|
880
|
+
end
|
881
|
+
|
882
|
+
|
883
|
+
# Try to make href an absolute url.
|
884
|
+
def rewrite_href(href, url, uri, urd)
|
885
|
+
begin
|
886
|
+
return if !href
|
887
|
+
rv = nil
|
888
|
+
href = href.strip
|
889
|
+
|
890
|
+
if href =~ /\w+:/
|
891
|
+
elsif uri.relative? and URI.parse(href).relative?
|
892
|
+
if uri.instance_of?(URI::Generic)
|
893
|
+
rv = File.join(urd, href)
|
894
|
+
end
|
895
|
+
elsif href[0..0] == '#'
|
896
|
+
rv = url + href
|
897
|
+
else
|
898
|
+
rv = uri.merge(href).to_s
|
899
|
+
end
|
900
|
+
|
901
|
+
case rv
|
902
|
+
when String
|
903
|
+
return rv
|
904
|
+
when nil
|
905
|
+
else
|
906
|
+
$logger.error "Internal error: href=#{href}"
|
907
|
+
$logger.debug caller.join("\n")
|
908
|
+
end
|
909
|
+
return
|
910
|
+
rescue Exception => e
|
911
|
+
# $logger.error e #DBG#
|
912
|
+
$logger.error e.message
|
913
|
+
$logger.debug e.backtrace
|
914
|
+
end
|
915
|
+
end
|
916
|
+
|
917
|
+
|
918
|
+
# Return a Proc that takes an text as argument and highlight occurences of rx.
|
919
|
+
# rx:: Regular expression
|
920
|
+
# group:: A number (default: 0)
|
921
|
+
# tag:: The HTML tag to use (default: "span")
|
922
|
+
def highlighter(rx, group=nil, tag='span')
|
923
|
+
lambda {|text| text.gsub(rx, %{<#{tag} class="highlight">\\#{group || 0}</#{tag}>})}
|
924
|
+
end
|
925
|
+
|
926
|
+
|
927
|
+
private
|
928
|
+
|
929
|
+
def difftext_annotation(url)
|
930
|
+
bak = backupname(url)
|
931
|
+
lst = latestname(url)
|
932
|
+
if File.exist?(bak) and File.exist?(lst)
|
933
|
+
eval_arg(get(url, :format_annotation, '%s >>> %s'), [File.mtime(bak), File.mtime(lst)])
|
934
|
+
end
|
935
|
+
end
|
936
|
+
|
937
|
+
|
938
|
+
def format_symbol(name, format_string)
|
939
|
+
(format_string % name.to_s).intern
|
940
|
+
end
|
941
|
+
|
942
|
+
|
943
|
+
def is_html?(text)
|
944
|
+
text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
|
945
|
+
end
|
946
|
+
|
947
|
+
|
948
|
+
def html_to_text(text)
|
949
|
+
Hpricot(text).to_plain_text
|
950
|
+
end
|
951
|
+
|
952
|
+
|
953
|
+
def robots_allowed?(url, uri)
|
954
|
+
if @allow.has_key?(url)
|
955
|
+
return @allow[url]
|
956
|
+
end
|
957
|
+
|
958
|
+
if defined?(RobotRules)
|
959
|
+
host = uri.host
|
960
|
+
|
961
|
+
unless (rules = @robots[host])
|
962
|
+
rurl = robots_uri(uri).to_s
|
963
|
+
return true unless rurl
|
964
|
+
begin
|
965
|
+
robots_txt = open(rurl).read
|
966
|
+
rules = RobotRules.new(@user_agent)
|
967
|
+
rules.parse(rurl, robots_txt)
|
968
|
+
@robots[host] = rules
|
969
|
+
$logger.info "Loaded #{rurl} for #{@user_agent}"
|
970
|
+
$logger.debug robots_txt
|
971
|
+
rescue Exception => e
|
972
|
+
# puts e
|
973
|
+
# puts robots_txt
|
974
|
+
end
|
975
|
+
end
|
976
|
+
|
977
|
+
rv = if rules and !rules.allowed?(url)
|
978
|
+
$logger.info "Excluded url: #{url}"
|
979
|
+
false
|
980
|
+
else
|
981
|
+
true
|
982
|
+
end
|
983
|
+
@allow[url] = rv
|
984
|
+
return rv
|
985
|
+
end
|
986
|
+
|
987
|
+
unless @robots[:warning]
|
988
|
+
$logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
|
989
|
+
@robots[:warning] = true
|
990
|
+
end
|
991
|
+
@allow[url] = true
|
992
|
+
return true
|
993
|
+
end
|
994
|
+
|
995
|
+
|
996
|
+
def robots_uri(uri)
|
997
|
+
uri.merge('/robots.txt') unless uri.relative?
|
998
|
+
end
|
999
|
+
|
1000
|
+
|
1001
|
+
def file_url(filename)
|
1002
|
+
filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
|
1003
|
+
# "file://#{encode(filename, ':/')}"
|
1004
|
+
encode(filename, ':/')
|
1005
|
+
end
|
1006
|
+
|
1007
|
+
|
1008
|
+
def get_optionvalue(opt, val, default=nil)
|
1009
|
+
ok, val = get_option(opt, val)
|
1010
|
+
if ok
|
1011
|
+
val
|
1012
|
+
else
|
1013
|
+
default
|
1014
|
+
end
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
|
1018
|
+
def get_option(opt, val)
|
1019
|
+
vals = @options[opt]
|
1020
|
+
$logger.debug "val=#{val} vals=#{vals.inspect}"
|
1021
|
+
if vals and vals.has_key?(val)
|
1022
|
+
rv = vals[val]
|
1023
|
+
$logger.debug "get_option ok: #{opt} => #{rv.inspect}"
|
1024
|
+
case rv
|
1025
|
+
when Symbol
|
1026
|
+
$logger.debug "get_option re: #{rv}"
|
1027
|
+
return get_option(opt, rv)
|
1028
|
+
else
|
1029
|
+
$logger.debug "get_option true, #{rv}"
|
1030
|
+
return [true, rv]
|
1031
|
+
end
|
1032
|
+
else
|
1033
|
+
$logger.debug "get_option no: #{opt} => #{val.inspect}"
|
1034
|
+
return [false, val]
|
1035
|
+
end
|
1036
|
+
end
|
1037
|
+
|
1038
|
+
|
1039
|
+
def encode(text, chars='')
|
1040
|
+
text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
|
1041
|
+
end
|
1042
|
+
|
1043
|
+
|
1044
|
+
def output_suffix(outformat)
|
1045
|
+
@suffix[outformat] || outformat
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
|
1049
|
+
def get_outfile(outformat=nil)
|
1050
|
+
@outfile[outformat] || File.join(@cfgdir, "websitiary.#{output_suffix(outformat || @output_format[0])}")
|
1051
|
+
end
|
1052
|
+
|
1053
|
+
|
1054
|
+
def view_output(outfile=nil)
|
1055
|
+
send("view_output_#{@output_format[0]}", outfile || get_outfile)
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
|
1059
|
+
def view_output_general(outfile)
|
1060
|
+
if @view
|
1061
|
+
system((@view % outfile))
|
1062
|
+
end
|
1063
|
+
end
|
1064
|
+
alias :view_output_html :view_output_general
|
1065
|
+
alias :view_output_text :view_output_general
|
1066
|
+
alias :view_output_rss :view_output_general
|
1067
|
+
|
1068
|
+
|
1069
|
+
def edit_profile(profile)
|
1070
|
+
fn = profile_filename(profile)
|
1071
|
+
$logger.debug "edit: #{fn}"
|
1072
|
+
`#{@cmd_edit % fn}`
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
|
1076
|
+
def profile_filename(profile_name)
|
1077
|
+
if File.extname(profile_name) != '.rb'
|
1078
|
+
profile_name = "#{profile_name}.rb"
|
1079
|
+
end
|
1080
|
+
for d in ['.', @cfgdir]
|
1081
|
+
filename = File.join(d, profile_name)
|
1082
|
+
if File.exists?(filename)
|
1083
|
+
return filename
|
1084
|
+
end
|
1085
|
+
end
|
1086
|
+
return nil
|
1087
|
+
end
|
1088
|
+
|
1089
|
+
end
|
1090
|
+
|
1091
|
+
|
1092
|
+
|
1093
|
+
# Hash: The output of the diff commands for each url.
|
1094
|
+
attr_reader :difftext
|
1095
|
+
|
1096
|
+
# The configurator
|
1097
|
+
attr_reader :configuration
|
1098
|
+
|
1099
|
+
|
1100
|
+
# args:: Array of command-line (like) arguments.
|
1101
|
+
def initialize(args=[])
|
1102
|
+
@configuration = Configuration.new(self, args)
|
1103
|
+
@difftext = {}
|
1104
|
+
|
1105
|
+
ensure_dir(@configuration.cfgdir)
|
1106
|
+
css = File.join(@configuration.cfgdir, 'websitiary.css')
|
1107
|
+
unless File.exists?(css)
|
1108
|
+
$logger.info "Copying default css file: #{css}"
|
1109
|
+
File.open(css, 'w') do |io|
|
1110
|
+
io.puts <<CSS
|
1111
|
+
body {
|
1112
|
+
color: black;
|
1113
|
+
background-color: #f0f0f0;
|
1114
|
+
}
|
1115
|
+
a.external {
|
1116
|
+
}
|
1117
|
+
a.old {
|
1118
|
+
}
|
1119
|
+
a.latest {
|
1120
|
+
}
|
1121
|
+
a.toc {
|
1122
|
+
}
|
1123
|
+
ol.toc {
|
1124
|
+
float: left;
|
1125
|
+
width: 200px;
|
1126
|
+
position: fixed;
|
1127
|
+
padding: 0;
|
1128
|
+
margin: 0;
|
1129
|
+
}
|
1130
|
+
li.toc {
|
1131
|
+
list-style: none;
|
1132
|
+
border: 1px solid silver;
|
1133
|
+
background-color: #fafafa;
|
1134
|
+
padding: 0.5em;
|
1135
|
+
font-size: 80%;
|
1136
|
+
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
1137
|
+
}
|
1138
|
+
li.toc:hover {
|
1139
|
+
background-color: #ffff8d;
|
1140
|
+
}
|
1141
|
+
div.contents {
|
1142
|
+
margin-left: 210px;
|
1143
|
+
min-width: 16em;
|
1144
|
+
}
|
1145
|
+
div.webpage {
|
1146
|
+
margin: 5px 0 5px 0;
|
1147
|
+
padding: 5px;
|
1148
|
+
border: 1px solid silver;
|
1149
|
+
background-color: white;
|
1150
|
+
}
|
1151
|
+
h1.diff {
|
1152
|
+
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
1153
|
+
}
|
1154
|
+
div.diff {
|
1155
|
+
padding-left: 2em;
|
1156
|
+
}
|
1157
|
+
pre.diff {
|
1158
|
+
padding-left: 2em;
|
1159
|
+
}
|
1160
|
+
hr.separator {
|
1161
|
+
width: 100%;
|
1162
|
+
visibility: hidden;
|
1163
|
+
}
|
1164
|
+
.error {
|
1165
|
+
color: yellow;
|
1166
|
+
background-color: red;
|
1167
|
+
}
|
1168
|
+
.highlight {
|
1169
|
+
background-color: #ffc730;
|
1170
|
+
}
|
1171
|
+
CSS
|
1172
|
+
end
|
1173
|
+
end
|
1174
|
+
end
|
1175
|
+
|
1176
|
+
|
1177
|
+
# Process the sources in @configuration.url as defined by profiles
|
1178
|
+
# and command-line options. The differences are stored in @difftext (a Hash).
|
1179
|
+
# show_output:: If true, show the output with the defined viewer.
|
1180
|
+
def process(show_output=true)
|
1181
|
+
@configuration.todo.each do |url|
|
1182
|
+
opts = @configuration.urls[url]
|
1183
|
+
$logger.debug "Source: #{@configuration.get(url, :title, url)}"
|
1184
|
+
older = @configuration.backupname(url)
|
1185
|
+
ensure_dir(File.dirname(older))
|
1186
|
+
$logger.debug "older: #{older}"
|
1187
|
+
latest = @configuration.latestname(url)
|
1188
|
+
ensure_dir(File.dirname(latest))
|
1189
|
+
$logger.debug "latest: #{latest}"
|
1190
|
+
|
1191
|
+
if File.exists?(latest) and !opts[:ignore_age]
|
1192
|
+
if (hdiff = opts[:hours])
|
1193
|
+
tdiff = hdiff * HOUR_SECS
|
1194
|
+
$logger.debug "hours: #{hdiff} (#{tdiff}s)"
|
1195
|
+
elsif (ddiff = opts[:days])
|
1196
|
+
tdiff = ddiff * DAY_SECS
|
1197
|
+
$logger.debug "days: #{ddiff} (#{tdiff}s)"
|
1198
|
+
else
|
1199
|
+
tdiff = nil
|
1200
|
+
end
|
1201
|
+
|
1202
|
+
if tdiff
|
1203
|
+
tn = Time.now
|
1204
|
+
tl = File.mtime(latest)
|
1205
|
+
td = tn - tl
|
1206
|
+
if td < tdiff
|
1207
|
+
$logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{(td / DAY_SECS).to_i}d old (#{(tdiff / DAY_SECS).to_i}d)"
|
1208
|
+
next
|
1209
|
+
end
|
1210
|
+
end
|
1211
|
+
end
|
1212
|
+
|
1213
|
+
move(latest, older)
|
1214
|
+
if download(url, latest, opts)
|
1215
|
+
difftext = diff(url, older, latest, opts)
|
1216
|
+
if difftext
|
1217
|
+
$logger.debug "difftext: #{difftext}"
|
1218
|
+
accumulate(url, difftext, opts)
|
1219
|
+
end
|
1220
|
+
end
|
1221
|
+
end
|
1222
|
+
show if show_output
|
1223
|
+
end
|
1224
|
+
|
1225
|
+
|
1226
|
+
|
1227
|
+
private
|
1228
|
+
|
1229
|
+
def download(url, latest, opts)
|
1230
|
+
if @configuration.done.include?(url)
|
1231
|
+
$logger.info "Already downloaded: #{@configuration.get(url, :title, url).inspect}"
|
1232
|
+
return false
|
1233
|
+
end
|
1234
|
+
|
1235
|
+
$logger.info "Download: #{@configuration.get(url, :title, url).inspect}"
|
1236
|
+
@configuration.done << url
|
1237
|
+
text = @configuration.call_cmd(@configuration.get(url, :download), [url])
|
1238
|
+
# $logger.debug text
|
1239
|
+
unless text
|
1240
|
+
$logger.warn "no contents: #{@configuration.get(url, :title, url)}"
|
1241
|
+
return false
|
1242
|
+
end
|
1243
|
+
|
1244
|
+
if opts
|
1245
|
+
text = text.split("\n")
|
1246
|
+
if (range = opts[:lines])
|
1247
|
+
$logger.debug "download: lines=#{range}"
|
1248
|
+
text = text[range] || []
|
1249
|
+
end
|
1250
|
+
if (range = opts[:cols])
|
1251
|
+
$logger.debug "download: cols=#{range}"
|
1252
|
+
text.map! {|l| l[range]}
|
1253
|
+
text.compact!
|
1254
|
+
end
|
1255
|
+
if (o = opts[:sort])
|
1256
|
+
$logger.debug "download: sort=#{o}"
|
1257
|
+
case o
|
1258
|
+
when true
|
1259
|
+
text.sort!
|
1260
|
+
when Proc
|
1261
|
+
text.sort!(&o)
|
1262
|
+
end
|
1263
|
+
end
|
1264
|
+
if (o = opts[:strip])
|
1265
|
+
$logger.debug "download: strip!"
|
1266
|
+
text.delete_if {|l| l !~ /\S/}
|
1267
|
+
end
|
1268
|
+
text = text.join("\n")
|
1269
|
+
end
|
1270
|
+
|
1271
|
+
pprc = @configuration.get(url, :downloadprocess)
|
1272
|
+
if pprc
|
1273
|
+
$logger.debug "download process: #{pprc}"
|
1274
|
+
text = @configuration.call_cmd(pprc, [text])
|
1275
|
+
$logger.debug text
|
1276
|
+
end
|
1277
|
+
|
1278
|
+
File.open(latest, 'w') {|io| io.puts(text)}
|
1279
|
+
return true
|
1280
|
+
end
|
1281
|
+
|
1282
|
+
|
1283
|
+
def diff(url, old, new, opts)
|
1284
|
+
if File.exists?(old)
|
1285
|
+
$logger.debug "diff: #{old} <-> #{new}"
|
1286
|
+
difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
|
1287
|
+
$logger.debug "diff: #{difftext}"
|
1288
|
+
|
1289
|
+
if difftext =~ /\S/
|
1290
|
+
if (pprc = @configuration.get(url, :diffprocess))
|
1291
|
+
$logger.debug "diff process: #{pprc}"
|
1292
|
+
difftext = @configuration.call_cmd(pprc, [difftext])
|
1293
|
+
end
|
1294
|
+
$logger.debug "difftext: #{difftext}"
|
1295
|
+
if difftext =~ /\S/
|
1296
|
+
$logger.warn "Changed: #{@configuration.get(url, :title, url).inspect}"
|
1297
|
+
return difftext
|
1298
|
+
end
|
1299
|
+
end
|
1300
|
+
|
1301
|
+
$logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
|
1302
|
+
else
|
1303
|
+
$logger.info "Initial copy: #{old.inspect}"
|
1304
|
+
end
|
1305
|
+
return nil
|
1306
|
+
end
|
1307
|
+
|
1308
|
+
|
1309
|
+
def accumulate(url, difftext, opts)
|
1310
|
+
@difftext[url] = difftext
|
1311
|
+
end
|
1312
|
+
|
1313
|
+
|
1314
|
+
def show
|
1315
|
+
@configuration.show_output(@difftext)
|
1316
|
+
end
|
1317
|
+
|
1318
|
+
|
1319
|
+
def move(from, to)
|
1320
|
+
if File.exists?(from)
|
1321
|
+
$logger.debug "Overwriting: #{from} -> #{to}" if File.exists?(to)
|
1322
|
+
File.rename(from, to)
|
1323
|
+
end
|
1324
|
+
end
|
1325
|
+
|
1326
|
+
|
1327
|
+
def ensure_dir(dir, &fill_dir)
|
1328
|
+
if File.exist?(dir)
|
1329
|
+
unless File.directory?(dir)
|
1330
|
+
$logger.fatal "Not a directory: #{dir}"
|
1331
|
+
exit 5
|
1332
|
+
end
|
1333
|
+
else
|
1334
|
+
Dir.mkdir(dir)
|
1335
|
+
fill_dir.call(dir) if fill_dir
|
1336
|
+
end
|
1337
|
+
end
|
1338
|
+
|
1339
|
+
end
|
1340
|
+
|
1341
|
+
|
1342
|
+
|
1343
|
+
if __FILE__ == $0
|
1344
|
+
Websitiary.new(ARGV).process
|
1345
|
+
# sleep 5
|
1346
|
+
end
|
1347
|
+
|
1348
|
+
|
1349
|
+
# Local Variables:
|
1350
|
+
# revisionRx: REVISION\s\+=\s\+\'
|
1351
|
+
# End:
|