websitiary 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +6 -0
- data/README.txt +474 -0
- data/Rakefile +20 -0
- data/bin/websitiary +1351 -0
- data/setup.rb +1585 -0
- metadata +71 -0
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
load './bin/websitiary'
|
6
|
+
|
7
|
+
Hoe.new('websitiary', Websitiary::VERSION) do |p|
|
8
|
+
p.rubyforge_name = 'websitiary'
|
9
|
+
p.author = 'Thomas Link'
|
10
|
+
p.email = 'sanobast-ruby@yahoo.de'
|
11
|
+
p.summary = 'A simple website monitor'
|
12
|
+
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
+
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
|
+
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
+
p.extra_deps << 'hpricot'
|
16
|
+
# p.need_tgz = false
|
17
|
+
p.need_zip = true
|
18
|
+
end
|
19
|
+
|
20
|
+
# vim: syntax=Ruby
|
data/bin/websitiary
ADDED
@@ -0,0 +1,1351 @@
|
|
1
|
+
#! /usr/bin/ruby.exe
|
2
|
+
# websitiary.rb -- Website Monitor
|
3
|
+
# @Last Change: 2007-07-16.
|
4
|
+
# Author:: Thomas Link (samul AT web de)
|
5
|
+
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
6
|
+
# Created:: 2007-06-09.
|
7
|
+
#
|
8
|
+
# = TODO
|
9
|
+
# * Find a ruby-based replacement for webdiff (or port webdiff to ruby)
|
10
|
+
# * Built-in support for robots.txt
|
11
|
+
# * Option to append to output files
|
12
|
+
# * Option to trim output files (when appending items)
|
13
|
+
|
14
|
+
|
15
|
+
require 'cgi'
|
16
|
+
require 'digest/md5'
|
17
|
+
require 'logger'
|
18
|
+
require 'optparse'
|
19
|
+
require 'pathname'
|
20
|
+
require 'rbconfig'
|
21
|
+
require 'uri'
|
22
|
+
require 'open-uri'
|
23
|
+
|
24
|
+
|
25
|
+
['hpricot', 'robot_rules'].each do |f|
|
26
|
+
begin
|
27
|
+
require f
|
28
|
+
rescue Exception => e
|
29
|
+
$stderr.puts <<EOT
|
30
|
+
#{e.message}
|
31
|
+
Library could not be loaded: #{f}
|
32
|
+
Please see the requirements section at: http://websitiary.rubyforge.org
|
33
|
+
EOT
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
# Basic usage:
|
40
|
+
# Websitiary.new(ARGV).process
|
41
|
+
class Websitiary
|
42
|
+
APPNAME = 'websitiary'
|
43
|
+
VERSION = '0.1.0'
|
44
|
+
REVISION = '1447'
|
45
|
+
MINUTE_SECS = 60
|
46
|
+
HOUR_SECS = MINUTE_SECS * 60
|
47
|
+
DAY_SECS = HOUR_SECS * 24
|
48
|
+
|
49
|
+
# A simple wrapper around Logger.
|
50
|
+
class AppLog
|
51
|
+
def initialize(output=nil)
|
52
|
+
@output = output || $stdout
|
53
|
+
$logger = Logger.new(@output, 'daily')
|
54
|
+
$logger.progname = APPNAME
|
55
|
+
$logger.datetime_format = "%H:%M:%S"
|
56
|
+
set_level
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def set_level(level=:default)
|
61
|
+
case level
|
62
|
+
when :debug
|
63
|
+
$logger.level = Logger::DEBUG
|
64
|
+
when :verbose
|
65
|
+
$logger.level = Logger::INFO
|
66
|
+
when :quiet
|
67
|
+
$logger.level = Logger::ERROR
|
68
|
+
else
|
69
|
+
$logger.level = Logger::WARN
|
70
|
+
end
|
71
|
+
$logger.debug "Set logger level: #{level}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
# This class defines the scope in which profiles are evaluated. Most
|
77
|
+
# of its methods are suitable for use in profiles.
|
78
|
+
class Configuration
|
79
|
+
# Hash (key = URL, value = Hash of options)
|
80
|
+
attr_accessor :urls
|
81
|
+
# Array of urls to be downloaded.
|
82
|
+
attr_accessor :todo
|
83
|
+
# Array of downloaded urls.
|
84
|
+
attr_accessor :done
|
85
|
+
# The user configuration directory
|
86
|
+
attr_accessor :cfgdir
|
87
|
+
# attr_accessor :default_profiles
|
88
|
+
# attr_accessor :options
|
89
|
+
# attr_accessor :cmd_edit
|
90
|
+
|
91
|
+
|
92
|
+
def initialize(app, args=[])
|
93
|
+
@logger = AppLog.new
|
94
|
+
$logger.debug "Configuration#initialize"
|
95
|
+
@app = app
|
96
|
+
@urls = {}
|
97
|
+
@todo = []
|
98
|
+
@done = []
|
99
|
+
@robots = {}
|
100
|
+
@allow = {}
|
101
|
+
|
102
|
+
@suffix = {
|
103
|
+
'text' => 'txt'
|
104
|
+
# 'rss' => 'xml'
|
105
|
+
}
|
106
|
+
|
107
|
+
@cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitiary') : '.'
|
108
|
+
[
|
109
|
+
ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitiary'),
|
110
|
+
File.join(Config::CONFIG['sysconfdir'], 'websitiary')
|
111
|
+
].each do |dir|
|
112
|
+
if File.exists?(dir)
|
113
|
+
@cfgdir = dir
|
114
|
+
break
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
@user_agent = "websitiary/#{Websitiary::VERSION}"
|
119
|
+
|
120
|
+
@cmd_edit = 'vi "%s"'
|
121
|
+
|
122
|
+
@options = {:global => {}}
|
123
|
+
|
124
|
+
@options[:diff] = {
|
125
|
+
:default => :diff,
|
126
|
+
:diff => 'diff -d -w -u2 "%s" "%s"',
|
127
|
+
:webdiff => lambda do |old, new|
|
128
|
+
$logger.debug "webdiff: #{File.basename(new)}"
|
129
|
+
$logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
|
130
|
+
difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
|
131
|
+
$?.exitstatus == 1 ? difftext : ''
|
132
|
+
end,
|
133
|
+
}
|
134
|
+
|
135
|
+
@options[:format] = {
|
136
|
+
:default => :diff,
|
137
|
+
:diff => %{<pre class="diff">\n%s\n</pre>},
|
138
|
+
:webdiff => "%s\n",
|
139
|
+
}
|
140
|
+
|
141
|
+
@options[:diffprocess] = {
|
142
|
+
:default => :diff,
|
143
|
+
:diff => lambda {|text| text.split("\n")[2..-1].delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n")},
|
144
|
+
:webdiff => false,
|
145
|
+
}
|
146
|
+
|
147
|
+
@options[:download] = {
|
148
|
+
:default => :w3m,
|
149
|
+
}
|
150
|
+
|
151
|
+
@options[:downloadformat] = {
|
152
|
+
:w3m => :text,
|
153
|
+
:webdiff => :html,
|
154
|
+
}
|
155
|
+
|
156
|
+
@options[:downloadprocess] = {
|
157
|
+
}
|
158
|
+
|
159
|
+
@options[:rss] = {
|
160
|
+
:version => '2.0',
|
161
|
+
}
|
162
|
+
|
163
|
+
@options[:strip_tags] = {
|
164
|
+
:default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
|
165
|
+
}
|
166
|
+
|
167
|
+
shortcut :w3m, :delegate => :diff,
|
168
|
+
:download => 'w3m -no-cookie -S -F -dump "%s"'
|
169
|
+
|
170
|
+
shortcut :lynx, :delegate => :diff,
|
171
|
+
:download => 'lynx -dump "%s"'
|
172
|
+
|
173
|
+
shortcut :links, :delegate => :diff,
|
174
|
+
:download => 'links -dump "%s"'
|
175
|
+
|
176
|
+
shortcut :curl, :delegate => :webdiff,
|
177
|
+
:download => 'curl --silent "%s"'
|
178
|
+
|
179
|
+
shortcut :wget, :delegate => :webdiff,
|
180
|
+
:download => 'wget -q -O - "%s"'
|
181
|
+
|
182
|
+
shortcut :body_html, :delegate => :webdiff,
|
183
|
+
:strip_tags => :default,
|
184
|
+
:download => lambda {|url|
|
185
|
+
begin
|
186
|
+
doc = Hpricot(open(url))
|
187
|
+
doc = doc.at('body')
|
188
|
+
if doc
|
189
|
+
doc = rewrite_urls(url, doc)
|
190
|
+
doc = doc.inner_html
|
191
|
+
if (tags = get(url, :strip_tags))
|
192
|
+
doc = strip_tags(doc, :format => :hpricot, :tags => tags)
|
193
|
+
end
|
194
|
+
else
|
195
|
+
$logger.warn 'inner html: No body'
|
196
|
+
end
|
197
|
+
doc.to_s
|
198
|
+
rescue Exception => e
|
199
|
+
# $logger.error e #DBG#
|
200
|
+
$logger.error e.message
|
201
|
+
$logger.debug e.backtrace
|
202
|
+
break %{<pre class="error">\n#{e.message}\n</pre>}
|
203
|
+
end
|
204
|
+
}
|
205
|
+
|
206
|
+
shortcut :openuri, :delegate => :webdiff,
|
207
|
+
:download => lambda {|url|
|
208
|
+
begin
|
209
|
+
open(url).read
|
210
|
+
rescue Exception => e
|
211
|
+
# $logger.error e #DBG#
|
212
|
+
$logger.error e.message
|
213
|
+
$logger.debug e.backtrace
|
214
|
+
%{<pre class="error">\n#{e.to_s}\n</pre>}
|
215
|
+
end
|
216
|
+
}
|
217
|
+
|
218
|
+
shortcut :website, :delegate => :webdiff,
|
219
|
+
:download => lambda {|url|
|
220
|
+
html = @options[:download][:body_html].call(url)
|
221
|
+
break unless html
|
222
|
+
doc = Hpricot(html)
|
223
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
224
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
225
|
+
uri.host == uri0.host
|
226
|
+
end
|
227
|
+
html
|
228
|
+
}
|
229
|
+
|
230
|
+
shortcut :website_below, :delegate => :webdiff,
|
231
|
+
:download => lambda {|url|
|
232
|
+
html = @options[:download][:body_html].call(url)
|
233
|
+
break unless html
|
234
|
+
doc = Hpricot(html)
|
235
|
+
if doc
|
236
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
237
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
238
|
+
uri.host == uri0.host &&
|
239
|
+
pn.relative_path_from(pn0).to_s == '.'
|
240
|
+
end
|
241
|
+
end
|
242
|
+
html
|
243
|
+
}
|
244
|
+
|
245
|
+
shortcut :website_txt, :delegate => :default,
|
246
|
+
:download => lambda {|url|
|
247
|
+
success, cmd = get_option(:download, :default)
|
248
|
+
if success
|
249
|
+
html = @options[:download][:website].call(url)
|
250
|
+
html_to_text(html)
|
251
|
+
end
|
252
|
+
}
|
253
|
+
|
254
|
+
shortcut :website_txt_below, :delegate => :default,
|
255
|
+
:download => lambda {|url|
|
256
|
+
success, cmd = get_option(:download, :default)
|
257
|
+
if success
|
258
|
+
html = @options[:download][:website_below].call(url)
|
259
|
+
html_to_text(html)
|
260
|
+
end
|
261
|
+
}
|
262
|
+
|
263
|
+
@options[:page] = {:format => lambda do |ti, li, bd|
|
264
|
+
template = <<OUT
|
265
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
266
|
+
<html>
|
267
|
+
<head>
|
268
|
+
<title>%s</title>
|
269
|
+
<link rel="stylesheet" href="websitiary.css" type="text/css">
|
270
|
+
<link rel="alternate" href="websitiary.rss" type="application/rss+xml" title="%s">
|
271
|
+
</head>
|
272
|
+
<body>
|
273
|
+
<ol class="toc">
|
274
|
+
%s
|
275
|
+
</ol>
|
276
|
+
<div class="contents">
|
277
|
+
%s
|
278
|
+
</div>
|
279
|
+
</body>
|
280
|
+
</html>
|
281
|
+
OUT
|
282
|
+
template % [ti, ti, li, bd]
|
283
|
+
end
|
284
|
+
}
|
285
|
+
|
286
|
+
# @view = nil
|
287
|
+
@view = 'w3m "%s"'
|
288
|
+
@default_options = {}
|
289
|
+
@default_profiles = []
|
290
|
+
@profiles = []
|
291
|
+
@outfile = {}
|
292
|
+
profile 'config.rb'
|
293
|
+
parse_command_line_args(args)
|
294
|
+
@output_format ||= ['html']
|
295
|
+
@output_title = %{#{APPNAME}: #{@profiles.join(", ")}}
|
296
|
+
end
|
297
|
+
|
298
|
+
|
299
|
+
def parse_command_line_args(args)
|
300
|
+
$logger.debug "parse_command_line_args: #{args}"
|
301
|
+
opts = OptionParser.new do |opts|
|
302
|
+
opts.banner = "Usage: #{APPNAME} [OPTIONS] [PROFILES] > [OUT]"
|
303
|
+
opts.separator ''
|
304
|
+
opts.separator "#{APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
|
305
|
+
opts.separator 'the terms of the GNU General Public License version 2 or newer.'
|
306
|
+
opts.separator ''
|
307
|
+
|
308
|
+
opts.separator 'General Options:'
|
309
|
+
|
310
|
+
opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
|
311
|
+
@cfgdir = value
|
312
|
+
end
|
313
|
+
|
314
|
+
opts.on('-e', '--edit=PROFILE', String, 'Edit a profile') do |value|
|
315
|
+
edit_profile value
|
316
|
+
exit 0
|
317
|
+
end
|
318
|
+
|
319
|
+
opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
|
320
|
+
output_format(*value.split(/,/))
|
321
|
+
end
|
322
|
+
|
323
|
+
opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
|
324
|
+
set :ignore_age => bool
|
325
|
+
end
|
326
|
+
|
327
|
+
opts.on('--log=DESTINATION', String, 'Log destination') do |value|
|
328
|
+
@logger = AppLog.new(value != '-' && value)
|
329
|
+
end
|
330
|
+
opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
|
331
|
+
output_file(value)
|
332
|
+
end
|
333
|
+
|
334
|
+
opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
|
335
|
+
key, val = value.split(/=/, 2)
|
336
|
+
set key.intern => eval(val)
|
337
|
+
end
|
338
|
+
|
339
|
+
opts.on('--review', 'View last diff') do |value|
|
340
|
+
view_output
|
341
|
+
exit 0
|
342
|
+
end
|
343
|
+
|
344
|
+
opts.separator ''
|
345
|
+
opts.separator 'Available profiles:'
|
346
|
+
opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
|
347
|
+
|
348
|
+
opts.separator ''
|
349
|
+
opts.separator 'Other Options:'
|
350
|
+
|
351
|
+
opts.on('--debug', 'Show debug messages') do |v|
|
352
|
+
$VERBOSE = $DEBUG = true
|
353
|
+
@logger.set_level(:debug)
|
354
|
+
end
|
355
|
+
|
356
|
+
opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
|
357
|
+
@logger.set_level(:quiet)
|
358
|
+
end
|
359
|
+
|
360
|
+
opts.on('-v', '--verbose', 'Run verbosely') do |v|
|
361
|
+
$VERBOSE = true
|
362
|
+
@logger.set_level(:verbose)
|
363
|
+
end
|
364
|
+
|
365
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
366
|
+
puts opts
|
367
|
+
exit 1
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
@profiles = opts.parse!(args)
|
372
|
+
@profiles = @default_profiles if @profiles.empty?
|
373
|
+
for pn in @profiles
|
374
|
+
profile pn
|
375
|
+
end
|
376
|
+
|
377
|
+
self
|
378
|
+
end
|
379
|
+
|
380
|
+
|
381
|
+
# Retrieve an option for an url
|
382
|
+
# url:: String
|
383
|
+
# opt:: Symbol
|
384
|
+
def get(url, opt, default=nil)
|
385
|
+
opts = @urls[url]
|
386
|
+
$logger.debug "get: opts=#{opts.inspect}"
|
387
|
+
case opt
|
388
|
+
when :diffprocess, :format
|
389
|
+
opt_ = opts.has_key?(opt) ? opt : :diff
|
390
|
+
else
|
391
|
+
opt_ = opt
|
392
|
+
end
|
393
|
+
|
394
|
+
$logger.debug "get: opt=#{opt} opt_=#{opt_} #{opts[opt_]} #{opts[:use]}"
|
395
|
+
if opts.has_key?(opt_)
|
396
|
+
val = opts[opt_]
|
397
|
+
elsif opts.has_key?(:use)
|
398
|
+
val = opts[:use]
|
399
|
+
else
|
400
|
+
val = nil
|
401
|
+
end
|
402
|
+
|
403
|
+
case val
|
404
|
+
when nil
|
405
|
+
when Symbol
|
406
|
+
$logger.debug "get: val=#{val}"
|
407
|
+
success, rv = get_option(opt, val)
|
408
|
+
$logger.debug "get: #{success}, #{rv}"
|
409
|
+
if success
|
410
|
+
return rv
|
411
|
+
end
|
412
|
+
else
|
413
|
+
$logger.debug "get: return val=#{val}"
|
414
|
+
return val
|
415
|
+
end
|
416
|
+
unless default
|
417
|
+
success, default1 = get_option(opt, :default)
|
418
|
+
default = default1 if success
|
419
|
+
end
|
420
|
+
|
421
|
+
$logger.debug "get: return default=#{default}"
|
422
|
+
return default
|
423
|
+
end
|
424
|
+
|
425
|
+
|
426
|
+
# Configuration command:
|
427
|
+
# Set the default profiles
|
428
|
+
def default(*profile_names)
|
429
|
+
@default_profiles = profile_names
|
430
|
+
end
|
431
|
+
|
432
|
+
|
433
|
+
# Configuration command:
|
434
|
+
# Load a profile
|
435
|
+
def profile(profile_name)
|
436
|
+
case profile_name
|
437
|
+
when '-'
|
438
|
+
readlines.map! {|l| l.chomp}.each {|url| source url}
|
439
|
+
else
|
440
|
+
fn = profile_filename(profile_name)
|
441
|
+
if fn
|
442
|
+
$logger.debug "Profile: #{fn}"
|
443
|
+
contents = File.read(fn)
|
444
|
+
@current_profile = fn
|
445
|
+
begin
|
446
|
+
self.instance_eval(contents)
|
447
|
+
ensure
|
448
|
+
@current_profile = nil
|
449
|
+
end
|
450
|
+
true
|
451
|
+
else
|
452
|
+
false
|
453
|
+
end
|
454
|
+
end
|
455
|
+
end
|
456
|
+
|
457
|
+
|
458
|
+
# Define a options shortcut.
|
459
|
+
def shortcut(symbol, args)
|
460
|
+
ak = args.keys
|
461
|
+
ok = @options.keys
|
462
|
+
dk = ok - ak
|
463
|
+
|
464
|
+
# :downloadprocess
|
465
|
+
if !ak.include?(:delegate) and
|
466
|
+
dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
|
467
|
+
$logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
|
468
|
+
end
|
469
|
+
|
470
|
+
if ak.include?(:delegate)
|
471
|
+
dk.each do |field|
|
472
|
+
@options[field][symbol] = args[:delegate]
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
args.each do |field, val|
|
477
|
+
@options[field][symbol] = val unless field == :delegate
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
|
482
|
+
# Set the output format.
|
483
|
+
def output_format(*format)
|
484
|
+
unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
|
485
|
+
$logger.fatal "Unknown output format: #{format}"
|
486
|
+
exit 5
|
487
|
+
end
|
488
|
+
@output_format = format
|
489
|
+
end
|
490
|
+
|
491
|
+
|
492
|
+
# Set the output file.
|
493
|
+
def output_file(filename, outformat=nil)
|
494
|
+
@outfile[outformat] = filename
|
495
|
+
end
|
496
|
+
|
497
|
+
|
498
|
+
# Configuration command:
|
499
|
+
# Set global options.
|
500
|
+
# type:: Symbol
|
501
|
+
# options:: Hash
|
502
|
+
def option(type, options)
|
503
|
+
$logger.info "option #{type}: #{options.inspect}"
|
504
|
+
o = @options[type]
|
505
|
+
if o
|
506
|
+
o.merge!(options)
|
507
|
+
else
|
508
|
+
$logger.error "Unknown option type: #{type} (#{options.inspect})"
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
|
513
|
+
# Set a global option.
|
514
|
+
def global(options)
|
515
|
+
options.each do |type, value|
|
516
|
+
@options[type] = value
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
|
521
|
+
# Configuration command:
|
522
|
+
# Set the default value for source-options.
|
523
|
+
def set(options)
|
524
|
+
$logger.debug "set: #{options.inspect}"
|
525
|
+
@default_options.merge!(options)
|
526
|
+
end
|
527
|
+
|
528
|
+
|
529
|
+
# Configuration command:
|
530
|
+
# Unset a default source-option.
|
531
|
+
def unset(*options)
|
532
|
+
for option in options
|
533
|
+
@default_options.delete(option)
|
534
|
+
end
|
535
|
+
end
|
536
|
+
|
537
|
+
|
538
|
+
# Configuration command:
|
539
|
+
# Define a source.
|
540
|
+
# urls:: String
|
541
|
+
def source(urls, opts={})
|
542
|
+
urls.split("\n").flatten.compact.each do |url|
|
543
|
+
@urls[url] = @default_options.dup.update(opts)
|
544
|
+
@todo << url
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
548
|
+
|
549
|
+
# Configuration command:
|
550
|
+
# Set the default download processor. The block takes the
|
551
|
+
# downloaded text (STRING) as argument.
|
552
|
+
def downloadprocess(&block)
|
553
|
+
@options[:downloadprocess][:default] = block
|
554
|
+
end
|
555
|
+
|
556
|
+
|
557
|
+
# Configuration command:
|
558
|
+
# Set the default diff processor. The block takes the
|
559
|
+
# diff text (STRING) as argument.
|
560
|
+
def diffprocess(&block)
|
561
|
+
@options[:diff][:default] = block
|
562
|
+
end
|
563
|
+
|
564
|
+
|
565
|
+
# Configuration command:
|
566
|
+
# Set the editor.
|
567
|
+
def edit(cmd)
|
568
|
+
@cmd_edit = cmd
|
569
|
+
end
|
570
|
+
|
571
|
+
|
572
|
+
# Configuration command:
|
573
|
+
# Set the viewer.
|
574
|
+
def view(view)
|
575
|
+
@view = view
|
576
|
+
end
|
577
|
+
|
578
|
+
|
579
|
+
# Configuration command:
|
580
|
+
# Set the default diff program.
|
581
|
+
def diff(diff)
|
582
|
+
@options[:diff][:default] = diff
|
583
|
+
end
|
584
|
+
|
585
|
+
|
586
|
+
# Configuration command:
|
587
|
+
# Set the default dowloader.
|
588
|
+
def download(download)
|
589
|
+
@options[:download][:default] = download
|
590
|
+
end
|
591
|
+
|
592
|
+
|
593
|
+
# Format a diff according to URL's source options.
|
594
|
+
def format(url, difftext)
|
595
|
+
fmt = get(url, :format)
|
596
|
+
eval_arg(fmt, [difftext])
|
597
|
+
end
|
598
|
+
|
599
|
+
|
600
|
+
# Apply some arguments to a format.
|
601
|
+
# format:: String or Proc
|
602
|
+
# args:: Array of Arguments
|
603
|
+
def eval_arg(format, args, default=nil, &process_string)
|
604
|
+
case format
|
605
|
+
when nil
|
606
|
+
return default
|
607
|
+
when Proc
|
608
|
+
$logger.debug "eval proc: #{format} #{args.inspect}"
|
609
|
+
return format.call(*args)
|
610
|
+
else
|
611
|
+
ca = format % args
|
612
|
+
$logger.debug "eval string: #{ca}"
|
613
|
+
if process_string
|
614
|
+
return process_string.call(ca)
|
615
|
+
else
|
616
|
+
return ca
|
617
|
+
end
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
|
622
|
+
# Apply the argument to cmd (a format String or a Proc). If a
|
623
|
+
# String, execute the command.
|
624
|
+
def call_cmd(cmd, args, default=nil)
|
625
|
+
eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
|
626
|
+
end
|
627
|
+
|
628
|
+
|
629
|
+
# Generate & view the final output.
|
630
|
+
# difftext:: Hash
|
631
|
+
def show_output(difftext)
|
632
|
+
if difftext.empty?
|
633
|
+
$logger.warn 'No news is good news'
|
634
|
+
return
|
635
|
+
end
|
636
|
+
|
637
|
+
@output_format.each do |outformat|
|
638
|
+
meth = "get_output_#{outformat}"
|
639
|
+
|
640
|
+
unless respond_to?(meth)
|
641
|
+
$logger.fatal "Unknown output format: #{outformat}"
|
642
|
+
exit 5
|
643
|
+
end
|
644
|
+
|
645
|
+
out = send(meth, difftext)
|
646
|
+
if out
|
647
|
+
outfile = get_outfile(outformat)
|
648
|
+
case outfile
|
649
|
+
when '-'
|
650
|
+
puts out
|
651
|
+
else
|
652
|
+
File.open(outfile, 'w') {|io| io.puts out}
|
653
|
+
meth = "view_output_#{outformat}"
|
654
|
+
self.send(meth, outfile)
|
655
|
+
end
|
656
|
+
end
|
657
|
+
end
|
658
|
+
end
|
659
|
+
|
660
|
+
|
661
|
+
def get_output_text(difftext)
|
662
|
+
difftext.map do |url, difftext|
|
663
|
+
if difftext
|
664
|
+
difftext = html_to_text(difftext) if is_html?(difftext)
|
665
|
+
!difftext.empty? && [url, difftext_annotation(url), nil, difftext].join("\n")
|
666
|
+
end
|
667
|
+
end.compact.join("\n\n#{('-' * 68)}\n\n")
|
668
|
+
end
|
669
|
+
|
670
|
+
|
671
|
+
def get_output_rss(difftext)
|
672
|
+
success, rss_url = get_option(:rss, :url)
|
673
|
+
if success
|
674
|
+
success, rss_version = get_option(:rss, :version)
|
675
|
+
require "rss/#{rss_version}"
|
676
|
+
|
677
|
+
rss = RSS::Rss.new(rss_version)
|
678
|
+
chan = RSS::Rss::Channel.new
|
679
|
+
chan.title = @output_title
|
680
|
+
[:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
|
681
|
+
ok, val = get_option(:rss, field)
|
682
|
+
item.send(format_symbol(field, '%s='), val) if ok
|
683
|
+
end
|
684
|
+
chan.link = rss_url
|
685
|
+
rss.channel = chan
|
686
|
+
|
687
|
+
cnt = difftext.map do |url, text|
|
688
|
+
rss_format = get(url, :rss_format, :plain_text)
|
689
|
+
text = strip_tags(text, :format => rss_format)
|
690
|
+
next if text.empty?
|
691
|
+
|
692
|
+
item = RSS::Rss::Channel::Item.new
|
693
|
+
item.title = get(url, :title, File.basename(url))
|
694
|
+
item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
|
695
|
+
[:author, :date, :enclosure, :category, :pubDate].each do |field|
|
696
|
+
val = get(url, format_symbol(field, 'rss_%s'))
|
697
|
+
item.send(format_symbol(field, '%s='), val) if val
|
698
|
+
end
|
699
|
+
|
700
|
+
annotation = difftext_annotation(url)
|
701
|
+
case rss_format
|
702
|
+
when :plain_text
|
703
|
+
annotation = "<pre>#{annotation}</pre>" if annotation
|
704
|
+
item.description = %{#{annotation}<pre>#{text}</pre>}
|
705
|
+
else
|
706
|
+
item.description = %{<pre>#{annotation}</pre>\n#{text}}
|
707
|
+
end
|
708
|
+
chan.items << item
|
709
|
+
end
|
710
|
+
|
711
|
+
return rss.to_s
|
712
|
+
|
713
|
+
else
|
714
|
+
|
715
|
+
$logger.fatal "Global option :rss[:url] not defined."
|
716
|
+
exit 5
|
717
|
+
|
718
|
+
end
|
719
|
+
end
|
720
|
+
|
721
|
+
|
722
|
+
def get_output_html(difftext)
|
723
|
+
difftext = difftext.map do |url, text|
|
724
|
+
tags = get(url, :strip_tags)
|
725
|
+
text = strip_tags(text, :tags => tags) if tags
|
726
|
+
text.empty? ? nil : [url, text]
|
727
|
+
end
|
728
|
+
difftext.compact!
|
729
|
+
|
730
|
+
toc = difftext.map do |url, text|
|
731
|
+
lab = Digest::MD5.hexdigest(url)
|
732
|
+
ti = get(url, :title, File.basename(url))
|
733
|
+
# %{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a> <a class="external" href="#{url}">[W]</a></li>}
|
734
|
+
%{<li class="toc"><a class="toc" href="\##{lab}">#{ti}</a></li>}
|
735
|
+
end.join("\n")
|
736
|
+
|
737
|
+
cnt = difftext.map do |url, text|
|
738
|
+
lab = Digest::MD5.hexdigest(url)
|
739
|
+
ti = get(url, :title, File.basename(url))
|
740
|
+
if (rewrite = get(url, :rewrite_link))
|
741
|
+
url = eval_arg(rewrite, [url])
|
742
|
+
ext = ''
|
743
|
+
else
|
744
|
+
old = %{<a class="old" href="#{file_url(backupname(url))}">old</a>}
|
745
|
+
lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
|
746
|
+
ext = %{ (#{old}, #{lst})}
|
747
|
+
end
|
748
|
+
<<HTML
|
749
|
+
<div class="webpage">
|
750
|
+
<h1 class="diff" id="#{lab}"><a class="external" href="#{url}">#{ti}</a>#{ext}</h1>
|
751
|
+
<div class="annotation">
|
752
|
+
#{CGI::escapeHTML(difftext_annotation(url))}
|
753
|
+
</div>
|
754
|
+
<div class="diff">
|
755
|
+
#{format(url, text)}
|
756
|
+
</div>
|
757
|
+
</div>
|
758
|
+
HTML
|
759
|
+
end.join(('<hr class="separator"/>') + "\n")
|
760
|
+
|
761
|
+
success, template = get_option(:page, :format)
|
762
|
+
unless success
|
763
|
+
success, template = get_option(:page, :simple)
|
764
|
+
end
|
765
|
+
return eval_arg(template, [@output_title, toc, cnt])
|
766
|
+
end
|
767
|
+
|
768
|
+
|
769
|
+
# Get the backup filename.
|
770
|
+
def backupname(url)
|
771
|
+
File.join(@cfgdir, 'old', encode(url))
|
772
|
+
end
|
773
|
+
|
774
|
+
|
775
|
+
# Get the filename for the freshly downloaded copy.
|
776
|
+
def latestname(url)
|
777
|
+
File.join(@cfgdir, 'latest', encode(url))
|
778
|
+
end
|
779
|
+
|
780
|
+
|
781
|
+
# Guess path's dirname.
|
782
|
+
# foo/bar -> foo
|
783
|
+
# foo/bar.txt -> foo
|
784
|
+
# foo/bar/ -> foo/bar
|
785
|
+
def guess_dir(path)
|
786
|
+
path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
|
787
|
+
end
|
788
|
+
|
789
|
+
|
790
|
+
# Strip the url's last part (after #).
|
791
|
+
def canonic_url(url)
|
792
|
+
url.sub(/#.*$/, '')
|
793
|
+
end
|
794
|
+
|
795
|
+
|
796
|
+
def strip_tags_default
|
797
|
+
success, tags = get_option(:strip_tags, :default)
|
798
|
+
tags.dup if success
|
799
|
+
end
|
800
|
+
|
801
|
+
|
802
|
+
def strip_tags(doc, args={})
|
803
|
+
tags = args[:tags] || strip_tags_default
|
804
|
+
case doc
|
805
|
+
when String
|
806
|
+
doc = Hpricot(doc)
|
807
|
+
end
|
808
|
+
tags.each do |tag|
|
809
|
+
doc.search(tag).remove
|
810
|
+
end
|
811
|
+
case args[:format]
|
812
|
+
when :hpricot
|
813
|
+
doc
|
814
|
+
else
|
815
|
+
doc.send("to_#{args[:format] || :html}")
|
816
|
+
end
|
817
|
+
end
|
818
|
+
|
819
|
+
|
820
|
+
# Check whether path is eligible on the basis of url or path0.
|
821
|
+
# This checks either for a :match option for url or the extensions
|
822
|
+
# of path0 and path.
|
823
|
+
def eligible_path?(url, path0, path)
|
824
|
+
rx = get(url, :match)
|
825
|
+
if rx
|
826
|
+
return path =~ rx
|
827
|
+
else
|
828
|
+
return File.extname(path0) == File.extname(path)
|
829
|
+
end
|
830
|
+
end
|
831
|
+
|
832
|
+
|
833
|
+
# Scan hpricot document for hrefs and push the onto @todo if not
|
834
|
+
# already included.
|
835
|
+
def push_hrefs(url, hpricot, &condition)
|
836
|
+
begin
|
837
|
+
depth = get(url, :depth)
|
838
|
+
return if depth and depth <= 0
|
839
|
+
uri0 = URI.parse(url)
|
840
|
+
pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
|
841
|
+
(hpricot / 'a').each do |a|
|
842
|
+
href = a['href']
|
843
|
+
next if href.nil?
|
844
|
+
curl = canonic_url(href)
|
845
|
+
next if @done.include?(curl) or @todo.include?(curl)
|
846
|
+
uri = URI.parse(href)
|
847
|
+
next unless robots_allowed?(curl, uri)
|
848
|
+
pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
|
849
|
+
if condition.call(uri0, pn0, uri, pn)
|
850
|
+
opts = @urls[url].dup
|
851
|
+
opts[:title] = File.basename(curl)
|
852
|
+
opts[:depth] = depth - 1 if depth and depth >= 0
|
853
|
+
@urls[curl] = opts
|
854
|
+
@todo << curl
|
855
|
+
end
|
856
|
+
end
|
857
|
+
rescue Exception => e
|
858
|
+
# $logger.error e #DBG#
|
859
|
+
$logger.error e.message
|
860
|
+
$logger.debug e.backtrace
|
861
|
+
end
|
862
|
+
end
|
863
|
+
|
864
|
+
|
865
|
+
# Rewrite urls in doc
|
866
|
+
# url:: String
|
867
|
+
# doc:: Hpricot document
|
868
|
+
def rewrite_urls(url, doc)
|
869
|
+
uri = URI.parse(url)
|
870
|
+
urd = guess_dir(uri.path)
|
871
|
+
(doc / 'a').each do |a|
|
872
|
+
href = rewrite_href(a['href'], url, uri, urd)
|
873
|
+
a['href'] = href
|
874
|
+
end
|
875
|
+
(doc / 'img').each do |a|
|
876
|
+
href = rewrite_href(a['src'], url, uri, urd)
|
877
|
+
a['src'] = href if href
|
878
|
+
end
|
879
|
+
doc
|
880
|
+
end
|
881
|
+
|
882
|
+
|
883
|
+
# Try to make href an absolute url.
|
884
|
+
def rewrite_href(href, url, uri, urd)
|
885
|
+
begin
|
886
|
+
return if !href
|
887
|
+
rv = nil
|
888
|
+
href = href.strip
|
889
|
+
|
890
|
+
if href =~ /\w+:/
|
891
|
+
elsif uri.relative? and URI.parse(href).relative?
|
892
|
+
if uri.instance_of?(URI::Generic)
|
893
|
+
rv = File.join(urd, href)
|
894
|
+
end
|
895
|
+
elsif href[0..0] == '#'
|
896
|
+
rv = url + href
|
897
|
+
else
|
898
|
+
rv = uri.merge(href).to_s
|
899
|
+
end
|
900
|
+
|
901
|
+
case rv
|
902
|
+
when String
|
903
|
+
return rv
|
904
|
+
when nil
|
905
|
+
else
|
906
|
+
$logger.error "Internal error: href=#{href}"
|
907
|
+
$logger.debug caller.join("\n")
|
908
|
+
end
|
909
|
+
return
|
910
|
+
rescue Exception => e
|
911
|
+
# $logger.error e #DBG#
|
912
|
+
$logger.error e.message
|
913
|
+
$logger.debug e.backtrace
|
914
|
+
end
|
915
|
+
end
|
916
|
+
|
917
|
+
|
918
|
+
# Return a Proc that takes an text as argument and highlight occurences of rx.
|
919
|
+
# rx:: Regular expression
|
920
|
+
# group:: A number (default: 0)
|
921
|
+
# tag:: The HTML tag to use (default: "span")
|
922
|
+
def highlighter(rx, group=nil, tag='span')
|
923
|
+
lambda {|text| text.gsub(rx, %{<#{tag} class="highlight">\\#{group || 0}</#{tag}>})}
|
924
|
+
end
|
925
|
+
|
926
|
+
|
927
|
+
private
|
928
|
+
|
929
|
+
def difftext_annotation(url)
|
930
|
+
bak = backupname(url)
|
931
|
+
lst = latestname(url)
|
932
|
+
if File.exist?(bak) and File.exist?(lst)
|
933
|
+
eval_arg(get(url, :format_annotation, '%s >>> %s'), [File.mtime(bak), File.mtime(lst)])
|
934
|
+
end
|
935
|
+
end
|
936
|
+
|
937
|
+
|
938
|
+
def format_symbol(name, format_string)
|
939
|
+
(format_string % name.to_s).intern
|
940
|
+
end
|
941
|
+
|
942
|
+
|
943
|
+
def is_html?(text)
|
944
|
+
text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
|
945
|
+
end
|
946
|
+
|
947
|
+
|
948
|
+
def html_to_text(text)
|
949
|
+
Hpricot(text).to_plain_text
|
950
|
+
end
|
951
|
+
|
952
|
+
|
953
|
+
def robots_allowed?(url, uri)
|
954
|
+
if @allow.has_key?(url)
|
955
|
+
return @allow[url]
|
956
|
+
end
|
957
|
+
|
958
|
+
if defined?(RobotRules)
|
959
|
+
host = uri.host
|
960
|
+
|
961
|
+
unless (rules = @robots[host])
|
962
|
+
rurl = robots_uri(uri).to_s
|
963
|
+
return true unless rurl
|
964
|
+
begin
|
965
|
+
robots_txt = open(rurl).read
|
966
|
+
rules = RobotRules.new(@user_agent)
|
967
|
+
rules.parse(rurl, robots_txt)
|
968
|
+
@robots[host] = rules
|
969
|
+
$logger.info "Loaded #{rurl} for #{@user_agent}"
|
970
|
+
$logger.debug robots_txt
|
971
|
+
rescue Exception => e
|
972
|
+
# puts e
|
973
|
+
# puts robots_txt
|
974
|
+
end
|
975
|
+
end
|
976
|
+
|
977
|
+
rv = if rules and !rules.allowed?(url)
|
978
|
+
$logger.info "Excluded url: #{url}"
|
979
|
+
false
|
980
|
+
else
|
981
|
+
true
|
982
|
+
end
|
983
|
+
@allow[url] = rv
|
984
|
+
return rv
|
985
|
+
end
|
986
|
+
|
987
|
+
unless @robots[:warning]
|
988
|
+
$logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
|
989
|
+
@robots[:warning] = true
|
990
|
+
end
|
991
|
+
@allow[url] = true
|
992
|
+
return true
|
993
|
+
end
|
994
|
+
|
995
|
+
|
996
|
+
def robots_uri(uri)
|
997
|
+
uri.merge('/robots.txt') unless uri.relative?
|
998
|
+
end
|
999
|
+
|
1000
|
+
|
1001
|
+
def file_url(filename)
|
1002
|
+
filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
|
1003
|
+
# "file://#{encode(filename, ':/')}"
|
1004
|
+
encode(filename, ':/')
|
1005
|
+
end
|
1006
|
+
|
1007
|
+
|
1008
|
+
def get_optionvalue(opt, val, default=nil)
|
1009
|
+
ok, val = get_option(opt, val)
|
1010
|
+
if ok
|
1011
|
+
val
|
1012
|
+
else
|
1013
|
+
default
|
1014
|
+
end
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
|
1018
|
+
def get_option(opt, val)
|
1019
|
+
vals = @options[opt]
|
1020
|
+
$logger.debug "val=#{val} vals=#{vals.inspect}"
|
1021
|
+
if vals and vals.has_key?(val)
|
1022
|
+
rv = vals[val]
|
1023
|
+
$logger.debug "get_option ok: #{opt} => #{rv.inspect}"
|
1024
|
+
case rv
|
1025
|
+
when Symbol
|
1026
|
+
$logger.debug "get_option re: #{rv}"
|
1027
|
+
return get_option(opt, rv)
|
1028
|
+
else
|
1029
|
+
$logger.debug "get_option true, #{rv}"
|
1030
|
+
return [true, rv]
|
1031
|
+
end
|
1032
|
+
else
|
1033
|
+
$logger.debug "get_option no: #{opt} => #{val.inspect}"
|
1034
|
+
return [false, val]
|
1035
|
+
end
|
1036
|
+
end
|
1037
|
+
|
1038
|
+
|
1039
|
+
def encode(text, chars='')
|
1040
|
+
text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
|
1041
|
+
end
|
1042
|
+
|
1043
|
+
|
1044
|
+
def output_suffix(outformat)
|
1045
|
+
@suffix[outformat] || outformat
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
|
1049
|
+
def get_outfile(outformat=nil)
|
1050
|
+
@outfile[outformat] || File.join(@cfgdir, "websitiary.#{output_suffix(outformat || @output_format[0])}")
|
1051
|
+
end
|
1052
|
+
|
1053
|
+
|
1054
|
+
def view_output(outfile=nil)
|
1055
|
+
send("view_output_#{@output_format[0]}", outfile || get_outfile)
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
|
1059
|
+
def view_output_general(outfile)
|
1060
|
+
if @view
|
1061
|
+
system((@view % outfile))
|
1062
|
+
end
|
1063
|
+
end
|
1064
|
+
alias :view_output_html :view_output_general
|
1065
|
+
alias :view_output_text :view_output_general
|
1066
|
+
alias :view_output_rss :view_output_general
|
1067
|
+
|
1068
|
+
|
1069
|
+
def edit_profile(profile)
|
1070
|
+
fn = profile_filename(profile)
|
1071
|
+
$logger.debug "edit: #{fn}"
|
1072
|
+
`#{@cmd_edit % fn}`
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
|
1076
|
+
def profile_filename(profile_name)
|
1077
|
+
if File.extname(profile_name) != '.rb'
|
1078
|
+
profile_name = "#{profile_name}.rb"
|
1079
|
+
end
|
1080
|
+
for d in ['.', @cfgdir]
|
1081
|
+
filename = File.join(d, profile_name)
|
1082
|
+
if File.exists?(filename)
|
1083
|
+
return filename
|
1084
|
+
end
|
1085
|
+
end
|
1086
|
+
return nil
|
1087
|
+
end
|
1088
|
+
|
1089
|
+
end
|
1090
|
+
|
1091
|
+
|
1092
|
+
|
1093
|
+
# Hash: The output of the diff commands for each url.
|
1094
|
+
attr_reader :difftext
|
1095
|
+
|
1096
|
+
# The configurator
|
1097
|
+
attr_reader :configuration
|
1098
|
+
|
1099
|
+
|
1100
|
+
# args:: Array of command-line (like) arguments.
|
1101
|
+
def initialize(args=[])
|
1102
|
+
@configuration = Configuration.new(self, args)
|
1103
|
+
@difftext = {}
|
1104
|
+
|
1105
|
+
ensure_dir(@configuration.cfgdir)
|
1106
|
+
css = File.join(@configuration.cfgdir, 'websitiary.css')
|
1107
|
+
unless File.exists?(css)
|
1108
|
+
$logger.info "Copying default css file: #{css}"
|
1109
|
+
File.open(css, 'w') do |io|
|
1110
|
+
io.puts <<CSS
|
1111
|
+
body {
|
1112
|
+
color: black;
|
1113
|
+
background-color: #f0f0f0;
|
1114
|
+
}
|
1115
|
+
a.external {
|
1116
|
+
}
|
1117
|
+
a.old {
|
1118
|
+
}
|
1119
|
+
a.latest {
|
1120
|
+
}
|
1121
|
+
a.toc {
|
1122
|
+
}
|
1123
|
+
ol.toc {
|
1124
|
+
float: left;
|
1125
|
+
width: 200px;
|
1126
|
+
position: fixed;
|
1127
|
+
padding: 0;
|
1128
|
+
margin: 0;
|
1129
|
+
}
|
1130
|
+
li.toc {
|
1131
|
+
list-style: none;
|
1132
|
+
border: 1px solid silver;
|
1133
|
+
background-color: #fafafa;
|
1134
|
+
padding: 0.5em;
|
1135
|
+
font-size: 80%;
|
1136
|
+
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
1137
|
+
}
|
1138
|
+
li.toc:hover {
|
1139
|
+
background-color: #ffff8d;
|
1140
|
+
}
|
1141
|
+
div.contents {
|
1142
|
+
margin-left: 210px;
|
1143
|
+
min-width: 16em;
|
1144
|
+
}
|
1145
|
+
div.webpage {
|
1146
|
+
margin: 5px 0 5px 0;
|
1147
|
+
padding: 5px;
|
1148
|
+
border: 1px solid silver;
|
1149
|
+
background-color: white;
|
1150
|
+
}
|
1151
|
+
h1.diff {
|
1152
|
+
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
1153
|
+
}
|
1154
|
+
div.diff {
|
1155
|
+
padding-left: 2em;
|
1156
|
+
}
|
1157
|
+
pre.diff {
|
1158
|
+
padding-left: 2em;
|
1159
|
+
}
|
1160
|
+
hr.separator {
|
1161
|
+
width: 100%;
|
1162
|
+
visibility: hidden;
|
1163
|
+
}
|
1164
|
+
.error {
|
1165
|
+
color: yellow;
|
1166
|
+
background-color: red;
|
1167
|
+
}
|
1168
|
+
.highlight {
|
1169
|
+
background-color: #ffc730;
|
1170
|
+
}
|
1171
|
+
CSS
|
1172
|
+
end
|
1173
|
+
end
|
1174
|
+
end
|
1175
|
+
|
1176
|
+
|
1177
|
+
# Process the sources in @configuration.url as defined by profiles
|
1178
|
+
# and command-line options. The differences are stored in @difftext (a Hash).
|
1179
|
+
# show_output:: If true, show the output with the defined viewer.
|
1180
|
+
def process(show_output=true)
|
1181
|
+
@configuration.todo.each do |url|
|
1182
|
+
opts = @configuration.urls[url]
|
1183
|
+
$logger.debug "Source: #{@configuration.get(url, :title, url)}"
|
1184
|
+
older = @configuration.backupname(url)
|
1185
|
+
ensure_dir(File.dirname(older))
|
1186
|
+
$logger.debug "older: #{older}"
|
1187
|
+
latest = @configuration.latestname(url)
|
1188
|
+
ensure_dir(File.dirname(latest))
|
1189
|
+
$logger.debug "latest: #{latest}"
|
1190
|
+
|
1191
|
+
if File.exists?(latest) and !opts[:ignore_age]
|
1192
|
+
if (hdiff = opts[:hours])
|
1193
|
+
tdiff = hdiff * HOUR_SECS
|
1194
|
+
$logger.debug "hours: #{hdiff} (#{tdiff}s)"
|
1195
|
+
elsif (ddiff = opts[:days])
|
1196
|
+
tdiff = ddiff * DAY_SECS
|
1197
|
+
$logger.debug "days: #{ddiff} (#{tdiff}s)"
|
1198
|
+
else
|
1199
|
+
tdiff = nil
|
1200
|
+
end
|
1201
|
+
|
1202
|
+
if tdiff
|
1203
|
+
tn = Time.now
|
1204
|
+
tl = File.mtime(latest)
|
1205
|
+
td = tn - tl
|
1206
|
+
if td < tdiff
|
1207
|
+
$logger.info "Skip #{@configuration.get(url, :title, url).inspect}: Only #{(td / DAY_SECS).to_i}d old (#{(tdiff / DAY_SECS).to_i}d)"
|
1208
|
+
next
|
1209
|
+
end
|
1210
|
+
end
|
1211
|
+
end
|
1212
|
+
|
1213
|
+
move(latest, older)
|
1214
|
+
if download(url, latest, opts)
|
1215
|
+
difftext = diff(url, older, latest, opts)
|
1216
|
+
if difftext
|
1217
|
+
$logger.debug "difftext: #{difftext}"
|
1218
|
+
accumulate(url, difftext, opts)
|
1219
|
+
end
|
1220
|
+
end
|
1221
|
+
end
|
1222
|
+
show if show_output
|
1223
|
+
end
|
1224
|
+
|
1225
|
+
|
1226
|
+
|
1227
|
+
private
|
1228
|
+
|
1229
|
+
def download(url, latest, opts)
|
1230
|
+
if @configuration.done.include?(url)
|
1231
|
+
$logger.info "Already downloaded: #{@configuration.get(url, :title, url).inspect}"
|
1232
|
+
return false
|
1233
|
+
end
|
1234
|
+
|
1235
|
+
$logger.info "Download: #{@configuration.get(url, :title, url).inspect}"
|
1236
|
+
@configuration.done << url
|
1237
|
+
text = @configuration.call_cmd(@configuration.get(url, :download), [url])
|
1238
|
+
# $logger.debug text
|
1239
|
+
unless text
|
1240
|
+
$logger.warn "no contents: #{@configuration.get(url, :title, url)}"
|
1241
|
+
return false
|
1242
|
+
end
|
1243
|
+
|
1244
|
+
if opts
|
1245
|
+
text = text.split("\n")
|
1246
|
+
if (range = opts[:lines])
|
1247
|
+
$logger.debug "download: lines=#{range}"
|
1248
|
+
text = text[range] || []
|
1249
|
+
end
|
1250
|
+
if (range = opts[:cols])
|
1251
|
+
$logger.debug "download: cols=#{range}"
|
1252
|
+
text.map! {|l| l[range]}
|
1253
|
+
text.compact!
|
1254
|
+
end
|
1255
|
+
if (o = opts[:sort])
|
1256
|
+
$logger.debug "download: sort=#{o}"
|
1257
|
+
case o
|
1258
|
+
when true
|
1259
|
+
text.sort!
|
1260
|
+
when Proc
|
1261
|
+
text.sort!(&o)
|
1262
|
+
end
|
1263
|
+
end
|
1264
|
+
if (o = opts[:strip])
|
1265
|
+
$logger.debug "download: strip!"
|
1266
|
+
text.delete_if {|l| l !~ /\S/}
|
1267
|
+
end
|
1268
|
+
text = text.join("\n")
|
1269
|
+
end
|
1270
|
+
|
1271
|
+
pprc = @configuration.get(url, :downloadprocess)
|
1272
|
+
if pprc
|
1273
|
+
$logger.debug "download process: #{pprc}"
|
1274
|
+
text = @configuration.call_cmd(pprc, [text])
|
1275
|
+
$logger.debug text
|
1276
|
+
end
|
1277
|
+
|
1278
|
+
File.open(latest, 'w') {|io| io.puts(text)}
|
1279
|
+
return true
|
1280
|
+
end
|
1281
|
+
|
1282
|
+
|
1283
|
+
def diff(url, old, new, opts)
|
1284
|
+
if File.exists?(old)
|
1285
|
+
$logger.debug "diff: #{old} <-> #{new}"
|
1286
|
+
difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
|
1287
|
+
$logger.debug "diff: #{difftext}"
|
1288
|
+
|
1289
|
+
if difftext =~ /\S/
|
1290
|
+
if (pprc = @configuration.get(url, :diffprocess))
|
1291
|
+
$logger.debug "diff process: #{pprc}"
|
1292
|
+
difftext = @configuration.call_cmd(pprc, [difftext])
|
1293
|
+
end
|
1294
|
+
$logger.debug "difftext: #{difftext}"
|
1295
|
+
if difftext =~ /\S/
|
1296
|
+
$logger.warn "Changed: #{@configuration.get(url, :title, url).inspect}"
|
1297
|
+
return difftext
|
1298
|
+
end
|
1299
|
+
end
|
1300
|
+
|
1301
|
+
$logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
|
1302
|
+
else
|
1303
|
+
$logger.info "Initial copy: #{old.inspect}"
|
1304
|
+
end
|
1305
|
+
return nil
|
1306
|
+
end
|
1307
|
+
|
1308
|
+
|
1309
|
+
def accumulate(url, difftext, opts)
|
1310
|
+
@difftext[url] = difftext
|
1311
|
+
end
|
1312
|
+
|
1313
|
+
|
1314
|
+
def show
|
1315
|
+
@configuration.show_output(@difftext)
|
1316
|
+
end
|
1317
|
+
|
1318
|
+
|
1319
|
+
def move(from, to)
|
1320
|
+
if File.exists?(from)
|
1321
|
+
$logger.debug "Overwriting: #{from} -> #{to}" if File.exists?(to)
|
1322
|
+
File.rename(from, to)
|
1323
|
+
end
|
1324
|
+
end
|
1325
|
+
|
1326
|
+
|
1327
|
+
def ensure_dir(dir, &fill_dir)
|
1328
|
+
if File.exist?(dir)
|
1329
|
+
unless File.directory?(dir)
|
1330
|
+
$logger.fatal "Not a directory: #{dir}"
|
1331
|
+
exit 5
|
1332
|
+
end
|
1333
|
+
else
|
1334
|
+
Dir.mkdir(dir)
|
1335
|
+
fill_dir.call(dir) if fill_dir
|
1336
|
+
end
|
1337
|
+
end
|
1338
|
+
|
1339
|
+
end
|
1340
|
+
|
1341
|
+
|
1342
|
+
|
1343
|
+
if __FILE__ == $0
|
1344
|
+
Websitiary.new(ARGV).process
|
1345
|
+
# sleep 5
|
1346
|
+
end
|
1347
|
+
|
1348
|
+
|
1349
|
+
# Local Variables:
|
1350
|
+
# revisionRx: REVISION\s\+=\s\+\'
|
1351
|
+
# End:
|