websitary 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +57 -0
- data/Manifest.txt +11 -0
- data/README.txt +732 -0
- data/Rakefile +27 -0
- data/bin/websitary +43 -0
- data/lib/websitary.rb +610 -0
- data/lib/websitary/applog.rb +39 -0
- data/lib/websitary/configuration.rb +1505 -0
- data/lib/websitary/filemtimes.rb +50 -0
- data/lib/websitary/htmldiff.rb +93 -0
- data/setup.rb +1585 -0
- metadata +76 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
# applog.rb
|
2
|
+
# @Last Change: 2007-09-11.
|
3
|
+
# Author:: Thomas Link (micathom AT gmail com)
|
4
|
+
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
|
+
# Created:: 2007-09-08.
|
6
|
+
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
|
10
|
+
# A simple wrapper around Logger.
|
11
|
+
class Websitary::AppLog
|
12
|
+
def initialize(output=nil)
|
13
|
+
@output = output || $stdout
|
14
|
+
$logger = Logger.new(@output, 'daily')
|
15
|
+
$logger.progname = Websitary::APPNAME
|
16
|
+
$logger.datetime_format = "%H:%M:%S"
|
17
|
+
set_level
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def set_level(level=:default)
|
22
|
+
case level
|
23
|
+
when :debug
|
24
|
+
$logger.level = Logger::DEBUG
|
25
|
+
when :verbose
|
26
|
+
$logger.level = Logger::INFO
|
27
|
+
when :quiet
|
28
|
+
$logger.level = Logger::ERROR
|
29
|
+
else
|
30
|
+
$logger.level = Logger::WARN
|
31
|
+
end
|
32
|
+
$logger.debug "Set logger level: #{level}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
# Local Variables:
|
38
|
+
# revisionRx: REVISION\s\+=\s\+\'
|
39
|
+
# End:
|
@@ -0,0 +1,1505 @@
|
|
1
|
+
# configuration.rb
|
2
|
+
# @Last Change: 2007-09-16.
|
3
|
+
# Author:: Thomas Link (micathom AT gmail com)
|
4
|
+
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
|
+
# Created:: 2007-09-08.
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
# This class defines the scope in which profiles are evaluated. Most
|
10
|
+
# of its methods are suitable for use in profiles.
|
11
|
+
class Websitary::Configuration
|
12
|
+
# Hash (key = URL, value = Hash of options)
|
13
|
+
attr_accessor :urls
|
14
|
+
# Array of urls to be downloaded.
|
15
|
+
attr_accessor :todo
|
16
|
+
# Array of downloaded urls.
|
17
|
+
attr_accessor :done
|
18
|
+
# The user configuration directory
|
19
|
+
attr_accessor :cfgdir
|
20
|
+
# What to do
|
21
|
+
attr_accessor :execute
|
22
|
+
# Global Options
|
23
|
+
attr_accessor :options
|
24
|
+
# Cached mtimes
|
25
|
+
attr_accessor :mtimes
|
26
|
+
# The name of the quicklist profile
|
27
|
+
attr_accessor :quicklist_profile
|
28
|
+
# attr_accessor :default_profiles
|
29
|
+
# attr_accessor :cmd_edit
|
30
|
+
|
31
|
+
|
32
|
+
def initialize(app, args=[])
|
33
|
+
@logger = Websitary::AppLog.new
|
34
|
+
$logger.debug "Configuration#initialize"
|
35
|
+
@app = app
|
36
|
+
@cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitary') : '.'
|
37
|
+
[
|
38
|
+
ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitary'),
|
39
|
+
File.join(Config::CONFIG['sysconfdir'], 'websitary')
|
40
|
+
].each do |dir|
|
41
|
+
if File.exists?(dir)
|
42
|
+
@cfgdir = dir
|
43
|
+
break
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
@cmd_edit = 'vi "%s"'
|
48
|
+
@execute = 'downdiff'
|
49
|
+
@quicklist_profile = 'quicklist'
|
50
|
+
@user_agent = "websitary/#{Websitary::VERSION}"
|
51
|
+
@view = 'w3m "%s"'
|
52
|
+
|
53
|
+
@allow = {}
|
54
|
+
@default_options = {}
|
55
|
+
@default_profiles = [@quicklist_profile]
|
56
|
+
@done = []
|
57
|
+
@mtimes = Websitary::FileMTimes.new(self)
|
58
|
+
@outfile = {}
|
59
|
+
@profiles = []
|
60
|
+
@robots = {}
|
61
|
+
@todo = []
|
62
|
+
@urlencmap = {}
|
63
|
+
@urls = {}
|
64
|
+
|
65
|
+
@suffix = {
|
66
|
+
'text' => 'txt'
|
67
|
+
# 'rss' => 'xml'
|
68
|
+
}
|
69
|
+
|
70
|
+
migrate
|
71
|
+
initialize_options
|
72
|
+
profile 'config.rb'
|
73
|
+
parse_command_line_args(args)
|
74
|
+
|
75
|
+
@output_format ||= ['html']
|
76
|
+
@output_title = %{#{Websitary::APPNAME}: #{@profiles.join(", ")}}
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
def parse_command_line_args(args)
|
81
|
+
$logger.debug "parse_command_line_args: #{args}"
|
82
|
+
opts = OptionParser.new do |opts|
|
83
|
+
opts.banner = "Usage: #{Websitary::APPNAME} [OPTIONS] [PROFILES] > [OUT]"
|
84
|
+
opts.separator ''
|
85
|
+
opts.separator "#{Websitary::APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
|
86
|
+
opts.separator 'the terms of the GNU General Public License version 2 or newer.'
|
87
|
+
opts.separator ''
|
88
|
+
|
89
|
+
opts.separator 'General Options:'
|
90
|
+
|
91
|
+
opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
|
92
|
+
@cfgdir = value
|
93
|
+
end
|
94
|
+
|
95
|
+
opts.on('-e', '--execute=COMMAND', String, 'Define what to do (default: downdiff)') do |value|
|
96
|
+
@execute = value
|
97
|
+
end
|
98
|
+
|
99
|
+
# opts.on('-E', '--edit=PROFILE', String, 'Edit a profile') do |value|
|
100
|
+
# edit_profile value
|
101
|
+
# exit 0
|
102
|
+
# end
|
103
|
+
|
104
|
+
opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
|
105
|
+
output_format(*value.split(/,/))
|
106
|
+
end
|
107
|
+
|
108
|
+
opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
|
109
|
+
set :ignore_age => bool
|
110
|
+
end
|
111
|
+
|
112
|
+
opts.on('--log=DESTINATION', String, 'Log destination') do |value|
|
113
|
+
@logger = Websitary::AppLog.new(value != '-' && value)
|
114
|
+
end
|
115
|
+
|
116
|
+
opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
|
117
|
+
output_file(value)
|
118
|
+
end
|
119
|
+
|
120
|
+
opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
|
121
|
+
key, val = value.split(/=/, 2)
|
122
|
+
set key.intern => eval(val)
|
123
|
+
end
|
124
|
+
|
125
|
+
opts.on('-t', '--timer=N', Numeric, 'Repeat every N seconds (never exit)') do |value|
|
126
|
+
global(:timer => value)
|
127
|
+
end
|
128
|
+
|
129
|
+
# opts.on('--review', 'View last diff') do |value|
|
130
|
+
# view_output
|
131
|
+
# exit 0
|
132
|
+
# end
|
133
|
+
|
134
|
+
opts.separator ''
|
135
|
+
opts.separator "Available commands (default: #@execute):"
|
136
|
+
commands = @app.methods.map do |m|
|
137
|
+
mt = m.match(/^execute_(.*)$/)
|
138
|
+
mt && mt[1]
|
139
|
+
end
|
140
|
+
commands.compact!
|
141
|
+
commands.sort!
|
142
|
+
opts.separator commands.join(', ')
|
143
|
+
|
144
|
+
opts.separator ''
|
145
|
+
opts.separator 'Available profiles:'
|
146
|
+
opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
|
147
|
+
|
148
|
+
opts.separator ''
|
149
|
+
opts.separator 'Other Options:'
|
150
|
+
|
151
|
+
opts.on('--debug', 'Show debug messages') do |v|
|
152
|
+
$VERBOSE = $DEBUG = true
|
153
|
+
@logger.set_level(:debug)
|
154
|
+
end
|
155
|
+
|
156
|
+
opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
|
157
|
+
@logger.set_level(:quiet)
|
158
|
+
end
|
159
|
+
|
160
|
+
opts.on('-v', '--verbose', 'Run verbosely') do |v|
|
161
|
+
$VERBOSE = true
|
162
|
+
@logger.set_level(:verbose)
|
163
|
+
end
|
164
|
+
|
165
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
166
|
+
puts opts
|
167
|
+
exit 1
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
@profiles = opts.parse!(args)
|
172
|
+
@profiles = @default_profiles if @profiles.empty?
|
173
|
+
cla_handler = "cmdline_arg_#{@execute}"
|
174
|
+
cla_handler = nil unless @app.respond_to?(cla_handler)
|
175
|
+
for pn in @profiles
|
176
|
+
if cla_handler
|
177
|
+
@app.send(cla_handler, self, pn)
|
178
|
+
else
|
179
|
+
profile pn
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
self
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
# Retrieve an option for an url
|
188
|
+
# url:: String
|
189
|
+
# opt:: Symbol
|
190
|
+
def get(url, opt, default=nil)
|
191
|
+
opts = @urls[url]
|
192
|
+
unless opts
|
193
|
+
$logger.debug "Non-registered URL: #{url}"
|
194
|
+
return default
|
195
|
+
end
|
196
|
+
$logger.debug "get: opts=#{opts.inspect}"
|
197
|
+
case opt
|
198
|
+
when :diffprocess, :format
|
199
|
+
opt_ = opts.has_key?(opt) ? opt : :diff
|
200
|
+
else
|
201
|
+
opt_ = opt
|
202
|
+
end
|
203
|
+
|
204
|
+
$logger.debug "get: opt=#{opt} opt_=#{opt_}"
|
205
|
+
$logger.debug "get: #{opts[opt_]} #{opts[:use]}" if opts
|
206
|
+
if opts.has_key?(opt_)
|
207
|
+
val = opts[opt_]
|
208
|
+
elsif opts.has_key?(:use)
|
209
|
+
val = opts[:use]
|
210
|
+
else
|
211
|
+
val = nil
|
212
|
+
end
|
213
|
+
|
214
|
+
case val
|
215
|
+
when nil
|
216
|
+
when Symbol
|
217
|
+
$logger.debug "get: val=#{val}"
|
218
|
+
success, rv = get_option(opt, val)
|
219
|
+
$logger.debug "get: #{success}, #{rv}"
|
220
|
+
if success
|
221
|
+
return rv
|
222
|
+
end
|
223
|
+
else
|
224
|
+
$logger.debug "get: return val=#{val}"
|
225
|
+
return val
|
226
|
+
end
|
227
|
+
unless default
|
228
|
+
success, default1 = get_option(opt, :default)
|
229
|
+
default = default1 if success
|
230
|
+
end
|
231
|
+
|
232
|
+
$logger.debug "get: return default=#{default}"
|
233
|
+
return default
|
234
|
+
end
|
235
|
+
|
236
|
+
|
237
|
+
def get_optionvalue(opt, val, default=nil)
|
238
|
+
case val
|
239
|
+
when Symbol
|
240
|
+
ok, val = get_option(opt, val)
|
241
|
+
if ok
|
242
|
+
val
|
243
|
+
else
|
244
|
+
default
|
245
|
+
end
|
246
|
+
else
|
247
|
+
val
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
def get_option(opt, val)
|
253
|
+
vals = @options[opt]
|
254
|
+
$logger.debug "val=#{val} vals=#{vals.inspect}"
|
255
|
+
if vals and vals.has_key?(val)
|
256
|
+
rv = vals[val]
|
257
|
+
$logger.debug "get_option ok: #{opt} => #{rv.inspect}"
|
258
|
+
case rv
|
259
|
+
when Symbol
|
260
|
+
$logger.debug "get_option re: #{rv}"
|
261
|
+
return get_option(opt, rv)
|
262
|
+
else
|
263
|
+
$logger.debug "get_option true, #{rv}"
|
264
|
+
return [true, rv]
|
265
|
+
end
|
266
|
+
else
|
267
|
+
$logger.debug "get_option no: #{opt} => #{val.inspect}"
|
268
|
+
return [false, val]
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
# Configuration command:
|
274
|
+
# Set the default profiles
|
275
|
+
def default(*profile_names)
|
276
|
+
@default_profiles = profile_names
|
277
|
+
end
|
278
|
+
|
279
|
+
|
280
|
+
def quicklist(profile_name)
|
281
|
+
@quicklist_profile = profile_name
|
282
|
+
end
|
283
|
+
|
284
|
+
|
285
|
+
# Configuration command:
|
286
|
+
# Load a profile
|
287
|
+
def profile(profile_name)
|
288
|
+
case profile_name
|
289
|
+
when '-'
|
290
|
+
readlines.map! {|l| l.chomp}.each {|url| source url}
|
291
|
+
when '__END__'
|
292
|
+
$logger.debug "Profile: __END__"
|
293
|
+
contents = DATA.read
|
294
|
+
return eval_profile(contents)
|
295
|
+
else
|
296
|
+
fn = profile_filename(profile_name)
|
297
|
+
if fn
|
298
|
+
$logger.debug "Profile: #{fn}"
|
299
|
+
contents = File.read(fn)
|
300
|
+
return eval_profile(contents, fn)
|
301
|
+
end
|
302
|
+
end
|
303
|
+
return false
|
304
|
+
end
|
305
|
+
|
306
|
+
|
307
|
+
# Define a options shortcut.
|
308
|
+
def shortcut(symbol, args)
|
309
|
+
ak = args.keys
|
310
|
+
ok = @options.keys
|
311
|
+
dk = ok - ak
|
312
|
+
|
313
|
+
# :downloadprocess
|
314
|
+
if !ak.include?(:delegate) and
|
315
|
+
dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
|
316
|
+
$logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
|
317
|
+
end
|
318
|
+
|
319
|
+
if ak.include?(:delegate)
|
320
|
+
dk.each do |field|
|
321
|
+
@options[field][symbol] = args[:delegate]
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
args.each do |field, val|
|
326
|
+
@options[field][symbol] = val unless field == :delegate
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
|
331
|
+
# Set the output format.
|
332
|
+
def output_format(*format)
|
333
|
+
unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
|
334
|
+
$logger.fatal "Unknown output format: #{format}"
|
335
|
+
exit 5
|
336
|
+
end
|
337
|
+
@output_format = format
|
338
|
+
end
|
339
|
+
|
340
|
+
|
341
|
+
# Set the output file.
|
342
|
+
def output_file(filename, outformat=nil)
|
343
|
+
@outfile[outformat] = filename
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
# Configuration command:
|
348
|
+
# Set global options.
|
349
|
+
# type:: Symbol
|
350
|
+
# options:: Hash
|
351
|
+
def option(type, options)
|
352
|
+
$logger.info "option #{type}: #{options.inspect}"
|
353
|
+
o = @options[type]
|
354
|
+
if o
|
355
|
+
o.merge!(options)
|
356
|
+
else
|
357
|
+
$logger.error "Unknown option type: #{type} (#{options.inspect})"
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
|
362
|
+
# Set a global option.
|
363
|
+
def global(options)
|
364
|
+
options.each do |type, value|
|
365
|
+
@options[:global][type] = value
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
|
370
|
+
# Configuration command:
|
371
|
+
# Set the default value for source-options.
|
372
|
+
def set(options)
|
373
|
+
$logger.debug "set: #{options.inspect}"
|
374
|
+
@default_options.merge!(options)
|
375
|
+
end
|
376
|
+
|
377
|
+
|
378
|
+
# Configuration command:
|
379
|
+
# Unset a default source-option.
|
380
|
+
def unset(*options)
|
381
|
+
for option in options
|
382
|
+
@default_options.delete(option)
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
|
387
|
+
# Configuration command:
|
388
|
+
# Define a source.
|
389
|
+
# urls:: String
|
390
|
+
def source(urls, opts={})
|
391
|
+
urls.split("\n").flatten.compact.each do |url|
|
392
|
+
@urls[url] = @default_options.dup.update(opts)
|
393
|
+
@todo << url
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
|
398
|
+
# Configuration command:
|
399
|
+
# Set the default download processor. The block takes the
|
400
|
+
# downloaded text (STRING) as argument.
|
401
|
+
def downloadprocess(&block)
|
402
|
+
@options[:downloadprocess][:default] = block
|
403
|
+
end
|
404
|
+
|
405
|
+
|
406
|
+
# Configuration command:
|
407
|
+
# Set the default diff processor. The block takes the
|
408
|
+
# diff text (STRING) as argument.
|
409
|
+
def diffprocess(&block)
|
410
|
+
@options[:diff][:default] = block
|
411
|
+
end
|
412
|
+
|
413
|
+
|
414
|
+
# Configuration command:
|
415
|
+
# Set the editor.
|
416
|
+
def edit(cmd)
|
417
|
+
@cmd_edit = cmd
|
418
|
+
end
|
419
|
+
|
420
|
+
|
421
|
+
# Configuration command:
|
422
|
+
# Set the viewer.
|
423
|
+
def view(view)
|
424
|
+
@view = view
|
425
|
+
end
|
426
|
+
|
427
|
+
|
428
|
+
# Configuration command:
|
429
|
+
# Set the default diff program.
|
430
|
+
def diff(diff)
|
431
|
+
@options[:diff][:default] = diff
|
432
|
+
end
|
433
|
+
|
434
|
+
|
435
|
+
# Configuration command:
|
436
|
+
# Set the default dowloader.
|
437
|
+
def download(download)
|
438
|
+
@options[:download][:default] = download
|
439
|
+
end
|
440
|
+
|
441
|
+
|
442
|
+
# Format a diff according to URL's source options.
|
443
|
+
def format(url, difftext)
|
444
|
+
fmt = get(url, :format)
|
445
|
+
eval_arg(fmt, [difftext], difftext)
|
446
|
+
end
|
447
|
+
|
448
|
+
|
449
|
+
# Apply some arguments to a format.
|
450
|
+
# format:: String or Proc
|
451
|
+
# args:: Array of Arguments
|
452
|
+
def eval_arg(format, args, default=nil, &process_string)
|
453
|
+
case format
|
454
|
+
when nil
|
455
|
+
return default
|
456
|
+
when Proc
|
457
|
+
# $logger.debug "eval proc: #{format} #{args.inspect}" #DBG#
|
458
|
+
$logger.debug "eval proc: #{format}/#{args.size}"
|
459
|
+
return format.call(*args)
|
460
|
+
else
|
461
|
+
ca = format % args
|
462
|
+
# $logger.debug "eval string: #{ca}" #DBG#
|
463
|
+
if process_string
|
464
|
+
return process_string.call(ca)
|
465
|
+
else
|
466
|
+
return ca
|
467
|
+
end
|
468
|
+
end
|
469
|
+
end
|
470
|
+
|
471
|
+
|
472
|
+
# Apply the argument to cmd (a format String or a Proc). If a
|
473
|
+
# String, execute the command.
|
474
|
+
def call_cmd(cmd, args, default=nil)
|
475
|
+
eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
|
476
|
+
end
|
477
|
+
|
478
|
+
|
479
|
+
# Generate & view the final output.
|
480
|
+
# difftext:: Hash
|
481
|
+
def show_output(difftext)
|
482
|
+
if difftext.empty?
|
483
|
+
msg = ['No news is good news']
|
484
|
+
msg << "try again in #{@app.format_tdiff(@app.tdiff_min)}" if @app.tdiff_min
|
485
|
+
$logger.warn msg.join('; ')
|
486
|
+
return 0
|
487
|
+
end
|
488
|
+
|
489
|
+
@output_format.each do |outformat|
|
490
|
+
meth = "get_output_#{outformat}"
|
491
|
+
|
492
|
+
unless respond_to?(meth)
|
493
|
+
$logger.fatal "Unknown output format: #{outformat}"
|
494
|
+
exit 5
|
495
|
+
end
|
496
|
+
|
497
|
+
out = send(meth, difftext)
|
498
|
+
if out
|
499
|
+
outfile = get_outfile(outformat)
|
500
|
+
case outfile
|
501
|
+
when '-'
|
502
|
+
puts out
|
503
|
+
else
|
504
|
+
write_file(outfile) {|io| io.puts out}
|
505
|
+
meth = "view_output_#{outformat}"
|
506
|
+
self.send(meth, outfile)
|
507
|
+
end
|
508
|
+
end
|
509
|
+
end
|
510
|
+
return 1
|
511
|
+
end
|
512
|
+
|
513
|
+
|
514
|
+
def get_output_text(difftext)
|
515
|
+
difftext.map do |url, difftext|
|
516
|
+
if difftext
|
517
|
+
difftext = html_to_text(difftext) if is_html?(difftext)
|
518
|
+
!difftext.empty? && [
|
519
|
+
eval_arg(get(url, :rewrite_link, '%s'), [url]),
|
520
|
+
difftext_annotation(url),
|
521
|
+
nil,
|
522
|
+
difftext
|
523
|
+
].join("\n")
|
524
|
+
end
|
525
|
+
end.compact.join("\n\n#{('-' * 68)}\n\n")
|
526
|
+
end
|
527
|
+
|
528
|
+
|
529
|
+
def get_output_rss(difftext)
|
530
|
+
success, rss_url = get_option(:rss, :url)
|
531
|
+
if success
|
532
|
+
success, rss_version = get_option(:rss, :version)
|
533
|
+
# require "rss/#{rss_version}"
|
534
|
+
|
535
|
+
rss = RSS::Rss.new(rss_version)
|
536
|
+
chan = RSS::Rss::Channel.new
|
537
|
+
chan.title = @output_title
|
538
|
+
[:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
|
539
|
+
ok, val = get_option(:rss, field)
|
540
|
+
item.send(format_symbol(field, '%s='), val) if ok
|
541
|
+
end
|
542
|
+
chan.link = rss_url
|
543
|
+
rss.channel = chan
|
544
|
+
|
545
|
+
cnt = difftext.map do |url, text|
|
546
|
+
rss_format = get(url, :rss_format, 'plain_text')
|
547
|
+
text = strip_tags(text, :format => rss_format)
|
548
|
+
next if text.empty?
|
549
|
+
|
550
|
+
item = RSS::Rss::Channel::Item.new
|
551
|
+
item.date = Time.now
|
552
|
+
item.title = get(url, :title, File.basename(url))
|
553
|
+
item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
|
554
|
+
[:author, :date, :enclosure, :category, :pubDate].each do |field|
|
555
|
+
val = get(url, format_symbol(field, 'rss_%s'))
|
556
|
+
item.send(format_symbol(field, '%s='), val) if val
|
557
|
+
end
|
558
|
+
|
559
|
+
annotation = difftext_annotation(url)
|
560
|
+
annotation = "<pre>#{annotation}</pre>" if annotation
|
561
|
+
case rss_format
|
562
|
+
when 'plain_text'
|
563
|
+
item.description = %{#{annotation}<pre>#{text}</pre>}
|
564
|
+
else
|
565
|
+
item.description = %{#{annotation}\n#{text}}
|
566
|
+
end
|
567
|
+
chan.items << item
|
568
|
+
end
|
569
|
+
|
570
|
+
return rss.to_s
|
571
|
+
|
572
|
+
else
|
573
|
+
|
574
|
+
$logger.fatal "Global option :rss[:url] not defined."
|
575
|
+
exit 5
|
576
|
+
|
577
|
+
end
|
578
|
+
end
|
579
|
+
|
580
|
+
|
581
|
+
def get_output_html(difftext)
|
582
|
+
difftext = difftext.map do |url, text|
|
583
|
+
tags = get(url, :strip_tags)
|
584
|
+
text = strip_tags(text, :tags => tags) if tags
|
585
|
+
text.empty? ? nil : [url, text]
|
586
|
+
end
|
587
|
+
difftext.compact!
|
588
|
+
sort_difftext!(difftext)
|
589
|
+
|
590
|
+
toc = difftext.map do |url, text|
|
591
|
+
ti = get(url, :title, File.basename(url))
|
592
|
+
tid = html_toc_id(url)
|
593
|
+
bid = html_body_id(url)
|
594
|
+
%{<li id="#{tid}" class="toc"><a class="toc" href="\##{bid}">#{ti}</a></li>}
|
595
|
+
end.join("\n")
|
596
|
+
|
597
|
+
idx = 0
|
598
|
+
cnt = difftext.map do |url, text|
|
599
|
+
idx += 1
|
600
|
+
ti = get(url, :title, File.basename(url))
|
601
|
+
bid = html_body_id(url)
|
602
|
+
if (rewrite = get(url, :rewrite_link))
|
603
|
+
urlr = eval_arg(rewrite, [url])
|
604
|
+
ext = ''
|
605
|
+
else
|
606
|
+
old = %{<a class="old" href="#{file_url(oldname(url))}">old</a>}
|
607
|
+
lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
|
608
|
+
ext = %{ (#{old}, #{lst})}
|
609
|
+
urlr = url
|
610
|
+
end
|
611
|
+
note = difftext_annotation(url)
|
612
|
+
<<HTML
|
613
|
+
<div id="#{bid}" class="webpage">
|
614
|
+
<div class="count">
|
615
|
+
#{idx}
|
616
|
+
</div>
|
617
|
+
<h1 class="diff">
|
618
|
+
<a class="external" href="#{urlr}">#{ti}</a>#{ext}
|
619
|
+
</h1>
|
620
|
+
<div class="annotation">
|
621
|
+
#{note && CGI::escapeHTML(note)}
|
622
|
+
</div>
|
623
|
+
<div class="diff,difftext">
|
624
|
+
#{format(url, text)}
|
625
|
+
</div>
|
626
|
+
</div>
|
627
|
+
HTML
|
628
|
+
end.join(('<hr class="separator"/>') + "\n")
|
629
|
+
|
630
|
+
success, template = get_option(:page, :format)
|
631
|
+
unless success
|
632
|
+
success, template = get_option(:page, :simple)
|
633
|
+
end
|
634
|
+
return eval_arg(template, [@output_title, toc, cnt])
|
635
|
+
end
|
636
|
+
|
637
|
+
|
638
|
+
# Get the diff filename.
|
639
|
+
def diffname(url, ensure_dir=false)
|
640
|
+
encoded_filename('diff', url, ensure_dir, 'md5')
|
641
|
+
end
|
642
|
+
|
643
|
+
|
644
|
+
# Get the backup filename.
|
645
|
+
def oldname(url, ensure_dir=false, type=nil)
|
646
|
+
encoded_filename('old', url, ensure_dir, type)
|
647
|
+
end
|
648
|
+
|
649
|
+
|
650
|
+
# Get the filename for the freshly downloaded copy.
|
651
|
+
def latestname(url, ensure_dir=false, type=nil)
|
652
|
+
encoded_filename('latest', url, ensure_dir, type)
|
653
|
+
end
|
654
|
+
|
655
|
+
|
656
|
+
def url_from_filename(filename)
|
657
|
+
rv = @urlencmap[filename]
|
658
|
+
if rv
|
659
|
+
$logger.debug "Map filename: #{filename} -> #{rv}"
|
660
|
+
else
|
661
|
+
$logger.warn "Unmapped filename: #{filename}"
|
662
|
+
end
|
663
|
+
rv
|
664
|
+
end
|
665
|
+
|
666
|
+
|
667
|
+
def encoded_filename(dir, url, ensure_dir=false, type=nil)
|
668
|
+
type ||= get(url, :cachetype, 'tree')
|
669
|
+
$logger.debug "encoded_filename: type=#{type} url=#{url}"
|
670
|
+
rv = File.join(@cfgdir, dir, encoded_basename(url, type))
|
671
|
+
rd = File.dirname(rv)
|
672
|
+
$logger.debug "encoded_filename: rv0=#{rv}"
|
673
|
+
fm = get_optionvalue(:global, :filename_size, 255)
|
674
|
+
rdok = !ensure_dir || @app.ensure_dir(rd, false)
|
675
|
+
if !rdok or rv.size > fm or File.directory?(rv)
|
676
|
+
# $logger.debug "Filename too long (:global=>:filename_size = #{fm}), try md5 encoded filename instead: #{url}"
|
677
|
+
$logger.info "Can't use filename, try 'md5' instead: #{url}"
|
678
|
+
rv = File.join(@cfgdir, dir, encoded_basename(url, :md5))
|
679
|
+
rd = File.dirname(rv)
|
680
|
+
end
|
681
|
+
@urlencmap[rv] = url
|
682
|
+
return rv
|
683
|
+
end
|
684
|
+
|
685
|
+
|
686
|
+
def encoded_basename(url, type='tree')
|
687
|
+
m = "encoded_basename_#{type}"
|
688
|
+
if respond_to?(m)
|
689
|
+
return send(m, url)
|
690
|
+
else
|
691
|
+
$logger.fatal "Unknown cache type: #{type}"
|
692
|
+
exit 5
|
693
|
+
end
|
694
|
+
end
|
695
|
+
|
696
|
+
|
697
|
+
def encoded_basename_tree(url)
|
698
|
+
ensure_filename(encode(url, '/'))
|
699
|
+
end
|
700
|
+
|
701
|
+
|
702
|
+
def encoded_basename_flat(url)
|
703
|
+
encode(url)
|
704
|
+
end
|
705
|
+
|
706
|
+
|
707
|
+
def encoded_basename_md5(url)
|
708
|
+
Digest::MD5.hexdigest(url)
|
709
|
+
end
|
710
|
+
|
711
|
+
|
712
|
+
def urlextname(url)
|
713
|
+
begin
|
714
|
+
return File.extname(URI.parse(url).path)
|
715
|
+
rescue Exception => e
|
716
|
+
end
|
717
|
+
end
|
718
|
+
|
719
|
+
|
720
|
+
# Guess path's dirname.
|
721
|
+
# foo/bar -> foo
|
722
|
+
# foo/bar.txt -> foo
|
723
|
+
# foo/bar/ -> foo/bar
|
724
|
+
def guess_dir(path)
|
725
|
+
path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
|
726
|
+
end
|
727
|
+
|
728
|
+
|
729
|
+
# Strip the url's last part (after #).
|
730
|
+
def canonic_url(url)
|
731
|
+
url.sub(/#.*$/, '')
|
732
|
+
end
|
733
|
+
|
734
|
+
|
735
|
+
def strip_tags_default
|
736
|
+
success, tags = get_option(:strip_tags, :default)
|
737
|
+
tags.dup if success
|
738
|
+
end
|
739
|
+
|
740
|
+
|
741
|
+
def strip_tags(doc, args={})
|
742
|
+
tags = args[:tags] || strip_tags_default
|
743
|
+
case doc
|
744
|
+
when String
|
745
|
+
doc = Hpricot(doc)
|
746
|
+
end
|
747
|
+
tags.each do |tag|
|
748
|
+
doc.search(tag).remove
|
749
|
+
end
|
750
|
+
case args[:format]
|
751
|
+
when :hpricot
|
752
|
+
doc
|
753
|
+
else
|
754
|
+
doc.send("to_#{args[:format] || :html}")
|
755
|
+
end
|
756
|
+
end
|
757
|
+
|
758
|
+
|
759
|
+
# Check whether path is eligible on the basis of url or path0.
|
760
|
+
# This checks either for a :match option for url or the extensions
|
761
|
+
# of path0 and path.
|
762
|
+
def eligible_path?(url, path0, path)
|
763
|
+
rx = get(url, :match)
|
764
|
+
if rx
|
765
|
+
return path =~ rx
|
766
|
+
else
|
767
|
+
return File.extname(path0) == File.extname(path)
|
768
|
+
end
|
769
|
+
end
|
770
|
+
|
771
|
+
|
772
|
+
# Scan hpricot document for hrefs and push the onto @todo if not
|
773
|
+
# already included.
|
774
|
+
def push_hrefs(url, hpricot, &condition)
|
775
|
+
begin
|
776
|
+
return if robots?(hpricot, 'nofollow')
|
777
|
+
depth = get(url, :depth)
|
778
|
+
return if depth and depth <= 0
|
779
|
+
uri0 = URI.parse(url)
|
780
|
+
# pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
|
781
|
+
pn0 = Pathname.new(guess_dir(uri0.path))
|
782
|
+
(hpricot / 'a').each do |a|
|
783
|
+
href = a['href']
|
784
|
+
next if href.nil? or href == url or href =~ /^\s*javascript:/
|
785
|
+
uri = URI.parse(href)
|
786
|
+
pn = guess_dir(uri.path)
|
787
|
+
href = rewrite_href(href, url, uri0, pn0, true)
|
788
|
+
curl = canonic_url(href)
|
789
|
+
next if href.nil? or @done.include?(curl) or @todo.include?(curl)
|
790
|
+
next unless robots_allowed?(curl, uri)
|
791
|
+
# pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
|
792
|
+
uri = URI.parse(href)
|
793
|
+
pn = Pathname.new(guess_dir(uri.path))
|
794
|
+
if condition.call(uri0, pn0, uri, pn)
|
795
|
+
opts = @urls[url].dup
|
796
|
+
# opts[:title] = File.basename(curl)
|
797
|
+
opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
|
798
|
+
opts[:depth] = depth - 1 if depth and depth >= 0
|
799
|
+
@urls[curl] = opts
|
800
|
+
@todo << curl
|
801
|
+
end
|
802
|
+
end
|
803
|
+
rescue Exception => e
|
804
|
+
# $logger.error e #DBG#
|
805
|
+
$logger.error e.message
|
806
|
+
$logger.debug e.backtrace
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
|
811
|
+
# Rewrite urls in doc
|
812
|
+
# url:: String
|
813
|
+
# doc:: Hpricot document
|
814
|
+
def rewrite_urls(url, doc)
|
815
|
+
uri = URI.parse(url)
|
816
|
+
urd = guess_dir(uri.path)
|
817
|
+
(doc / 'a').each do |a|
|
818
|
+
href = rewrite_href(a['href'], url, uri, urd, true)
|
819
|
+
a['href'] = href if href
|
820
|
+
end
|
821
|
+
(doc / 'img').each do |a|
|
822
|
+
href = rewrite_href(a['src'], url, uri, urd, false)
|
823
|
+
a['src'] = href if href
|
824
|
+
end
|
825
|
+
doc
|
826
|
+
end
|
827
|
+
|
828
|
+
|
829
|
+
# Try to make href an absolute url.
|
830
|
+
def rewrite_href(href, url, uri=nil, urd=nil, local=false)
|
831
|
+
begin
|
832
|
+
return if !href or href =~ /^\s*javascript:/
|
833
|
+
urh = URI.parse(href)
|
834
|
+
uri ||= URI.parse(url)
|
835
|
+
urd ||= guess_dir(uri.path)
|
836
|
+
rv = nil
|
837
|
+
href = href.strip
|
838
|
+
|
839
|
+
# $logger.debug "DBG", uri, urh, #DBG#
|
840
|
+
if href =~ /\w+:/
|
841
|
+
# $logger.debug "DBG href=#$0" #DBG#
|
842
|
+
rv = href
|
843
|
+
elsif urh.relative?
|
844
|
+
# $logger.debug "DBG urh relative" #DBG#
|
845
|
+
if uri.relative?
|
846
|
+
# $logger.debug "DBG both relative" #DBG#
|
847
|
+
if uri.instance_of?(URI::Generic)
|
848
|
+
rv = File.join(urd, href)
|
849
|
+
# $logger.debug "DBG rv=#{rv}" #DBG#
|
850
|
+
end
|
851
|
+
else
|
852
|
+
rv = uri.merge(href).to_s
|
853
|
+
# $logger.debug "DBG relativ rv=#{rv}" #DBG#
|
854
|
+
if local
|
855
|
+
hf = latestname(rv)
|
856
|
+
if @todo.include?(rv) or @done.include?(rv) or File.exist?(hf)
|
857
|
+
rv = hf
|
858
|
+
# $logger.debug "DBG relativ, local rv=#{rv}" #DBG#
|
859
|
+
end
|
860
|
+
end
|
861
|
+
end
|
862
|
+
elsif href[0..0] == '#'
|
863
|
+
# $logger.debug "DBG anchor" #DBG#
|
864
|
+
rv = url + href
|
865
|
+
elsif uri.host == urh.host
|
866
|
+
# $logger.debug "DBG merge" #DBG#
|
867
|
+
rv = uri.merge(href).to_s
|
868
|
+
else
|
869
|
+
# $logger.debug "as is" #DBG#
|
870
|
+
rv = href
|
871
|
+
end
|
872
|
+
|
873
|
+
case rv
|
874
|
+
when String
|
875
|
+
return rv
|
876
|
+
when nil
|
877
|
+
else
|
878
|
+
$logger.error "Internal error: href=#{href}"
|
879
|
+
$logger.debug caller.join("\n")
|
880
|
+
end
|
881
|
+
return
|
882
|
+
rescue Exception => e
|
883
|
+
# $logger.error e #DBG#
|
884
|
+
$logger.error e.message
|
885
|
+
$logger.debug e.backtrace
|
886
|
+
end
|
887
|
+
return nil
|
888
|
+
end
|
889
|
+
|
890
|
+
|
891
|
+
# Return a Proc that takes an text as argument and highlight occurences of rx.
|
892
|
+
# rx:: Regular expression
|
893
|
+
# color:: A string, sets the class to highlight-color (default: "yellow")
|
894
|
+
# group:: A number (default: 0)
|
895
|
+
# tag:: The HTML tag to use (default: "span")
|
896
|
+
def highlighter(rx, color=nil, group=nil, tag='span')
|
897
|
+
lambda {|text| text.gsub(rx, %{<#{tag} class="highlight-#{color || 'yellow'}">\\#{group || 0}</#{tag}>})}
|
898
|
+
end
|
899
|
+
|
900
|
+
|
901
|
+
def view_output(outfile=nil)
|
902
|
+
send("view_output_#{@output_format[0]}", outfile || get_outfile)
|
903
|
+
end
|
904
|
+
|
905
|
+
|
906
|
+
def edit_profile(profile=nil)
|
907
|
+
profile ||= @profiles
|
908
|
+
case profile
|
909
|
+
when Array
|
910
|
+
profile.each {|p| edit_profile p}
|
911
|
+
else
|
912
|
+
fn = profile_filename(profile)
|
913
|
+
$logger.debug "edit: #{fn}"
|
914
|
+
`#{@cmd_edit % fn}`
|
915
|
+
end
|
916
|
+
end
|
917
|
+
|
918
|
+
|
919
|
+
def profile_filename(profile_name, check_file_exists=true)
|
920
|
+
if File.extname(profile_name) != '.rb'
|
921
|
+
profile_name = "#{profile_name}.rb"
|
922
|
+
end
|
923
|
+
filename = nil
|
924
|
+
['.', @cfgdir].each do |d|
|
925
|
+
filename = File.join(d, profile_name)
|
926
|
+
if File.exists?(filename)
|
927
|
+
return filename
|
928
|
+
end
|
929
|
+
end
|
930
|
+
return check_file_exists ? nil : filename
|
931
|
+
end
|
932
|
+
|
933
|
+
|
934
|
+
def write_file(filename, mode='w', &block)
|
935
|
+
File.open(filename, mode) {|io| block.call(io)}
|
936
|
+
@mtimes.set(filename)
|
937
|
+
end
|
938
|
+
|
939
|
+
|
940
|
+
def canonic_filename(filename)
|
941
|
+
call_cmd(get_optionvalue(:global, :canonic_filename), [filename], filename)
|
942
|
+
end
|
943
|
+
|
944
|
+
|
945
|
+
private
|
946
|
+
def initialize_options
|
947
|
+
@options = {
|
948
|
+
:global => {
|
949
|
+
:downloadhtml => :openuri,
|
950
|
+
},
|
951
|
+
}
|
952
|
+
|
953
|
+
@options[:diff] = {
|
954
|
+
:default => :diff,
|
955
|
+
|
956
|
+
:diff => lambda {|old, new, *args|
|
957
|
+
opts, _ = args
|
958
|
+
opts ||= '-d -w'
|
959
|
+
difftext = call_cmd('diff %s -u2 "%s" "%s"', [opts, old, new])
|
960
|
+
difftext = difftext.split("\n")[2..-1]
|
961
|
+
difftext ? difftext.delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n") : ''
|
962
|
+
},
|
963
|
+
|
964
|
+
:binary => lambda {|old, new|
|
965
|
+
call_cmd(get_optionvalue(:diff, :diff), [old, new, '--binary -d -w'])
|
966
|
+
},
|
967
|
+
|
968
|
+
:new => lambda {|old, new|
|
969
|
+
difftext = call_cmd(get_optionvalue(:diff, :binary), [old, new])
|
970
|
+
difftext.empty? ? '' : new
|
971
|
+
},
|
972
|
+
|
973
|
+
:raw => :new,
|
974
|
+
|
975
|
+
:webdiff => lambda {|old, new|
|
976
|
+
$logger.debug "webdiff: #{File.basename(new)}"
|
977
|
+
$logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
|
978
|
+
difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
|
979
|
+
$?.exitstatus == 1 ? difftext : ''
|
980
|
+
},
|
981
|
+
}
|
982
|
+
|
983
|
+
@options[:format] = {
|
984
|
+
:default => :diff,
|
985
|
+
:diff => %{<pre class="diff">\n%s\n</pre>},
|
986
|
+
:webdiff => "%s\n",
|
987
|
+
:raw => lambda {|new| File.read(new)},
|
988
|
+
}
|
989
|
+
|
990
|
+
@options[:diffprocess] = {
|
991
|
+
:default => :diff,
|
992
|
+
:diff => false,
|
993
|
+
:webdiff => false,
|
994
|
+
:raw => false,
|
995
|
+
}
|
996
|
+
|
997
|
+
@options[:download] = {
|
998
|
+
:default => :w3m,
|
999
|
+
:raw => :openuri,
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
@options[:downloadformat] = {
|
1003
|
+
:w3m => 'text',
|
1004
|
+
:webdiff => 'html',
|
1005
|
+
:raw => '',
|
1006
|
+
}
|
1007
|
+
|
1008
|
+
@options[:downloadprocess] = {
|
1009
|
+
}
|
1010
|
+
|
1011
|
+
@options[:rss] = {
|
1012
|
+
:version => '2.0',
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
@options[:strip_tags] = {
|
1016
|
+
:default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
shortcut :w3m, :delegate => :diff,
|
1020
|
+
:download => 'w3m -S -F -dump "%s"'
|
1021
|
+
# :download => 'w3m -no-cookie -S -F -dump "%s"'
|
1022
|
+
|
1023
|
+
shortcut :lynx, :delegate => :diff,
|
1024
|
+
:download => 'lynx -dump "%s"'
|
1025
|
+
|
1026
|
+
shortcut :links, :delegate => :diff,
|
1027
|
+
:download => 'links -dump "%s"'
|
1028
|
+
|
1029
|
+
shortcut :curl, :delegate => :webdiff,
|
1030
|
+
:download => 'curl --silent "%s"'
|
1031
|
+
|
1032
|
+
shortcut :wget, :delegate => :webdiff,
|
1033
|
+
:download => 'wget -q -O - "%s"'
|
1034
|
+
|
1035
|
+
shortcut :text, :delegate => :diff,
|
1036
|
+
:download => lambda {|url| html_to_text(open_url(url).read)}
|
1037
|
+
|
1038
|
+
shortcut :body_html, :delegate => :webdiff,
|
1039
|
+
:strip_tags => :default,
|
1040
|
+
:download => lambda {|url|
|
1041
|
+
begin
|
1042
|
+
doc = Hpricot(open_url(url).read)
|
1043
|
+
doc = doc.at('body')
|
1044
|
+
if doc
|
1045
|
+
doc = rewrite_urls(url, doc)
|
1046
|
+
doc = doc.inner_html
|
1047
|
+
if (tags = get(url, :strip_tags))
|
1048
|
+
doc = strip_tags(doc, :format => :hpricot, :tags => tags)
|
1049
|
+
end
|
1050
|
+
else
|
1051
|
+
$logger.warn 'inner html: No body'
|
1052
|
+
end
|
1053
|
+
doc.to_s
|
1054
|
+
rescue Exception => e
|
1055
|
+
# $logger.error e #DBG#
|
1056
|
+
$logger.error e.message
|
1057
|
+
$logger.debug e.backtrace
|
1058
|
+
break %{<pre class="error">\n#{e.message}\n</pre>}
|
1059
|
+
end
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
shortcut :openuri, :delegate => :webdiff,
|
1063
|
+
:download => lambda {|url|
|
1064
|
+
begin
|
1065
|
+
open_url(url).read
|
1066
|
+
rescue Exception => e
|
1067
|
+
# $logger.error e #DBG#
|
1068
|
+
$logger.error e.message
|
1069
|
+
$logger.debug e.backtrace
|
1070
|
+
%{<pre class="error">\n#{e.to_s}\n</pre>}
|
1071
|
+
end
|
1072
|
+
}
|
1073
|
+
|
1074
|
+
shortcut :rss,
|
1075
|
+
:delegate => :openuri,
|
1076
|
+
:diff => lambda {|old, new|
|
1077
|
+
success, rss_version = get_option(:rss, :version)
|
1078
|
+
ro = RSS::Parser.parse(File.read(old), false)
|
1079
|
+
if ro
|
1080
|
+
rh = {}
|
1081
|
+
ro.items.each do |item|
|
1082
|
+
rh[Digest::MD5.hexdigest(item.to_s)] = item
|
1083
|
+
rh[item.link] = item
|
1084
|
+
end
|
1085
|
+
rnew = []
|
1086
|
+
rn = RSS::Parser.parse(File.read(new), false)
|
1087
|
+
if rn
|
1088
|
+
rn.items.each do |item|
|
1089
|
+
rid = Digest::MD5.hexdigest(item.to_s)
|
1090
|
+
if !rh[rid]
|
1091
|
+
if (olditem = rh[item.link])
|
1092
|
+
rss_diff = Websitary::Htmldiff.new(:oldtext => olditem.description, :newtext => item.description).process
|
1093
|
+
rnew << format_rss_item(item, rss_diff)
|
1094
|
+
else
|
1095
|
+
if item.enclosure and (curl = item.enclosure.url)
|
1096
|
+
url = url_from_filename(new)
|
1097
|
+
dir = get(url, :rss_enclosure)
|
1098
|
+
curl = rewrite_href(curl, url, nil, nil, true)
|
1099
|
+
next unless curl
|
1100
|
+
if dir
|
1101
|
+
if dir == true
|
1102
|
+
dir = File.join(@cfgdir, 'attachments', encode(rn.channel.title))
|
1103
|
+
end
|
1104
|
+
@app.ensure_dir(dir)
|
1105
|
+
$logger.debug "Enclosure URL: #{curl}"
|
1106
|
+
fname = File.join(dir, encode(File.basename(curl) || item.title || item.pubDate.to_s || Time.now.to_s))
|
1107
|
+
$logger.debug "Enclosure save to: #{fname}"
|
1108
|
+
enc = open_url(curl).read
|
1109
|
+
write_file(fname, 'wb') {|io| io.puts enc}
|
1110
|
+
furl = file_url(fname)
|
1111
|
+
enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Enclosure (local copy)</a></p>} % furl
|
1112
|
+
if get(url, :rss_rewrite_enclosed_urls)
|
1113
|
+
item.description.gsub!(Regexp.new(Regexp.escape(curl))) {|t| furl}
|
1114
|
+
end
|
1115
|
+
else
|
1116
|
+
enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Original Enclosure</a></p>} % curl
|
1117
|
+
end
|
1118
|
+
else
|
1119
|
+
enclosure = ''
|
1120
|
+
end
|
1121
|
+
rnew << format_rss_item(item, item.description, enclosure)
|
1122
|
+
end
|
1123
|
+
end
|
1124
|
+
end
|
1125
|
+
rnew.join("\n")
|
1126
|
+
end
|
1127
|
+
end
|
1128
|
+
}
|
1129
|
+
|
1130
|
+
shortcut :opml, :delegate => :rss,
|
1131
|
+
:download => lambda {|url|
|
1132
|
+
opml = open(url) {|io| io.read}
|
1133
|
+
if oplm
|
1134
|
+
xml = Hpricot(opml)
|
1135
|
+
# <+TBD+>Well, maybe would should search for outline[@type=rss]?
|
1136
|
+
xml.search('//outline[@xmlurl]').each {|elt|
|
1137
|
+
if elt['type'] =~ /rss/
|
1138
|
+
curl = elt['xmlurl']
|
1139
|
+
opts = @urls[url].dup
|
1140
|
+
opts[:download] = :rss
|
1141
|
+
opts[:title] = elt['title'] || elt['text'] || elt['htmlurl'] || curl
|
1142
|
+
@urls[curl] = opts
|
1143
|
+
@todo << curl
|
1144
|
+
else
|
1145
|
+
$logger.warn "Unsupported type in OPML: #{elt.to_s}"
|
1146
|
+
end
|
1147
|
+
}
|
1148
|
+
end
|
1149
|
+
nil
|
1150
|
+
}
|
1151
|
+
|
1152
|
+
shortcut :website, :delegate => :webdiff,
|
1153
|
+
:download => lambda {|url| get_website(:body_html, url)}
|
1154
|
+
|
1155
|
+
shortcut :website_below, :delegate => :webdiff,
|
1156
|
+
:download => lambda {|url| get_website_below(:body_html, url)}
|
1157
|
+
|
1158
|
+
shortcut :website_txt, :delegate => :default,
|
1159
|
+
:download => lambda {|url| html_to_text(get_website(get(url, :downloadhtml, :openuri), url))}
|
1160
|
+
|
1161
|
+
shortcut :website_txt_below, :delegate => :default,
|
1162
|
+
:download => lambda {|url| html_to_text(get_website_below(get(url, :downloadhtml, :openuri), url))}
|
1163
|
+
|
1164
|
+
shortcut :ftp, :delegate => :default,
|
1165
|
+
:download => lambda {|url| get_ftp(url).join("\n")}
|
1166
|
+
|
1167
|
+
shortcut :ftp_recursive, :delegate => :default,
|
1168
|
+
:download => lambda {|url|
|
1169
|
+
list = get_ftp(url)
|
1170
|
+
depth = get(url, :depth)
|
1171
|
+
if !depth or depth >= 0
|
1172
|
+
dirs = list.find_all {|e| e =~ /^d/}
|
1173
|
+
dirs.each do |l|
|
1174
|
+
sl = l.scan(/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+ +\S+ +\S+)\s+(.+)$/)
|
1175
|
+
perms, type, owner, group, size, date, dirname = sl[0]
|
1176
|
+
curl = File.join(url, dirname, '')
|
1177
|
+
opts = @urls[url].dup
|
1178
|
+
opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
|
1179
|
+
opts[:depth] = depth - 1 if depth and depth >= 0
|
1180
|
+
@urls[curl] = opts
|
1181
|
+
@todo << curl
|
1182
|
+
end
|
1183
|
+
end
|
1184
|
+
list.join("\n")
|
1185
|
+
}
|
1186
|
+
|
1187
|
+
shortcut :img, :delegate => :raw,
|
1188
|
+
:format => lambda {|new|
|
1189
|
+
file = file_url(new)
|
1190
|
+
%{<img src="#{file}" />}
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
@options[:page] = {
|
1194
|
+
:format => lambda do |ti, li, bd|
|
1195
|
+
template = <<OUT
|
1196
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
1197
|
+
<html>
|
1198
|
+
<head>
|
1199
|
+
<title>%s</title>
|
1200
|
+
<link rel="stylesheet" href="websitary.css" type="text/css">
|
1201
|
+
<link rel="alternate" href="websitary.rss" type="application/rss+xml" title="%s">
|
1202
|
+
</head>
|
1203
|
+
<body>
|
1204
|
+
<ol class="toc">
|
1205
|
+
%s
|
1206
|
+
</ol>
|
1207
|
+
<div class="contents">
|
1208
|
+
%s
|
1209
|
+
</div>
|
1210
|
+
</body>
|
1211
|
+
</html>
|
1212
|
+
OUT
|
1213
|
+
template % [ti, ti, li, bd]
|
1214
|
+
end
|
1215
|
+
}
|
1216
|
+
end
|
1217
|
+
|
1218
|
+
|
1219
|
+
def migrate
|
1220
|
+
store = File.join(@cfgdir, 'version.yml')
|
1221
|
+
if File.exist?(store)
|
1222
|
+
version = YAML.load_file(store)
|
1223
|
+
return if version == Websitary::VERSION
|
1224
|
+
else
|
1225
|
+
version = '0.1.0'
|
1226
|
+
end
|
1227
|
+
va = version.split(/\./).map {|i| i.to_i}
|
1228
|
+
migrate_0_1_0 if (va <=> [0, 1, 0]) != 1
|
1229
|
+
write_file(store) {|f| YAML.dump(Websitary::VERSION, f)}
|
1230
|
+
end
|
1231
|
+
|
1232
|
+
|
1233
|
+
def migrate_0_1_0
|
1234
|
+
$logger.warn "Migrate data from version 0.1.0"
|
1235
|
+
['latest', 'old'].each do |dir|
|
1236
|
+
files = Dir[File.join(@cfgdir, dir, '*')]
|
1237
|
+
files.each do |f|
|
1238
|
+
url = decode(File.basename(f))
|
1239
|
+
nfn = encoded_filename(dir, url, true)
|
1240
|
+
@app.move(f, nfn)
|
1241
|
+
end
|
1242
|
+
end
|
1243
|
+
end
|
1244
|
+
|
1245
|
+
|
1246
|
+
def eval_profile(contents, profile_file=nil)
|
1247
|
+
@current_profile = profile_file
|
1248
|
+
begin
|
1249
|
+
self.instance_eval(contents)
|
1250
|
+
return true
|
1251
|
+
ensure
|
1252
|
+
@current_profile = nil
|
1253
|
+
end
|
1254
|
+
end
|
1255
|
+
|
1256
|
+
|
1257
|
+
def get_website(download, url)
|
1258
|
+
html = call_cmd(get_optionvalue(:download, download), [url])
|
1259
|
+
if html
|
1260
|
+
doc = Hpricot(html)
|
1261
|
+
if doc
|
1262
|
+
return if robots?(doc, 'noindex')
|
1263
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
1264
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
1265
|
+
uri.host == uri0.host
|
1266
|
+
end
|
1267
|
+
end
|
1268
|
+
end
|
1269
|
+
html
|
1270
|
+
end
|
1271
|
+
|
1272
|
+
|
1273
|
+
def get_website_below(download, url)
|
1274
|
+
dwnl = get_optionvalue(:download, download)
|
1275
|
+
html = call_cmd(dwnl, [url])
|
1276
|
+
if html
|
1277
|
+
doc = Hpricot(html)
|
1278
|
+
if doc
|
1279
|
+
return if robots?(doc, 'noindex')
|
1280
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
1281
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
1282
|
+
uri.host == uri0.host &&
|
1283
|
+
(pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
|
1284
|
+
end
|
1285
|
+
end
|
1286
|
+
end
|
1287
|
+
html
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
|
1291
|
+
def get_ftp(url)
|
1292
|
+
uri = URI.parse(url)
|
1293
|
+
ftp = Net::FTP.new(uri.host)
|
1294
|
+
ftp.passive = true
|
1295
|
+
begin
|
1296
|
+
ftp.login
|
1297
|
+
ftp.chdir(uri.path)
|
1298
|
+
return ftp.list('*')
|
1299
|
+
rescue Exception => e
|
1300
|
+
$logger.error e
|
1301
|
+
ensure
|
1302
|
+
ftp.close
|
1303
|
+
end
|
1304
|
+
end
|
1305
|
+
|
1306
|
+
|
1307
|
+
def html_toc_id(url)
|
1308
|
+
't%s' % Digest::MD5.hexdigest(url)
|
1309
|
+
end
|
1310
|
+
|
1311
|
+
|
1312
|
+
def html_body_id(url)
|
1313
|
+
'b%s' % Digest::MD5.hexdigest(url)
|
1314
|
+
end
|
1315
|
+
|
1316
|
+
|
1317
|
+
def ensure_filename(filename)
|
1318
|
+
filename = filename.gsub(/[\/]{2,}/, File::SEPARATOR)
|
1319
|
+
# File.join(*File.split(filename))
|
1320
|
+
if filename =~ /#{Regexp.escape(File::SEPARATOR)}$/
|
1321
|
+
File.join(filename, '__WEBSITARY__')
|
1322
|
+
else
|
1323
|
+
parts = filename.split(/#{Regexp.escape(File::SEPARATOR)}/)
|
1324
|
+
if parts.size == 2 and parts[0] =~ /^\w+%3a$/
|
1325
|
+
File.join(filename, '__WEBSITARY__')
|
1326
|
+
else
|
1327
|
+
filename
|
1328
|
+
end
|
1329
|
+
end
|
1330
|
+
end
|
1331
|
+
|
1332
|
+
|
1333
|
+
def open_url(url)
|
1334
|
+
$logger.debug "Open URL: #{url}"
|
1335
|
+
uri = URI.parse(url)
|
1336
|
+
if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
|
1337
|
+
open(url)
|
1338
|
+
else
|
1339
|
+
header = {"User-Agent" => @user_agent}
|
1340
|
+
header.merge!(get(url, :header, {}))
|
1341
|
+
open(url, header)
|
1342
|
+
end
|
1343
|
+
end
|
1344
|
+
|
1345
|
+
|
1346
|
+
def difftext_annotation(url)
|
1347
|
+
bak = oldname(url)
|
1348
|
+
lst = latestname(url)
|
1349
|
+
if File.exist?(bak) and File.exist?(lst)
|
1350
|
+
eval_arg(get(url, :format_annotation, '%s >>> %s'), [@mtimes.mtime(bak), @mtimes.mtime(lst)])
|
1351
|
+
end
|
1352
|
+
end
|
1353
|
+
|
1354
|
+
|
1355
|
+
def format_symbol(name, format_string)
|
1356
|
+
(format_string % name.to_s).intern
|
1357
|
+
end
|
1358
|
+
|
1359
|
+
|
1360
|
+
def format_rss_item(item, body, enclosure='')
|
1361
|
+
hd = [item.title]
|
1362
|
+
hd << " (#{item.author})" if item.author
|
1363
|
+
return <<EOT
|
1364
|
+
<h2 class="rss"><a class="rss" href="#{item.link}">#{hd.join} -- #{item.pubDate}</a></h2>
|
1365
|
+
<div class="rss">
|
1366
|
+
#{body}
|
1367
|
+
#{enclosure}
|
1368
|
+
</div>
|
1369
|
+
EOT
|
1370
|
+
end
|
1371
|
+
|
1372
|
+
# Guess whether text is plain text or html.
|
1373
|
+
def is_html?(text)
|
1374
|
+
text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
|
1375
|
+
end
|
1376
|
+
|
1377
|
+
|
1378
|
+
# Convert html to plain text using hpricot.
|
1379
|
+
def html_to_text(text)
|
1380
|
+
text && Hpricot(text).to_plain_text
|
1381
|
+
end
|
1382
|
+
|
1383
|
+
|
1384
|
+
# Retrieve any robots meta directives from the hpricot document.
|
1385
|
+
def robots?(hpricot, *what)
|
1386
|
+
(hpricot / '//meta[@name="robots"]').any? do |e|
|
1387
|
+
what.any? {|w| e['content'].split(/,\s*/).include?(w)}
|
1388
|
+
end
|
1389
|
+
end
|
1390
|
+
|
1391
|
+
|
1392
|
+
# Check whether robots are allowed to retrieve an url.
|
1393
|
+
def robots_allowed?(url, uri)
|
1394
|
+
if @allow.has_key?(url)
|
1395
|
+
return @allow[url]
|
1396
|
+
end
|
1397
|
+
|
1398
|
+
if defined?(RobotRules)
|
1399
|
+
host = uri.host
|
1400
|
+
|
1401
|
+
unless (rules = @robots[host])
|
1402
|
+
rurl = robots_uri(uri).to_s
|
1403
|
+
return true unless rurl
|
1404
|
+
begin
|
1405
|
+
robots_txt = open_url(rurl).read
|
1406
|
+
rules = RobotRules.new(@user_agent)
|
1407
|
+
rules.parse(rurl, robots_txt)
|
1408
|
+
@robots[host] = rules
|
1409
|
+
$logger.info "Loaded #{rurl} for #{@user_agent}"
|
1410
|
+
$logger.debug robots_txt
|
1411
|
+
rescue Exception => e
|
1412
|
+
puts e
|
1413
|
+
puts robots_txt
|
1414
|
+
end
|
1415
|
+
end
|
1416
|
+
|
1417
|
+
rv = if rules and !rules.allowed?(url)
|
1418
|
+
$logger.info "Excluded url: #{url}"
|
1419
|
+
false
|
1420
|
+
else
|
1421
|
+
true
|
1422
|
+
end
|
1423
|
+
@allow[url] = rv
|
1424
|
+
return rv
|
1425
|
+
end
|
1426
|
+
|
1427
|
+
unless @robots[:warning]
|
1428
|
+
$logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
|
1429
|
+
@robots[:warning] = true
|
1430
|
+
end
|
1431
|
+
@allow[url] = true
|
1432
|
+
return true
|
1433
|
+
end
|
1434
|
+
|
1435
|
+
|
1436
|
+
# Get the robots.txt uri for uri.
|
1437
|
+
def robots_uri(uri)
|
1438
|
+
unless uri.relative?
|
1439
|
+
ruri = uri.dup
|
1440
|
+
ruri.path = '/robots.txt'
|
1441
|
+
ruri
|
1442
|
+
end
|
1443
|
+
end
|
1444
|
+
|
1445
|
+
|
1446
|
+
def sort_difftext!(difftext)
|
1447
|
+
difftext.sort! do |a, b|
|
1448
|
+
aa = a[0]
|
1449
|
+
bb = b[0]
|
1450
|
+
get(aa, :title, aa).downcase <=> get(bb, :title, bb).downcase
|
1451
|
+
end
|
1452
|
+
end
|
1453
|
+
|
1454
|
+
|
1455
|
+
def file_url(filename)
|
1456
|
+
# filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
|
1457
|
+
# "file://#{encode(filename, ':/')}"
|
1458
|
+
filename = call_cmd(get_optionvalue(:global, :file_url), [filename], filename)
|
1459
|
+
encode(filename, ':/')
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
|
1463
|
+
def encode(text, chars='')
|
1464
|
+
text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
|
1465
|
+
end
|
1466
|
+
|
1467
|
+
|
1468
|
+
def decode(text)
|
1469
|
+
text.gsub(/%(..)/) {|t| "%c" % $1.hex}
|
1470
|
+
end
|
1471
|
+
|
1472
|
+
|
1473
|
+
def output_suffix(outformat)
|
1474
|
+
outformat ||= @output_format[0]
|
1475
|
+
@suffix[outformat] || outformat
|
1476
|
+
end
|
1477
|
+
|
1478
|
+
|
1479
|
+
def output_basename
|
1480
|
+
@profiles.join(',')
|
1481
|
+
end
|
1482
|
+
|
1483
|
+
|
1484
|
+
def get_outfile(outformat=nil)
|
1485
|
+
@outfile[outformat] || File.join(@cfgdir, "#{output_basename}.#{output_suffix(outformat)}")
|
1486
|
+
end
|
1487
|
+
|
1488
|
+
|
1489
|
+
def view_output_general(outfile)
|
1490
|
+
if @view
|
1491
|
+
system((@view % outfile))
|
1492
|
+
end
|
1493
|
+
end
|
1494
|
+
alias :view_output_html :view_output_general
|
1495
|
+
alias :view_output_text :view_output_general
|
1496
|
+
alias :view_output_rss :view_output_general
|
1497
|
+
|
1498
|
+
end
|
1499
|
+
|
1500
|
+
|
1501
|
+
|
1502
|
+
|
1503
|
+
# Local Variables:
|
1504
|
+
# revisionRx: REVISION\s\+=\s\+\'
|
1505
|
+
# End:
|