websitary 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +57 -0
- data/Manifest.txt +11 -0
- data/README.txt +732 -0
- data/Rakefile +27 -0
- data/bin/websitary +43 -0
- data/lib/websitary.rb +610 -0
- data/lib/websitary/applog.rb +39 -0
- data/lib/websitary/configuration.rb +1505 -0
- data/lib/websitary/filemtimes.rb +50 -0
- data/lib/websitary/htmldiff.rb +93 -0
- data/setup.rb +1585 -0
- metadata +76 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
# applog.rb
|
2
|
+
# @Last Change: 2007-09-11.
|
3
|
+
# Author:: Thomas Link (micathom AT gmail com)
|
4
|
+
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
|
+
# Created:: 2007-09-08.
|
6
|
+
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
|
10
|
+
# A simple wrapper around Logger.
|
11
|
+
class Websitary::AppLog
|
12
|
+
def initialize(output=nil)
|
13
|
+
@output = output || $stdout
|
14
|
+
$logger = Logger.new(@output, 'daily')
|
15
|
+
$logger.progname = Websitary::APPNAME
|
16
|
+
$logger.datetime_format = "%H:%M:%S"
|
17
|
+
set_level
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def set_level(level=:default)
|
22
|
+
case level
|
23
|
+
when :debug
|
24
|
+
$logger.level = Logger::DEBUG
|
25
|
+
when :verbose
|
26
|
+
$logger.level = Logger::INFO
|
27
|
+
when :quiet
|
28
|
+
$logger.level = Logger::ERROR
|
29
|
+
else
|
30
|
+
$logger.level = Logger::WARN
|
31
|
+
end
|
32
|
+
$logger.debug "Set logger level: #{level}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
# Local Variables:
|
38
|
+
# revisionRx: REVISION\s\+=\s\+\'
|
39
|
+
# End:
|
@@ -0,0 +1,1505 @@
|
|
1
|
+
# configuration.rb
|
2
|
+
# @Last Change: 2007-09-16.
|
3
|
+
# Author:: Thomas Link (micathom AT gmail com)
|
4
|
+
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
|
+
# Created:: 2007-09-08.
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
# This class defines the scope in which profiles are evaluated. Most
|
10
|
+
# of its methods are suitable for use in profiles.
|
11
|
+
class Websitary::Configuration
|
12
|
+
# Hash (key = URL, value = Hash of options)
|
13
|
+
attr_accessor :urls
|
14
|
+
# Array of urls to be downloaded.
|
15
|
+
attr_accessor :todo
|
16
|
+
# Array of downloaded urls.
|
17
|
+
attr_accessor :done
|
18
|
+
# The user configuration directory
|
19
|
+
attr_accessor :cfgdir
|
20
|
+
# What to do
|
21
|
+
attr_accessor :execute
|
22
|
+
# Global Options
|
23
|
+
attr_accessor :options
|
24
|
+
# Cached mtimes
|
25
|
+
attr_accessor :mtimes
|
26
|
+
# The name of the quicklist profile
|
27
|
+
attr_accessor :quicklist_profile
|
28
|
+
# attr_accessor :default_profiles
|
29
|
+
# attr_accessor :cmd_edit
|
30
|
+
|
31
|
+
|
32
|
+
def initialize(app, args=[])
|
33
|
+
@logger = Websitary::AppLog.new
|
34
|
+
$logger.debug "Configuration#initialize"
|
35
|
+
@app = app
|
36
|
+
@cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitary') : '.'
|
37
|
+
[
|
38
|
+
ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitary'),
|
39
|
+
File.join(Config::CONFIG['sysconfdir'], 'websitary')
|
40
|
+
].each do |dir|
|
41
|
+
if File.exists?(dir)
|
42
|
+
@cfgdir = dir
|
43
|
+
break
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
@cmd_edit = 'vi "%s"'
|
48
|
+
@execute = 'downdiff'
|
49
|
+
@quicklist_profile = 'quicklist'
|
50
|
+
@user_agent = "websitary/#{Websitary::VERSION}"
|
51
|
+
@view = 'w3m "%s"'
|
52
|
+
|
53
|
+
@allow = {}
|
54
|
+
@default_options = {}
|
55
|
+
@default_profiles = [@quicklist_profile]
|
56
|
+
@done = []
|
57
|
+
@mtimes = Websitary::FileMTimes.new(self)
|
58
|
+
@outfile = {}
|
59
|
+
@profiles = []
|
60
|
+
@robots = {}
|
61
|
+
@todo = []
|
62
|
+
@urlencmap = {}
|
63
|
+
@urls = {}
|
64
|
+
|
65
|
+
@suffix = {
|
66
|
+
'text' => 'txt'
|
67
|
+
# 'rss' => 'xml'
|
68
|
+
}
|
69
|
+
|
70
|
+
migrate
|
71
|
+
initialize_options
|
72
|
+
profile 'config.rb'
|
73
|
+
parse_command_line_args(args)
|
74
|
+
|
75
|
+
@output_format ||= ['html']
|
76
|
+
@output_title = %{#{Websitary::APPNAME}: #{@profiles.join(", ")}}
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
def parse_command_line_args(args)
|
81
|
+
$logger.debug "parse_command_line_args: #{args}"
|
82
|
+
opts = OptionParser.new do |opts|
|
83
|
+
opts.banner = "Usage: #{Websitary::APPNAME} [OPTIONS] [PROFILES] > [OUT]"
|
84
|
+
opts.separator ''
|
85
|
+
opts.separator "#{Websitary::APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
|
86
|
+
opts.separator 'the terms of the GNU General Public License version 2 or newer.'
|
87
|
+
opts.separator ''
|
88
|
+
|
89
|
+
opts.separator 'General Options:'
|
90
|
+
|
91
|
+
opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
|
92
|
+
@cfgdir = value
|
93
|
+
end
|
94
|
+
|
95
|
+
opts.on('-e', '--execute=COMMAND', String, 'Define what to do (default: downdiff)') do |value|
|
96
|
+
@execute = value
|
97
|
+
end
|
98
|
+
|
99
|
+
# opts.on('-E', '--edit=PROFILE', String, 'Edit a profile') do |value|
|
100
|
+
# edit_profile value
|
101
|
+
# exit 0
|
102
|
+
# end
|
103
|
+
|
104
|
+
opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
|
105
|
+
output_format(*value.split(/,/))
|
106
|
+
end
|
107
|
+
|
108
|
+
opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
|
109
|
+
set :ignore_age => bool
|
110
|
+
end
|
111
|
+
|
112
|
+
opts.on('--log=DESTINATION', String, 'Log destination') do |value|
|
113
|
+
@logger = Websitary::AppLog.new(value != '-' && value)
|
114
|
+
end
|
115
|
+
|
116
|
+
opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
|
117
|
+
output_file(value)
|
118
|
+
end
|
119
|
+
|
120
|
+
opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
|
121
|
+
key, val = value.split(/=/, 2)
|
122
|
+
set key.intern => eval(val)
|
123
|
+
end
|
124
|
+
|
125
|
+
opts.on('-t', '--timer=N', Numeric, 'Repeat every N seconds (never exit)') do |value|
|
126
|
+
global(:timer => value)
|
127
|
+
end
|
128
|
+
|
129
|
+
# opts.on('--review', 'View last diff') do |value|
|
130
|
+
# view_output
|
131
|
+
# exit 0
|
132
|
+
# end
|
133
|
+
|
134
|
+
opts.separator ''
|
135
|
+
opts.separator "Available commands (default: #@execute):"
|
136
|
+
commands = @app.methods.map do |m|
|
137
|
+
mt = m.match(/^execute_(.*)$/)
|
138
|
+
mt && mt[1]
|
139
|
+
end
|
140
|
+
commands.compact!
|
141
|
+
commands.sort!
|
142
|
+
opts.separator commands.join(', ')
|
143
|
+
|
144
|
+
opts.separator ''
|
145
|
+
opts.separator 'Available profiles:'
|
146
|
+
opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
|
147
|
+
|
148
|
+
opts.separator ''
|
149
|
+
opts.separator 'Other Options:'
|
150
|
+
|
151
|
+
opts.on('--debug', 'Show debug messages') do |v|
|
152
|
+
$VERBOSE = $DEBUG = true
|
153
|
+
@logger.set_level(:debug)
|
154
|
+
end
|
155
|
+
|
156
|
+
opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
|
157
|
+
@logger.set_level(:quiet)
|
158
|
+
end
|
159
|
+
|
160
|
+
opts.on('-v', '--verbose', 'Run verbosely') do |v|
|
161
|
+
$VERBOSE = true
|
162
|
+
@logger.set_level(:verbose)
|
163
|
+
end
|
164
|
+
|
165
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
166
|
+
puts opts
|
167
|
+
exit 1
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
@profiles = opts.parse!(args)
|
172
|
+
@profiles = @default_profiles if @profiles.empty?
|
173
|
+
cla_handler = "cmdline_arg_#{@execute}"
|
174
|
+
cla_handler = nil unless @app.respond_to?(cla_handler)
|
175
|
+
for pn in @profiles
|
176
|
+
if cla_handler
|
177
|
+
@app.send(cla_handler, self, pn)
|
178
|
+
else
|
179
|
+
profile pn
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
self
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
# Retrieve an option for an url
|
188
|
+
# url:: String
|
189
|
+
# opt:: Symbol
|
190
|
+
def get(url, opt, default=nil)
|
191
|
+
opts = @urls[url]
|
192
|
+
unless opts
|
193
|
+
$logger.debug "Non-registered URL: #{url}"
|
194
|
+
return default
|
195
|
+
end
|
196
|
+
$logger.debug "get: opts=#{opts.inspect}"
|
197
|
+
case opt
|
198
|
+
when :diffprocess, :format
|
199
|
+
opt_ = opts.has_key?(opt) ? opt : :diff
|
200
|
+
else
|
201
|
+
opt_ = opt
|
202
|
+
end
|
203
|
+
|
204
|
+
$logger.debug "get: opt=#{opt} opt_=#{opt_}"
|
205
|
+
$logger.debug "get: #{opts[opt_]} #{opts[:use]}" if opts
|
206
|
+
if opts.has_key?(opt_)
|
207
|
+
val = opts[opt_]
|
208
|
+
elsif opts.has_key?(:use)
|
209
|
+
val = opts[:use]
|
210
|
+
else
|
211
|
+
val = nil
|
212
|
+
end
|
213
|
+
|
214
|
+
case val
|
215
|
+
when nil
|
216
|
+
when Symbol
|
217
|
+
$logger.debug "get: val=#{val}"
|
218
|
+
success, rv = get_option(opt, val)
|
219
|
+
$logger.debug "get: #{success}, #{rv}"
|
220
|
+
if success
|
221
|
+
return rv
|
222
|
+
end
|
223
|
+
else
|
224
|
+
$logger.debug "get: return val=#{val}"
|
225
|
+
return val
|
226
|
+
end
|
227
|
+
unless default
|
228
|
+
success, default1 = get_option(opt, :default)
|
229
|
+
default = default1 if success
|
230
|
+
end
|
231
|
+
|
232
|
+
$logger.debug "get: return default=#{default}"
|
233
|
+
return default
|
234
|
+
end
|
235
|
+
|
236
|
+
|
237
|
+
def get_optionvalue(opt, val, default=nil)
|
238
|
+
case val
|
239
|
+
when Symbol
|
240
|
+
ok, val = get_option(opt, val)
|
241
|
+
if ok
|
242
|
+
val
|
243
|
+
else
|
244
|
+
default
|
245
|
+
end
|
246
|
+
else
|
247
|
+
val
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
def get_option(opt, val)
|
253
|
+
vals = @options[opt]
|
254
|
+
$logger.debug "val=#{val} vals=#{vals.inspect}"
|
255
|
+
if vals and vals.has_key?(val)
|
256
|
+
rv = vals[val]
|
257
|
+
$logger.debug "get_option ok: #{opt} => #{rv.inspect}"
|
258
|
+
case rv
|
259
|
+
when Symbol
|
260
|
+
$logger.debug "get_option re: #{rv}"
|
261
|
+
return get_option(opt, rv)
|
262
|
+
else
|
263
|
+
$logger.debug "get_option true, #{rv}"
|
264
|
+
return [true, rv]
|
265
|
+
end
|
266
|
+
else
|
267
|
+
$logger.debug "get_option no: #{opt} => #{val.inspect}"
|
268
|
+
return [false, val]
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
# Configuration command:
|
274
|
+
# Set the default profiles
|
275
|
+
def default(*profile_names)
|
276
|
+
@default_profiles = profile_names
|
277
|
+
end
|
278
|
+
|
279
|
+
|
280
|
+
def quicklist(profile_name)
|
281
|
+
@quicklist_profile = profile_name
|
282
|
+
end
|
283
|
+
|
284
|
+
|
285
|
+
# Configuration command:
|
286
|
+
# Load a profile
|
287
|
+
def profile(profile_name)
|
288
|
+
case profile_name
|
289
|
+
when '-'
|
290
|
+
readlines.map! {|l| l.chomp}.each {|url| source url}
|
291
|
+
when '__END__'
|
292
|
+
$logger.debug "Profile: __END__"
|
293
|
+
contents = DATA.read
|
294
|
+
return eval_profile(contents)
|
295
|
+
else
|
296
|
+
fn = profile_filename(profile_name)
|
297
|
+
if fn
|
298
|
+
$logger.debug "Profile: #{fn}"
|
299
|
+
contents = File.read(fn)
|
300
|
+
return eval_profile(contents, fn)
|
301
|
+
end
|
302
|
+
end
|
303
|
+
return false
|
304
|
+
end
|
305
|
+
|
306
|
+
|
307
|
+
# Define a options shortcut.
|
308
|
+
def shortcut(symbol, args)
|
309
|
+
ak = args.keys
|
310
|
+
ok = @options.keys
|
311
|
+
dk = ok - ak
|
312
|
+
|
313
|
+
# :downloadprocess
|
314
|
+
if !ak.include?(:delegate) and
|
315
|
+
dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
|
316
|
+
$logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
|
317
|
+
end
|
318
|
+
|
319
|
+
if ak.include?(:delegate)
|
320
|
+
dk.each do |field|
|
321
|
+
@options[field][symbol] = args[:delegate]
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
args.each do |field, val|
|
326
|
+
@options[field][symbol] = val unless field == :delegate
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
|
331
|
+
# Set the output format.
|
332
|
+
def output_format(*format)
|
333
|
+
unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
|
334
|
+
$logger.fatal "Unknown output format: #{format}"
|
335
|
+
exit 5
|
336
|
+
end
|
337
|
+
@output_format = format
|
338
|
+
end
|
339
|
+
|
340
|
+
|
341
|
+
# Set the output file.
|
342
|
+
def output_file(filename, outformat=nil)
|
343
|
+
@outfile[outformat] = filename
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
# Configuration command:
|
348
|
+
# Set global options.
|
349
|
+
# type:: Symbol
|
350
|
+
# options:: Hash
|
351
|
+
def option(type, options)
|
352
|
+
$logger.info "option #{type}: #{options.inspect}"
|
353
|
+
o = @options[type]
|
354
|
+
if o
|
355
|
+
o.merge!(options)
|
356
|
+
else
|
357
|
+
$logger.error "Unknown option type: #{type} (#{options.inspect})"
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
|
362
|
+
# Set a global option.
|
363
|
+
def global(options)
|
364
|
+
options.each do |type, value|
|
365
|
+
@options[:global][type] = value
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
|
370
|
+
# Configuration command:
|
371
|
+
# Set the default value for source-options.
|
372
|
+
def set(options)
|
373
|
+
$logger.debug "set: #{options.inspect}"
|
374
|
+
@default_options.merge!(options)
|
375
|
+
end
|
376
|
+
|
377
|
+
|
378
|
+
# Configuration command:
|
379
|
+
# Unset a default source-option.
|
380
|
+
def unset(*options)
|
381
|
+
for option in options
|
382
|
+
@default_options.delete(option)
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
|
387
|
+
# Configuration command:
|
388
|
+
# Define a source.
|
389
|
+
# urls:: String
|
390
|
+
def source(urls, opts={})
|
391
|
+
urls.split("\n").flatten.compact.each do |url|
|
392
|
+
@urls[url] = @default_options.dup.update(opts)
|
393
|
+
@todo << url
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
|
398
|
+
# Configuration command:
|
399
|
+
# Set the default download processor. The block takes the
|
400
|
+
# downloaded text (STRING) as argument.
|
401
|
+
def downloadprocess(&block)
|
402
|
+
@options[:downloadprocess][:default] = block
|
403
|
+
end
|
404
|
+
|
405
|
+
|
406
|
+
# Configuration command:
|
407
|
+
# Set the default diff processor. The block takes the
|
408
|
+
# diff text (STRING) as argument.
|
409
|
+
def diffprocess(&block)
|
410
|
+
@options[:diff][:default] = block
|
411
|
+
end
|
412
|
+
|
413
|
+
|
414
|
+
# Configuration command:
|
415
|
+
# Set the editor.
|
416
|
+
def edit(cmd)
|
417
|
+
@cmd_edit = cmd
|
418
|
+
end
|
419
|
+
|
420
|
+
|
421
|
+
# Configuration command:
|
422
|
+
# Set the viewer.
|
423
|
+
def view(view)
|
424
|
+
@view = view
|
425
|
+
end
|
426
|
+
|
427
|
+
|
428
|
+
# Configuration command:
|
429
|
+
# Set the default diff program.
|
430
|
+
def diff(diff)
|
431
|
+
@options[:diff][:default] = diff
|
432
|
+
end
|
433
|
+
|
434
|
+
|
435
|
+
# Configuration command:
|
436
|
+
# Set the default dowloader.
|
437
|
+
def download(download)
|
438
|
+
@options[:download][:default] = download
|
439
|
+
end
|
440
|
+
|
441
|
+
|
442
|
+
# Format a diff according to URL's source options.
|
443
|
+
def format(url, difftext)
|
444
|
+
fmt = get(url, :format)
|
445
|
+
eval_arg(fmt, [difftext], difftext)
|
446
|
+
end
|
447
|
+
|
448
|
+
|
449
|
+
# Apply some arguments to a format.
|
450
|
+
# format:: String or Proc
|
451
|
+
# args:: Array of Arguments
|
452
|
+
def eval_arg(format, args, default=nil, &process_string)
|
453
|
+
case format
|
454
|
+
when nil
|
455
|
+
return default
|
456
|
+
when Proc
|
457
|
+
# $logger.debug "eval proc: #{format} #{args.inspect}" #DBG#
|
458
|
+
$logger.debug "eval proc: #{format}/#{args.size}"
|
459
|
+
return format.call(*args)
|
460
|
+
else
|
461
|
+
ca = format % args
|
462
|
+
# $logger.debug "eval string: #{ca}" #DBG#
|
463
|
+
if process_string
|
464
|
+
return process_string.call(ca)
|
465
|
+
else
|
466
|
+
return ca
|
467
|
+
end
|
468
|
+
end
|
469
|
+
end
|
470
|
+
|
471
|
+
|
472
|
+
# Apply the argument to cmd (a format String or a Proc). If a
|
473
|
+
# String, execute the command.
|
474
|
+
def call_cmd(cmd, args, default=nil)
|
475
|
+
eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
|
476
|
+
end
|
477
|
+
|
478
|
+
|
479
|
+
# Generate & view the final output.
|
480
|
+
# difftext:: Hash
|
481
|
+
def show_output(difftext)
|
482
|
+
if difftext.empty?
|
483
|
+
msg = ['No news is good news']
|
484
|
+
msg << "try again in #{@app.format_tdiff(@app.tdiff_min)}" if @app.tdiff_min
|
485
|
+
$logger.warn msg.join('; ')
|
486
|
+
return 0
|
487
|
+
end
|
488
|
+
|
489
|
+
@output_format.each do |outformat|
|
490
|
+
meth = "get_output_#{outformat}"
|
491
|
+
|
492
|
+
unless respond_to?(meth)
|
493
|
+
$logger.fatal "Unknown output format: #{outformat}"
|
494
|
+
exit 5
|
495
|
+
end
|
496
|
+
|
497
|
+
out = send(meth, difftext)
|
498
|
+
if out
|
499
|
+
outfile = get_outfile(outformat)
|
500
|
+
case outfile
|
501
|
+
when '-'
|
502
|
+
puts out
|
503
|
+
else
|
504
|
+
write_file(outfile) {|io| io.puts out}
|
505
|
+
meth = "view_output_#{outformat}"
|
506
|
+
self.send(meth, outfile)
|
507
|
+
end
|
508
|
+
end
|
509
|
+
end
|
510
|
+
return 1
|
511
|
+
end
|
512
|
+
|
513
|
+
|
514
|
+
def get_output_text(difftext)
|
515
|
+
difftext.map do |url, difftext|
|
516
|
+
if difftext
|
517
|
+
difftext = html_to_text(difftext) if is_html?(difftext)
|
518
|
+
!difftext.empty? && [
|
519
|
+
eval_arg(get(url, :rewrite_link, '%s'), [url]),
|
520
|
+
difftext_annotation(url),
|
521
|
+
nil,
|
522
|
+
difftext
|
523
|
+
].join("\n")
|
524
|
+
end
|
525
|
+
end.compact.join("\n\n#{('-' * 68)}\n\n")
|
526
|
+
end
|
527
|
+
|
528
|
+
|
529
|
+
def get_output_rss(difftext)
|
530
|
+
success, rss_url = get_option(:rss, :url)
|
531
|
+
if success
|
532
|
+
success, rss_version = get_option(:rss, :version)
|
533
|
+
# require "rss/#{rss_version}"
|
534
|
+
|
535
|
+
rss = RSS::Rss.new(rss_version)
|
536
|
+
chan = RSS::Rss::Channel.new
|
537
|
+
chan.title = @output_title
|
538
|
+
[:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
|
539
|
+
ok, val = get_option(:rss, field)
|
540
|
+
item.send(format_symbol(field, '%s='), val) if ok
|
541
|
+
end
|
542
|
+
chan.link = rss_url
|
543
|
+
rss.channel = chan
|
544
|
+
|
545
|
+
cnt = difftext.map do |url, text|
|
546
|
+
rss_format = get(url, :rss_format, 'plain_text')
|
547
|
+
text = strip_tags(text, :format => rss_format)
|
548
|
+
next if text.empty?
|
549
|
+
|
550
|
+
item = RSS::Rss::Channel::Item.new
|
551
|
+
item.date = Time.now
|
552
|
+
item.title = get(url, :title, File.basename(url))
|
553
|
+
item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
|
554
|
+
[:author, :date, :enclosure, :category, :pubDate].each do |field|
|
555
|
+
val = get(url, format_symbol(field, 'rss_%s'))
|
556
|
+
item.send(format_symbol(field, '%s='), val) if val
|
557
|
+
end
|
558
|
+
|
559
|
+
annotation = difftext_annotation(url)
|
560
|
+
annotation = "<pre>#{annotation}</pre>" if annotation
|
561
|
+
case rss_format
|
562
|
+
when 'plain_text'
|
563
|
+
item.description = %{#{annotation}<pre>#{text}</pre>}
|
564
|
+
else
|
565
|
+
item.description = %{#{annotation}\n#{text}}
|
566
|
+
end
|
567
|
+
chan.items << item
|
568
|
+
end
|
569
|
+
|
570
|
+
return rss.to_s
|
571
|
+
|
572
|
+
else
|
573
|
+
|
574
|
+
$logger.fatal "Global option :rss[:url] not defined."
|
575
|
+
exit 5
|
576
|
+
|
577
|
+
end
|
578
|
+
end
|
579
|
+
|
580
|
+
|
581
|
+
def get_output_html(difftext)
|
582
|
+
difftext = difftext.map do |url, text|
|
583
|
+
tags = get(url, :strip_tags)
|
584
|
+
text = strip_tags(text, :tags => tags) if tags
|
585
|
+
text.empty? ? nil : [url, text]
|
586
|
+
end
|
587
|
+
difftext.compact!
|
588
|
+
sort_difftext!(difftext)
|
589
|
+
|
590
|
+
toc = difftext.map do |url, text|
|
591
|
+
ti = get(url, :title, File.basename(url))
|
592
|
+
tid = html_toc_id(url)
|
593
|
+
bid = html_body_id(url)
|
594
|
+
%{<li id="#{tid}" class="toc"><a class="toc" href="\##{bid}">#{ti}</a></li>}
|
595
|
+
end.join("\n")
|
596
|
+
|
597
|
+
idx = 0
|
598
|
+
cnt = difftext.map do |url, text|
|
599
|
+
idx += 1
|
600
|
+
ti = get(url, :title, File.basename(url))
|
601
|
+
bid = html_body_id(url)
|
602
|
+
if (rewrite = get(url, :rewrite_link))
|
603
|
+
urlr = eval_arg(rewrite, [url])
|
604
|
+
ext = ''
|
605
|
+
else
|
606
|
+
old = %{<a class="old" href="#{file_url(oldname(url))}">old</a>}
|
607
|
+
lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
|
608
|
+
ext = %{ (#{old}, #{lst})}
|
609
|
+
urlr = url
|
610
|
+
end
|
611
|
+
note = difftext_annotation(url)
|
612
|
+
<<HTML
|
613
|
+
<div id="#{bid}" class="webpage">
|
614
|
+
<div class="count">
|
615
|
+
#{idx}
|
616
|
+
</div>
|
617
|
+
<h1 class="diff">
|
618
|
+
<a class="external" href="#{urlr}">#{ti}</a>#{ext}
|
619
|
+
</h1>
|
620
|
+
<div class="annotation">
|
621
|
+
#{note && CGI::escapeHTML(note)}
|
622
|
+
</div>
|
623
|
+
<div class="diff,difftext">
|
624
|
+
#{format(url, text)}
|
625
|
+
</div>
|
626
|
+
</div>
|
627
|
+
HTML
|
628
|
+
end.join(('<hr class="separator"/>') + "\n")
|
629
|
+
|
630
|
+
success, template = get_option(:page, :format)
|
631
|
+
unless success
|
632
|
+
success, template = get_option(:page, :simple)
|
633
|
+
end
|
634
|
+
return eval_arg(template, [@output_title, toc, cnt])
|
635
|
+
end
|
636
|
+
|
637
|
+
|
638
|
+
# Get the diff filename.
|
639
|
+
def diffname(url, ensure_dir=false)
|
640
|
+
encoded_filename('diff', url, ensure_dir, 'md5')
|
641
|
+
end
|
642
|
+
|
643
|
+
|
644
|
+
# Get the backup filename.
|
645
|
+
def oldname(url, ensure_dir=false, type=nil)
|
646
|
+
encoded_filename('old', url, ensure_dir, type)
|
647
|
+
end
|
648
|
+
|
649
|
+
|
650
|
+
# Get the filename for the freshly downloaded copy.
|
651
|
+
def latestname(url, ensure_dir=false, type=nil)
|
652
|
+
encoded_filename('latest', url, ensure_dir, type)
|
653
|
+
end
|
654
|
+
|
655
|
+
|
656
|
+
def url_from_filename(filename)
|
657
|
+
rv = @urlencmap[filename]
|
658
|
+
if rv
|
659
|
+
$logger.debug "Map filename: #{filename} -> #{rv}"
|
660
|
+
else
|
661
|
+
$logger.warn "Unmapped filename: #{filename}"
|
662
|
+
end
|
663
|
+
rv
|
664
|
+
end
|
665
|
+
|
666
|
+
|
667
|
+
def encoded_filename(dir, url, ensure_dir=false, type=nil)
|
668
|
+
type ||= get(url, :cachetype, 'tree')
|
669
|
+
$logger.debug "encoded_filename: type=#{type} url=#{url}"
|
670
|
+
rv = File.join(@cfgdir, dir, encoded_basename(url, type))
|
671
|
+
rd = File.dirname(rv)
|
672
|
+
$logger.debug "encoded_filename: rv0=#{rv}"
|
673
|
+
fm = get_optionvalue(:global, :filename_size, 255)
|
674
|
+
rdok = !ensure_dir || @app.ensure_dir(rd, false)
|
675
|
+
if !rdok or rv.size > fm or File.directory?(rv)
|
676
|
+
# $logger.debug "Filename too long (:global=>:filename_size = #{fm}), try md5 encoded filename instead: #{url}"
|
677
|
+
$logger.info "Can't use filename, try 'md5' instead: #{url}"
|
678
|
+
rv = File.join(@cfgdir, dir, encoded_basename(url, :md5))
|
679
|
+
rd = File.dirname(rv)
|
680
|
+
end
|
681
|
+
@urlencmap[rv] = url
|
682
|
+
return rv
|
683
|
+
end
|
684
|
+
|
685
|
+
|
686
|
+
def encoded_basename(url, type='tree')
|
687
|
+
m = "encoded_basename_#{type}"
|
688
|
+
if respond_to?(m)
|
689
|
+
return send(m, url)
|
690
|
+
else
|
691
|
+
$logger.fatal "Unknown cache type: #{type}"
|
692
|
+
exit 5
|
693
|
+
end
|
694
|
+
end
|
695
|
+
|
696
|
+
|
697
|
+
def encoded_basename_tree(url)
|
698
|
+
ensure_filename(encode(url, '/'))
|
699
|
+
end
|
700
|
+
|
701
|
+
|
702
|
+
def encoded_basename_flat(url)
|
703
|
+
encode(url)
|
704
|
+
end
|
705
|
+
|
706
|
+
|
707
|
+
def encoded_basename_md5(url)
|
708
|
+
Digest::MD5.hexdigest(url)
|
709
|
+
end
|
710
|
+
|
711
|
+
|
712
|
+
def urlextname(url)
|
713
|
+
begin
|
714
|
+
return File.extname(URI.parse(url).path)
|
715
|
+
rescue Exception => e
|
716
|
+
end
|
717
|
+
end
|
718
|
+
|
719
|
+
|
720
|
+
# Guess path's dirname.
|
721
|
+
# foo/bar -> foo
|
722
|
+
# foo/bar.txt -> foo
|
723
|
+
# foo/bar/ -> foo/bar
|
724
|
+
def guess_dir(path)
|
725
|
+
path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
|
726
|
+
end
|
727
|
+
|
728
|
+
|
729
|
+
# Strip the url's last part (after #).
|
730
|
+
def canonic_url(url)
|
731
|
+
url.sub(/#.*$/, '')
|
732
|
+
end
|
733
|
+
|
734
|
+
|
735
|
+
def strip_tags_default
|
736
|
+
success, tags = get_option(:strip_tags, :default)
|
737
|
+
tags.dup if success
|
738
|
+
end
|
739
|
+
|
740
|
+
|
741
|
+
def strip_tags(doc, args={})
|
742
|
+
tags = args[:tags] || strip_tags_default
|
743
|
+
case doc
|
744
|
+
when String
|
745
|
+
doc = Hpricot(doc)
|
746
|
+
end
|
747
|
+
tags.each do |tag|
|
748
|
+
doc.search(tag).remove
|
749
|
+
end
|
750
|
+
case args[:format]
|
751
|
+
when :hpricot
|
752
|
+
doc
|
753
|
+
else
|
754
|
+
doc.send("to_#{args[:format] || :html}")
|
755
|
+
end
|
756
|
+
end
|
757
|
+
|
758
|
+
|
759
|
+
# Check whether path is eligible on the basis of url or path0.
|
760
|
+
# This checks either for a :match option for url or the extensions
|
761
|
+
# of path0 and path.
|
762
|
+
def eligible_path?(url, path0, path)
|
763
|
+
rx = get(url, :match)
|
764
|
+
if rx
|
765
|
+
return path =~ rx
|
766
|
+
else
|
767
|
+
return File.extname(path0) == File.extname(path)
|
768
|
+
end
|
769
|
+
end
|
770
|
+
|
771
|
+
|
772
|
+
# Scan hpricot document for hrefs and push the onto @todo if not
|
773
|
+
# already included.
|
774
|
+
def push_hrefs(url, hpricot, &condition)
|
775
|
+
begin
|
776
|
+
return if robots?(hpricot, 'nofollow')
|
777
|
+
depth = get(url, :depth)
|
778
|
+
return if depth and depth <= 0
|
779
|
+
uri0 = URI.parse(url)
|
780
|
+
# pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
|
781
|
+
pn0 = Pathname.new(guess_dir(uri0.path))
|
782
|
+
(hpricot / 'a').each do |a|
|
783
|
+
href = a['href']
|
784
|
+
next if href.nil? or href == url or href =~ /^\s*javascript:/
|
785
|
+
uri = URI.parse(href)
|
786
|
+
pn = guess_dir(uri.path)
|
787
|
+
href = rewrite_href(href, url, uri0, pn0, true)
|
788
|
+
curl = canonic_url(href)
|
789
|
+
next if href.nil? or @done.include?(curl) or @todo.include?(curl)
|
790
|
+
next unless robots_allowed?(curl, uri)
|
791
|
+
# pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
|
792
|
+
uri = URI.parse(href)
|
793
|
+
pn = Pathname.new(guess_dir(uri.path))
|
794
|
+
if condition.call(uri0, pn0, uri, pn)
|
795
|
+
opts = @urls[url].dup
|
796
|
+
# opts[:title] = File.basename(curl)
|
797
|
+
opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
|
798
|
+
opts[:depth] = depth - 1 if depth and depth >= 0
|
799
|
+
@urls[curl] = opts
|
800
|
+
@todo << curl
|
801
|
+
end
|
802
|
+
end
|
803
|
+
rescue Exception => e
|
804
|
+
# $logger.error e #DBG#
|
805
|
+
$logger.error e.message
|
806
|
+
$logger.debug e.backtrace
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
|
811
|
+
# Rewrite urls in doc
|
812
|
+
# url:: String
|
813
|
+
# doc:: Hpricot document
|
814
|
+
def rewrite_urls(url, doc)
|
815
|
+
uri = URI.parse(url)
|
816
|
+
urd = guess_dir(uri.path)
|
817
|
+
(doc / 'a').each do |a|
|
818
|
+
href = rewrite_href(a['href'], url, uri, urd, true)
|
819
|
+
a['href'] = href if href
|
820
|
+
end
|
821
|
+
(doc / 'img').each do |a|
|
822
|
+
href = rewrite_href(a['src'], url, uri, urd, false)
|
823
|
+
a['src'] = href if href
|
824
|
+
end
|
825
|
+
doc
|
826
|
+
end
|
827
|
+
|
828
|
+
|
829
|
+
# Try to make href an absolute url.
|
830
|
+
def rewrite_href(href, url, uri=nil, urd=nil, local=false)
|
831
|
+
begin
|
832
|
+
return if !href or href =~ /^\s*javascript:/
|
833
|
+
urh = URI.parse(href)
|
834
|
+
uri ||= URI.parse(url)
|
835
|
+
urd ||= guess_dir(uri.path)
|
836
|
+
rv = nil
|
837
|
+
href = href.strip
|
838
|
+
|
839
|
+
# $logger.debug "DBG", uri, urh, #DBG#
|
840
|
+
if href =~ /\w+:/
|
841
|
+
# $logger.debug "DBG href=#$0" #DBG#
|
842
|
+
rv = href
|
843
|
+
elsif urh.relative?
|
844
|
+
# $logger.debug "DBG urh relative" #DBG#
|
845
|
+
if uri.relative?
|
846
|
+
# $logger.debug "DBG both relative" #DBG#
|
847
|
+
if uri.instance_of?(URI::Generic)
|
848
|
+
rv = File.join(urd, href)
|
849
|
+
# $logger.debug "DBG rv=#{rv}" #DBG#
|
850
|
+
end
|
851
|
+
else
|
852
|
+
rv = uri.merge(href).to_s
|
853
|
+
# $logger.debug "DBG relativ rv=#{rv}" #DBG#
|
854
|
+
if local
|
855
|
+
hf = latestname(rv)
|
856
|
+
if @todo.include?(rv) or @done.include?(rv) or File.exist?(hf)
|
857
|
+
rv = hf
|
858
|
+
# $logger.debug "DBG relativ, local rv=#{rv}" #DBG#
|
859
|
+
end
|
860
|
+
end
|
861
|
+
end
|
862
|
+
elsif href[0..0] == '#'
|
863
|
+
# $logger.debug "DBG anchor" #DBG#
|
864
|
+
rv = url + href
|
865
|
+
elsif uri.host == urh.host
|
866
|
+
# $logger.debug "DBG merge" #DBG#
|
867
|
+
rv = uri.merge(href).to_s
|
868
|
+
else
|
869
|
+
# $logger.debug "as is" #DBG#
|
870
|
+
rv = href
|
871
|
+
end
|
872
|
+
|
873
|
+
case rv
|
874
|
+
when String
|
875
|
+
return rv
|
876
|
+
when nil
|
877
|
+
else
|
878
|
+
$logger.error "Internal error: href=#{href}"
|
879
|
+
$logger.debug caller.join("\n")
|
880
|
+
end
|
881
|
+
return
|
882
|
+
rescue Exception => e
|
883
|
+
# $logger.error e #DBG#
|
884
|
+
$logger.error e.message
|
885
|
+
$logger.debug e.backtrace
|
886
|
+
end
|
887
|
+
return nil
|
888
|
+
end
|
889
|
+
|
890
|
+
|
891
|
+
# Return a Proc that takes an text as argument and highlight occurences of rx.
|
892
|
+
# rx:: Regular expression
|
893
|
+
# color:: A string, sets the class to highlight-color (default: "yellow")
|
894
|
+
# group:: A number (default: 0)
|
895
|
+
# tag:: The HTML tag to use (default: "span")
|
896
|
+
def highlighter(rx, color=nil, group=nil, tag='span')
|
897
|
+
lambda {|text| text.gsub(rx, %{<#{tag} class="highlight-#{color || 'yellow'}">\\#{group || 0}</#{tag}>})}
|
898
|
+
end
|
899
|
+
|
900
|
+
|
901
|
+
def view_output(outfile=nil)
|
902
|
+
send("view_output_#{@output_format[0]}", outfile || get_outfile)
|
903
|
+
end
|
904
|
+
|
905
|
+
|
906
|
+
def edit_profile(profile=nil)
|
907
|
+
profile ||= @profiles
|
908
|
+
case profile
|
909
|
+
when Array
|
910
|
+
profile.each {|p| edit_profile p}
|
911
|
+
else
|
912
|
+
fn = profile_filename(profile)
|
913
|
+
$logger.debug "edit: #{fn}"
|
914
|
+
`#{@cmd_edit % fn}`
|
915
|
+
end
|
916
|
+
end
|
917
|
+
|
918
|
+
|
919
|
+
def profile_filename(profile_name, check_file_exists=true)
|
920
|
+
if File.extname(profile_name) != '.rb'
|
921
|
+
profile_name = "#{profile_name}.rb"
|
922
|
+
end
|
923
|
+
filename = nil
|
924
|
+
['.', @cfgdir].each do |d|
|
925
|
+
filename = File.join(d, profile_name)
|
926
|
+
if File.exists?(filename)
|
927
|
+
return filename
|
928
|
+
end
|
929
|
+
end
|
930
|
+
return check_file_exists ? nil : filename
|
931
|
+
end
|
932
|
+
|
933
|
+
|
934
|
+
def write_file(filename, mode='w', &block)
|
935
|
+
File.open(filename, mode) {|io| block.call(io)}
|
936
|
+
@mtimes.set(filename)
|
937
|
+
end
|
938
|
+
|
939
|
+
|
940
|
+
def canonic_filename(filename)
|
941
|
+
call_cmd(get_optionvalue(:global, :canonic_filename), [filename], filename)
|
942
|
+
end
|
943
|
+
|
944
|
+
|
945
|
+
private
|
946
|
+
def initialize_options
|
947
|
+
@options = {
|
948
|
+
:global => {
|
949
|
+
:downloadhtml => :openuri,
|
950
|
+
},
|
951
|
+
}
|
952
|
+
|
953
|
+
@options[:diff] = {
|
954
|
+
:default => :diff,
|
955
|
+
|
956
|
+
:diff => lambda {|old, new, *args|
|
957
|
+
opts, _ = args
|
958
|
+
opts ||= '-d -w'
|
959
|
+
difftext = call_cmd('diff %s -u2 "%s" "%s"', [opts, old, new])
|
960
|
+
difftext = difftext.split("\n")[2..-1]
|
961
|
+
difftext ? difftext.delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n") : ''
|
962
|
+
},
|
963
|
+
|
964
|
+
:binary => lambda {|old, new|
|
965
|
+
call_cmd(get_optionvalue(:diff, :diff), [old, new, '--binary -d -w'])
|
966
|
+
},
|
967
|
+
|
968
|
+
:new => lambda {|old, new|
|
969
|
+
difftext = call_cmd(get_optionvalue(:diff, :binary), [old, new])
|
970
|
+
difftext.empty? ? '' : new
|
971
|
+
},
|
972
|
+
|
973
|
+
:raw => :new,
|
974
|
+
|
975
|
+
:webdiff => lambda {|old, new|
|
976
|
+
$logger.debug "webdiff: #{File.basename(new)}"
|
977
|
+
$logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
|
978
|
+
difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
|
979
|
+
$?.exitstatus == 1 ? difftext : ''
|
980
|
+
},
|
981
|
+
}
|
982
|
+
|
983
|
+
@options[:format] = {
|
984
|
+
:default => :diff,
|
985
|
+
:diff => %{<pre class="diff">\n%s\n</pre>},
|
986
|
+
:webdiff => "%s\n",
|
987
|
+
:raw => lambda {|new| File.read(new)},
|
988
|
+
}
|
989
|
+
|
990
|
+
@options[:diffprocess] = {
|
991
|
+
:default => :diff,
|
992
|
+
:diff => false,
|
993
|
+
:webdiff => false,
|
994
|
+
:raw => false,
|
995
|
+
}
|
996
|
+
|
997
|
+
@options[:download] = {
|
998
|
+
:default => :w3m,
|
999
|
+
:raw => :openuri,
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
@options[:downloadformat] = {
|
1003
|
+
:w3m => 'text',
|
1004
|
+
:webdiff => 'html',
|
1005
|
+
:raw => '',
|
1006
|
+
}
|
1007
|
+
|
1008
|
+
@options[:downloadprocess] = {
|
1009
|
+
}
|
1010
|
+
|
1011
|
+
@options[:rss] = {
|
1012
|
+
:version => '2.0',
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
@options[:strip_tags] = {
|
1016
|
+
:default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
shortcut :w3m, :delegate => :diff,
|
1020
|
+
:download => 'w3m -S -F -dump "%s"'
|
1021
|
+
# :download => 'w3m -no-cookie -S -F -dump "%s"'
|
1022
|
+
|
1023
|
+
shortcut :lynx, :delegate => :diff,
|
1024
|
+
:download => 'lynx -dump "%s"'
|
1025
|
+
|
1026
|
+
shortcut :links, :delegate => :diff,
|
1027
|
+
:download => 'links -dump "%s"'
|
1028
|
+
|
1029
|
+
shortcut :curl, :delegate => :webdiff,
|
1030
|
+
:download => 'curl --silent "%s"'
|
1031
|
+
|
1032
|
+
shortcut :wget, :delegate => :webdiff,
|
1033
|
+
:download => 'wget -q -O - "%s"'
|
1034
|
+
|
1035
|
+
shortcut :text, :delegate => :diff,
|
1036
|
+
:download => lambda {|url| html_to_text(open_url(url).read)}
|
1037
|
+
|
1038
|
+
shortcut :body_html, :delegate => :webdiff,
|
1039
|
+
:strip_tags => :default,
|
1040
|
+
:download => lambda {|url|
|
1041
|
+
begin
|
1042
|
+
doc = Hpricot(open_url(url).read)
|
1043
|
+
doc = doc.at('body')
|
1044
|
+
if doc
|
1045
|
+
doc = rewrite_urls(url, doc)
|
1046
|
+
doc = doc.inner_html
|
1047
|
+
if (tags = get(url, :strip_tags))
|
1048
|
+
doc = strip_tags(doc, :format => :hpricot, :tags => tags)
|
1049
|
+
end
|
1050
|
+
else
|
1051
|
+
$logger.warn 'inner html: No body'
|
1052
|
+
end
|
1053
|
+
doc.to_s
|
1054
|
+
rescue Exception => e
|
1055
|
+
# $logger.error e #DBG#
|
1056
|
+
$logger.error e.message
|
1057
|
+
$logger.debug e.backtrace
|
1058
|
+
break %{<pre class="error">\n#{e.message}\n</pre>}
|
1059
|
+
end
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
shortcut :openuri, :delegate => :webdiff,
|
1063
|
+
:download => lambda {|url|
|
1064
|
+
begin
|
1065
|
+
open_url(url).read
|
1066
|
+
rescue Exception => e
|
1067
|
+
# $logger.error e #DBG#
|
1068
|
+
$logger.error e.message
|
1069
|
+
$logger.debug e.backtrace
|
1070
|
+
%{<pre class="error">\n#{e.to_s}\n</pre>}
|
1071
|
+
end
|
1072
|
+
}
|
1073
|
+
|
1074
|
+
shortcut :rss,
|
1075
|
+
:delegate => :openuri,
|
1076
|
+
:diff => lambda {|old, new|
|
1077
|
+
success, rss_version = get_option(:rss, :version)
|
1078
|
+
ro = RSS::Parser.parse(File.read(old), false)
|
1079
|
+
if ro
|
1080
|
+
rh = {}
|
1081
|
+
ro.items.each do |item|
|
1082
|
+
rh[Digest::MD5.hexdigest(item.to_s)] = item
|
1083
|
+
rh[item.link] = item
|
1084
|
+
end
|
1085
|
+
rnew = []
|
1086
|
+
rn = RSS::Parser.parse(File.read(new), false)
|
1087
|
+
if rn
|
1088
|
+
rn.items.each do |item|
|
1089
|
+
rid = Digest::MD5.hexdigest(item.to_s)
|
1090
|
+
if !rh[rid]
|
1091
|
+
if (olditem = rh[item.link])
|
1092
|
+
rss_diff = Websitary::Htmldiff.new(:oldtext => olditem.description, :newtext => item.description).process
|
1093
|
+
rnew << format_rss_item(item, rss_diff)
|
1094
|
+
else
|
1095
|
+
if item.enclosure and (curl = item.enclosure.url)
|
1096
|
+
url = url_from_filename(new)
|
1097
|
+
dir = get(url, :rss_enclosure)
|
1098
|
+
curl = rewrite_href(curl, url, nil, nil, true)
|
1099
|
+
next unless curl
|
1100
|
+
if dir
|
1101
|
+
if dir == true
|
1102
|
+
dir = File.join(@cfgdir, 'attachments', encode(rn.channel.title))
|
1103
|
+
end
|
1104
|
+
@app.ensure_dir(dir)
|
1105
|
+
$logger.debug "Enclosure URL: #{curl}"
|
1106
|
+
fname = File.join(dir, encode(File.basename(curl) || item.title || item.pubDate.to_s || Time.now.to_s))
|
1107
|
+
$logger.debug "Enclosure save to: #{fname}"
|
1108
|
+
enc = open_url(curl).read
|
1109
|
+
write_file(fname, 'wb') {|io| io.puts enc}
|
1110
|
+
furl = file_url(fname)
|
1111
|
+
enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Enclosure (local copy)</a></p>} % furl
|
1112
|
+
if get(url, :rss_rewrite_enclosed_urls)
|
1113
|
+
item.description.gsub!(Regexp.new(Regexp.escape(curl))) {|t| furl}
|
1114
|
+
end
|
1115
|
+
else
|
1116
|
+
enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Original Enclosure</a></p>} % curl
|
1117
|
+
end
|
1118
|
+
else
|
1119
|
+
enclosure = ''
|
1120
|
+
end
|
1121
|
+
rnew << format_rss_item(item, item.description, enclosure)
|
1122
|
+
end
|
1123
|
+
end
|
1124
|
+
end
|
1125
|
+
rnew.join("\n")
|
1126
|
+
end
|
1127
|
+
end
|
1128
|
+
}
|
1129
|
+
|
1130
|
+
shortcut :opml, :delegate => :rss,
|
1131
|
+
:download => lambda {|url|
|
1132
|
+
opml = open(url) {|io| io.read}
|
1133
|
+
if oplm
|
1134
|
+
xml = Hpricot(opml)
|
1135
|
+
# <+TBD+>Well, maybe would should search for outline[@type=rss]?
|
1136
|
+
xml.search('//outline[@xmlurl]').each {|elt|
|
1137
|
+
if elt['type'] =~ /rss/
|
1138
|
+
curl = elt['xmlurl']
|
1139
|
+
opts = @urls[url].dup
|
1140
|
+
opts[:download] = :rss
|
1141
|
+
opts[:title] = elt['title'] || elt['text'] || elt['htmlurl'] || curl
|
1142
|
+
@urls[curl] = opts
|
1143
|
+
@todo << curl
|
1144
|
+
else
|
1145
|
+
$logger.warn "Unsupported type in OPML: #{elt.to_s}"
|
1146
|
+
end
|
1147
|
+
}
|
1148
|
+
end
|
1149
|
+
nil
|
1150
|
+
}
|
1151
|
+
|
1152
|
+
shortcut :website, :delegate => :webdiff,
|
1153
|
+
:download => lambda {|url| get_website(:body_html, url)}
|
1154
|
+
|
1155
|
+
shortcut :website_below, :delegate => :webdiff,
|
1156
|
+
:download => lambda {|url| get_website_below(:body_html, url)}
|
1157
|
+
|
1158
|
+
shortcut :website_txt, :delegate => :default,
|
1159
|
+
:download => lambda {|url| html_to_text(get_website(get(url, :downloadhtml, :openuri), url))}
|
1160
|
+
|
1161
|
+
shortcut :website_txt_below, :delegate => :default,
|
1162
|
+
:download => lambda {|url| html_to_text(get_website_below(get(url, :downloadhtml, :openuri), url))}
|
1163
|
+
|
1164
|
+
shortcut :ftp, :delegate => :default,
|
1165
|
+
:download => lambda {|url| get_ftp(url).join("\n")}
|
1166
|
+
|
1167
|
+
shortcut :ftp_recursive, :delegate => :default,
|
1168
|
+
:download => lambda {|url|
|
1169
|
+
list = get_ftp(url)
|
1170
|
+
depth = get(url, :depth)
|
1171
|
+
if !depth or depth >= 0
|
1172
|
+
dirs = list.find_all {|e| e =~ /^d/}
|
1173
|
+
dirs.each do |l|
|
1174
|
+
sl = l.scan(/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+ +\S+ +\S+)\s+(.+)$/)
|
1175
|
+
perms, type, owner, group, size, date, dirname = sl[0]
|
1176
|
+
curl = File.join(url, dirname, '')
|
1177
|
+
opts = @urls[url].dup
|
1178
|
+
opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
|
1179
|
+
opts[:depth] = depth - 1 if depth and depth >= 0
|
1180
|
+
@urls[curl] = opts
|
1181
|
+
@todo << curl
|
1182
|
+
end
|
1183
|
+
end
|
1184
|
+
list.join("\n")
|
1185
|
+
}
|
1186
|
+
|
1187
|
+
shortcut :img, :delegate => :raw,
|
1188
|
+
:format => lambda {|new|
|
1189
|
+
file = file_url(new)
|
1190
|
+
%{<img src="#{file}" />}
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
@options[:page] = {
|
1194
|
+
:format => lambda do |ti, li, bd|
|
1195
|
+
template = <<OUT
|
1196
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
1197
|
+
<html>
|
1198
|
+
<head>
|
1199
|
+
<title>%s</title>
|
1200
|
+
<link rel="stylesheet" href="websitary.css" type="text/css">
|
1201
|
+
<link rel="alternate" href="websitary.rss" type="application/rss+xml" title="%s">
|
1202
|
+
</head>
|
1203
|
+
<body>
|
1204
|
+
<ol class="toc">
|
1205
|
+
%s
|
1206
|
+
</ol>
|
1207
|
+
<div class="contents">
|
1208
|
+
%s
|
1209
|
+
</div>
|
1210
|
+
</body>
|
1211
|
+
</html>
|
1212
|
+
OUT
|
1213
|
+
template % [ti, ti, li, bd]
|
1214
|
+
end
|
1215
|
+
}
|
1216
|
+
end
|
1217
|
+
|
1218
|
+
|
1219
|
+
def migrate
|
1220
|
+
store = File.join(@cfgdir, 'version.yml')
|
1221
|
+
if File.exist?(store)
|
1222
|
+
version = YAML.load_file(store)
|
1223
|
+
return if version == Websitary::VERSION
|
1224
|
+
else
|
1225
|
+
version = '0.1.0'
|
1226
|
+
end
|
1227
|
+
va = version.split(/\./).map {|i| i.to_i}
|
1228
|
+
migrate_0_1_0 if (va <=> [0, 1, 0]) != 1
|
1229
|
+
write_file(store) {|f| YAML.dump(Websitary::VERSION, f)}
|
1230
|
+
end
|
1231
|
+
|
1232
|
+
|
1233
|
+
def migrate_0_1_0
|
1234
|
+
$logger.warn "Migrate data from version 0.1.0"
|
1235
|
+
['latest', 'old'].each do |dir|
|
1236
|
+
files = Dir[File.join(@cfgdir, dir, '*')]
|
1237
|
+
files.each do |f|
|
1238
|
+
url = decode(File.basename(f))
|
1239
|
+
nfn = encoded_filename(dir, url, true)
|
1240
|
+
@app.move(f, nfn)
|
1241
|
+
end
|
1242
|
+
end
|
1243
|
+
end
|
1244
|
+
|
1245
|
+
|
1246
|
+
def eval_profile(contents, profile_file=nil)
|
1247
|
+
@current_profile = profile_file
|
1248
|
+
begin
|
1249
|
+
self.instance_eval(contents)
|
1250
|
+
return true
|
1251
|
+
ensure
|
1252
|
+
@current_profile = nil
|
1253
|
+
end
|
1254
|
+
end
|
1255
|
+
|
1256
|
+
|
1257
|
+
def get_website(download, url)
|
1258
|
+
html = call_cmd(get_optionvalue(:download, download), [url])
|
1259
|
+
if html
|
1260
|
+
doc = Hpricot(html)
|
1261
|
+
if doc
|
1262
|
+
return if robots?(doc, 'noindex')
|
1263
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
1264
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
1265
|
+
uri.host == uri0.host
|
1266
|
+
end
|
1267
|
+
end
|
1268
|
+
end
|
1269
|
+
html
|
1270
|
+
end
|
1271
|
+
|
1272
|
+
|
1273
|
+
def get_website_below(download, url)
|
1274
|
+
dwnl = get_optionvalue(:download, download)
|
1275
|
+
html = call_cmd(dwnl, [url])
|
1276
|
+
if html
|
1277
|
+
doc = Hpricot(html)
|
1278
|
+
if doc
|
1279
|
+
return if robots?(doc, 'noindex')
|
1280
|
+
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
1281
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
1282
|
+
uri.host == uri0.host &&
|
1283
|
+
(pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
|
1284
|
+
end
|
1285
|
+
end
|
1286
|
+
end
|
1287
|
+
html
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
|
1291
|
+
def get_ftp(url)
|
1292
|
+
uri = URI.parse(url)
|
1293
|
+
ftp = Net::FTP.new(uri.host)
|
1294
|
+
ftp.passive = true
|
1295
|
+
begin
|
1296
|
+
ftp.login
|
1297
|
+
ftp.chdir(uri.path)
|
1298
|
+
return ftp.list('*')
|
1299
|
+
rescue Exception => e
|
1300
|
+
$logger.error e
|
1301
|
+
ensure
|
1302
|
+
ftp.close
|
1303
|
+
end
|
1304
|
+
end
|
1305
|
+
|
1306
|
+
|
1307
|
+
def html_toc_id(url)
|
1308
|
+
't%s' % Digest::MD5.hexdigest(url)
|
1309
|
+
end
|
1310
|
+
|
1311
|
+
|
1312
|
+
def html_body_id(url)
|
1313
|
+
'b%s' % Digest::MD5.hexdigest(url)
|
1314
|
+
end
|
1315
|
+
|
1316
|
+
|
1317
|
+
def ensure_filename(filename)
|
1318
|
+
filename = filename.gsub(/[\/]{2,}/, File::SEPARATOR)
|
1319
|
+
# File.join(*File.split(filename))
|
1320
|
+
if filename =~ /#{Regexp.escape(File::SEPARATOR)}$/
|
1321
|
+
File.join(filename, '__WEBSITARY__')
|
1322
|
+
else
|
1323
|
+
parts = filename.split(/#{Regexp.escape(File::SEPARATOR)}/)
|
1324
|
+
if parts.size == 2 and parts[0] =~ /^\w+%3a$/
|
1325
|
+
File.join(filename, '__WEBSITARY__')
|
1326
|
+
else
|
1327
|
+
filename
|
1328
|
+
end
|
1329
|
+
end
|
1330
|
+
end
|
1331
|
+
|
1332
|
+
|
1333
|
+
def open_url(url)
|
1334
|
+
$logger.debug "Open URL: #{url}"
|
1335
|
+
uri = URI.parse(url)
|
1336
|
+
if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
|
1337
|
+
open(url)
|
1338
|
+
else
|
1339
|
+
header = {"User-Agent" => @user_agent}
|
1340
|
+
header.merge!(get(url, :header, {}))
|
1341
|
+
open(url, header)
|
1342
|
+
end
|
1343
|
+
end
|
1344
|
+
|
1345
|
+
|
1346
|
+
def difftext_annotation(url)
|
1347
|
+
bak = oldname(url)
|
1348
|
+
lst = latestname(url)
|
1349
|
+
if File.exist?(bak) and File.exist?(lst)
|
1350
|
+
eval_arg(get(url, :format_annotation, '%s >>> %s'), [@mtimes.mtime(bak), @mtimes.mtime(lst)])
|
1351
|
+
end
|
1352
|
+
end
|
1353
|
+
|
1354
|
+
|
1355
|
+
def format_symbol(name, format_string)
|
1356
|
+
(format_string % name.to_s).intern
|
1357
|
+
end
|
1358
|
+
|
1359
|
+
|
1360
|
+
def format_rss_item(item, body, enclosure='')
|
1361
|
+
hd = [item.title]
|
1362
|
+
hd << " (#{item.author})" if item.author
|
1363
|
+
return <<EOT
|
1364
|
+
<h2 class="rss"><a class="rss" href="#{item.link}">#{hd.join} -- #{item.pubDate}</a></h2>
|
1365
|
+
<div class="rss">
|
1366
|
+
#{body}
|
1367
|
+
#{enclosure}
|
1368
|
+
</div>
|
1369
|
+
EOT
|
1370
|
+
end
|
1371
|
+
|
1372
|
+
# Guess whether text is plain text or html.
|
1373
|
+
def is_html?(text)
|
1374
|
+
text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
|
1375
|
+
end
|
1376
|
+
|
1377
|
+
|
1378
|
+
# Convert html to plain text using hpricot.
|
1379
|
+
def html_to_text(text)
|
1380
|
+
text && Hpricot(text).to_plain_text
|
1381
|
+
end
|
1382
|
+
|
1383
|
+
|
1384
|
+
# Retrieve any robots meta directives from the hpricot document.
|
1385
|
+
def robots?(hpricot, *what)
|
1386
|
+
(hpricot / '//meta[@name="robots"]').any? do |e|
|
1387
|
+
what.any? {|w| e['content'].split(/,\s*/).include?(w)}
|
1388
|
+
end
|
1389
|
+
end
|
1390
|
+
|
1391
|
+
|
1392
|
+
# Check whether robots are allowed to retrieve an url.
|
1393
|
+
def robots_allowed?(url, uri)
|
1394
|
+
if @allow.has_key?(url)
|
1395
|
+
return @allow[url]
|
1396
|
+
end
|
1397
|
+
|
1398
|
+
if defined?(RobotRules)
|
1399
|
+
host = uri.host
|
1400
|
+
|
1401
|
+
unless (rules = @robots[host])
|
1402
|
+
rurl = robots_uri(uri).to_s
|
1403
|
+
return true unless rurl
|
1404
|
+
begin
|
1405
|
+
robots_txt = open_url(rurl).read
|
1406
|
+
rules = RobotRules.new(@user_agent)
|
1407
|
+
rules.parse(rurl, robots_txt)
|
1408
|
+
@robots[host] = rules
|
1409
|
+
$logger.info "Loaded #{rurl} for #{@user_agent}"
|
1410
|
+
$logger.debug robots_txt
|
1411
|
+
rescue Exception => e
|
1412
|
+
puts e
|
1413
|
+
puts robots_txt
|
1414
|
+
end
|
1415
|
+
end
|
1416
|
+
|
1417
|
+
rv = if rules and !rules.allowed?(url)
|
1418
|
+
$logger.info "Excluded url: #{url}"
|
1419
|
+
false
|
1420
|
+
else
|
1421
|
+
true
|
1422
|
+
end
|
1423
|
+
@allow[url] = rv
|
1424
|
+
return rv
|
1425
|
+
end
|
1426
|
+
|
1427
|
+
unless @robots[:warning]
|
1428
|
+
$logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
|
1429
|
+
@robots[:warning] = true
|
1430
|
+
end
|
1431
|
+
@allow[url] = true
|
1432
|
+
return true
|
1433
|
+
end
|
1434
|
+
|
1435
|
+
|
1436
|
+
# Get the robots.txt uri for uri.
|
1437
|
+
def robots_uri(uri)
|
1438
|
+
unless uri.relative?
|
1439
|
+
ruri = uri.dup
|
1440
|
+
ruri.path = '/robots.txt'
|
1441
|
+
ruri
|
1442
|
+
end
|
1443
|
+
end
|
1444
|
+
|
1445
|
+
|
1446
|
+
def sort_difftext!(difftext)
|
1447
|
+
difftext.sort! do |a, b|
|
1448
|
+
aa = a[0]
|
1449
|
+
bb = b[0]
|
1450
|
+
get(aa, :title, aa).downcase <=> get(bb, :title, bb).downcase
|
1451
|
+
end
|
1452
|
+
end
|
1453
|
+
|
1454
|
+
|
1455
|
+
def file_url(filename)
|
1456
|
+
# filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
|
1457
|
+
# "file://#{encode(filename, ':/')}"
|
1458
|
+
filename = call_cmd(get_optionvalue(:global, :file_url), [filename], filename)
|
1459
|
+
encode(filename, ':/')
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
|
1463
|
+
def encode(text, chars='')
|
1464
|
+
text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
|
1465
|
+
end
|
1466
|
+
|
1467
|
+
|
1468
|
+
def decode(text)
|
1469
|
+
text.gsub(/%(..)/) {|t| "%c" % $1.hex}
|
1470
|
+
end
|
1471
|
+
|
1472
|
+
|
1473
|
+
def output_suffix(outformat)
|
1474
|
+
outformat ||= @output_format[0]
|
1475
|
+
@suffix[outformat] || outformat
|
1476
|
+
end
|
1477
|
+
|
1478
|
+
|
1479
|
+
def output_basename
|
1480
|
+
@profiles.join(',')
|
1481
|
+
end
|
1482
|
+
|
1483
|
+
|
1484
|
+
def get_outfile(outformat=nil)
|
1485
|
+
@outfile[outformat] || File.join(@cfgdir, "#{output_basename}.#{output_suffix(outformat)}")
|
1486
|
+
end
|
1487
|
+
|
1488
|
+
|
1489
|
+
def view_output_general(outfile)
|
1490
|
+
if @view
|
1491
|
+
system((@view % outfile))
|
1492
|
+
end
|
1493
|
+
end
|
1494
|
+
alias :view_output_html :view_output_general
|
1495
|
+
alias :view_output_text :view_output_general
|
1496
|
+
alias :view_output_rss :view_output_general
|
1497
|
+
|
1498
|
+
end
|
1499
|
+
|
1500
|
+
|
1501
|
+
|
1502
|
+
|
1503
|
+
# Local Variables:
|
1504
|
+
# revisionRx: REVISION\s\+=\s\+\'
|
1505
|
+
# End:
|