websitary 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ # applog.rb
2
+ # @Last Change: 2007-09-11.
3
+ # Author:: Thomas Link (micathom AT gmail com)
4
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
+ # Created:: 2007-09-08.
6
+
7
+ require 'logger'
8
+
9
+
10
+ # A simple wrapper around Logger.
11
+ class Websitary::AppLog
12
+ def initialize(output=nil)
13
+ @output = output || $stdout
14
+ $logger = Logger.new(@output, 'daily')
15
+ $logger.progname = Websitary::APPNAME
16
+ $logger.datetime_format = "%H:%M:%S"
17
+ set_level
18
+ end
19
+
20
+
21
+ def set_level(level=:default)
22
+ case level
23
+ when :debug
24
+ $logger.level = Logger::DEBUG
25
+ when :verbose
26
+ $logger.level = Logger::INFO
27
+ when :quiet
28
+ $logger.level = Logger::ERROR
29
+ else
30
+ $logger.level = Logger::WARN
31
+ end
32
+ $logger.debug "Set logger level: #{level}"
33
+ end
34
+ end
35
+
36
+
37
+ # Local Variables:
38
+ # revisionRx: REVISION\s\+=\s\+\'
39
+ # End:
@@ -0,0 +1,1505 @@
1
+ # configuration.rb
2
+ # @Last Change: 2007-09-16.
3
+ # Author:: Thomas Link (micathom AT gmail com)
4
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
+ # Created:: 2007-09-08.
6
+
7
+
8
+
9
+ # This class defines the scope in which profiles are evaluated. Most
10
+ # of its methods are suitable for use in profiles.
11
+ class Websitary::Configuration
12
+ # Hash (key = URL, value = Hash of options)
13
+ attr_accessor :urls
14
+ # Array of urls to be downloaded.
15
+ attr_accessor :todo
16
+ # Array of downloaded urls.
17
+ attr_accessor :done
18
+ # The user configuration directory
19
+ attr_accessor :cfgdir
20
+ # What to do
21
+ attr_accessor :execute
22
+ # Global Options
23
+ attr_accessor :options
24
+ # Cached mtimes
25
+ attr_accessor :mtimes
26
+ # The name of the quicklist profile
27
+ attr_accessor :quicklist_profile
28
+ # attr_accessor :default_profiles
29
+ # attr_accessor :cmd_edit
30
+
31
+
32
+ def initialize(app, args=[])
33
+ @logger = Websitary::AppLog.new
34
+ $logger.debug "Configuration#initialize"
35
+ @app = app
36
+ @cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitary') : '.'
37
+ [
38
+ ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitary'),
39
+ File.join(Config::CONFIG['sysconfdir'], 'websitary')
40
+ ].each do |dir|
41
+ if File.exists?(dir)
42
+ @cfgdir = dir
43
+ break
44
+ end
45
+ end
46
+
47
+ @cmd_edit = 'vi "%s"'
48
+ @execute = 'downdiff'
49
+ @quicklist_profile = 'quicklist'
50
+ @user_agent = "websitary/#{Websitary::VERSION}"
51
+ @view = 'w3m "%s"'
52
+
53
+ @allow = {}
54
+ @default_options = {}
55
+ @default_profiles = [@quicklist_profile]
56
+ @done = []
57
+ @mtimes = Websitary::FileMTimes.new(self)
58
+ @outfile = {}
59
+ @profiles = []
60
+ @robots = {}
61
+ @todo = []
62
+ @urlencmap = {}
63
+ @urls = {}
64
+
65
+ @suffix = {
66
+ 'text' => 'txt'
67
+ # 'rss' => 'xml'
68
+ }
69
+
70
+ migrate
71
+ initialize_options
72
+ profile 'config.rb'
73
+ parse_command_line_args(args)
74
+
75
+ @output_format ||= ['html']
76
+ @output_title = %{#{Websitary::APPNAME}: #{@profiles.join(", ")}}
77
+ end
78
+
79
+
80
+ def parse_command_line_args(args)
81
+ $logger.debug "parse_command_line_args: #{args}"
82
+ opts = OptionParser.new do |opts|
83
+ opts.banner = "Usage: #{Websitary::APPNAME} [OPTIONS] [PROFILES] > [OUT]"
84
+ opts.separator ''
85
+ opts.separator "#{Websitary::APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
86
+ opts.separator 'the terms of the GNU General Public License version 2 or newer.'
87
+ opts.separator ''
88
+
89
+ opts.separator 'General Options:'
90
+
91
+ opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
92
+ @cfgdir = value
93
+ end
94
+
95
+ opts.on('-e', '--execute=COMMAND', String, 'Define what to do (default: downdiff)') do |value|
96
+ @execute = value
97
+ end
98
+
99
+ # opts.on('-E', '--edit=PROFILE', String, 'Edit a profile') do |value|
100
+ # edit_profile value
101
+ # exit 0
102
+ # end
103
+
104
+ opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
105
+ output_format(*value.split(/,/))
106
+ end
107
+
108
+ opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
109
+ set :ignore_age => bool
110
+ end
111
+
112
+ opts.on('--log=DESTINATION', String, 'Log destination') do |value|
113
+ @logger = Websitary::AppLog.new(value != '-' && value)
114
+ end
115
+
116
+ opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
117
+ output_file(value)
118
+ end
119
+
120
+ opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
121
+ key, val = value.split(/=/, 2)
122
+ set key.intern => eval(val)
123
+ end
124
+
125
+ opts.on('-t', '--timer=N', Numeric, 'Repeat every N seconds (never exit)') do |value|
126
+ global(:timer => value)
127
+ end
128
+
129
+ # opts.on('--review', 'View last diff') do |value|
130
+ # view_output
131
+ # exit 0
132
+ # end
133
+
134
+ opts.separator ''
135
+ opts.separator "Available commands (default: #@execute):"
136
+ commands = @app.methods.map do |m|
137
+ mt = m.match(/^execute_(.*)$/)
138
+ mt && mt[1]
139
+ end
140
+ commands.compact!
141
+ commands.sort!
142
+ opts.separator commands.join(', ')
143
+
144
+ opts.separator ''
145
+ opts.separator 'Available profiles:'
146
+ opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
147
+
148
+ opts.separator ''
149
+ opts.separator 'Other Options:'
150
+
151
+ opts.on('--debug', 'Show debug messages') do |v|
152
+ $VERBOSE = $DEBUG = true
153
+ @logger.set_level(:debug)
154
+ end
155
+
156
+ opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
157
+ @logger.set_level(:quiet)
158
+ end
159
+
160
+ opts.on('-v', '--verbose', 'Run verbosely') do |v|
161
+ $VERBOSE = true
162
+ @logger.set_level(:verbose)
163
+ end
164
+
165
+ opts.on_tail('-h', '--help', 'Show this message') do
166
+ puts opts
167
+ exit 1
168
+ end
169
+ end
170
+
171
+ @profiles = opts.parse!(args)
172
+ @profiles = @default_profiles if @profiles.empty?
173
+ cla_handler = "cmdline_arg_#{@execute}"
174
+ cla_handler = nil unless @app.respond_to?(cla_handler)
175
+ for pn in @profiles
176
+ if cla_handler
177
+ @app.send(cla_handler, self, pn)
178
+ else
179
+ profile pn
180
+ end
181
+ end
182
+
183
+ self
184
+ end
185
+
186
+
187
+ # Retrieve an option for an url
188
+ # url:: String
189
+ # opt:: Symbol
190
+ def get(url, opt, default=nil)
191
+ opts = @urls[url]
192
+ unless opts
193
+ $logger.debug "Non-registered URL: #{url}"
194
+ return default
195
+ end
196
+ $logger.debug "get: opts=#{opts.inspect}"
197
+ case opt
198
+ when :diffprocess, :format
199
+ opt_ = opts.has_key?(opt) ? opt : :diff
200
+ else
201
+ opt_ = opt
202
+ end
203
+
204
+ $logger.debug "get: opt=#{opt} opt_=#{opt_}"
205
+ $logger.debug "get: #{opts[opt_]} #{opts[:use]}" if opts
206
+ if opts.has_key?(opt_)
207
+ val = opts[opt_]
208
+ elsif opts.has_key?(:use)
209
+ val = opts[:use]
210
+ else
211
+ val = nil
212
+ end
213
+
214
+ case val
215
+ when nil
216
+ when Symbol
217
+ $logger.debug "get: val=#{val}"
218
+ success, rv = get_option(opt, val)
219
+ $logger.debug "get: #{success}, #{rv}"
220
+ if success
221
+ return rv
222
+ end
223
+ else
224
+ $logger.debug "get: return val=#{val}"
225
+ return val
226
+ end
227
+ unless default
228
+ success, default1 = get_option(opt, :default)
229
+ default = default1 if success
230
+ end
231
+
232
+ $logger.debug "get: return default=#{default}"
233
+ return default
234
+ end
235
+
236
+
237
+ def get_optionvalue(opt, val, default=nil)
238
+ case val
239
+ when Symbol
240
+ ok, val = get_option(opt, val)
241
+ if ok
242
+ val
243
+ else
244
+ default
245
+ end
246
+ else
247
+ val
248
+ end
249
+ end
250
+
251
+
252
+ def get_option(opt, val)
253
+ vals = @options[opt]
254
+ $logger.debug "val=#{val} vals=#{vals.inspect}"
255
+ if vals and vals.has_key?(val)
256
+ rv = vals[val]
257
+ $logger.debug "get_option ok: #{opt} => #{rv.inspect}"
258
+ case rv
259
+ when Symbol
260
+ $logger.debug "get_option re: #{rv}"
261
+ return get_option(opt, rv)
262
+ else
263
+ $logger.debug "get_option true, #{rv}"
264
+ return [true, rv]
265
+ end
266
+ else
267
+ $logger.debug "get_option no: #{opt} => #{val.inspect}"
268
+ return [false, val]
269
+ end
270
+ end
271
+
272
+
273
+ # Configuration command:
274
+ # Set the default profiles
275
+ def default(*profile_names)
276
+ @default_profiles = profile_names
277
+ end
278
+
279
+
280
+ def quicklist(profile_name)
281
+ @quicklist_profile = profile_name
282
+ end
283
+
284
+
285
+ # Configuration command:
286
+ # Load a profile
287
+ def profile(profile_name)
288
+ case profile_name
289
+ when '-'
290
+ readlines.map! {|l| l.chomp}.each {|url| source url}
291
+ when '__END__'
292
+ $logger.debug "Profile: __END__"
293
+ contents = DATA.read
294
+ return eval_profile(contents)
295
+ else
296
+ fn = profile_filename(profile_name)
297
+ if fn
298
+ $logger.debug "Profile: #{fn}"
299
+ contents = File.read(fn)
300
+ return eval_profile(contents, fn)
301
+ end
302
+ end
303
+ return false
304
+ end
305
+
306
+
307
+ # Define a options shortcut.
308
+ def shortcut(symbol, args)
309
+ ak = args.keys
310
+ ok = @options.keys
311
+ dk = ok - ak
312
+
313
+ # :downloadprocess
314
+ if !ak.include?(:delegate) and
315
+ dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
316
+ $logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
317
+ end
318
+
319
+ if ak.include?(:delegate)
320
+ dk.each do |field|
321
+ @options[field][symbol] = args[:delegate]
322
+ end
323
+ end
324
+
325
+ args.each do |field, val|
326
+ @options[field][symbol] = val unless field == :delegate
327
+ end
328
+ end
329
+
330
+
331
+ # Set the output format.
332
+ def output_format(*format)
333
+ unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
334
+ $logger.fatal "Unknown output format: #{format}"
335
+ exit 5
336
+ end
337
+ @output_format = format
338
+ end
339
+
340
+
341
+ # Set the output file.
342
+ def output_file(filename, outformat=nil)
343
+ @outfile[outformat] = filename
344
+ end
345
+
346
+
347
+ # Configuration command:
348
+ # Set global options.
349
+ # type:: Symbol
350
+ # options:: Hash
351
+ def option(type, options)
352
+ $logger.info "option #{type}: #{options.inspect}"
353
+ o = @options[type]
354
+ if o
355
+ o.merge!(options)
356
+ else
357
+ $logger.error "Unknown option type: #{type} (#{options.inspect})"
358
+ end
359
+ end
360
+
361
+
362
+ # Set a global option.
363
+ def global(options)
364
+ options.each do |type, value|
365
+ @options[:global][type] = value
366
+ end
367
+ end
368
+
369
+
370
+ # Configuration command:
371
+ # Set the default value for source-options.
372
+ def set(options)
373
+ $logger.debug "set: #{options.inspect}"
374
+ @default_options.merge!(options)
375
+ end
376
+
377
+
378
+ # Configuration command:
379
+ # Unset a default source-option.
380
+ def unset(*options)
381
+ for option in options
382
+ @default_options.delete(option)
383
+ end
384
+ end
385
+
386
+
387
+ # Configuration command:
388
+ # Define a source.
389
+ # urls:: String
390
+ def source(urls, opts={})
391
+ urls.split("\n").flatten.compact.each do |url|
392
+ @urls[url] = @default_options.dup.update(opts)
393
+ @todo << url
394
+ end
395
+ end
396
+
397
+
398
+ # Configuration command:
399
+ # Set the default download processor. The block takes the
400
+ # downloaded text (STRING) as argument.
401
+ def downloadprocess(&block)
402
+ @options[:downloadprocess][:default] = block
403
+ end
404
+
405
+
406
+ # Configuration command:
407
+ # Set the default diff processor. The block takes the
408
+ # diff text (STRING) as argument.
409
+ def diffprocess(&block)
410
+ @options[:diff][:default] = block
411
+ end
412
+
413
+
414
+ # Configuration command:
415
+ # Set the editor.
416
+ def edit(cmd)
417
+ @cmd_edit = cmd
418
+ end
419
+
420
+
421
+ # Configuration command:
422
+ # Set the viewer.
423
+ def view(view)
424
+ @view = view
425
+ end
426
+
427
+
428
+ # Configuration command:
429
+ # Set the default diff program.
430
+ def diff(diff)
431
+ @options[:diff][:default] = diff
432
+ end
433
+
434
+
435
+ # Configuration command:
436
+ # Set the default dowloader.
437
+ def download(download)
438
+ @options[:download][:default] = download
439
+ end
440
+
441
+
442
+ # Format a diff according to URL's source options.
443
+ def format(url, difftext)
444
+ fmt = get(url, :format)
445
+ eval_arg(fmt, [difftext], difftext)
446
+ end
447
+
448
+
449
+ # Apply some arguments to a format.
450
+ # format:: String or Proc
451
+ # args:: Array of Arguments
452
+ def eval_arg(format, args, default=nil, &process_string)
453
+ case format
454
+ when nil
455
+ return default
456
+ when Proc
457
+ # $logger.debug "eval proc: #{format} #{args.inspect}" #DBG#
458
+ $logger.debug "eval proc: #{format}/#{args.size}"
459
+ return format.call(*args)
460
+ else
461
+ ca = format % args
462
+ # $logger.debug "eval string: #{ca}" #DBG#
463
+ if process_string
464
+ return process_string.call(ca)
465
+ else
466
+ return ca
467
+ end
468
+ end
469
+ end
470
+
471
+
472
+ # Apply the argument to cmd (a format String or a Proc). If a
473
+ # String, execute the command.
474
+ def call_cmd(cmd, args, default=nil)
475
+ eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
476
+ end
477
+
478
+
479
+ # Generate & view the final output.
480
+ # difftext:: Hash
481
+ def show_output(difftext)
482
+ if difftext.empty?
483
+ msg = ['No news is good news']
484
+ msg << "try again in #{@app.format_tdiff(@app.tdiff_min)}" if @app.tdiff_min
485
+ $logger.warn msg.join('; ')
486
+ return 0
487
+ end
488
+
489
+ @output_format.each do |outformat|
490
+ meth = "get_output_#{outformat}"
491
+
492
+ unless respond_to?(meth)
493
+ $logger.fatal "Unknown output format: #{outformat}"
494
+ exit 5
495
+ end
496
+
497
+ out = send(meth, difftext)
498
+ if out
499
+ outfile = get_outfile(outformat)
500
+ case outfile
501
+ when '-'
502
+ puts out
503
+ else
504
+ write_file(outfile) {|io| io.puts out}
505
+ meth = "view_output_#{outformat}"
506
+ self.send(meth, outfile)
507
+ end
508
+ end
509
+ end
510
+ return 1
511
+ end
512
+
513
+
514
+ def get_output_text(difftext)
515
+ difftext.map do |url, difftext|
516
+ if difftext
517
+ difftext = html_to_text(difftext) if is_html?(difftext)
518
+ !difftext.empty? && [
519
+ eval_arg(get(url, :rewrite_link, '%s'), [url]),
520
+ difftext_annotation(url),
521
+ nil,
522
+ difftext
523
+ ].join("\n")
524
+ end
525
+ end.compact.join("\n\n#{('-' * 68)}\n\n")
526
+ end
527
+
528
+
529
+ def get_output_rss(difftext)
530
+ success, rss_url = get_option(:rss, :url)
531
+ if success
532
+ success, rss_version = get_option(:rss, :version)
533
+ # require "rss/#{rss_version}"
534
+
535
+ rss = RSS::Rss.new(rss_version)
536
+ chan = RSS::Rss::Channel.new
537
+ chan.title = @output_title
538
+ [:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
539
+ ok, val = get_option(:rss, field)
540
+ item.send(format_symbol(field, '%s='), val) if ok
541
+ end
542
+ chan.link = rss_url
543
+ rss.channel = chan
544
+
545
+ cnt = difftext.map do |url, text|
546
+ rss_format = get(url, :rss_format, 'plain_text')
547
+ text = strip_tags(text, :format => rss_format)
548
+ next if text.empty?
549
+
550
+ item = RSS::Rss::Channel::Item.new
551
+ item.date = Time.now
552
+ item.title = get(url, :title, File.basename(url))
553
+ item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
554
+ [:author, :date, :enclosure, :category, :pubDate].each do |field|
555
+ val = get(url, format_symbol(field, 'rss_%s'))
556
+ item.send(format_symbol(field, '%s='), val) if val
557
+ end
558
+
559
+ annotation = difftext_annotation(url)
560
+ annotation = "<pre>#{annotation}</pre>" if annotation
561
+ case rss_format
562
+ when 'plain_text'
563
+ item.description = %{#{annotation}<pre>#{text}</pre>}
564
+ else
565
+ item.description = %{#{annotation}\n#{text}}
566
+ end
567
+ chan.items << item
568
+ end
569
+
570
+ return rss.to_s
571
+
572
+ else
573
+
574
+ $logger.fatal "Global option :rss[:url] not defined."
575
+ exit 5
576
+
577
+ end
578
+ end
579
+
580
+
581
+ def get_output_html(difftext)
582
+ difftext = difftext.map do |url, text|
583
+ tags = get(url, :strip_tags)
584
+ text = strip_tags(text, :tags => tags) if tags
585
+ text.empty? ? nil : [url, text]
586
+ end
587
+ difftext.compact!
588
+ sort_difftext!(difftext)
589
+
590
+ toc = difftext.map do |url, text|
591
+ ti = get(url, :title, File.basename(url))
592
+ tid = html_toc_id(url)
593
+ bid = html_body_id(url)
594
+ %{<li id="#{tid}" class="toc"><a class="toc" href="\##{bid}">#{ti}</a></li>}
595
+ end.join("\n")
596
+
597
+ idx = 0
598
+ cnt = difftext.map do |url, text|
599
+ idx += 1
600
+ ti = get(url, :title, File.basename(url))
601
+ bid = html_body_id(url)
602
+ if (rewrite = get(url, :rewrite_link))
603
+ urlr = eval_arg(rewrite, [url])
604
+ ext = ''
605
+ else
606
+ old = %{<a class="old" href="#{file_url(oldname(url))}">old</a>}
607
+ lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
608
+ ext = %{ (#{old}, #{lst})}
609
+ urlr = url
610
+ end
611
+ note = difftext_annotation(url)
612
+ <<HTML
613
+ <div id="#{bid}" class="webpage">
614
+ <div class="count">
615
+ #{idx}
616
+ </div>
617
+ <h1 class="diff">
618
+ <a class="external" href="#{urlr}">#{ti}</a>#{ext}
619
+ </h1>
620
+ <div class="annotation">
621
+ #{note && CGI::escapeHTML(note)}
622
+ </div>
623
+ <div class="diff,difftext">
624
+ #{format(url, text)}
625
+ </div>
626
+ </div>
627
+ HTML
628
+ end.join(('<hr class="separator"/>') + "\n")
629
+
630
+ success, template = get_option(:page, :format)
631
+ unless success
632
+ success, template = get_option(:page, :simple)
633
+ end
634
+ return eval_arg(template, [@output_title, toc, cnt])
635
+ end
636
+
637
+
638
+ # Get the diff filename.
639
+ def diffname(url, ensure_dir=false)
640
+ encoded_filename('diff', url, ensure_dir, 'md5')
641
+ end
642
+
643
+
644
+ # Get the backup filename.
645
+ def oldname(url, ensure_dir=false, type=nil)
646
+ encoded_filename('old', url, ensure_dir, type)
647
+ end
648
+
649
+
650
+ # Get the filename for the freshly downloaded copy.
651
+ def latestname(url, ensure_dir=false, type=nil)
652
+ encoded_filename('latest', url, ensure_dir, type)
653
+ end
654
+
655
+
656
+ def url_from_filename(filename)
657
+ rv = @urlencmap[filename]
658
+ if rv
659
+ $logger.debug "Map filename: #{filename} -> #{rv}"
660
+ else
661
+ $logger.warn "Unmapped filename: #{filename}"
662
+ end
663
+ rv
664
+ end
665
+
666
+
667
+ def encoded_filename(dir, url, ensure_dir=false, type=nil)
668
+ type ||= get(url, :cachetype, 'tree')
669
+ $logger.debug "encoded_filename: type=#{type} url=#{url}"
670
+ rv = File.join(@cfgdir, dir, encoded_basename(url, type))
671
+ rd = File.dirname(rv)
672
+ $logger.debug "encoded_filename: rv0=#{rv}"
673
+ fm = get_optionvalue(:global, :filename_size, 255)
674
+ rdok = !ensure_dir || @app.ensure_dir(rd, false)
675
+ if !rdok or rv.size > fm or File.directory?(rv)
676
+ # $logger.debug "Filename too long (:global=>:filename_size = #{fm}), try md5 encoded filename instead: #{url}"
677
+ $logger.info "Can't use filename, try 'md5' instead: #{url}"
678
+ rv = File.join(@cfgdir, dir, encoded_basename(url, :md5))
679
+ rd = File.dirname(rv)
680
+ end
681
+ @urlencmap[rv] = url
682
+ return rv
683
+ end
684
+
685
+
686
+ def encoded_basename(url, type='tree')
687
+ m = "encoded_basename_#{type}"
688
+ if respond_to?(m)
689
+ return send(m, url)
690
+ else
691
+ $logger.fatal "Unknown cache type: #{type}"
692
+ exit 5
693
+ end
694
+ end
695
+
696
+
697
+ def encoded_basename_tree(url)
698
+ ensure_filename(encode(url, '/'))
699
+ end
700
+
701
+
702
+ def encoded_basename_flat(url)
703
+ encode(url)
704
+ end
705
+
706
+
707
+ def encoded_basename_md5(url)
708
+ Digest::MD5.hexdigest(url)
709
+ end
710
+
711
+
712
+ def urlextname(url)
713
+ begin
714
+ return File.extname(URI.parse(url).path)
715
+ rescue Exception => e
716
+ end
717
+ end
718
+
719
+
720
+ # Guess path's dirname.
721
+ # foo/bar -> foo
722
+ # foo/bar.txt -> foo
723
+ # foo/bar/ -> foo/bar
724
+ def guess_dir(path)
725
+ path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
726
+ end
727
+
728
+
729
+ # Strip the url's last part (after #).
730
+ def canonic_url(url)
731
+ url.sub(/#.*$/, '')
732
+ end
733
+
734
+
735
+ def strip_tags_default
736
+ success, tags = get_option(:strip_tags, :default)
737
+ tags.dup if success
738
+ end
739
+
740
+
741
+ def strip_tags(doc, args={})
742
+ tags = args[:tags] || strip_tags_default
743
+ case doc
744
+ when String
745
+ doc = Hpricot(doc)
746
+ end
747
+ tags.each do |tag|
748
+ doc.search(tag).remove
749
+ end
750
+ case args[:format]
751
+ when :hpricot
752
+ doc
753
+ else
754
+ doc.send("to_#{args[:format] || :html}")
755
+ end
756
+ end
757
+
758
+
759
+ # Check whether path is eligible on the basis of url or path0.
760
+ # This checks either for a :match option for url or the extensions
761
+ # of path0 and path.
762
+ def eligible_path?(url, path0, path)
763
+ rx = get(url, :match)
764
+ if rx
765
+ return path =~ rx
766
+ else
767
+ return File.extname(path0) == File.extname(path)
768
+ end
769
+ end
770
+
771
+
772
+ # Scan hpricot document for hrefs and push the onto @todo if not
773
+ # already included.
774
+ def push_hrefs(url, hpricot, &condition)
775
+ begin
776
+ return if robots?(hpricot, 'nofollow')
777
+ depth = get(url, :depth)
778
+ return if depth and depth <= 0
779
+ uri0 = URI.parse(url)
780
+ # pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
781
+ pn0 = Pathname.new(guess_dir(uri0.path))
782
+ (hpricot / 'a').each do |a|
783
+ href = a['href']
784
+ next if href.nil? or href == url or href =~ /^\s*javascript:/
785
+ uri = URI.parse(href)
786
+ pn = guess_dir(uri.path)
787
+ href = rewrite_href(href, url, uri0, pn0, true)
788
+ curl = canonic_url(href)
789
+ next if href.nil? or @done.include?(curl) or @todo.include?(curl)
790
+ next unless robots_allowed?(curl, uri)
791
+ # pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
792
+ uri = URI.parse(href)
793
+ pn = Pathname.new(guess_dir(uri.path))
794
+ if condition.call(uri0, pn0, uri, pn)
795
+ opts = @urls[url].dup
796
+ # opts[:title] = File.basename(curl)
797
+ opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
798
+ opts[:depth] = depth - 1 if depth and depth >= 0
799
+ @urls[curl] = opts
800
+ @todo << curl
801
+ end
802
+ end
803
+ rescue Exception => e
804
+ # $logger.error e #DBG#
805
+ $logger.error e.message
806
+ $logger.debug e.backtrace
807
+ end
808
+ end
809
+
810
+
811
+ # Rewrite urls in doc
812
+ # url:: String
813
+ # doc:: Hpricot document
814
+ def rewrite_urls(url, doc)
815
+ uri = URI.parse(url)
816
+ urd = guess_dir(uri.path)
817
+ (doc / 'a').each do |a|
818
+ href = rewrite_href(a['href'], url, uri, urd, true)
819
+ a['href'] = href if href
820
+ end
821
+ (doc / 'img').each do |a|
822
+ href = rewrite_href(a['src'], url, uri, urd, false)
823
+ a['src'] = href if href
824
+ end
825
+ doc
826
+ end
827
+
828
+
829
+ # Try to make href an absolute url.
830
+ def rewrite_href(href, url, uri=nil, urd=nil, local=false)
831
+ begin
832
+ return if !href or href =~ /^\s*javascript:/
833
+ urh = URI.parse(href)
834
+ uri ||= URI.parse(url)
835
+ urd ||= guess_dir(uri.path)
836
+ rv = nil
837
+ href = href.strip
838
+
839
+ # $logger.debug "DBG", uri, urh, #DBG#
840
+ if href =~ /\w+:/
841
+ # $logger.debug "DBG href=#$0" #DBG#
842
+ rv = href
843
+ elsif urh.relative?
844
+ # $logger.debug "DBG urh relative" #DBG#
845
+ if uri.relative?
846
+ # $logger.debug "DBG both relative" #DBG#
847
+ if uri.instance_of?(URI::Generic)
848
+ rv = File.join(urd, href)
849
+ # $logger.debug "DBG rv=#{rv}" #DBG#
850
+ end
851
+ else
852
+ rv = uri.merge(href).to_s
853
+ # $logger.debug "DBG relativ rv=#{rv}" #DBG#
854
+ if local
855
+ hf = latestname(rv)
856
+ if @todo.include?(rv) or @done.include?(rv) or File.exist?(hf)
857
+ rv = hf
858
+ # $logger.debug "DBG relativ, local rv=#{rv}" #DBG#
859
+ end
860
+ end
861
+ end
862
+ elsif href[0..0] == '#'
863
+ # $logger.debug "DBG anchor" #DBG#
864
+ rv = url + href
865
+ elsif uri.host == urh.host
866
+ # $logger.debug "DBG merge" #DBG#
867
+ rv = uri.merge(href).to_s
868
+ else
869
+ # $logger.debug "as is" #DBG#
870
+ rv = href
871
+ end
872
+
873
+ case rv
874
+ when String
875
+ return rv
876
+ when nil
877
+ else
878
+ $logger.error "Internal error: href=#{href}"
879
+ $logger.debug caller.join("\n")
880
+ end
881
+ return
882
+ rescue Exception => e
883
+ # $logger.error e #DBG#
884
+ $logger.error e.message
885
+ $logger.debug e.backtrace
886
+ end
887
+ return nil
888
+ end
889
+
890
+
891
+ # Return a Proc that takes an text as argument and highlight occurences of rx.
892
+ # rx:: Regular expression
893
+ # color:: A string, sets the class to highlight-color (default: "yellow")
894
+ # group:: A number (default: 0)
895
+ # tag:: The HTML tag to use (default: "span")
896
+ def highlighter(rx, color=nil, group=nil, tag='span')
897
+ lambda {|text| text.gsub(rx, %{<#{tag} class="highlight-#{color || 'yellow'}">\\#{group || 0}</#{tag}>})}
898
+ end
899
+
900
+
901
+ def view_output(outfile=nil)
902
+ send("view_output_#{@output_format[0]}", outfile || get_outfile)
903
+ end
904
+
905
+
906
+ def edit_profile(profile=nil)
907
+ profile ||= @profiles
908
+ case profile
909
+ when Array
910
+ profile.each {|p| edit_profile p}
911
+ else
912
+ fn = profile_filename(profile)
913
+ $logger.debug "edit: #{fn}"
914
+ `#{@cmd_edit % fn}`
915
+ end
916
+ end
917
+
918
+
919
+ def profile_filename(profile_name, check_file_exists=true)
920
+ if File.extname(profile_name) != '.rb'
921
+ profile_name = "#{profile_name}.rb"
922
+ end
923
+ filename = nil
924
+ ['.', @cfgdir].each do |d|
925
+ filename = File.join(d, profile_name)
926
+ if File.exists?(filename)
927
+ return filename
928
+ end
929
+ end
930
+ return check_file_exists ? nil : filename
931
+ end
932
+
933
+
934
+ def write_file(filename, mode='w', &block)
935
+ File.open(filename, mode) {|io| block.call(io)}
936
+ @mtimes.set(filename)
937
+ end
938
+
939
+
940
+ def canonic_filename(filename)
941
+ call_cmd(get_optionvalue(:global, :canonic_filename), [filename], filename)
942
+ end
943
+
944
+
945
+ private
946
+ def initialize_options
947
+ @options = {
948
+ :global => {
949
+ :downloadhtml => :openuri,
950
+ },
951
+ }
952
+
953
+ @options[:diff] = {
954
+ :default => :diff,
955
+
956
+ :diff => lambda {|old, new, *args|
957
+ opts, _ = args
958
+ opts ||= '-d -w'
959
+ difftext = call_cmd('diff %s -u2 "%s" "%s"', [opts, old, new])
960
+ difftext = difftext.split("\n")[2..-1]
961
+ difftext ? difftext.delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n") : ''
962
+ },
963
+
964
+ :binary => lambda {|old, new|
965
+ call_cmd(get_optionvalue(:diff, :diff), [old, new, '--binary -d -w'])
966
+ },
967
+
968
+ :new => lambda {|old, new|
969
+ difftext = call_cmd(get_optionvalue(:diff, :binary), [old, new])
970
+ difftext.empty? ? '' : new
971
+ },
972
+
973
+ :raw => :new,
974
+
975
+ :webdiff => lambda {|old, new|
976
+ $logger.debug "webdiff: #{File.basename(new)}"
977
+ $logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
978
+ difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
979
+ $?.exitstatus == 1 ? difftext : ''
980
+ },
981
+ }
982
+
983
+ @options[:format] = {
984
+ :default => :diff,
985
+ :diff => %{<pre class="diff">\n%s\n</pre>},
986
+ :webdiff => "%s\n",
987
+ :raw => lambda {|new| File.read(new)},
988
+ }
989
+
990
+ @options[:diffprocess] = {
991
+ :default => :diff,
992
+ :diff => false,
993
+ :webdiff => false,
994
+ :raw => false,
995
+ }
996
+
997
+ @options[:download] = {
998
+ :default => :w3m,
999
+ :raw => :openuri,
1000
+ }
1001
+
1002
+ @options[:downloadformat] = {
1003
+ :w3m => 'text',
1004
+ :webdiff => 'html',
1005
+ :raw => '',
1006
+ }
1007
+
1008
+ @options[:downloadprocess] = {
1009
+ }
1010
+
1011
+ @options[:rss] = {
1012
+ :version => '2.0',
1013
+ }
1014
+
1015
+ @options[:strip_tags] = {
1016
+ :default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
1017
+ }
1018
+
1019
+ shortcut :w3m, :delegate => :diff,
1020
+ :download => 'w3m -S -F -dump "%s"'
1021
+ # :download => 'w3m -no-cookie -S -F -dump "%s"'
1022
+
1023
+ shortcut :lynx, :delegate => :diff,
1024
+ :download => 'lynx -dump "%s"'
1025
+
1026
+ shortcut :links, :delegate => :diff,
1027
+ :download => 'links -dump "%s"'
1028
+
1029
+ shortcut :curl, :delegate => :webdiff,
1030
+ :download => 'curl --silent "%s"'
1031
+
1032
+ shortcut :wget, :delegate => :webdiff,
1033
+ :download => 'wget -q -O - "%s"'
1034
+
1035
+ shortcut :text, :delegate => :diff,
1036
+ :download => lambda {|url| html_to_text(open_url(url).read)}
1037
+
1038
+ shortcut :body_html, :delegate => :webdiff,
1039
+ :strip_tags => :default,
1040
+ :download => lambda {|url|
1041
+ begin
1042
+ doc = Hpricot(open_url(url).read)
1043
+ doc = doc.at('body')
1044
+ if doc
1045
+ doc = rewrite_urls(url, doc)
1046
+ doc = doc.inner_html
1047
+ if (tags = get(url, :strip_tags))
1048
+ doc = strip_tags(doc, :format => :hpricot, :tags => tags)
1049
+ end
1050
+ else
1051
+ $logger.warn 'inner html: No body'
1052
+ end
1053
+ doc.to_s
1054
+ rescue Exception => e
1055
+ # $logger.error e #DBG#
1056
+ $logger.error e.message
1057
+ $logger.debug e.backtrace
1058
+ break %{<pre class="error">\n#{e.message}\n</pre>}
1059
+ end
1060
+ }
1061
+
1062
+ shortcut :openuri, :delegate => :webdiff,
1063
+ :download => lambda {|url|
1064
+ begin
1065
+ open_url(url).read
1066
+ rescue Exception => e
1067
+ # $logger.error e #DBG#
1068
+ $logger.error e.message
1069
+ $logger.debug e.backtrace
1070
+ %{<pre class="error">\n#{e.to_s}\n</pre>}
1071
+ end
1072
+ }
1073
+
1074
+ shortcut :rss,
1075
+ :delegate => :openuri,
1076
+ :diff => lambda {|old, new|
1077
+ success, rss_version = get_option(:rss, :version)
1078
+ ro = RSS::Parser.parse(File.read(old), false)
1079
+ if ro
1080
+ rh = {}
1081
+ ro.items.each do |item|
1082
+ rh[Digest::MD5.hexdigest(item.to_s)] = item
1083
+ rh[item.link] = item
1084
+ end
1085
+ rnew = []
1086
+ rn = RSS::Parser.parse(File.read(new), false)
1087
+ if rn
1088
+ rn.items.each do |item|
1089
+ rid = Digest::MD5.hexdigest(item.to_s)
1090
+ if !rh[rid]
1091
+ if (olditem = rh[item.link])
1092
+ rss_diff = Websitary::Htmldiff.new(:oldtext => olditem.description, :newtext => item.description).process
1093
+ rnew << format_rss_item(item, rss_diff)
1094
+ else
1095
+ if item.enclosure and (curl = item.enclosure.url)
1096
+ url = url_from_filename(new)
1097
+ dir = get(url, :rss_enclosure)
1098
+ curl = rewrite_href(curl, url, nil, nil, true)
1099
+ next unless curl
1100
+ if dir
1101
+ if dir == true
1102
+ dir = File.join(@cfgdir, 'attachments', encode(rn.channel.title))
1103
+ end
1104
+ @app.ensure_dir(dir)
1105
+ $logger.debug "Enclosure URL: #{curl}"
1106
+ fname = File.join(dir, encode(File.basename(curl) || item.title || item.pubDate.to_s || Time.now.to_s))
1107
+ $logger.debug "Enclosure save to: #{fname}"
1108
+ enc = open_url(curl).read
1109
+ write_file(fname, 'wb') {|io| io.puts enc}
1110
+ furl = file_url(fname)
1111
+ enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Enclosure (local copy)</a></p>} % furl
1112
+ if get(url, :rss_rewrite_enclosed_urls)
1113
+ item.description.gsub!(Regexp.new(Regexp.escape(curl))) {|t| furl}
1114
+ end
1115
+ else
1116
+ enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Original Enclosure</a></p>} % curl
1117
+ end
1118
+ else
1119
+ enclosure = ''
1120
+ end
1121
+ rnew << format_rss_item(item, item.description, enclosure)
1122
+ end
1123
+ end
1124
+ end
1125
+ rnew.join("\n")
1126
+ end
1127
+ end
1128
+ }
1129
+
1130
+ shortcut :opml, :delegate => :rss,
1131
+ :download => lambda {|url|
1132
+ opml = open(url) {|io| io.read}
1133
+ if oplm
1134
+ xml = Hpricot(opml)
1135
+ # <+TBD+>Well, maybe would should search for outline[@type=rss]?
1136
+ xml.search('//outline[@xmlurl]').each {|elt|
1137
+ if elt['type'] =~ /rss/
1138
+ curl = elt['xmlurl']
1139
+ opts = @urls[url].dup
1140
+ opts[:download] = :rss
1141
+ opts[:title] = elt['title'] || elt['text'] || elt['htmlurl'] || curl
1142
+ @urls[curl] = opts
1143
+ @todo << curl
1144
+ else
1145
+ $logger.warn "Unsupported type in OPML: #{elt.to_s}"
1146
+ end
1147
+ }
1148
+ end
1149
+ nil
1150
+ }
1151
+
1152
+ shortcut :website, :delegate => :webdiff,
1153
+ :download => lambda {|url| get_website(:body_html, url)}
1154
+
1155
+ shortcut :website_below, :delegate => :webdiff,
1156
+ :download => lambda {|url| get_website_below(:body_html, url)}
1157
+
1158
+ shortcut :website_txt, :delegate => :default,
1159
+ :download => lambda {|url| html_to_text(get_website(get(url, :downloadhtml, :openuri), url))}
1160
+
1161
+ shortcut :website_txt_below, :delegate => :default,
1162
+ :download => lambda {|url| html_to_text(get_website_below(get(url, :downloadhtml, :openuri), url))}
1163
+
1164
+ shortcut :ftp, :delegate => :default,
1165
+ :download => lambda {|url| get_ftp(url).join("\n")}
1166
+
1167
+ shortcut :ftp_recursive, :delegate => :default,
1168
+ :download => lambda {|url|
1169
+ list = get_ftp(url)
1170
+ depth = get(url, :depth)
1171
+ if !depth or depth >= 0
1172
+ dirs = list.find_all {|e| e =~ /^d/}
1173
+ dirs.each do |l|
1174
+ sl = l.scan(/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+ +\S+ +\S+)\s+(.+)$/)
1175
+ perms, type, owner, group, size, date, dirname = sl[0]
1176
+ curl = File.join(url, dirname, '')
1177
+ opts = @urls[url].dup
1178
+ opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
1179
+ opts[:depth] = depth - 1 if depth and depth >= 0
1180
+ @urls[curl] = opts
1181
+ @todo << curl
1182
+ end
1183
+ end
1184
+ list.join("\n")
1185
+ }
1186
+
1187
+ shortcut :img, :delegate => :raw,
1188
+ :format => lambda {|new|
1189
+ file = file_url(new)
1190
+ %{<img src="#{file}" />}
1191
+ }
1192
+
1193
+ @options[:page] = {
1194
+ :format => lambda do |ti, li, bd|
1195
+ template = <<OUT
1196
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1197
+ <html>
1198
+ <head>
1199
+ <title>%s</title>
1200
+ <link rel="stylesheet" href="websitary.css" type="text/css">
1201
+ <link rel="alternate" href="websitary.rss" type="application/rss+xml" title="%s">
1202
+ </head>
1203
+ <body>
1204
+ <ol class="toc">
1205
+ %s
1206
+ </ol>
1207
+ <div class="contents">
1208
+ %s
1209
+ </div>
1210
+ </body>
1211
+ </html>
1212
+ OUT
1213
+ template % [ti, ti, li, bd]
1214
+ end
1215
+ }
1216
+ end
1217
+
1218
+
1219
+ def migrate
1220
+ store = File.join(@cfgdir, 'version.yml')
1221
+ if File.exist?(store)
1222
+ version = YAML.load_file(store)
1223
+ return if version == Websitary::VERSION
1224
+ else
1225
+ version = '0.1.0'
1226
+ end
1227
+ va = version.split(/\./).map {|i| i.to_i}
1228
+ migrate_0_1_0 if (va <=> [0, 1, 0]) != 1
1229
+ write_file(store) {|f| YAML.dump(Websitary::VERSION, f)}
1230
+ end
1231
+
1232
+
1233
+ def migrate_0_1_0
1234
+ $logger.warn "Migrate data from version 0.1.0"
1235
+ ['latest', 'old'].each do |dir|
1236
+ files = Dir[File.join(@cfgdir, dir, '*')]
1237
+ files.each do |f|
1238
+ url = decode(File.basename(f))
1239
+ nfn = encoded_filename(dir, url, true)
1240
+ @app.move(f, nfn)
1241
+ end
1242
+ end
1243
+ end
1244
+
1245
+
1246
+ def eval_profile(contents, profile_file=nil)
1247
+ @current_profile = profile_file
1248
+ begin
1249
+ self.instance_eval(contents)
1250
+ return true
1251
+ ensure
1252
+ @current_profile = nil
1253
+ end
1254
+ end
1255
+
1256
+
1257
+ def get_website(download, url)
1258
+ html = call_cmd(get_optionvalue(:download, download), [url])
1259
+ if html
1260
+ doc = Hpricot(html)
1261
+ if doc
1262
+ return if robots?(doc, 'noindex')
1263
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
1264
+ eligible_path?(url, uri0.path, uri.path) &&
1265
+ uri.host == uri0.host
1266
+ end
1267
+ end
1268
+ end
1269
+ html
1270
+ end
1271
+
1272
+
1273
+ def get_website_below(download, url)
1274
+ dwnl = get_optionvalue(:download, download)
1275
+ html = call_cmd(dwnl, [url])
1276
+ if html
1277
+ doc = Hpricot(html)
1278
+ if doc
1279
+ return if robots?(doc, 'noindex')
1280
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
1281
+ eligible_path?(url, uri0.path, uri.path) &&
1282
+ uri.host == uri0.host &&
1283
+ (pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
1284
+ end
1285
+ end
1286
+ end
1287
+ html
1288
+ end
1289
+
1290
+
1291
+ def get_ftp(url)
1292
+ uri = URI.parse(url)
1293
+ ftp = Net::FTP.new(uri.host)
1294
+ ftp.passive = true
1295
+ begin
1296
+ ftp.login
1297
+ ftp.chdir(uri.path)
1298
+ return ftp.list('*')
1299
+ rescue Exception => e
1300
+ $logger.error e
1301
+ ensure
1302
+ ftp.close
1303
+ end
1304
+ end
1305
+
1306
+
1307
+ def html_toc_id(url)
1308
+ 't%s' % Digest::MD5.hexdigest(url)
1309
+ end
1310
+
1311
+
1312
+ def html_body_id(url)
1313
+ 'b%s' % Digest::MD5.hexdigest(url)
1314
+ end
1315
+
1316
+
1317
+ def ensure_filename(filename)
1318
+ filename = filename.gsub(/[\/]{2,}/, File::SEPARATOR)
1319
+ # File.join(*File.split(filename))
1320
+ if filename =~ /#{Regexp.escape(File::SEPARATOR)}$/
1321
+ File.join(filename, '__WEBSITARY__')
1322
+ else
1323
+ parts = filename.split(/#{Regexp.escape(File::SEPARATOR)}/)
1324
+ if parts.size == 2 and parts[0] =~ /^\w+%3a$/
1325
+ File.join(filename, '__WEBSITARY__')
1326
+ else
1327
+ filename
1328
+ end
1329
+ end
1330
+ end
1331
+
1332
+
1333
+ def open_url(url)
1334
+ $logger.debug "Open URL: #{url}"
1335
+ uri = URI.parse(url)
1336
+ if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
1337
+ open(url)
1338
+ else
1339
+ header = {"User-Agent" => @user_agent}
1340
+ header.merge!(get(url, :header, {}))
1341
+ open(url, header)
1342
+ end
1343
+ end
1344
+
1345
+
1346
+ def difftext_annotation(url)
1347
+ bak = oldname(url)
1348
+ lst = latestname(url)
1349
+ if File.exist?(bak) and File.exist?(lst)
1350
+ eval_arg(get(url, :format_annotation, '%s >>> %s'), [@mtimes.mtime(bak), @mtimes.mtime(lst)])
1351
+ end
1352
+ end
1353
+
1354
+
1355
+ def format_symbol(name, format_string)
1356
+ (format_string % name.to_s).intern
1357
+ end
1358
+
1359
+
1360
+ def format_rss_item(item, body, enclosure='')
1361
+ hd = [item.title]
1362
+ hd << " (#{item.author})" if item.author
1363
+ return <<EOT
1364
+ <h2 class="rss"><a class="rss" href="#{item.link}">#{hd.join} -- #{item.pubDate}</a></h2>
1365
+ <div class="rss">
1366
+ #{body}
1367
+ #{enclosure}
1368
+ </div>
1369
+ EOT
1370
+ end
1371
+
1372
+ # Guess whether text is plain text or html.
1373
+ def is_html?(text)
1374
+ text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
1375
+ end
1376
+
1377
+
1378
+ # Convert html to plain text using hpricot.
1379
+ def html_to_text(text)
1380
+ text && Hpricot(text).to_plain_text
1381
+ end
1382
+
1383
+
1384
+ # Retrieve any robots meta directives from the hpricot document.
1385
+ def robots?(hpricot, *what)
1386
+ (hpricot / '//meta[@name="robots"]').any? do |e|
1387
+ what.any? {|w| e['content'].split(/,\s*/).include?(w)}
1388
+ end
1389
+ end
1390
+
1391
+
1392
+ # Check whether robots are allowed to retrieve an url.
1393
+ def robots_allowed?(url, uri)
1394
+ if @allow.has_key?(url)
1395
+ return @allow[url]
1396
+ end
1397
+
1398
+ if defined?(RobotRules)
1399
+ host = uri.host
1400
+
1401
+ unless (rules = @robots[host])
1402
+ rurl = robots_uri(uri).to_s
1403
+ return true unless rurl
1404
+ begin
1405
+ robots_txt = open_url(rurl).read
1406
+ rules = RobotRules.new(@user_agent)
1407
+ rules.parse(rurl, robots_txt)
1408
+ @robots[host] = rules
1409
+ $logger.info "Loaded #{rurl} for #{@user_agent}"
1410
+ $logger.debug robots_txt
1411
+ rescue Exception => e
1412
+ puts e
1413
+ puts robots_txt
1414
+ end
1415
+ end
1416
+
1417
+ rv = if rules and !rules.allowed?(url)
1418
+ $logger.info "Excluded url: #{url}"
1419
+ false
1420
+ else
1421
+ true
1422
+ end
1423
+ @allow[url] = rv
1424
+ return rv
1425
+ end
1426
+
1427
+ unless @robots[:warning]
1428
+ $logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
1429
+ @robots[:warning] = true
1430
+ end
1431
+ @allow[url] = true
1432
+ return true
1433
+ end
1434
+
1435
+
1436
+ # Get the robots.txt uri for uri.
1437
+ def robots_uri(uri)
1438
+ unless uri.relative?
1439
+ ruri = uri.dup
1440
+ ruri.path = '/robots.txt'
1441
+ ruri
1442
+ end
1443
+ end
1444
+
1445
+
1446
+ def sort_difftext!(difftext)
1447
+ difftext.sort! do |a, b|
1448
+ aa = a[0]
1449
+ bb = b[0]
1450
+ get(aa, :title, aa).downcase <=> get(bb, :title, bb).downcase
1451
+ end
1452
+ end
1453
+
1454
+
1455
+ def file_url(filename)
1456
+ # filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
1457
+ # "file://#{encode(filename, ':/')}"
1458
+ filename = call_cmd(get_optionvalue(:global, :file_url), [filename], filename)
1459
+ encode(filename, ':/')
1460
+ end
1461
+
1462
+
1463
+ def encode(text, chars='')
1464
+ text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
1465
+ end
1466
+
1467
+
1468
+ def decode(text)
1469
+ text.gsub(/%(..)/) {|t| "%c" % $1.hex}
1470
+ end
1471
+
1472
+
1473
+ def output_suffix(outformat)
1474
+ outformat ||= @output_format[0]
1475
+ @suffix[outformat] || outformat
1476
+ end
1477
+
1478
+
1479
+ def output_basename
1480
+ @profiles.join(',')
1481
+ end
1482
+
1483
+
1484
+ def get_outfile(outformat=nil)
1485
+ @outfile[outformat] || File.join(@cfgdir, "#{output_basename}.#{output_suffix(outformat)}")
1486
+ end
1487
+
1488
+
1489
+ def view_output_general(outfile)
1490
+ if @view
1491
+ system((@view % outfile))
1492
+ end
1493
+ end
1494
+ alias :view_output_html :view_output_general
1495
+ alias :view_output_text :view_output_general
1496
+ alias :view_output_rss :view_output_general
1497
+
1498
+ end
1499
+
1500
+
1501
+
1502
+
1503
+ # Local Variables:
1504
+ # revisionRx: REVISION\s\+=\s\+\'
1505
+ # End: