websitary 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,39 @@
1
+ # applog.rb
2
+ # @Last Change: 2007-09-11.
3
+ # Author:: Thomas Link (micathom AT gmail com)
4
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
+ # Created:: 2007-09-08.
6
+
7
+ require 'logger'
8
+
9
+
10
+ # A simple wrapper around Logger.
11
+ class Websitary::AppLog
12
+ def initialize(output=nil)
13
+ @output = output || $stdout
14
+ $logger = Logger.new(@output, 'daily')
15
+ $logger.progname = Websitary::APPNAME
16
+ $logger.datetime_format = "%H:%M:%S"
17
+ set_level
18
+ end
19
+
20
+
21
+ def set_level(level=:default)
22
+ case level
23
+ when :debug
24
+ $logger.level = Logger::DEBUG
25
+ when :verbose
26
+ $logger.level = Logger::INFO
27
+ when :quiet
28
+ $logger.level = Logger::ERROR
29
+ else
30
+ $logger.level = Logger::WARN
31
+ end
32
+ $logger.debug "Set logger level: #{level}"
33
+ end
34
+ end
35
+
36
+
37
+ # Local Variables:
38
+ # revisionRx: REVISION\s\+=\s\+\'
39
+ # End:
@@ -0,0 +1,1505 @@
1
+ # configuration.rb
2
+ # @Last Change: 2007-09-16.
3
+ # Author:: Thomas Link (micathom AT gmail com)
4
+ # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
+ # Created:: 2007-09-08.
6
+
7
+
8
+
9
+ # This class defines the scope in which profiles are evaluated. Most
10
+ # of its methods are suitable for use in profiles.
11
+ class Websitary::Configuration
12
+ # Hash (key = URL, value = Hash of options)
13
+ attr_accessor :urls
14
+ # Array of urls to be downloaded.
15
+ attr_accessor :todo
16
+ # Array of downloaded urls.
17
+ attr_accessor :done
18
+ # The user configuration directory
19
+ attr_accessor :cfgdir
20
+ # What to do
21
+ attr_accessor :execute
22
+ # Global Options
23
+ attr_accessor :options
24
+ # Cached mtimes
25
+ attr_accessor :mtimes
26
+ # The name of the quicklist profile
27
+ attr_accessor :quicklist_profile
28
+ # attr_accessor :default_profiles
29
+ # attr_accessor :cmd_edit
30
+
31
+
32
+ def initialize(app, args=[])
33
+ @logger = Websitary::AppLog.new
34
+ $logger.debug "Configuration#initialize"
35
+ @app = app
36
+ @cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitary') : '.'
37
+ [
38
+ ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitary'),
39
+ File.join(Config::CONFIG['sysconfdir'], 'websitary')
40
+ ].each do |dir|
41
+ if File.exists?(dir)
42
+ @cfgdir = dir
43
+ break
44
+ end
45
+ end
46
+
47
+ @cmd_edit = 'vi "%s"'
48
+ @execute = 'downdiff'
49
+ @quicklist_profile = 'quicklist'
50
+ @user_agent = "websitary/#{Websitary::VERSION}"
51
+ @view = 'w3m "%s"'
52
+
53
+ @allow = {}
54
+ @default_options = {}
55
+ @default_profiles = [@quicklist_profile]
56
+ @done = []
57
+ @mtimes = Websitary::FileMTimes.new(self)
58
+ @outfile = {}
59
+ @profiles = []
60
+ @robots = {}
61
+ @todo = []
62
+ @urlencmap = {}
63
+ @urls = {}
64
+
65
+ @suffix = {
66
+ 'text' => 'txt'
67
+ # 'rss' => 'xml'
68
+ }
69
+
70
+ migrate
71
+ initialize_options
72
+ profile 'config.rb'
73
+ parse_command_line_args(args)
74
+
75
+ @output_format ||= ['html']
76
+ @output_title = %{#{Websitary::APPNAME}: #{@profiles.join(", ")}}
77
+ end
78
+
79
+
80
+ def parse_command_line_args(args)
81
+ $logger.debug "parse_command_line_args: #{args}"
82
+ opts = OptionParser.new do |opts|
83
+ opts.banner = "Usage: #{Websitary::APPNAME} [OPTIONS] [PROFILES] > [OUT]"
84
+ opts.separator ''
85
+ opts.separator "#{Websitary::APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
86
+ opts.separator 'the terms of the GNU General Public License version 2 or newer.'
87
+ opts.separator ''
88
+
89
+ opts.separator 'General Options:'
90
+
91
+ opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
92
+ @cfgdir = value
93
+ end
94
+
95
+ opts.on('-e', '--execute=COMMAND', String, 'Define what to do (default: downdiff)') do |value|
96
+ @execute = value
97
+ end
98
+
99
+ # opts.on('-E', '--edit=PROFILE', String, 'Edit a profile') do |value|
100
+ # edit_profile value
101
+ # exit 0
102
+ # end
103
+
104
+ opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
105
+ output_format(*value.split(/,/))
106
+ end
107
+
108
+ opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
109
+ set :ignore_age => bool
110
+ end
111
+
112
+ opts.on('--log=DESTINATION', String, 'Log destination') do |value|
113
+ @logger = Websitary::AppLog.new(value != '-' && value)
114
+ end
115
+
116
+ opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
117
+ output_file(value)
118
+ end
119
+
120
+ opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
121
+ key, val = value.split(/=/, 2)
122
+ set key.intern => eval(val)
123
+ end
124
+
125
+ opts.on('-t', '--timer=N', Numeric, 'Repeat every N seconds (never exit)') do |value|
126
+ global(:timer => value)
127
+ end
128
+
129
+ # opts.on('--review', 'View last diff') do |value|
130
+ # view_output
131
+ # exit 0
132
+ # end
133
+
134
+ opts.separator ''
135
+ opts.separator "Available commands (default: #@execute):"
136
+ commands = @app.methods.map do |m|
137
+ mt = m.match(/^execute_(.*)$/)
138
+ mt && mt[1]
139
+ end
140
+ commands.compact!
141
+ commands.sort!
142
+ opts.separator commands.join(', ')
143
+
144
+ opts.separator ''
145
+ opts.separator 'Available profiles:'
146
+ opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')
147
+
148
+ opts.separator ''
149
+ opts.separator 'Other Options:'
150
+
151
+ opts.on('--debug', 'Show debug messages') do |v|
152
+ $VERBOSE = $DEBUG = true
153
+ @logger.set_level(:debug)
154
+ end
155
+
156
+ opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
157
+ @logger.set_level(:quiet)
158
+ end
159
+
160
+ opts.on('-v', '--verbose', 'Run verbosely') do |v|
161
+ $VERBOSE = true
162
+ @logger.set_level(:verbose)
163
+ end
164
+
165
+ opts.on_tail('-h', '--help', 'Show this message') do
166
+ puts opts
167
+ exit 1
168
+ end
169
+ end
170
+
171
+ @profiles = opts.parse!(args)
172
+ @profiles = @default_profiles if @profiles.empty?
173
+ cla_handler = "cmdline_arg_#{@execute}"
174
+ cla_handler = nil unless @app.respond_to?(cla_handler)
175
+ for pn in @profiles
176
+ if cla_handler
177
+ @app.send(cla_handler, self, pn)
178
+ else
179
+ profile pn
180
+ end
181
+ end
182
+
183
+ self
184
+ end
185
+
186
+
187
+ # Retrieve an option for an url
188
+ # url:: String
189
+ # opt:: Symbol
190
+ def get(url, opt, default=nil)
191
+ opts = @urls[url]
192
+ unless opts
193
+ $logger.debug "Non-registered URL: #{url}"
194
+ return default
195
+ end
196
+ $logger.debug "get: opts=#{opts.inspect}"
197
+ case opt
198
+ when :diffprocess, :format
199
+ opt_ = opts.has_key?(opt) ? opt : :diff
200
+ else
201
+ opt_ = opt
202
+ end
203
+
204
+ $logger.debug "get: opt=#{opt} opt_=#{opt_}"
205
+ $logger.debug "get: #{opts[opt_]} #{opts[:use]}" if opts
206
+ if opts.has_key?(opt_)
207
+ val = opts[opt_]
208
+ elsif opts.has_key?(:use)
209
+ val = opts[:use]
210
+ else
211
+ val = nil
212
+ end
213
+
214
+ case val
215
+ when nil
216
+ when Symbol
217
+ $logger.debug "get: val=#{val}"
218
+ success, rv = get_option(opt, val)
219
+ $logger.debug "get: #{success}, #{rv}"
220
+ if success
221
+ return rv
222
+ end
223
+ else
224
+ $logger.debug "get: return val=#{val}"
225
+ return val
226
+ end
227
+ unless default
228
+ success, default1 = get_option(opt, :default)
229
+ default = default1 if success
230
+ end
231
+
232
+ $logger.debug "get: return default=#{default}"
233
+ return default
234
+ end
235
+
236
+
237
+ def get_optionvalue(opt, val, default=nil)
238
+ case val
239
+ when Symbol
240
+ ok, val = get_option(opt, val)
241
+ if ok
242
+ val
243
+ else
244
+ default
245
+ end
246
+ else
247
+ val
248
+ end
249
+ end
250
+
251
+
252
+ def get_option(opt, val)
253
+ vals = @options[opt]
254
+ $logger.debug "val=#{val} vals=#{vals.inspect}"
255
+ if vals and vals.has_key?(val)
256
+ rv = vals[val]
257
+ $logger.debug "get_option ok: #{opt} => #{rv.inspect}"
258
+ case rv
259
+ when Symbol
260
+ $logger.debug "get_option re: #{rv}"
261
+ return get_option(opt, rv)
262
+ else
263
+ $logger.debug "get_option true, #{rv}"
264
+ return [true, rv]
265
+ end
266
+ else
267
+ $logger.debug "get_option no: #{opt} => #{val.inspect}"
268
+ return [false, val]
269
+ end
270
+ end
271
+
272
+
273
+ # Configuration command:
274
+ # Set the default profiles
275
+ def default(*profile_names)
276
+ @default_profiles = profile_names
277
+ end
278
+
279
+
280
+ def quicklist(profile_name)
281
+ @quicklist_profile = profile_name
282
+ end
283
+
284
+
285
+ # Configuration command:
286
+ # Load a profile
287
+ def profile(profile_name)
288
+ case profile_name
289
+ when '-'
290
+ readlines.map! {|l| l.chomp}.each {|url| source url}
291
+ when '__END__'
292
+ $logger.debug "Profile: __END__"
293
+ contents = DATA.read
294
+ return eval_profile(contents)
295
+ else
296
+ fn = profile_filename(profile_name)
297
+ if fn
298
+ $logger.debug "Profile: #{fn}"
299
+ contents = File.read(fn)
300
+ return eval_profile(contents, fn)
301
+ end
302
+ end
303
+ return false
304
+ end
305
+
306
+
307
+ # Define a options shortcut.
308
+ def shortcut(symbol, args)
309
+ ak = args.keys
310
+ ok = @options.keys
311
+ dk = ok - ak
312
+
313
+ # :downloadprocess
314
+ if !ak.include?(:delegate) and
315
+ dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
316
+ $logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
317
+ end
318
+
319
+ if ak.include?(:delegate)
320
+ dk.each do |field|
321
+ @options[field][symbol] = args[:delegate]
322
+ end
323
+ end
324
+
325
+ args.each do |field, val|
326
+ @options[field][symbol] = val unless field == :delegate
327
+ end
328
+ end
329
+
330
+
331
+ # Set the output format.
332
+ def output_format(*format)
333
+ unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
334
+ $logger.fatal "Unknown output format: #{format}"
335
+ exit 5
336
+ end
337
+ @output_format = format
338
+ end
339
+
340
+
341
+ # Set the output file.
342
+ def output_file(filename, outformat=nil)
343
+ @outfile[outformat] = filename
344
+ end
345
+
346
+
347
+ # Configuration command:
348
+ # Set global options.
349
+ # type:: Symbol
350
+ # options:: Hash
351
+ def option(type, options)
352
+ $logger.info "option #{type}: #{options.inspect}"
353
+ o = @options[type]
354
+ if o
355
+ o.merge!(options)
356
+ else
357
+ $logger.error "Unknown option type: #{type} (#{options.inspect})"
358
+ end
359
+ end
360
+
361
+
362
+ # Set a global option.
363
+ def global(options)
364
+ options.each do |type, value|
365
+ @options[:global][type] = value
366
+ end
367
+ end
368
+
369
+
370
+ # Configuration command:
371
+ # Set the default value for source-options.
372
+ def set(options)
373
+ $logger.debug "set: #{options.inspect}"
374
+ @default_options.merge!(options)
375
+ end
376
+
377
+
378
+ # Configuration command:
379
+ # Unset a default source-option.
380
+ def unset(*options)
381
+ for option in options
382
+ @default_options.delete(option)
383
+ end
384
+ end
385
+
386
+
387
+ # Configuration command:
388
+ # Define a source.
389
+ # urls:: String
390
+ def source(urls, opts={})
391
+ urls.split("\n").flatten.compact.each do |url|
392
+ @urls[url] = @default_options.dup.update(opts)
393
+ @todo << url
394
+ end
395
+ end
396
+
397
+
398
+ # Configuration command:
399
+ # Set the default download processor. The block takes the
400
+ # downloaded text (STRING) as argument.
401
+ def downloadprocess(&block)
402
+ @options[:downloadprocess][:default] = block
403
+ end
404
+
405
+
406
+ # Configuration command:
407
+ # Set the default diff processor. The block takes the
408
+ # diff text (STRING) as argument.
409
+ def diffprocess(&block)
410
+ @options[:diff][:default] = block
411
+ end
412
+
413
+
414
+ # Configuration command:
415
+ # Set the editor.
416
+ def edit(cmd)
417
+ @cmd_edit = cmd
418
+ end
419
+
420
+
421
+ # Configuration command:
422
+ # Set the viewer.
423
+ def view(view)
424
+ @view = view
425
+ end
426
+
427
+
428
+ # Configuration command:
429
+ # Set the default diff program.
430
+ def diff(diff)
431
+ @options[:diff][:default] = diff
432
+ end
433
+
434
+
435
+ # Configuration command:
436
+ # Set the default dowloader.
437
+ def download(download)
438
+ @options[:download][:default] = download
439
+ end
440
+
441
+
442
+ # Format a diff according to URL's source options.
443
+ def format(url, difftext)
444
+ fmt = get(url, :format)
445
+ eval_arg(fmt, [difftext], difftext)
446
+ end
447
+
448
+
449
+ # Apply some arguments to a format.
450
+ # format:: String or Proc
451
+ # args:: Array of Arguments
452
+ def eval_arg(format, args, default=nil, &process_string)
453
+ case format
454
+ when nil
455
+ return default
456
+ when Proc
457
+ # $logger.debug "eval proc: #{format} #{args.inspect}" #DBG#
458
+ $logger.debug "eval proc: #{format}/#{args.size}"
459
+ return format.call(*args)
460
+ else
461
+ ca = format % args
462
+ # $logger.debug "eval string: #{ca}" #DBG#
463
+ if process_string
464
+ return process_string.call(ca)
465
+ else
466
+ return ca
467
+ end
468
+ end
469
+ end
470
+
471
+
472
+ # Apply the argument to cmd (a format String or a Proc). If a
473
+ # String, execute the command.
474
+ def call_cmd(cmd, args, default=nil)
475
+ eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
476
+ end
477
+
478
+
479
+ # Generate & view the final output.
480
+ # difftext:: Hash
481
+ def show_output(difftext)
482
+ if difftext.empty?
483
+ msg = ['No news is good news']
484
+ msg << "try again in #{@app.format_tdiff(@app.tdiff_min)}" if @app.tdiff_min
485
+ $logger.warn msg.join('; ')
486
+ return 0
487
+ end
488
+
489
+ @output_format.each do |outformat|
490
+ meth = "get_output_#{outformat}"
491
+
492
+ unless respond_to?(meth)
493
+ $logger.fatal "Unknown output format: #{outformat}"
494
+ exit 5
495
+ end
496
+
497
+ out = send(meth, difftext)
498
+ if out
499
+ outfile = get_outfile(outformat)
500
+ case outfile
501
+ when '-'
502
+ puts out
503
+ else
504
+ write_file(outfile) {|io| io.puts out}
505
+ meth = "view_output_#{outformat}"
506
+ self.send(meth, outfile)
507
+ end
508
+ end
509
+ end
510
+ return 1
511
+ end
512
+
513
+
514
+ def get_output_text(difftext)
515
+ difftext.map do |url, difftext|
516
+ if difftext
517
+ difftext = html_to_text(difftext) if is_html?(difftext)
518
+ !difftext.empty? && [
519
+ eval_arg(get(url, :rewrite_link, '%s'), [url]),
520
+ difftext_annotation(url),
521
+ nil,
522
+ difftext
523
+ ].join("\n")
524
+ end
525
+ end.compact.join("\n\n#{('-' * 68)}\n\n")
526
+ end
527
+
528
+
529
+ def get_output_rss(difftext)
530
+ success, rss_url = get_option(:rss, :url)
531
+ if success
532
+ success, rss_version = get_option(:rss, :version)
533
+ # require "rss/#{rss_version}"
534
+
535
+ rss = RSS::Rss.new(rss_version)
536
+ chan = RSS::Rss::Channel.new
537
+ chan.title = @output_title
538
+ [:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
539
+ ok, val = get_option(:rss, field)
540
+ item.send(format_symbol(field, '%s='), val) if ok
541
+ end
542
+ chan.link = rss_url
543
+ rss.channel = chan
544
+
545
+ cnt = difftext.map do |url, text|
546
+ rss_format = get(url, :rss_format, 'plain_text')
547
+ text = strip_tags(text, :format => rss_format)
548
+ next if text.empty?
549
+
550
+ item = RSS::Rss::Channel::Item.new
551
+ item.date = Time.now
552
+ item.title = get(url, :title, File.basename(url))
553
+ item.link = eval_arg(get(url, :rewrite_link, '%s'), [url])
554
+ [:author, :date, :enclosure, :category, :pubDate].each do |field|
555
+ val = get(url, format_symbol(field, 'rss_%s'))
556
+ item.send(format_symbol(field, '%s='), val) if val
557
+ end
558
+
559
+ annotation = difftext_annotation(url)
560
+ annotation = "<pre>#{annotation}</pre>" if annotation
561
+ case rss_format
562
+ when 'plain_text'
563
+ item.description = %{#{annotation}<pre>#{text}</pre>}
564
+ else
565
+ item.description = %{#{annotation}\n#{text}}
566
+ end
567
+ chan.items << item
568
+ end
569
+
570
+ return rss.to_s
571
+
572
+ else
573
+
574
+ $logger.fatal "Global option :rss[:url] not defined."
575
+ exit 5
576
+
577
+ end
578
+ end
579
+
580
+
581
+ def get_output_html(difftext)
582
+ difftext = difftext.map do |url, text|
583
+ tags = get(url, :strip_tags)
584
+ text = strip_tags(text, :tags => tags) if tags
585
+ text.empty? ? nil : [url, text]
586
+ end
587
+ difftext.compact!
588
+ sort_difftext!(difftext)
589
+
590
+ toc = difftext.map do |url, text|
591
+ ti = get(url, :title, File.basename(url))
592
+ tid = html_toc_id(url)
593
+ bid = html_body_id(url)
594
+ %{<li id="#{tid}" class="toc"><a class="toc" href="\##{bid}">#{ti}</a></li>}
595
+ end.join("\n")
596
+
597
+ idx = 0
598
+ cnt = difftext.map do |url, text|
599
+ idx += 1
600
+ ti = get(url, :title, File.basename(url))
601
+ bid = html_body_id(url)
602
+ if (rewrite = get(url, :rewrite_link))
603
+ urlr = eval_arg(rewrite, [url])
604
+ ext = ''
605
+ else
606
+ old = %{<a class="old" href="#{file_url(oldname(url))}">old</a>}
607
+ lst = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
608
+ ext = %{ (#{old}, #{lst})}
609
+ urlr = url
610
+ end
611
+ note = difftext_annotation(url)
612
+ <<HTML
613
+ <div id="#{bid}" class="webpage">
614
+ <div class="count">
615
+ #{idx}
616
+ </div>
617
+ <h1 class="diff">
618
+ <a class="external" href="#{urlr}">#{ti}</a>#{ext}
619
+ </h1>
620
+ <div class="annotation">
621
+ #{note && CGI::escapeHTML(note)}
622
+ </div>
623
+ <div class="diff,difftext">
624
+ #{format(url, text)}
625
+ </div>
626
+ </div>
627
+ HTML
628
+ end.join(('<hr class="separator"/>') + "\n")
629
+
630
+ success, template = get_option(:page, :format)
631
+ unless success
632
+ success, template = get_option(:page, :simple)
633
+ end
634
+ return eval_arg(template, [@output_title, toc, cnt])
635
+ end
636
+
637
+
638
+ # Get the diff filename.
639
+ def diffname(url, ensure_dir=false)
640
+ encoded_filename('diff', url, ensure_dir, 'md5')
641
+ end
642
+
643
+
644
+ # Get the backup filename.
645
+ def oldname(url, ensure_dir=false, type=nil)
646
+ encoded_filename('old', url, ensure_dir, type)
647
+ end
648
+
649
+
650
+ # Get the filename for the freshly downloaded copy.
651
+ def latestname(url, ensure_dir=false, type=nil)
652
+ encoded_filename('latest', url, ensure_dir, type)
653
+ end
654
+
655
+
656
+ def url_from_filename(filename)
657
+ rv = @urlencmap[filename]
658
+ if rv
659
+ $logger.debug "Map filename: #{filename} -> #{rv}"
660
+ else
661
+ $logger.warn "Unmapped filename: #{filename}"
662
+ end
663
+ rv
664
+ end
665
+
666
+
667
+ def encoded_filename(dir, url, ensure_dir=false, type=nil)
668
+ type ||= get(url, :cachetype, 'tree')
669
+ $logger.debug "encoded_filename: type=#{type} url=#{url}"
670
+ rv = File.join(@cfgdir, dir, encoded_basename(url, type))
671
+ rd = File.dirname(rv)
672
+ $logger.debug "encoded_filename: rv0=#{rv}"
673
+ fm = get_optionvalue(:global, :filename_size, 255)
674
+ rdok = !ensure_dir || @app.ensure_dir(rd, false)
675
+ if !rdok or rv.size > fm or File.directory?(rv)
676
+ # $logger.debug "Filename too long (:global=>:filename_size = #{fm}), try md5 encoded filename instead: #{url}"
677
+ $logger.info "Can't use filename, try 'md5' instead: #{url}"
678
+ rv = File.join(@cfgdir, dir, encoded_basename(url, :md5))
679
+ rd = File.dirname(rv)
680
+ end
681
+ @urlencmap[rv] = url
682
+ return rv
683
+ end
684
+
685
+
686
+ def encoded_basename(url, type='tree')
687
+ m = "encoded_basename_#{type}"
688
+ if respond_to?(m)
689
+ return send(m, url)
690
+ else
691
+ $logger.fatal "Unknown cache type: #{type}"
692
+ exit 5
693
+ end
694
+ end
695
+
696
+
697
+ def encoded_basename_tree(url)
698
+ ensure_filename(encode(url, '/'))
699
+ end
700
+
701
+
702
+ def encoded_basename_flat(url)
703
+ encode(url)
704
+ end
705
+
706
+
707
+ def encoded_basename_md5(url)
708
+ Digest::MD5.hexdigest(url)
709
+ end
710
+
711
+
712
+ def urlextname(url)
713
+ begin
714
+ return File.extname(URI.parse(url).path)
715
+ rescue Exception => e
716
+ end
717
+ end
718
+
719
+
720
+ # Guess path's dirname.
721
+ # foo/bar -> foo
722
+ # foo/bar.txt -> foo
723
+ # foo/bar/ -> foo/bar
724
+ def guess_dir(path)
725
+ path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
726
+ end
727
+
728
+
729
+ # Strip the url's last part (after #).
730
+ def canonic_url(url)
731
+ url.sub(/#.*$/, '')
732
+ end
733
+
734
+
735
+ def strip_tags_default
736
+ success, tags = get_option(:strip_tags, :default)
737
+ tags.dup if success
738
+ end
739
+
740
+
741
+ def strip_tags(doc, args={})
742
+ tags = args[:tags] || strip_tags_default
743
+ case doc
744
+ when String
745
+ doc = Hpricot(doc)
746
+ end
747
+ tags.each do |tag|
748
+ doc.search(tag).remove
749
+ end
750
+ case args[:format]
751
+ when :hpricot
752
+ doc
753
+ else
754
+ doc.send("to_#{args[:format] || :html}")
755
+ end
756
+ end
757
+
758
+
759
+ # Check whether path is eligible on the basis of url or path0.
760
+ # This checks either for a :match option for url or the extensions
761
+ # of path0 and path.
762
+ def eligible_path?(url, path0, path)
763
+ rx = get(url, :match)
764
+ if rx
765
+ return path =~ rx
766
+ else
767
+ return File.extname(path0) == File.extname(path)
768
+ end
769
+ end
770
+
771
+
772
+ # Scan hpricot document for hrefs and push the onto @todo if not
773
+ # already included.
774
+ def push_hrefs(url, hpricot, &condition)
775
+ begin
776
+ return if robots?(hpricot, 'nofollow')
777
+ depth = get(url, :depth)
778
+ return if depth and depth <= 0
779
+ uri0 = URI.parse(url)
780
+ # pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path)))
781
+ pn0 = Pathname.new(guess_dir(uri0.path))
782
+ (hpricot / 'a').each do |a|
783
+ href = a['href']
784
+ next if href.nil? or href == url or href =~ /^\s*javascript:/
785
+ uri = URI.parse(href)
786
+ pn = guess_dir(uri.path)
787
+ href = rewrite_href(href, url, uri0, pn0, true)
788
+ curl = canonic_url(href)
789
+ next if href.nil? or @done.include?(curl) or @todo.include?(curl)
790
+ next unless robots_allowed?(curl, uri)
791
+ # pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
792
+ uri = URI.parse(href)
793
+ pn = Pathname.new(guess_dir(uri.path))
794
+ if condition.call(uri0, pn0, uri, pn)
795
+ opts = @urls[url].dup
796
+ # opts[:title] = File.basename(curl)
797
+ opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
798
+ opts[:depth] = depth - 1 if depth and depth >= 0
799
+ @urls[curl] = opts
800
+ @todo << curl
801
+ end
802
+ end
803
+ rescue Exception => e
804
+ # $logger.error e #DBG#
805
+ $logger.error e.message
806
+ $logger.debug e.backtrace
807
+ end
808
+ end
809
+
810
+
811
+ # Rewrite urls in doc
812
+ # url:: String
813
+ # doc:: Hpricot document
814
+ def rewrite_urls(url, doc)
815
+ uri = URI.parse(url)
816
+ urd = guess_dir(uri.path)
817
+ (doc / 'a').each do |a|
818
+ href = rewrite_href(a['href'], url, uri, urd, true)
819
+ a['href'] = href if href
820
+ end
821
+ (doc / 'img').each do |a|
822
+ href = rewrite_href(a['src'], url, uri, urd, false)
823
+ a['src'] = href if href
824
+ end
825
+ doc
826
+ end
827
+
828
+
829
+ # Try to make href an absolute url.
830
+ def rewrite_href(href, url, uri=nil, urd=nil, local=false)
831
+ begin
832
+ return if !href or href =~ /^\s*javascript:/
833
+ urh = URI.parse(href)
834
+ uri ||= URI.parse(url)
835
+ urd ||= guess_dir(uri.path)
836
+ rv = nil
837
+ href = href.strip
838
+
839
+ # $logger.debug "DBG", uri, urh, #DBG#
840
+ if href =~ /\w+:/
841
+ # $logger.debug "DBG href=#$0" #DBG#
842
+ rv = href
843
+ elsif urh.relative?
844
+ # $logger.debug "DBG urh relative" #DBG#
845
+ if uri.relative?
846
+ # $logger.debug "DBG both relative" #DBG#
847
+ if uri.instance_of?(URI::Generic)
848
+ rv = File.join(urd, href)
849
+ # $logger.debug "DBG rv=#{rv}" #DBG#
850
+ end
851
+ else
852
+ rv = uri.merge(href).to_s
853
+ # $logger.debug "DBG relativ rv=#{rv}" #DBG#
854
+ if local
855
+ hf = latestname(rv)
856
+ if @todo.include?(rv) or @done.include?(rv) or File.exist?(hf)
857
+ rv = hf
858
+ # $logger.debug "DBG relativ, local rv=#{rv}" #DBG#
859
+ end
860
+ end
861
+ end
862
+ elsif href[0..0] == '#'
863
+ # $logger.debug "DBG anchor" #DBG#
864
+ rv = url + href
865
+ elsif uri.host == urh.host
866
+ # $logger.debug "DBG merge" #DBG#
867
+ rv = uri.merge(href).to_s
868
+ else
869
+ # $logger.debug "as is" #DBG#
870
+ rv = href
871
+ end
872
+
873
+ case rv
874
+ when String
875
+ return rv
876
+ when nil
877
+ else
878
+ $logger.error "Internal error: href=#{href}"
879
+ $logger.debug caller.join("\n")
880
+ end
881
+ return
882
+ rescue Exception => e
883
+ # $logger.error e #DBG#
884
+ $logger.error e.message
885
+ $logger.debug e.backtrace
886
+ end
887
+ return nil
888
+ end
889
+
890
+
891
+ # Return a Proc that takes an text as argument and highlight occurences of rx.
892
+ # rx:: Regular expression
893
+ # color:: A string, sets the class to highlight-color (default: "yellow")
894
+ # group:: A number (default: 0)
895
+ # tag:: The HTML tag to use (default: "span")
896
+ def highlighter(rx, color=nil, group=nil, tag='span')
897
+ lambda {|text| text.gsub(rx, %{<#{tag} class="highlight-#{color || 'yellow'}">\\#{group || 0}</#{tag}>})}
898
+ end
899
+
900
+
901
+ def view_output(outfile=nil)
902
+ send("view_output_#{@output_format[0]}", outfile || get_outfile)
903
+ end
904
+
905
+
906
+ def edit_profile(profile=nil)
907
+ profile ||= @profiles
908
+ case profile
909
+ when Array
910
+ profile.each {|p| edit_profile p}
911
+ else
912
+ fn = profile_filename(profile)
913
+ $logger.debug "edit: #{fn}"
914
+ `#{@cmd_edit % fn}`
915
+ end
916
+ end
917
+
918
+
919
+ def profile_filename(profile_name, check_file_exists=true)
920
+ if File.extname(profile_name) != '.rb'
921
+ profile_name = "#{profile_name}.rb"
922
+ end
923
+ filename = nil
924
+ ['.', @cfgdir].each do |d|
925
+ filename = File.join(d, profile_name)
926
+ if File.exists?(filename)
927
+ return filename
928
+ end
929
+ end
930
+ return check_file_exists ? nil : filename
931
+ end
932
+
933
+
934
+ def write_file(filename, mode='w', &block)
935
+ File.open(filename, mode) {|io| block.call(io)}
936
+ @mtimes.set(filename)
937
+ end
938
+
939
+
940
+ def canonic_filename(filename)
941
+ call_cmd(get_optionvalue(:global, :canonic_filename), [filename], filename)
942
+ end
943
+
944
+
945
+ private
946
+ def initialize_options
947
+ @options = {
948
+ :global => {
949
+ :downloadhtml => :openuri,
950
+ },
951
+ }
952
+
953
+ @options[:diff] = {
954
+ :default => :diff,
955
+
956
+ :diff => lambda {|old, new, *args|
957
+ opts, _ = args
958
+ opts ||= '-d -w'
959
+ difftext = call_cmd('diff %s -u2 "%s" "%s"', [opts, old, new])
960
+ difftext = difftext.split("\n")[2..-1]
961
+ difftext ? difftext.delete_if {|l| l =~ /^[^+]/}.map {|l| l[1..-1]}.join("\n") : ''
962
+ },
963
+
964
+ :binary => lambda {|old, new|
965
+ call_cmd(get_optionvalue(:diff, :diff), [old, new, '--binary -d -w'])
966
+ },
967
+
968
+ :new => lambda {|old, new|
969
+ difftext = call_cmd(get_optionvalue(:diff, :binary), [old, new])
970
+ difftext.empty? ? '' : new
971
+ },
972
+
973
+ :raw => :new,
974
+
975
+ :webdiff => lambda {|old, new|
976
+ $logger.debug "webdiff: #{File.basename(new)}"
977
+ $logger.debug %{webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -}
978
+ difftext = `webdiff --hicolor=yellow -archive "#{old}" -current "#{new}" -out -`
979
+ $?.exitstatus == 1 ? difftext : ''
980
+ },
981
+ }
982
+
983
+ @options[:format] = {
984
+ :default => :diff,
985
+ :diff => %{<pre class="diff">\n%s\n</pre>},
986
+ :webdiff => "%s\n",
987
+ :raw => lambda {|new| File.read(new)},
988
+ }
989
+
990
+ @options[:diffprocess] = {
991
+ :default => :diff,
992
+ :diff => false,
993
+ :webdiff => false,
994
+ :raw => false,
995
+ }
996
+
997
+ @options[:download] = {
998
+ :default => :w3m,
999
+ :raw => :openuri,
1000
+ }
1001
+
1002
+ @options[:downloadformat] = {
1003
+ :w3m => 'text',
1004
+ :webdiff => 'html',
1005
+ :raw => '',
1006
+ }
1007
+
1008
+ @options[:downloadprocess] = {
1009
+ }
1010
+
1011
+ @options[:rss] = {
1012
+ :version => '2.0',
1013
+ }
1014
+
1015
+ @options[:strip_tags] = {
1016
+ :default => ['script', 'object', 'form', 'input', 'select', 'iframe', 'head', 'meta', 'link'],
1017
+ }
1018
+
1019
+ shortcut :w3m, :delegate => :diff,
1020
+ :download => 'w3m -S -F -dump "%s"'
1021
+ # :download => 'w3m -no-cookie -S -F -dump "%s"'
1022
+
1023
+ shortcut :lynx, :delegate => :diff,
1024
+ :download => 'lynx -dump "%s"'
1025
+
1026
+ shortcut :links, :delegate => :diff,
1027
+ :download => 'links -dump "%s"'
1028
+
1029
+ shortcut :curl, :delegate => :webdiff,
1030
+ :download => 'curl --silent "%s"'
1031
+
1032
+ shortcut :wget, :delegate => :webdiff,
1033
+ :download => 'wget -q -O - "%s"'
1034
+
1035
+ shortcut :text, :delegate => :diff,
1036
+ :download => lambda {|url| html_to_text(open_url(url).read)}
1037
+
1038
+ shortcut :body_html, :delegate => :webdiff,
1039
+ :strip_tags => :default,
1040
+ :download => lambda {|url|
1041
+ begin
1042
+ doc = Hpricot(open_url(url).read)
1043
+ doc = doc.at('body')
1044
+ if doc
1045
+ doc = rewrite_urls(url, doc)
1046
+ doc = doc.inner_html
1047
+ if (tags = get(url, :strip_tags))
1048
+ doc = strip_tags(doc, :format => :hpricot, :tags => tags)
1049
+ end
1050
+ else
1051
+ $logger.warn 'inner html: No body'
1052
+ end
1053
+ doc.to_s
1054
+ rescue Exception => e
1055
+ # $logger.error e #DBG#
1056
+ $logger.error e.message
1057
+ $logger.debug e.backtrace
1058
+ break %{<pre class="error">\n#{e.message}\n</pre>}
1059
+ end
1060
+ }
1061
+
1062
+ shortcut :openuri, :delegate => :webdiff,
1063
+ :download => lambda {|url|
1064
+ begin
1065
+ open_url(url).read
1066
+ rescue Exception => e
1067
+ # $logger.error e #DBG#
1068
+ $logger.error e.message
1069
+ $logger.debug e.backtrace
1070
+ %{<pre class="error">\n#{e.to_s}\n</pre>}
1071
+ end
1072
+ }
1073
+
1074
+ shortcut :rss,
1075
+ :delegate => :openuri,
1076
+ :diff => lambda {|old, new|
1077
+ success, rss_version = get_option(:rss, :version)
1078
+ ro = RSS::Parser.parse(File.read(old), false)
1079
+ if ro
1080
+ rh = {}
1081
+ ro.items.each do |item|
1082
+ rh[Digest::MD5.hexdigest(item.to_s)] = item
1083
+ rh[item.link] = item
1084
+ end
1085
+ rnew = []
1086
+ rn = RSS::Parser.parse(File.read(new), false)
1087
+ if rn
1088
+ rn.items.each do |item|
1089
+ rid = Digest::MD5.hexdigest(item.to_s)
1090
+ if !rh[rid]
1091
+ if (olditem = rh[item.link])
1092
+ rss_diff = Websitary::Htmldiff.new(:oldtext => olditem.description, :newtext => item.description).process
1093
+ rnew << format_rss_item(item, rss_diff)
1094
+ else
1095
+ if item.enclosure and (curl = item.enclosure.url)
1096
+ url = url_from_filename(new)
1097
+ dir = get(url, :rss_enclosure)
1098
+ curl = rewrite_href(curl, url, nil, nil, true)
1099
+ next unless curl
1100
+ if dir
1101
+ if dir == true
1102
+ dir = File.join(@cfgdir, 'attachments', encode(rn.channel.title))
1103
+ end
1104
+ @app.ensure_dir(dir)
1105
+ $logger.debug "Enclosure URL: #{curl}"
1106
+ fname = File.join(dir, encode(File.basename(curl) || item.title || item.pubDate.to_s || Time.now.to_s))
1107
+ $logger.debug "Enclosure save to: #{fname}"
1108
+ enc = open_url(curl).read
1109
+ write_file(fname, 'wb') {|io| io.puts enc}
1110
+ furl = file_url(fname)
1111
+ enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Enclosure (local copy)</a></p>} % furl
1112
+ if get(url, :rss_rewrite_enclosed_urls)
1113
+ item.description.gsub!(Regexp.new(Regexp.escape(curl))) {|t| furl}
1114
+ end
1115
+ else
1116
+ enclosure = %{<p class="enclosure"><a href="%s" class="enclosure" />Original Enclosure</a></p>} % curl
1117
+ end
1118
+ else
1119
+ enclosure = ''
1120
+ end
1121
+ rnew << format_rss_item(item, item.description, enclosure)
1122
+ end
1123
+ end
1124
+ end
1125
+ rnew.join("\n")
1126
+ end
1127
+ end
1128
+ }
1129
+
1130
+ shortcut :opml, :delegate => :rss,
1131
+ :download => lambda {|url|
1132
+ opml = open(url) {|io| io.read}
1133
+ if oplm
1134
+ xml = Hpricot(opml)
1135
+ # <+TBD+>Well, maybe would should search for outline[@type=rss]?
1136
+ xml.search('//outline[@xmlurl]').each {|elt|
1137
+ if elt['type'] =~ /rss/
1138
+ curl = elt['xmlurl']
1139
+ opts = @urls[url].dup
1140
+ opts[:download] = :rss
1141
+ opts[:title] = elt['title'] || elt['text'] || elt['htmlurl'] || curl
1142
+ @urls[curl] = opts
1143
+ @todo << curl
1144
+ else
1145
+ $logger.warn "Unsupported type in OPML: #{elt.to_s}"
1146
+ end
1147
+ }
1148
+ end
1149
+ nil
1150
+ }
1151
+
1152
+ shortcut :website, :delegate => :webdiff,
1153
+ :download => lambda {|url| get_website(:body_html, url)}
1154
+
1155
+ shortcut :website_below, :delegate => :webdiff,
1156
+ :download => lambda {|url| get_website_below(:body_html, url)}
1157
+
1158
+ shortcut :website_txt, :delegate => :default,
1159
+ :download => lambda {|url| html_to_text(get_website(get(url, :downloadhtml, :openuri), url))}
1160
+
1161
+ shortcut :website_txt_below, :delegate => :default,
1162
+ :download => lambda {|url| html_to_text(get_website_below(get(url, :downloadhtml, :openuri), url))}
1163
+
1164
+ shortcut :ftp, :delegate => :default,
1165
+ :download => lambda {|url| get_ftp(url).join("\n")}
1166
+
1167
+ shortcut :ftp_recursive, :delegate => :default,
1168
+ :download => lambda {|url|
1169
+ list = get_ftp(url)
1170
+ depth = get(url, :depth)
1171
+ if !depth or depth >= 0
1172
+ dirs = list.find_all {|e| e =~ /^d/}
1173
+ dirs.each do |l|
1174
+ sl = l.scan(/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+ +\S+ +\S+)\s+(.+)$/)
1175
+ perms, type, owner, group, size, date, dirname = sl[0]
1176
+ curl = File.join(url, dirname, '')
1177
+ opts = @urls[url].dup
1178
+ opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
1179
+ opts[:depth] = depth - 1 if depth and depth >= 0
1180
+ @urls[curl] = opts
1181
+ @todo << curl
1182
+ end
1183
+ end
1184
+ list.join("\n")
1185
+ }
1186
+
1187
+ shortcut :img, :delegate => :raw,
1188
+ :format => lambda {|new|
1189
+ file = file_url(new)
1190
+ %{<img src="#{file}" />}
1191
+ }
1192
+
1193
+ @options[:page] = {
1194
+ :format => lambda do |ti, li, bd|
1195
+ template = <<OUT
1196
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1197
+ <html>
1198
+ <head>
1199
+ <title>%s</title>
1200
+ <link rel="stylesheet" href="websitary.css" type="text/css">
1201
+ <link rel="alternate" href="websitary.rss" type="application/rss+xml" title="%s">
1202
+ </head>
1203
+ <body>
1204
+ <ol class="toc">
1205
+ %s
1206
+ </ol>
1207
+ <div class="contents">
1208
+ %s
1209
+ </div>
1210
+ </body>
1211
+ </html>
1212
+ OUT
1213
+ template % [ti, ti, li, bd]
1214
+ end
1215
+ }
1216
+ end
1217
+
1218
+
1219
+ def migrate
1220
+ store = File.join(@cfgdir, 'version.yml')
1221
+ if File.exist?(store)
1222
+ version = YAML.load_file(store)
1223
+ return if version == Websitary::VERSION
1224
+ else
1225
+ version = '0.1.0'
1226
+ end
1227
+ va = version.split(/\./).map {|i| i.to_i}
1228
+ migrate_0_1_0 if (va <=> [0, 1, 0]) != 1
1229
+ write_file(store) {|f| YAML.dump(Websitary::VERSION, f)}
1230
+ end
1231
+
1232
+
1233
+ def migrate_0_1_0
1234
+ $logger.warn "Migrate data from version 0.1.0"
1235
+ ['latest', 'old'].each do |dir|
1236
+ files = Dir[File.join(@cfgdir, dir, '*')]
1237
+ files.each do |f|
1238
+ url = decode(File.basename(f))
1239
+ nfn = encoded_filename(dir, url, true)
1240
+ @app.move(f, nfn)
1241
+ end
1242
+ end
1243
+ end
1244
+
1245
+
1246
+ def eval_profile(contents, profile_file=nil)
1247
+ @current_profile = profile_file
1248
+ begin
1249
+ self.instance_eval(contents)
1250
+ return true
1251
+ ensure
1252
+ @current_profile = nil
1253
+ end
1254
+ end
1255
+
1256
+
1257
+ def get_website(download, url)
1258
+ html = call_cmd(get_optionvalue(:download, download), [url])
1259
+ if html
1260
+ doc = Hpricot(html)
1261
+ if doc
1262
+ return if robots?(doc, 'noindex')
1263
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
1264
+ eligible_path?(url, uri0.path, uri.path) &&
1265
+ uri.host == uri0.host
1266
+ end
1267
+ end
1268
+ end
1269
+ html
1270
+ end
1271
+
1272
+
1273
+ def get_website_below(download, url)
1274
+ dwnl = get_optionvalue(:download, download)
1275
+ html = call_cmd(dwnl, [url])
1276
+ if html
1277
+ doc = Hpricot(html)
1278
+ if doc
1279
+ return if robots?(doc, 'noindex')
1280
+ push_hrefs(url, doc) do |uri0, pn0, uri, pn|
1281
+ eligible_path?(url, uri0.path, uri.path) &&
1282
+ uri.host == uri0.host &&
1283
+ (pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
1284
+ end
1285
+ end
1286
+ end
1287
+ html
1288
+ end
1289
+
1290
+
1291
+ def get_ftp(url)
1292
+ uri = URI.parse(url)
1293
+ ftp = Net::FTP.new(uri.host)
1294
+ ftp.passive = true
1295
+ begin
1296
+ ftp.login
1297
+ ftp.chdir(uri.path)
1298
+ return ftp.list('*')
1299
+ rescue Exception => e
1300
+ $logger.error e
1301
+ ensure
1302
+ ftp.close
1303
+ end
1304
+ end
1305
+
1306
+
1307
+ def html_toc_id(url)
1308
+ 't%s' % Digest::MD5.hexdigest(url)
1309
+ end
1310
+
1311
+
1312
+ def html_body_id(url)
1313
+ 'b%s' % Digest::MD5.hexdigest(url)
1314
+ end
1315
+
1316
+
1317
+ def ensure_filename(filename)
1318
+ filename = filename.gsub(/[\/]{2,}/, File::SEPARATOR)
1319
+ # File.join(*File.split(filename))
1320
+ if filename =~ /#{Regexp.escape(File::SEPARATOR)}$/
1321
+ File.join(filename, '__WEBSITARY__')
1322
+ else
1323
+ parts = filename.split(/#{Regexp.escape(File::SEPARATOR)}/)
1324
+ if parts.size == 2 and parts[0] =~ /^\w+%3a$/
1325
+ File.join(filename, '__WEBSITARY__')
1326
+ else
1327
+ filename
1328
+ end
1329
+ end
1330
+ end
1331
+
1332
+
1333
+ def open_url(url)
1334
+ $logger.debug "Open URL: #{url}"
1335
+ uri = URI.parse(url)
1336
+ if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
1337
+ open(url)
1338
+ else
1339
+ header = {"User-Agent" => @user_agent}
1340
+ header.merge!(get(url, :header, {}))
1341
+ open(url, header)
1342
+ end
1343
+ end
1344
+
1345
+
1346
+ def difftext_annotation(url)
1347
+ bak = oldname(url)
1348
+ lst = latestname(url)
1349
+ if File.exist?(bak) and File.exist?(lst)
1350
+ eval_arg(get(url, :format_annotation, '%s >>> %s'), [@mtimes.mtime(bak), @mtimes.mtime(lst)])
1351
+ end
1352
+ end
1353
+
1354
+
1355
+ def format_symbol(name, format_string)
1356
+ (format_string % name.to_s).intern
1357
+ end
1358
+
1359
+
1360
+ def format_rss_item(item, body, enclosure='')
1361
+ hd = [item.title]
1362
+ hd << " (#{item.author})" if item.author
1363
+ return <<EOT
1364
+ <h2 class="rss"><a class="rss" href="#{item.link}">#{hd.join} -- #{item.pubDate}</a></h2>
1365
+ <div class="rss">
1366
+ #{body}
1367
+ #{enclosure}
1368
+ </div>
1369
+ EOT
1370
+ end
1371
+
1372
+ # Guess whether text is plain text or html.
1373
+ def is_html?(text)
1374
+ text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
1375
+ end
1376
+
1377
+
1378
+ # Convert html to plain text using hpricot.
1379
+ def html_to_text(text)
1380
+ text && Hpricot(text).to_plain_text
1381
+ end
1382
+
1383
+
1384
+ # Retrieve any robots meta directives from the hpricot document.
1385
+ def robots?(hpricot, *what)
1386
+ (hpricot / '//meta[@name="robots"]').any? do |e|
1387
+ what.any? {|w| e['content'].split(/,\s*/).include?(w)}
1388
+ end
1389
+ end
1390
+
1391
+
1392
+ # Check whether robots are allowed to retrieve an url.
1393
+ def robots_allowed?(url, uri)
1394
+ if @allow.has_key?(url)
1395
+ return @allow[url]
1396
+ end
1397
+
1398
+ if defined?(RobotRules)
1399
+ host = uri.host
1400
+
1401
+ unless (rules = @robots[host])
1402
+ rurl = robots_uri(uri).to_s
1403
+ return true unless rurl
1404
+ begin
1405
+ robots_txt = open_url(rurl).read
1406
+ rules = RobotRules.new(@user_agent)
1407
+ rules.parse(rurl, robots_txt)
1408
+ @robots[host] = rules
1409
+ $logger.info "Loaded #{rurl} for #{@user_agent}"
1410
+ $logger.debug robots_txt
1411
+ rescue Exception => e
1412
+ puts e
1413
+ puts robots_txt
1414
+ end
1415
+ end
1416
+
1417
+ rv = if rules and !rules.allowed?(url)
1418
+ $logger.info "Excluded url: #{url}"
1419
+ false
1420
+ else
1421
+ true
1422
+ end
1423
+ @allow[url] = rv
1424
+ return rv
1425
+ end
1426
+
1427
+ unless @robots[:warning]
1428
+ $logger.warn 'robots.txt is ignored: Please install robot_rules.rb from http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589 in $RUBYLIB'
1429
+ @robots[:warning] = true
1430
+ end
1431
+ @allow[url] = true
1432
+ return true
1433
+ end
1434
+
1435
+
1436
+ # Get the robots.txt uri for uri.
1437
+ def robots_uri(uri)
1438
+ unless uri.relative?
1439
+ ruri = uri.dup
1440
+ ruri.path = '/robots.txt'
1441
+ ruri
1442
+ end
1443
+ end
1444
+
1445
+
1446
+ def sort_difftext!(difftext)
1447
+ difftext.sort! do |a, b|
1448
+ aa = a[0]
1449
+ bb = b[0]
1450
+ get(aa, :title, aa).downcase <=> get(bb, :title, bb).downcase
1451
+ end
1452
+ end
1453
+
1454
+
1455
+ def file_url(filename)
1456
+ # filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
1457
+ # "file://#{encode(filename, ':/')}"
1458
+ filename = call_cmd(get_optionvalue(:global, :file_url), [filename], filename)
1459
+ encode(filename, ':/')
1460
+ end
1461
+
1462
+
1463
+ def encode(text, chars='')
1464
+ text.gsub(/[^a-zA-Z0-9,._#{chars}-]/) {|t| '%%%02x' % t[0]}
1465
+ end
1466
+
1467
+
1468
+ def decode(text)
1469
+ text.gsub(/%(..)/) {|t| "%c" % $1.hex}
1470
+ end
1471
+
1472
+
1473
+ def output_suffix(outformat)
1474
+ outformat ||= @output_format[0]
1475
+ @suffix[outformat] || outformat
1476
+ end
1477
+
1478
+
1479
+ def output_basename
1480
+ @profiles.join(',')
1481
+ end
1482
+
1483
+
1484
+ def get_outfile(outformat=nil)
1485
+ @outfile[outformat] || File.join(@cfgdir, "#{output_basename}.#{output_suffix(outformat)}")
1486
+ end
1487
+
1488
+
1489
+ def view_output_general(outfile)
1490
+ if @view
1491
+ system((@view % outfile))
1492
+ end
1493
+ end
1494
+ alias :view_output_html :view_output_general
1495
+ alias :view_output_text :view_output_general
1496
+ alias :view_output_rss :view_output_general
1497
+
1498
+ end
1499
+
1500
+
1501
+
1502
+
1503
+ # Local Variables:
1504
+ # revisionRx: REVISION\s\+=\s\+\'
1505
+ # End: