websitary 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,9 @@
1
+ = 0.2.1
2
+
3
+ * Use URI.merge for constructing robots.txt uri.
4
+ * Fixed minor show-stopper.
5
+
6
+
1
7
  = 0.2.0
2
8
 
3
9
  * Renamed the project from websitiary to websitary (without the
@@ -51,6 +57,7 @@
51
57
  :months => N (calculated on basis of the calendar month, not the
52
58
  number of days)
53
59
 
60
+
54
61
  == 0.1.0 / 2007-07-16
55
62
 
56
63
  * Initial release
@@ -1,5 +1,5 @@
1
1
  # websitary.rb
2
- # @Last Change: 2007-09-11.
2
+ # @Last Change: 2007-09-16.
3
3
  # Author:: Thomas Link (micathom AT gmail com)
4
4
  # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
5
  # Created:: 2007-09-08.
@@ -37,8 +37,8 @@ end
37
37
 
38
38
  module Websitary
39
39
  APPNAME = 'websitary'
40
- VERSION = '0.2.0'
41
- REVISION = '2403'
40
+ VERSION = '0.2.1'
41
+ REVISION = '2405'
42
42
  end
43
43
 
44
44
  require 'websitary/applog'
@@ -55,6 +55,7 @@ class Websitary::Configuration
55
55
  @default_profiles = [@quicklist_profile]
56
56
  @done = []
57
57
  @mtimes = Websitary::FileMTimes.new(self)
58
+ @options = {}
58
59
  @outfile = {}
59
60
  @profiles = []
60
61
  @robots = {}
@@ -162,6 +163,11 @@ class Websitary::Configuration
162
163
  @logger.set_level(:verbose)
163
164
  end
164
165
 
166
+ opts.on('--version', 'Run verbosely') do |v|
167
+ puts Websitary::VERSION
168
+ exit 1
169
+ end
170
+
165
171
  opts.on_tail('-h', '--help', 'Show this message') do
166
172
  puts opts
167
173
  exit 1
@@ -786,7 +792,7 @@ HTML
786
792
  pn = guess_dir(uri.path)
787
793
  href = rewrite_href(href, url, uri0, pn0, true)
788
794
  curl = canonic_url(href)
789
- next if href.nil? or @done.include?(curl) or @todo.include?(curl)
795
+ next if !href or href.nil? or @done.include?(curl) or @todo.include?(curl)
790
796
  next unless robots_allowed?(curl, uri)
791
797
  # pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
792
798
  uri = URI.parse(href)
@@ -1278,7 +1284,8 @@ OUT
1278
1284
  if doc
1279
1285
  return if robots?(doc, 'noindex')
1280
1286
  push_hrefs(url, doc) do |uri0, pn0, uri, pn|
1281
- eligible_path?(url, uri0.path, uri.path) &&
1287
+ uri.host && uri0.host &&
1288
+ eligible_path?(url, uri0.path, uri.path) &&
1282
1289
  uri.host == uri0.host &&
1283
1290
  (pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
1284
1291
  end
@@ -1331,6 +1338,11 @@ OUT
1331
1338
 
1332
1339
 
1333
1340
  def open_url(url)
1341
+ if url.nil? or url.empty?
1342
+ $logger.fatal "Internal error: url is nil"
1343
+ puts caller.join("\n")
1344
+ exit 5
1345
+ end
1334
1346
  $logger.debug "Open URL: #{url}"
1335
1347
  uri = URI.parse(url)
1336
1348
  if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
@@ -1400,7 +1412,7 @@ EOT
1400
1412
 
1401
1413
  unless (rules = @robots[host])
1402
1414
  rurl = robots_uri(uri).to_s
1403
- return true unless rurl
1415
+ return true if rurl.nil? or rurl.empty?
1404
1416
  begin
1405
1417
  robots_txt = open_url(rurl).read
1406
1418
  rules = RobotRules.new(@user_agent)
@@ -1409,8 +1421,7 @@ EOT
1409
1421
  $logger.info "Loaded #{rurl} for #{@user_agent}"
1410
1422
  $logger.debug robots_txt
1411
1423
  rescue Exception => e
1412
- puts e
1413
- puts robots_txt
1424
+ $logger.info "#{rurl}: #{e}"
1414
1425
  end
1415
1426
  end
1416
1427
 
@@ -1436,9 +1447,11 @@ EOT
1436
1447
  # Get the robots.txt uri for uri.
1437
1448
  def robots_uri(uri)
1438
1449
  unless uri.relative?
1439
- ruri = uri.dup
1440
- ruri.path = '/robots.txt'
1441
- ruri
1450
+ # ruri = uri.dup
1451
+ # ruri.path = '/robots.txt'
1452
+ # ruri.query = nil
1453
+ # ruri
1454
+ uri.merge '/robots.txt'
1442
1455
  end
1443
1456
  end
1444
1457
 
@@ -20,6 +20,12 @@ class Websitary::FileMTimes
20
20
  def swap_in
21
21
  if File.exist?(@store)
22
22
  @data = YAML.load_file(@store)
23
+ case @data
24
+ when Hash
25
+ else
26
+ $logger.error 'mtime.yml stored malformed data'
27
+ @data = {}
28
+ end
23
29
  File.delete(@store)
24
30
  end
25
31
  end
@@ -40,6 +46,8 @@ class Websitary::FileMTimes
40
46
  @data[filenamec] = mtime
41
47
  $logger.debug "Set mtime: #{filename} -> #{mtime.to_s}"
42
48
  mtime
49
+ else
50
+ nil
43
51
  end
44
52
  end
45
53
  end
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: websitary
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.0
6
+ version: 0.2.1
7
7
  date: 2007-09-16 00:00:00 +02:00
8
8
  summary: A unified website news, rss feed, podcast monitor
9
9
  require_paths: