websitary 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,9 @@
1
+ = 0.2.1
2
+
3
+ * Use URI.merge for constructing robots.txt uri.
4
+ * Fixed minor show-stopper.
5
+
6
+
1
7
  = 0.2.0
2
8
 
3
9
  * Renamed the project from websitiary to websitary (without the
@@ -51,6 +57,7 @@
51
57
  :months => N (calculated on basis of the calendar month, not the
52
58
  number of days)
53
59
 
60
+
54
61
  == 0.1.0 / 2007-07-16
55
62
 
56
63
  * Initial release
@@ -1,5 +1,5 @@
1
1
  # websitary.rb
2
- # @Last Change: 2007-09-11.
2
+ # @Last Change: 2007-09-16.
3
3
  # Author:: Thomas Link (micathom AT gmail com)
4
4
  # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
5
  # Created:: 2007-09-08.
@@ -37,8 +37,8 @@ end
37
37
 
38
38
  module Websitary
39
39
  APPNAME = 'websitary'
40
- VERSION = '0.2.0'
41
- REVISION = '2403'
40
+ VERSION = '0.2.1'
41
+ REVISION = '2405'
42
42
  end
43
43
 
44
44
  require 'websitary/applog'
@@ -55,6 +55,7 @@ class Websitary::Configuration
55
55
  @default_profiles = [@quicklist_profile]
56
56
  @done = []
57
57
  @mtimes = Websitary::FileMTimes.new(self)
58
+ @options = {}
58
59
  @outfile = {}
59
60
  @profiles = []
60
61
  @robots = {}
@@ -162,6 +163,11 @@ class Websitary::Configuration
162
163
  @logger.set_level(:verbose)
163
164
  end
164
165
 
166
+ opts.on('--version', 'Run verbosely') do |v|
167
+ puts Websitary::VERSION
168
+ exit 1
169
+ end
170
+
165
171
  opts.on_tail('-h', '--help', 'Show this message') do
166
172
  puts opts
167
173
  exit 1
@@ -786,7 +792,7 @@ HTML
786
792
  pn = guess_dir(uri.path)
787
793
  href = rewrite_href(href, url, uri0, pn0, true)
788
794
  curl = canonic_url(href)
789
- next if href.nil? or @done.include?(curl) or @todo.include?(curl)
795
+ next if !href or href.nil? or @done.include?(curl) or @todo.include?(curl)
790
796
  next unless robots_allowed?(curl, uri)
791
797
  # pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
792
798
  uri = URI.parse(href)
@@ -1278,7 +1284,8 @@ OUT
1278
1284
  if doc
1279
1285
  return if robots?(doc, 'noindex')
1280
1286
  push_hrefs(url, doc) do |uri0, pn0, uri, pn|
1281
- eligible_path?(url, uri0.path, uri.path) &&
1287
+ uri.host && uri0.host &&
1288
+ eligible_path?(url, uri0.path, uri.path) &&
1282
1289
  uri.host == uri0.host &&
1283
1290
  (pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
1284
1291
  end
@@ -1331,6 +1338,11 @@ OUT
1331
1338
 
1332
1339
 
1333
1340
  def open_url(url)
1341
+ if url.nil? or url.empty?
1342
+ $logger.fatal "Internal error: url is nil"
1343
+ puts caller.join("\n")
1344
+ exit 5
1345
+ end
1334
1346
  $logger.debug "Open URL: #{url}"
1335
1347
  uri = URI.parse(url)
1336
1348
  if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
@@ -1400,7 +1412,7 @@ EOT
1400
1412
 
1401
1413
  unless (rules = @robots[host])
1402
1414
  rurl = robots_uri(uri).to_s
1403
- return true unless rurl
1415
+ return true if rurl.nil? or rurl.empty?
1404
1416
  begin
1405
1417
  robots_txt = open_url(rurl).read
1406
1418
  rules = RobotRules.new(@user_agent)
@@ -1409,8 +1421,7 @@ EOT
1409
1421
  $logger.info "Loaded #{rurl} for #{@user_agent}"
1410
1422
  $logger.debug robots_txt
1411
1423
  rescue Exception => e
1412
- puts e
1413
- puts robots_txt
1424
+ $logger.info "#{rurl}: #{e}"
1414
1425
  end
1415
1426
  end
1416
1427
 
@@ -1436,9 +1447,11 @@ EOT
1436
1447
  # Get the robots.txt uri for uri.
1437
1448
  def robots_uri(uri)
1438
1449
  unless uri.relative?
1439
- ruri = uri.dup
1440
- ruri.path = '/robots.txt'
1441
- ruri
1450
+ # ruri = uri.dup
1451
+ # ruri.path = '/robots.txt'
1452
+ # ruri.query = nil
1453
+ # ruri
1454
+ uri.merge '/robots.txt'
1442
1455
  end
1443
1456
  end
1444
1457
 
@@ -20,6 +20,12 @@ class Websitary::FileMTimes
20
20
  def swap_in
21
21
  if File.exist?(@store)
22
22
  @data = YAML.load_file(@store)
23
+ case @data
24
+ when Hash
25
+ else
26
+ $logger.error 'mtime.yml stored malformed data'
27
+ @data = {}
28
+ end
23
29
  File.delete(@store)
24
30
  end
25
31
  end
@@ -40,6 +46,8 @@ class Websitary::FileMTimes
40
46
  @data[filenamec] = mtime
41
47
  $logger.debug "Set mtime: #{filename} -> #{mtime.to_s}"
42
48
  mtime
49
+ else
50
+ nil
43
51
  end
44
52
  end
45
53
  end
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: websitary
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.0
6
+ version: 0.2.1
7
7
  date: 2007-09-16 00:00:00 +02:00
8
8
  summary: A unified website news, rss feed, podcast monitor
9
9
  require_paths: