websitary 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/lib/websitary.rb +3 -3
- data/lib/websitary/configuration.rb +21 -8
- data/lib/websitary/filemtimes.rb +8 -0
- metadata +1 -1
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
= 0.2.1
|
2
|
+
|
3
|
+
* Use URI.merge for constructing robots.txt uri.
|
4
|
+
* Fixed minor show-stopper.
|
5
|
+
|
6
|
+
|
1
7
|
= 0.2.0
|
2
8
|
|
3
9
|
* Renamed the project from websitiary to websitary (without the
|
@@ -51,6 +57,7 @@
|
|
51
57
|
:months => N (calculated on basis of the calendar month, not the
|
52
58
|
number of days)
|
53
59
|
|
60
|
+
|
54
61
|
== 0.1.0 / 2007-07-16
|
55
62
|
|
56
63
|
* Initial release
|
data/lib/websitary.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# websitary.rb
|
2
|
-
# @Last Change: 2007-09-
|
2
|
+
# @Last Change: 2007-09-16.
|
3
3
|
# Author:: Thomas Link (micathom AT gmail com)
|
4
4
|
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
5
|
# Created:: 2007-09-08.
|
@@ -37,8 +37,8 @@ end
|
|
37
37
|
|
38
38
|
module Websitary
|
39
39
|
APPNAME = 'websitary'
|
40
|
-
VERSION = '0.2.
|
41
|
-
REVISION = '
|
40
|
+
VERSION = '0.2.1'
|
41
|
+
REVISION = '2405'
|
42
42
|
end
|
43
43
|
|
44
44
|
require 'websitary/applog'
|
@@ -55,6 +55,7 @@ class Websitary::Configuration
|
|
55
55
|
@default_profiles = [@quicklist_profile]
|
56
56
|
@done = []
|
57
57
|
@mtimes = Websitary::FileMTimes.new(self)
|
58
|
+
@options = {}
|
58
59
|
@outfile = {}
|
59
60
|
@profiles = []
|
60
61
|
@robots = {}
|
@@ -162,6 +163,11 @@ class Websitary::Configuration
|
|
162
163
|
@logger.set_level(:verbose)
|
163
164
|
end
|
164
165
|
|
166
|
+
opts.on('--version', 'Run verbosely') do |v|
|
167
|
+
puts Websitary::VERSION
|
168
|
+
exit 1
|
169
|
+
end
|
170
|
+
|
165
171
|
opts.on_tail('-h', '--help', 'Show this message') do
|
166
172
|
puts opts
|
167
173
|
exit 1
|
@@ -786,7 +792,7 @@ HTML
|
|
786
792
|
pn = guess_dir(uri.path)
|
787
793
|
href = rewrite_href(href, url, uri0, pn0, true)
|
788
794
|
curl = canonic_url(href)
|
789
|
-
next if href.nil? or @done.include?(curl) or @todo.include?(curl)
|
795
|
+
next if !href or href.nil? or @done.include?(curl) or @todo.include?(curl)
|
790
796
|
next unless robots_allowed?(curl, uri)
|
791
797
|
# pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
|
792
798
|
uri = URI.parse(href)
|
@@ -1278,7 +1284,8 @@ OUT
|
|
1278
1284
|
if doc
|
1279
1285
|
return if robots?(doc, 'noindex')
|
1280
1286
|
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
1281
|
-
|
1287
|
+
uri.host && uri0.host &&
|
1288
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
1282
1289
|
uri.host == uri0.host &&
|
1283
1290
|
(pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
|
1284
1291
|
end
|
@@ -1331,6 +1338,11 @@ OUT
|
|
1331
1338
|
|
1332
1339
|
|
1333
1340
|
def open_url(url)
|
1341
|
+
if url.nil? or url.empty?
|
1342
|
+
$logger.fatal "Internal error: url is nil"
|
1343
|
+
puts caller.join("\n")
|
1344
|
+
exit 5
|
1345
|
+
end
|
1334
1346
|
$logger.debug "Open URL: #{url}"
|
1335
1347
|
uri = URI.parse(url)
|
1336
1348
|
if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
|
@@ -1400,7 +1412,7 @@ EOT
|
|
1400
1412
|
|
1401
1413
|
unless (rules = @robots[host])
|
1402
1414
|
rurl = robots_uri(uri).to_s
|
1403
|
-
return true
|
1415
|
+
return true if rurl.nil? or rurl.empty?
|
1404
1416
|
begin
|
1405
1417
|
robots_txt = open_url(rurl).read
|
1406
1418
|
rules = RobotRules.new(@user_agent)
|
@@ -1409,8 +1421,7 @@ EOT
|
|
1409
1421
|
$logger.info "Loaded #{rurl} for #{@user_agent}"
|
1410
1422
|
$logger.debug robots_txt
|
1411
1423
|
rescue Exception => e
|
1412
|
-
|
1413
|
-
puts robots_txt
|
1424
|
+
$logger.info "#{rurl}: #{e}"
|
1414
1425
|
end
|
1415
1426
|
end
|
1416
1427
|
|
@@ -1436,9 +1447,11 @@ EOT
|
|
1436
1447
|
# Get the robots.txt uri for uri.
|
1437
1448
|
def robots_uri(uri)
|
1438
1449
|
unless uri.relative?
|
1439
|
-
ruri = uri.dup
|
1440
|
-
ruri.path = '/robots.txt'
|
1441
|
-
ruri
|
1450
|
+
# ruri = uri.dup
|
1451
|
+
# ruri.path = '/robots.txt'
|
1452
|
+
# ruri.query = nil
|
1453
|
+
# ruri
|
1454
|
+
uri.merge '/robots.txt'
|
1442
1455
|
end
|
1443
1456
|
end
|
1444
1457
|
|
data/lib/websitary/filemtimes.rb
CHANGED
@@ -20,6 +20,12 @@ class Websitary::FileMTimes
|
|
20
20
|
def swap_in
|
21
21
|
if File.exist?(@store)
|
22
22
|
@data = YAML.load_file(@store)
|
23
|
+
case @data
|
24
|
+
when Hash
|
25
|
+
else
|
26
|
+
$logger.error 'mtime.yml stored malformed data'
|
27
|
+
@data = {}
|
28
|
+
end
|
23
29
|
File.delete(@store)
|
24
30
|
end
|
25
31
|
end
|
@@ -40,6 +46,8 @@ class Websitary::FileMTimes
|
|
40
46
|
@data[filenamec] = mtime
|
41
47
|
$logger.debug "Set mtime: #{filename} -> #{mtime.to_s}"
|
42
48
|
mtime
|
49
|
+
else
|
50
|
+
nil
|
43
51
|
end
|
44
52
|
end
|
45
53
|
end
|
metadata
CHANGED