websitary 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/lib/websitary.rb +3 -3
- data/lib/websitary/configuration.rb +21 -8
- data/lib/websitary/filemtimes.rb +8 -0
- metadata +1 -1
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
= 0.2.1
|
2
|
+
|
3
|
+
* Use URI.merge for constructing robots.txt uri.
|
4
|
+
* Fixed minor show-stopper.
|
5
|
+
|
6
|
+
|
1
7
|
= 0.2.0
|
2
8
|
|
3
9
|
* Renamed the project from websitiary to websitary (without the
|
@@ -51,6 +57,7 @@
|
|
51
57
|
:months => N (calculated on basis of the calendar month, not the
|
52
58
|
number of days)
|
53
59
|
|
60
|
+
|
54
61
|
== 0.1.0 / 2007-07-16
|
55
62
|
|
56
63
|
* Initial release
|
data/lib/websitary.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# websitary.rb
|
2
|
-
# @Last Change: 2007-09-
|
2
|
+
# @Last Change: 2007-09-16.
|
3
3
|
# Author:: Thomas Link (micathom AT gmail com)
|
4
4
|
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
5
|
# Created:: 2007-09-08.
|
@@ -37,8 +37,8 @@ end
|
|
37
37
|
|
38
38
|
module Websitary
|
39
39
|
APPNAME = 'websitary'
|
40
|
-
VERSION = '0.2.
|
41
|
-
REVISION = '
|
40
|
+
VERSION = '0.2.1'
|
41
|
+
REVISION = '2405'
|
42
42
|
end
|
43
43
|
|
44
44
|
require 'websitary/applog'
|
@@ -55,6 +55,7 @@ class Websitary::Configuration
|
|
55
55
|
@default_profiles = [@quicklist_profile]
|
56
56
|
@done = []
|
57
57
|
@mtimes = Websitary::FileMTimes.new(self)
|
58
|
+
@options = {}
|
58
59
|
@outfile = {}
|
59
60
|
@profiles = []
|
60
61
|
@robots = {}
|
@@ -162,6 +163,11 @@ class Websitary::Configuration
|
|
162
163
|
@logger.set_level(:verbose)
|
163
164
|
end
|
164
165
|
|
166
|
+
opts.on('--version', 'Run verbosely') do |v|
|
167
|
+
puts Websitary::VERSION
|
168
|
+
exit 1
|
169
|
+
end
|
170
|
+
|
165
171
|
opts.on_tail('-h', '--help', 'Show this message') do
|
166
172
|
puts opts
|
167
173
|
exit 1
|
@@ -786,7 +792,7 @@ HTML
|
|
786
792
|
pn = guess_dir(uri.path)
|
787
793
|
href = rewrite_href(href, url, uri0, pn0, true)
|
788
794
|
curl = canonic_url(href)
|
789
|
-
next if href.nil? or @done.include?(curl) or @todo.include?(curl)
|
795
|
+
next if !href or href.nil? or @done.include?(curl) or @todo.include?(curl)
|
790
796
|
next unless robots_allowed?(curl, uri)
|
791
797
|
# pn = Pathname.new(guess_dir(File.expand_path(uri.path)))
|
792
798
|
uri = URI.parse(href)
|
@@ -1278,7 +1284,8 @@ OUT
|
|
1278
1284
|
if doc
|
1279
1285
|
return if robots?(doc, 'noindex')
|
1280
1286
|
push_hrefs(url, doc) do |uri0, pn0, uri, pn|
|
1281
|
-
|
1287
|
+
uri.host && uri0.host &&
|
1288
|
+
eligible_path?(url, uri0.path, uri.path) &&
|
1282
1289
|
uri.host == uri0.host &&
|
1283
1290
|
(pn.to_s == '.' || pn.relative_path_from(pn0).to_s == '.')
|
1284
1291
|
end
|
@@ -1331,6 +1338,11 @@ OUT
|
|
1331
1338
|
|
1332
1339
|
|
1333
1340
|
def open_url(url)
|
1341
|
+
if url.nil? or url.empty?
|
1342
|
+
$logger.fatal "Internal error: url is nil"
|
1343
|
+
puts caller.join("\n")
|
1344
|
+
exit 5
|
1345
|
+
end
|
1334
1346
|
$logger.debug "Open URL: #{url}"
|
1335
1347
|
uri = URI.parse(url)
|
1336
1348
|
if uri.instance_of?(URI::Generic) or uri.scheme == 'file'
|
@@ -1400,7 +1412,7 @@ EOT
|
|
1400
1412
|
|
1401
1413
|
unless (rules = @robots[host])
|
1402
1414
|
rurl = robots_uri(uri).to_s
|
1403
|
-
return true
|
1415
|
+
return true if rurl.nil? or rurl.empty?
|
1404
1416
|
begin
|
1405
1417
|
robots_txt = open_url(rurl).read
|
1406
1418
|
rules = RobotRules.new(@user_agent)
|
@@ -1409,8 +1421,7 @@ EOT
|
|
1409
1421
|
$logger.info "Loaded #{rurl} for #{@user_agent}"
|
1410
1422
|
$logger.debug robots_txt
|
1411
1423
|
rescue Exception => e
|
1412
|
-
|
1413
|
-
puts robots_txt
|
1424
|
+
$logger.info "#{rurl}: #{e}"
|
1414
1425
|
end
|
1415
1426
|
end
|
1416
1427
|
|
@@ -1436,9 +1447,11 @@ EOT
|
|
1436
1447
|
# Get the robots.txt uri for uri.
|
1437
1448
|
def robots_uri(uri)
|
1438
1449
|
unless uri.relative?
|
1439
|
-
ruri = uri.dup
|
1440
|
-
ruri.path = '/robots.txt'
|
1441
|
-
ruri
|
1450
|
+
# ruri = uri.dup
|
1451
|
+
# ruri.path = '/robots.txt'
|
1452
|
+
# ruri.query = nil
|
1453
|
+
# ruri
|
1454
|
+
uri.merge '/robots.txt'
|
1442
1455
|
end
|
1443
1456
|
end
|
1444
1457
|
|
data/lib/websitary/filemtimes.rb
CHANGED
@@ -20,6 +20,12 @@ class Websitary::FileMTimes
|
|
20
20
|
def swap_in
|
21
21
|
if File.exist?(@store)
|
22
22
|
@data = YAML.load_file(@store)
|
23
|
+
case @data
|
24
|
+
when Hash
|
25
|
+
else
|
26
|
+
$logger.error 'mtime.yml stored malformed data'
|
27
|
+
@data = {}
|
28
|
+
end
|
23
29
|
File.delete(@store)
|
24
30
|
end
|
25
31
|
end
|
@@ -40,6 +46,8 @@ class Websitary::FileMTimes
|
|
40
46
|
@data[filenamec] = mtime
|
41
47
|
$logger.debug "Set mtime: #{filename} -> #{mtime.to_s}"
|
42
48
|
mtime
|
49
|
+
else
|
50
|
+
nil
|
43
51
|
end
|
44
52
|
end
|
45
53
|
end
|
metadata
CHANGED