simplecrawler 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/accessibility_report.rb +32 -19
- data/examples/find_broken_links.rb +21 -0
- data/examples/result.htm +1282 -0
- data/examples/riksdagen.txt +66 -0
- data/lib/simplecrawler.rb +13 -7
- metadata +5 -2
@@ -0,0 +1,66 @@
|
|
1
|
+
Added http://www.riksdagen.se/
|
2
|
+
Opening http://www.riksdagen.se/
|
3
|
+
Loading data from http://www.riksdagen.se/
|
4
|
+
3 http://www.riksdagen.se/
|
5
|
+
Queuing links for http://www.riksdagen.se/
|
6
|
+
Added http://www.riksdagen.se/om
|
7
|
+
Added http://www.riksdagen.se/innehall
|
8
|
+
Added http://www.riksdagen.se/fragor
|
9
|
+
Added http://www.riksdagen.se/anvander
|
10
|
+
Added http://www.riksdagen.se/kontakt
|
11
|
+
Added /default____4.aspx
|
12
|
+
Added /templates/R_Page____283.aspx
|
13
|
+
Added /templates/R_OptionPage____285.aspx
|
14
|
+
Added /shopping/R_Default____2508.aspx
|
15
|
+
Added /webbnav/index.aspx?nid=53
|
16
|
+
Added /templates/R_Page____275.aspx
|
17
|
+
Added /templates/R_Page____2161.aspx
|
18
|
+
Added /templates/R_Page____273.aspx
|
19
|
+
Added /templates/R_Page____3618.aspx
|
20
|
+
Added /templates/R_Page____271.aspx
|
21
|
+
Added /templates/R_SubStartPage____5029.aspx
|
22
|
+
Added /templates/R_SubStartPage____6694.aspx
|
23
|
+
Added /templates/R_SubStartPage____272.aspx
|
24
|
+
Added /templates/R_SubStartPage____282.aspx
|
25
|
+
Added /templates/R_SubStartPage____238.aspx
|
26
|
+
Added /templates/R_Page____1068.aspx
|
27
|
+
Added /templates/R_Page____6531.aspx
|
28
|
+
Added /templates/R_CustomCalendar____1765.aspx
|
29
|
+
Added /Webbnav/index.aspx?nid=7800
|
30
|
+
Added /templates/R_SubStartPage____4935.aspx
|
31
|
+
Added /templates/R_Page____4335.aspx
|
32
|
+
Added /templates/R_Page____13951.aspx
|
33
|
+
Added /templates/R_Page____1595.aspx
|
34
|
+
Added /templates/R_Page____16886.aspx
|
35
|
+
Added /templates/R_Page____14748.aspx
|
36
|
+
Added /templates/R_Page____12861.aspx
|
37
|
+
Added #
|
38
|
+
Added /templates/R_ExternalPage____3383.aspx?op=search&search_freetext=&search_parlamentary_session=2008%2F09&search_type=Interpellationsdebatt&search_committee=&search_speaker=&search_party=&search_sdate1=2009%2D01%2D16&search_sdate2=2009%2D01%2D16&rpage=0
|
39
|
+
Added http://www.riksdagen.se/templates/R_HtmlCallPage____17356.aspx
|
40
|
+
Added http://www.riksdagen.se/templates/R_Page____17944.aspx
|
41
|
+
Added http://www.riksdagen.se/templates/R_Page____17958.aspx
|
42
|
+
Added http://www.riksdagen.se/templates/R_HtmlCallPage____17361.aspx
|
43
|
+
Added /templates/R_LopsedelArkiv____3388.aspx
|
44
|
+
Added /templates/R_Page____1977.aspx
|
45
|
+
Added /templates/R_LLSubStartPage____4303.aspx
|
46
|
+
Added /default____56.aspx
|
47
|
+
Added /templates/R_Page____1928.aspx
|
48
|
+
Added /templates/R_Page____6546.aspx
|
49
|
+
Added /templates/R_SubStartPage____257.aspx
|
50
|
+
Added /templates/R_SubStartPage____448.aspx
|
51
|
+
Added /templates/R_SubStartPage____4492.aspx
|
52
|
+
Added /templates/R_Page____498.aspx
|
53
|
+
Added /templates/R_Page____734.aspx
|
54
|
+
Opening http://www.riksdagen.se/om
|
55
|
+
Loading data from http://www.riksdagen.se/om
|
56
|
+
3 http://www.riksdagen.se/om
|
57
|
+
Queuing links for http://www.riksdagen.se/om
|
58
|
+
Added /templates/R_Page____735.aspx
|
59
|
+
Added /templates/R_Page____5915.aspx
|
60
|
+
Added /templates/R_Page____1791.aspx
|
61
|
+
Added /templates/R_Page____1788.aspx
|
62
|
+
Added /templates/R_PageFull____6580.aspx
|
63
|
+
Added /templates/R_Page____6558.aspx
|
64
|
+
Added /templates/R_Page____5674.aspx
|
65
|
+
Opening http://www.riksdagen.se/innehall
|
66
|
+
Loading data from http://www.riksdagen.se/innehall
|
data/lib/simplecrawler.rb
CHANGED
@@ -23,7 +23,7 @@ module SimpleCrawler
|
|
23
23
|
require File.dirname(__FILE__) + '/document'
|
24
24
|
|
25
25
|
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
26
|
-
VERSION = "0.1.
|
26
|
+
VERSION = "0.1.7"
|
27
27
|
|
28
28
|
class Crawler
|
29
29
|
|
@@ -107,6 +107,7 @@ module SimpleCrawler
|
|
107
107
|
@queue.push uri.path
|
108
108
|
@current_count = @current_count + 1
|
109
109
|
@visited[uri.path] = false
|
110
|
+
log(" Added #{uri}")
|
110
111
|
end
|
111
112
|
|
112
113
|
end
|
@@ -118,6 +119,7 @@ module SimpleCrawler
|
|
118
119
|
uri = @site_uri.clone
|
119
120
|
uri.path = path if path != "/"
|
120
121
|
doc.uri = uri
|
122
|
+
doc.fetched_at = Time.now
|
121
123
|
|
122
124
|
log("Opening #{uri}")
|
123
125
|
|
@@ -135,10 +137,15 @@ module SimpleCrawler
|
|
135
137
|
|
136
138
|
doc.headers = file.meta
|
137
139
|
doc.http_status = file.status
|
138
|
-
|
139
|
-
rescue
|
140
|
-
log("Error fetching
|
141
|
-
|
140
|
+
|
141
|
+
rescue => error
|
142
|
+
log("Error fetching #{uri}: #{error.message}")
|
143
|
+
if error.message[0..2] =~ /\d\d\d/ then
|
144
|
+
doc.http_status = [error.message[0..2], error.message[3..-1]]
|
145
|
+
return doc
|
146
|
+
else
|
147
|
+
raise error
|
148
|
+
end
|
142
149
|
end
|
143
150
|
return doc
|
144
151
|
end
|
@@ -147,7 +154,7 @@ module SimpleCrawler
|
|
147
154
|
def queue_local_links(doc)
|
148
155
|
return if doc.data == nil
|
149
156
|
log("Queuing links for #{doc.uri}")
|
150
|
-
Hpricot.buffer_size =
|
157
|
+
Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
|
151
158
|
doc = Hpricot(doc.data)
|
152
159
|
links = doc.search("a[@href]")
|
153
160
|
for link in links
|
@@ -155,7 +162,6 @@ module SimpleCrawler
|
|
155
162
|
begin
|
156
163
|
uri = URI.parse(link.attributes["href"])
|
157
164
|
add_uri(uri)
|
158
|
-
log(" Added #{uri}")
|
159
165
|
rescue
|
160
166
|
#skip this link
|
161
167
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simplecrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Krantz
|
@@ -9,7 +9,7 @@ autorequire: simplecrawler
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-05-04 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -37,8 +37,11 @@ files:
|
|
37
37
|
- tests/simplecrawler_test.rb
|
38
38
|
- examples/accessibility_report.rb
|
39
39
|
- examples/crawl.rb
|
40
|
+
- examples/find_broken_links.rb
|
40
41
|
- examples/find_pdfs.rb
|
41
42
|
- examples/list_site_links.rb
|
43
|
+
- examples/result.htm
|
44
|
+
- examples/riksdagen.txt
|
42
45
|
has_rdoc: true
|
43
46
|
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
44
47
|
post_install_message:
|