simplecrawler 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/accessibility_report.rb +32 -19
- data/examples/find_broken_links.rb +21 -0
- data/examples/result.htm +1282 -0
- data/examples/riksdagen.txt +66 -0
- data/lib/simplecrawler.rb +13 -7
- metadata +5 -2
@@ -0,0 +1,66 @@
|
|
1
|
+
Added http://www.riksdagen.se/
|
2
|
+
Opening http://www.riksdagen.se/
|
3
|
+
Loading data from http://www.riksdagen.se/
|
4
|
+
3 http://www.riksdagen.se/
|
5
|
+
Queuing links for http://www.riksdagen.se/
|
6
|
+
Added http://www.riksdagen.se/om
|
7
|
+
Added http://www.riksdagen.se/innehall
|
8
|
+
Added http://www.riksdagen.se/fragor
|
9
|
+
Added http://www.riksdagen.se/anvander
|
10
|
+
Added http://www.riksdagen.se/kontakt
|
11
|
+
Added /default____4.aspx
|
12
|
+
Added /templates/R_Page____283.aspx
|
13
|
+
Added /templates/R_OptionPage____285.aspx
|
14
|
+
Added /shopping/R_Default____2508.aspx
|
15
|
+
Added /webbnav/index.aspx?nid=53
|
16
|
+
Added /templates/R_Page____275.aspx
|
17
|
+
Added /templates/R_Page____2161.aspx
|
18
|
+
Added /templates/R_Page____273.aspx
|
19
|
+
Added /templates/R_Page____3618.aspx
|
20
|
+
Added /templates/R_Page____271.aspx
|
21
|
+
Added /templates/R_SubStartPage____5029.aspx
|
22
|
+
Added /templates/R_SubStartPage____6694.aspx
|
23
|
+
Added /templates/R_SubStartPage____272.aspx
|
24
|
+
Added /templates/R_SubStartPage____282.aspx
|
25
|
+
Added /templates/R_SubStartPage____238.aspx
|
26
|
+
Added /templates/R_Page____1068.aspx
|
27
|
+
Added /templates/R_Page____6531.aspx
|
28
|
+
Added /templates/R_CustomCalendar____1765.aspx
|
29
|
+
Added /Webbnav/index.aspx?nid=7800
|
30
|
+
Added /templates/R_SubStartPage____4935.aspx
|
31
|
+
Added /templates/R_Page____4335.aspx
|
32
|
+
Added /templates/R_Page____13951.aspx
|
33
|
+
Added /templates/R_Page____1595.aspx
|
34
|
+
Added /templates/R_Page____16886.aspx
|
35
|
+
Added /templates/R_Page____14748.aspx
|
36
|
+
Added /templates/R_Page____12861.aspx
|
37
|
+
Added #
|
38
|
+
Added /templates/R_ExternalPage____3383.aspx?op=search&search_freetext=&search_parlamentary_session=2008%2F09&search_type=Interpellationsdebatt&search_committee=&search_speaker=&search_party=&search_sdate1=2009%2D01%2D16&search_sdate2=2009%2D01%2D16&rpage=0
|
39
|
+
Added http://www.riksdagen.se/templates/R_HtmlCallPage____17356.aspx
|
40
|
+
Added http://www.riksdagen.se/templates/R_Page____17944.aspx
|
41
|
+
Added http://www.riksdagen.se/templates/R_Page____17958.aspx
|
42
|
+
Added http://www.riksdagen.se/templates/R_HtmlCallPage____17361.aspx
|
43
|
+
Added /templates/R_LopsedelArkiv____3388.aspx
|
44
|
+
Added /templates/R_Page____1977.aspx
|
45
|
+
Added /templates/R_LLSubStartPage____4303.aspx
|
46
|
+
Added /default____56.aspx
|
47
|
+
Added /templates/R_Page____1928.aspx
|
48
|
+
Added /templates/R_Page____6546.aspx
|
49
|
+
Added /templates/R_SubStartPage____257.aspx
|
50
|
+
Added /templates/R_SubStartPage____448.aspx
|
51
|
+
Added /templates/R_SubStartPage____4492.aspx
|
52
|
+
Added /templates/R_Page____498.aspx
|
53
|
+
Added /templates/R_Page____734.aspx
|
54
|
+
Opening http://www.riksdagen.se/om
|
55
|
+
Loading data from http://www.riksdagen.se/om
|
56
|
+
3 http://www.riksdagen.se/om
|
57
|
+
Queuing links for http://www.riksdagen.se/om
|
58
|
+
Added /templates/R_Page____735.aspx
|
59
|
+
Added /templates/R_Page____5915.aspx
|
60
|
+
Added /templates/R_Page____1791.aspx
|
61
|
+
Added /templates/R_Page____1788.aspx
|
62
|
+
Added /templates/R_PageFull____6580.aspx
|
63
|
+
Added /templates/R_Page____6558.aspx
|
64
|
+
Added /templates/R_Page____5674.aspx
|
65
|
+
Opening http://www.riksdagen.se/innehall
|
66
|
+
Loading data from http://www.riksdagen.se/innehall
|
data/lib/simplecrawler.rb
CHANGED
@@ -23,7 +23,7 @@ module SimpleCrawler
|
|
23
23
|
require File.dirname(__FILE__) + '/document'
|
24
24
|
|
25
25
|
MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
|
26
|
-
VERSION = "0.1.
|
26
|
+
VERSION = "0.1.7"
|
27
27
|
|
28
28
|
class Crawler
|
29
29
|
|
@@ -107,6 +107,7 @@ module SimpleCrawler
|
|
107
107
|
@queue.push uri.path
|
108
108
|
@current_count = @current_count + 1
|
109
109
|
@visited[uri.path] = false
|
110
|
+
log(" Added #{uri}")
|
110
111
|
end
|
111
112
|
|
112
113
|
end
|
@@ -118,6 +119,7 @@ module SimpleCrawler
|
|
118
119
|
uri = @site_uri.clone
|
119
120
|
uri.path = path if path != "/"
|
120
121
|
doc.uri = uri
|
122
|
+
doc.fetched_at = Time.now
|
121
123
|
|
122
124
|
log("Opening #{uri}")
|
123
125
|
|
@@ -135,10 +137,15 @@ module SimpleCrawler
|
|
135
137
|
|
136
138
|
doc.headers = file.meta
|
137
139
|
doc.http_status = file.status
|
138
|
-
|
139
|
-
rescue
|
140
|
-
log("Error fetching
|
141
|
-
|
140
|
+
|
141
|
+
rescue => error
|
142
|
+
log("Error fetching #{uri}: #{error.message}")
|
143
|
+
if error.message[0..2] =~ /\d\d\d/ then
|
144
|
+
doc.http_status = [error.message[0..2], error.message[3..-1]]
|
145
|
+
return doc
|
146
|
+
else
|
147
|
+
raise error
|
148
|
+
end
|
142
149
|
end
|
143
150
|
return doc
|
144
151
|
end
|
@@ -147,7 +154,7 @@ module SimpleCrawler
|
|
147
154
|
def queue_local_links(doc)
|
148
155
|
return if doc.data == nil
|
149
156
|
log("Queuing links for #{doc.uri}")
|
150
|
-
Hpricot.buffer_size =
|
157
|
+
Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
|
151
158
|
doc = Hpricot(doc.data)
|
152
159
|
links = doc.search("a[@href]")
|
153
160
|
for link in links
|
@@ -155,7 +162,6 @@ module SimpleCrawler
|
|
155
162
|
begin
|
156
163
|
uri = URI.parse(link.attributes["href"])
|
157
164
|
add_uri(uri)
|
158
|
-
log(" Added #{uri}")
|
159
165
|
rescue
|
160
166
|
#skip this link
|
161
167
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simplecrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Krantz
|
@@ -9,7 +9,7 @@ autorequire: simplecrawler
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-05-04 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -37,8 +37,11 @@ files:
|
|
37
37
|
- tests/simplecrawler_test.rb
|
38
38
|
- examples/accessibility_report.rb
|
39
39
|
- examples/crawl.rb
|
40
|
+
- examples/find_broken_links.rb
|
40
41
|
- examples/find_pdfs.rb
|
41
42
|
- examples/list_site_links.rb
|
43
|
+
- examples/result.htm
|
44
|
+
- examples/riksdagen.txt
|
42
45
|
has_rdoc: true
|
43
46
|
homepage: http://www.peterkrantz.com/simplecrawler/wiki/
|
44
47
|
post_install_message:
|