simplecrawler 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ Added http://www.riksdagen.se/
2
+ Opening http://www.riksdagen.se/
3
+ Loading data from http://www.riksdagen.se/
4
+ 3 http://www.riksdagen.se/
5
+ Queuing links for http://www.riksdagen.se/
6
+ Added http://www.riksdagen.se/om
7
+ Added http://www.riksdagen.se/innehall
8
+ Added http://www.riksdagen.se/fragor
9
+ Added http://www.riksdagen.se/anvander
10
+ Added http://www.riksdagen.se/kontakt
11
+ Added /default____4.aspx
12
+ Added /templates/R_Page____283.aspx
13
+ Added /templates/R_OptionPage____285.aspx
14
+ Added /shopping/R_Default____2508.aspx
15
+ Added /webbnav/index.aspx?nid=53
16
+ Added /templates/R_Page____275.aspx
17
+ Added /templates/R_Page____2161.aspx
18
+ Added /templates/R_Page____273.aspx
19
+ Added /templates/R_Page____3618.aspx
20
+ Added /templates/R_Page____271.aspx
21
+ Added /templates/R_SubStartPage____5029.aspx
22
+ Added /templates/R_SubStartPage____6694.aspx
23
+ Added /templates/R_SubStartPage____272.aspx
24
+ Added /templates/R_SubStartPage____282.aspx
25
+ Added /templates/R_SubStartPage____238.aspx
26
+ Added /templates/R_Page____1068.aspx
27
+ Added /templates/R_Page____6531.aspx
28
+ Added /templates/R_CustomCalendar____1765.aspx
29
+ Added /Webbnav/index.aspx?nid=7800
30
+ Added /templates/R_SubStartPage____4935.aspx
31
+ Added /templates/R_Page____4335.aspx
32
+ Added /templates/R_Page____13951.aspx
33
+ Added /templates/R_Page____1595.aspx
34
+ Added /templates/R_Page____16886.aspx
35
+ Added /templates/R_Page____14748.aspx
36
+ Added /templates/R_Page____12861.aspx
37
+ Added #
38
+ Added /templates/R_ExternalPage____3383.aspx?op=search&search_freetext=&search_parlamentary_session=2008%2F09&search_type=Interpellationsdebatt&search_committee=&search_speaker=&search_party=&search_sdate1=2009%2D01%2D16&search_sdate2=2009%2D01%2D16&rpage=0
39
+ Added http://www.riksdagen.se/templates/R_HtmlCallPage____17356.aspx
40
+ Added http://www.riksdagen.se/templates/R_Page____17944.aspx
41
+ Added http://www.riksdagen.se/templates/R_Page____17958.aspx
42
+ Added http://www.riksdagen.se/templates/R_HtmlCallPage____17361.aspx
43
+ Added /templates/R_LopsedelArkiv____3388.aspx
44
+ Added /templates/R_Page____1977.aspx
45
+ Added /templates/R_LLSubStartPage____4303.aspx
46
+ Added /default____56.aspx
47
+ Added /templates/R_Page____1928.aspx
48
+ Added /templates/R_Page____6546.aspx
49
+ Added /templates/R_SubStartPage____257.aspx
50
+ Added /templates/R_SubStartPage____448.aspx
51
+ Added /templates/R_SubStartPage____4492.aspx
52
+ Added /templates/R_Page____498.aspx
53
+ Added /templates/R_Page____734.aspx
54
+ Opening http://www.riksdagen.se/om
55
+ Loading data from http://www.riksdagen.se/om
56
+ 3 http://www.riksdagen.se/om
57
+ Queuing links for http://www.riksdagen.se/om
58
+ Added /templates/R_Page____735.aspx
59
+ Added /templates/R_Page____5915.aspx
60
+ Added /templates/R_Page____1791.aspx
61
+ Added /templates/R_Page____1788.aspx
62
+ Added /templates/R_PageFull____6580.aspx
63
+ Added /templates/R_Page____6558.aspx
64
+ Added /templates/R_Page____5674.aspx
65
+ Opening http://www.riksdagen.se/innehall
66
+ Loading data from http://www.riksdagen.se/innehall
data/lib/simplecrawler.rb CHANGED
@@ -23,7 +23,7 @@ module SimpleCrawler
23
23
  require File.dirname(__FILE__) + '/document'
24
24
 
25
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
26
- VERSION = "0.1.6"
26
+ VERSION = "0.1.7"
27
27
 
28
28
  class Crawler
29
29
 
@@ -107,6 +107,7 @@ module SimpleCrawler
107
107
  @queue.push uri.path
108
108
  @current_count = @current_count + 1
109
109
  @visited[uri.path] = false
110
+ log(" Added #{uri}")
110
111
  end
111
112
 
112
113
  end
@@ -118,6 +119,7 @@ module SimpleCrawler
118
119
  uri = @site_uri.clone
119
120
  uri.path = path if path != "/"
120
121
  doc.uri = uri
122
+ doc.fetched_at = Time.now
121
123
 
122
124
  log("Opening #{uri}")
123
125
 
@@ -135,10 +137,15 @@ module SimpleCrawler
135
137
 
136
138
  doc.headers = file.meta
137
139
  doc.http_status = file.status
138
- doc.fetched_at = Time.now
139
- rescue Exception
140
- log("Error fetching [#{uri}]: #{$!}")
141
- return doc
140
+
141
+ rescue => error
142
+ log("Error fetching #{uri}: #{error.message}")
143
+ if error.message[0..2] =~ /\d\d\d/ then
144
+ doc.http_status = [error.message[0..2], error.message[3..-1]]
145
+ return doc
146
+ else
147
+ raise error
148
+ end
142
149
  end
143
150
  return doc
144
151
  end
@@ -147,7 +154,7 @@ module SimpleCrawler
147
154
  def queue_local_links(doc)
148
155
  return if doc.data == nil
149
156
  log("Queuing links for #{doc.uri}")
150
- Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
157
+ Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
151
158
  doc = Hpricot(doc.data)
152
159
  links = doc.search("a[@href]")
153
160
  for link in links
@@ -155,7 +162,6 @@ module SimpleCrawler
155
162
  begin
156
163
  uri = URI.parse(link.attributes["href"])
157
164
  add_uri(uri)
158
- log(" Added #{uri}")
159
165
  rescue
160
166
  #skip this link
161
167
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simplecrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Krantz
@@ -9,7 +9,7 @@ autorequire: simplecrawler
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-11-28 00:00:00 +01:00
12
+ date: 2009-05-04 00:00:00 +02:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -37,8 +37,11 @@ files:
37
37
  - tests/simplecrawler_test.rb
38
38
  - examples/accessibility_report.rb
39
39
  - examples/crawl.rb
40
+ - examples/find_broken_links.rb
40
41
  - examples/find_pdfs.rb
41
42
  - examples/list_site_links.rb
43
+ - examples/result.htm
44
+ - examples/riksdagen.txt
42
45
  has_rdoc: true
43
46
  homepage: http://www.peterkrantz.com/simplecrawler/wiki/
44
47
  post_install_message: