simplecrawler 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ Added http://www.riksdagen.se/
2
+ Opening http://www.riksdagen.se/
3
+ Loading data from http://www.riksdagen.se/
4
+ 3 http://www.riksdagen.se/
5
+ Queuing links for http://www.riksdagen.se/
6
+ Added http://www.riksdagen.se/om
7
+ Added http://www.riksdagen.se/innehall
8
+ Added http://www.riksdagen.se/fragor
9
+ Added http://www.riksdagen.se/anvander
10
+ Added http://www.riksdagen.se/kontakt
11
+ Added /default____4.aspx
12
+ Added /templates/R_Page____283.aspx
13
+ Added /templates/R_OptionPage____285.aspx
14
+ Added /shopping/R_Default____2508.aspx
15
+ Added /webbnav/index.aspx?nid=53
16
+ Added /templates/R_Page____275.aspx
17
+ Added /templates/R_Page____2161.aspx
18
+ Added /templates/R_Page____273.aspx
19
+ Added /templates/R_Page____3618.aspx
20
+ Added /templates/R_Page____271.aspx
21
+ Added /templates/R_SubStartPage____5029.aspx
22
+ Added /templates/R_SubStartPage____6694.aspx
23
+ Added /templates/R_SubStartPage____272.aspx
24
+ Added /templates/R_SubStartPage____282.aspx
25
+ Added /templates/R_SubStartPage____238.aspx
26
+ Added /templates/R_Page____1068.aspx
27
+ Added /templates/R_Page____6531.aspx
28
+ Added /templates/R_CustomCalendar____1765.aspx
29
+ Added /Webbnav/index.aspx?nid=7800
30
+ Added /templates/R_SubStartPage____4935.aspx
31
+ Added /templates/R_Page____4335.aspx
32
+ Added /templates/R_Page____13951.aspx
33
+ Added /templates/R_Page____1595.aspx
34
+ Added /templates/R_Page____16886.aspx
35
+ Added /templates/R_Page____14748.aspx
36
+ Added /templates/R_Page____12861.aspx
37
+ Added #
38
+ Added /templates/R_ExternalPage____3383.aspx?op=search&search_freetext=&search_parlamentary_session=2008%2F09&search_type=Interpellationsdebatt&search_committee=&search_speaker=&search_party=&search_sdate1=2009%2D01%2D16&search_sdate2=2009%2D01%2D16&rpage=0
39
+ Added http://www.riksdagen.se/templates/R_HtmlCallPage____17356.aspx
40
+ Added http://www.riksdagen.se/templates/R_Page____17944.aspx
41
+ Added http://www.riksdagen.se/templates/R_Page____17958.aspx
42
+ Added http://www.riksdagen.se/templates/R_HtmlCallPage____17361.aspx
43
+ Added /templates/R_LopsedelArkiv____3388.aspx
44
+ Added /templates/R_Page____1977.aspx
45
+ Added /templates/R_LLSubStartPage____4303.aspx
46
+ Added /default____56.aspx
47
+ Added /templates/R_Page____1928.aspx
48
+ Added /templates/R_Page____6546.aspx
49
+ Added /templates/R_SubStartPage____257.aspx
50
+ Added /templates/R_SubStartPage____448.aspx
51
+ Added /templates/R_SubStartPage____4492.aspx
52
+ Added /templates/R_Page____498.aspx
53
+ Added /templates/R_Page____734.aspx
54
+ Opening http://www.riksdagen.se/om
55
+ Loading data from http://www.riksdagen.se/om
56
+ 3 http://www.riksdagen.se/om
57
+ Queuing links for http://www.riksdagen.se/om
58
+ Added /templates/R_Page____735.aspx
59
+ Added /templates/R_Page____5915.aspx
60
+ Added /templates/R_Page____1791.aspx
61
+ Added /templates/R_Page____1788.aspx
62
+ Added /templates/R_PageFull____6580.aspx
63
+ Added /templates/R_Page____6558.aspx
64
+ Added /templates/R_Page____5674.aspx
65
+ Opening http://www.riksdagen.se/innehall
66
+ Loading data from http://www.riksdagen.se/innehall
data/lib/simplecrawler.rb CHANGED
@@ -23,7 +23,7 @@ module SimpleCrawler
23
23
  require File.dirname(__FILE__) + '/document'
24
24
 
25
25
  MARKUP_MIME_TYPES = ["text/html", "text/xml", "application/xml", "application/xhtml+xml"]
26
- VERSION = "0.1.6"
26
+ VERSION = "0.1.7"
27
27
 
28
28
  class Crawler
29
29
 
@@ -107,6 +107,7 @@ module SimpleCrawler
107
107
  @queue.push uri.path
108
108
  @current_count = @current_count + 1
109
109
  @visited[uri.path] = false
110
+ log(" Added #{uri}")
110
111
  end
111
112
 
112
113
  end
@@ -118,6 +119,7 @@ module SimpleCrawler
118
119
  uri = @site_uri.clone
119
120
  uri.path = path if path != "/"
120
121
  doc.uri = uri
122
+ doc.fetched_at = Time.now
121
123
 
122
124
  log("Opening #{uri}")
123
125
 
@@ -135,10 +137,15 @@ module SimpleCrawler
135
137
 
136
138
  doc.headers = file.meta
137
139
  doc.http_status = file.status
138
- doc.fetched_at = Time.now
139
- rescue Exception
140
- log("Error fetching [#{uri}]: #{$!}")
141
- return doc
140
+
141
+ rescue => error
142
+ log("Error fetching #{uri}: #{error.message}")
143
+ if error.message[0..2] =~ /\d\d\d/ then
144
+ doc.http_status = [error.message[0..2], error.message[3..-1]]
145
+ return doc
146
+ else
147
+ raise error
148
+ end
142
149
  end
143
150
  return doc
144
151
  end
@@ -147,7 +154,7 @@ module SimpleCrawler
147
154
  def queue_local_links(doc)
148
155
  return if doc.data == nil
149
156
  log("Queuing links for #{doc.uri}")
150
- Hpricot.buffer_size = 262144 #Allow for asp.net bastard-sized viewstate attributes...
157
+ Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
151
158
  doc = Hpricot(doc.data)
152
159
  links = doc.search("a[@href]")
153
160
  for link in links
@@ -155,7 +162,6 @@ module SimpleCrawler
155
162
  begin
156
163
  uri = URI.parse(link.attributes["href"])
157
164
  add_uri(uri)
158
- log(" Added #{uri}")
159
165
  rescue
160
166
  #skip this link
161
167
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simplecrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Krantz
@@ -9,7 +9,7 @@ autorequire: simplecrawler
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-11-28 00:00:00 +01:00
12
+ date: 2009-05-04 00:00:00 +02:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -37,8 +37,11 @@ files:
37
37
  - tests/simplecrawler_test.rb
38
38
  - examples/accessibility_report.rb
39
39
  - examples/crawl.rb
40
+ - examples/find_broken_links.rb
40
41
  - examples/find_pdfs.rb
41
42
  - examples/list_site_links.rb
43
+ - examples/result.htm
44
+ - examples/riksdagen.txt
42
45
  has_rdoc: true
43
46
  homepage: http://www.peterkrantz.com/simplecrawler/wiki/
44
47
  post_install_message: