rcrawl 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ end
18
18
 
19
19
  spec = Gem::Specification.new do |s|
20
20
  s.name = "rcrawl"
21
- s.version = "0.3.0"
21
+ s.version = "0.3.5"
22
22
  s.author = "Digital Duckies"
23
23
  s.email = "rcrawl@digitalduckies.net"
24
24
  s.homepage = "http://digitalduckies.net"
@@ -1,8 +1,61 @@
1
1
  module Rcrawl
2
+
2
3
  module Process
4
+
3
5
  module HTML
4
- def link_extractor
6
+
7
+ # HTML processing module for extracting links
8
+ def HTML.link_extractor(document)
9
+ print "."
10
+ # Parse all links from HTML into an array
11
+ # Set up the scrAPI (http://labnotes.org)
12
+ links = Scraper.define do
13
+ array :urls
14
+ process "a[href]", :urls => "@href"
15
+ result :urls
16
+ end
17
+
18
+ urls = links.scrape(document)
19
+
20
+ urls.each { |url|
21
+ uri = URI.parse(url)
22
+
23
+ # Derelativeize links if necessary
24
+ if uri.relative?
25
+ url = @site.merge(url).to_s
26
+ uri = URI.parse(url)
27
+ end
28
+
29
+ # Check domain, if in same domain, keep link, else trash it
30
+ if uri.host != @site.host
31
+ @external_links << url
32
+ @external_links.uniq!
33
+ next
34
+ end
35
+
36
+ # Find out if we've seen this link already
37
+ if (@visited_links.include? url) || (@links_to_visit.include? url)
38
+ next
39
+ end
40
+
41
+ @links_to_visit << url
42
+ }
5
43
  end
44
+
45
+ # HTML processing module for raw HTML storage
46
+ def HTML.process_html(document)
47
+
48
+ # Add link and raw HTML to a hash as key/value
49
+ # for later storage in database
50
+ unless @raw_html.has_value?(document)
51
+ print "."
52
+ @raw_html[document.base_uri] = document
53
+ end
54
+
55
+ end
56
+
6
57
  end
58
+
7
59
  end
60
+
8
61
  end
data/lib/rcrawl.rb CHANGED
@@ -77,63 +77,13 @@ module Rcrawl
77
77
  # Based on MIME type, invoke the proper processing modules
78
78
  case document.content_type
79
79
  when "text/html"
80
- link_extractor(document)
81
- process_html(document)
80
+ Rcrawl::Process::HTML.link_extractor(document)
81
+ Rcrawl::Process::HTML.process_html(document)
82
82
  else
83
83
  print "... not HTML, skipping..."
84
84
  end
85
85
  end
86
86
 
87
- # HTML processing module for extracting links
88
- def link_extractor(document)
89
- print "."
90
-
91
- # Parse all links from HTML into an array
92
- # Set up the scrAPI (http://labnotes.org)
93
- links = Scraper.define do
94
- array :urls
95
- process "a[href]", :urls => "@href"
96
- result :urls
97
- end
98
-
99
- urls = links.scrape(document)
100
-
101
- urls.each { |url|
102
- uri = URI.parse(url)
103
-
104
- # Derelativeize links if necessary
105
- if uri.relative?
106
- url = @site.merge(url).to_s
107
- uri = URI.parse(url)
108
- end
109
-
110
- # Check domain, if in same domain, keep link, else trash it
111
- if uri.host != @site.host
112
- @external_links << url
113
- @external_links.uniq!
114
- next
115
- end
116
-
117
- # Find out if we've seen this link already
118
- if (@visited_links.include? url) || (@links_to_visit.include? url)
119
- next
120
- end
121
-
122
- @links_to_visit << url
123
- }
124
-
125
- end
126
-
127
- # HTML processing module for raw HTML storage
128
- def process_html(document)
129
- # Add link and raw HTML to a hash as key/value
130
- # for later storage in database
131
- unless @raw_html.has_value?(document)
132
- print "."
133
- @raw_html[document.base_uri] = document
134
- end
135
- end
136
-
137
87
  # robots.txt parsing
138
88
  def robot_safe?(url)
139
89
  uri = URI.parse(url)
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rcrawl
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2006-09-22 00:00:00 -05:00
6
+ version: 0.3.5
7
+ date: 2006-09-23 00:00:00 -05:00
8
8
  summary: A web crawler written in ruby
9
9
  require_paths:
10
10
  - lib