rcrawl 0.3.0 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/lib/rcrawl/process/html.rb +54 -1
- data/lib/rcrawl.rb +2 -52
- metadata +2 -2
data/Rakefile
CHANGED
data/lib/rcrawl/process/html.rb
CHANGED
@@ -1,8 +1,61 @@
|
|
1
1
|
module Rcrawl
|
2
|
+
|
2
3
|
module Process
|
4
|
+
|
3
5
|
module HTML
|
4
|
-
|
6
|
+
|
7
|
+
# HTML processing module for extracting links
|
8
|
+
def HTML.link_extractor(document)
|
9
|
+
print "."
|
10
|
+
# Parse all links from HTML into an array
|
11
|
+
# Set up the scrAPI (http://labnotes.org)
|
12
|
+
links = Scraper.define do
|
13
|
+
array :urls
|
14
|
+
process "a[href]", :urls => "@href"
|
15
|
+
result :urls
|
16
|
+
end
|
17
|
+
|
18
|
+
urls = links.scrape(document)
|
19
|
+
|
20
|
+
urls.each { |url|
|
21
|
+
uri = URI.parse(url)
|
22
|
+
|
23
|
+
# Derelativeize links if necessary
|
24
|
+
if uri.relative?
|
25
|
+
url = @site.merge(url).to_s
|
26
|
+
uri = URI.parse(url)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Check domain, if in same domain, keep link, else trash it
|
30
|
+
if uri.host != @site.host
|
31
|
+
@external_links << url
|
32
|
+
@external_links.uniq!
|
33
|
+
next
|
34
|
+
end
|
35
|
+
|
36
|
+
# Find out if we've seen this link already
|
37
|
+
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
38
|
+
next
|
39
|
+
end
|
40
|
+
|
41
|
+
@links_to_visit << url
|
42
|
+
}
|
5
43
|
end
|
44
|
+
|
45
|
+
# HTML processing module for raw HTML storage
|
46
|
+
def HTML.process_html(document)
|
47
|
+
|
48
|
+
# Add link and raw HTML to a hash as key/value
|
49
|
+
# for later storage in database
|
50
|
+
unless @raw_html.has_value?(document)
|
51
|
+
print "."
|
52
|
+
@raw_html[document.base_uri] = document
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
6
57
|
end
|
58
|
+
|
7
59
|
end
|
60
|
+
|
8
61
|
end
|
data/lib/rcrawl.rb
CHANGED
@@ -77,63 +77,13 @@ module Rcrawl
|
|
77
77
|
# Based on MIME type, invoke the proper processing modules
|
78
78
|
case document.content_type
|
79
79
|
when "text/html"
|
80
|
-
link_extractor(document)
|
81
|
-
process_html(document)
|
80
|
+
Rcrawl::Process::HTML.link_extractor(document)
|
81
|
+
Rcrawl::Process::HTML.process_html(document)
|
82
82
|
else
|
83
83
|
print "... not HTML, skipping..."
|
84
84
|
end
|
85
85
|
end
|
86
86
|
|
87
|
-
# HTML processing module for extracting links
|
88
|
-
def link_extractor(document)
|
89
|
-
print "."
|
90
|
-
|
91
|
-
# Parse all links from HTML into an array
|
92
|
-
# Set up the scrAPI (http://labnotes.org)
|
93
|
-
links = Scraper.define do
|
94
|
-
array :urls
|
95
|
-
process "a[href]", :urls => "@href"
|
96
|
-
result :urls
|
97
|
-
end
|
98
|
-
|
99
|
-
urls = links.scrape(document)
|
100
|
-
|
101
|
-
urls.each { |url|
|
102
|
-
uri = URI.parse(url)
|
103
|
-
|
104
|
-
# Derelativeize links if necessary
|
105
|
-
if uri.relative?
|
106
|
-
url = @site.merge(url).to_s
|
107
|
-
uri = URI.parse(url)
|
108
|
-
end
|
109
|
-
|
110
|
-
# Check domain, if in same domain, keep link, else trash it
|
111
|
-
if uri.host != @site.host
|
112
|
-
@external_links << url
|
113
|
-
@external_links.uniq!
|
114
|
-
next
|
115
|
-
end
|
116
|
-
|
117
|
-
# Find out if we've seen this link already
|
118
|
-
if (@visited_links.include? url) || (@links_to_visit.include? url)
|
119
|
-
next
|
120
|
-
end
|
121
|
-
|
122
|
-
@links_to_visit << url
|
123
|
-
}
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
# HTML processing module for raw HTML storage
|
128
|
-
def process_html(document)
|
129
|
-
# Add link and raw HTML to a hash as key/value
|
130
|
-
# for later storage in database
|
131
|
-
unless @raw_html.has_value?(document)
|
132
|
-
print "."
|
133
|
-
@raw_html[document.base_uri] = document
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
87
|
# robots.txt parsing
|
138
88
|
def robot_safe?(url)
|
139
89
|
uri = URI.parse(url)
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rcrawl
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-09-
|
6
|
+
version: 0.3.5
|
7
|
+
date: 2006-09-23 00:00:00 -05:00
|
8
8
|
summary: A web crawler written in ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|