super_crawler 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +19 -4
- data/lib/super_crawler/crawl.rb +29 -71
- data/lib/super_crawler/render.rb +89 -0
- data/lib/super_crawler/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef7ae8a5bb3aac480832a0024b70363788e268cb
|
4
|
+
data.tar.gz: c03d33300045b2e2053606d616b8e5f82af2e941
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23e645e55ad85462fbfb81d0e518961b54e2986e16eaa256799dea2a2fe40759eae3bcbfef74261e87e59e495b3aa5598812622ca87c0e7448d0fb1ebcaf7504
|
7
|
+
data.tar.gz: 82fab161a1147e538b5e4f6324a9cae1a8d2bc2815834d2d7ecedc58aeafd96e2ec7145bdb5c62195374a316d7bc57efbf1157101ab00816c1b25ddb8bbc1026
|
data/README.md
CHANGED
@@ -79,7 +79,7 @@ Where `url` should be the URL of the page you would like to scrap.
|
|
79
79
|
Run
|
80
80
|
|
81
81
|
page.url
|
82
|
-
|
82
|
+
|
83
83
|
to get the encoded URL.
|
84
84
|
|
85
85
|
#### Get internal links of a page
|
@@ -87,7 +87,7 @@ to get the encoded URL.
|
|
87
87
|
Run
|
88
88
|
|
89
89
|
page.get_links
|
90
|
-
|
90
|
+
|
91
91
|
to get the list of internal links in the page. An internal link is a link that _has the same schame and host than the provided URL_. Subdomains are rejected.
|
92
92
|
|
93
93
|
This method searches in the `href` attribute of all `<a>` anchor tags.
|
@@ -154,7 +154,7 @@ where `url` is the URL of the website to crawl.
|
|
154
154
|
Next, start the crawler:
|
155
155
|
|
156
156
|
sc.start(number_of_threads)
|
157
|
-
|
157
|
+
|
158
158
|
where `number_of_threads` is the number of threads that will perform the job (10 by default.) **This can take some time, depending on the site to crawl.**
|
159
159
|
|
160
160
|
To access the crawl results, use the following:
|
@@ -166,7 +166,7 @@ To see the crawling as a sitemap, use:
|
|
166
166
|
|
167
167
|
sc.render(5) # Will render the sitemap of the first 5 pages
|
168
168
|
|
169
|
-
_TODO:
|
169
|
+
_TODO: Make more sophisticated rendering methods, that can render within files of different formats (HTML, XML, JSON,...)_
|
170
170
|
|
171
171
|
#### Tips on searching assets and links
|
172
172
|
|
@@ -187,6 +187,21 @@ After `sc.start`, you can access all collected resources (links and assets) usin
|
|
187
187
|
|
188
188
|
You can use `sc.crawl_results.select{ |resource| ... }` to select a particular resource.
|
189
189
|
|
190
|
+
Example:
|
191
|
+
|
192
|
+
images = sc.crawl_results.map{ |page| page[:assets][:images] }.flatten.uniq
|
193
|
+
# => Returns an array of all unique images found during the crawling
|
194
|
+
|
195
|
+
#### Get assets of a whole crawling
|
196
|
+
|
197
|
+
You can collect in a single array any assets of a crawling, by using the following:
|
198
|
+
|
199
|
+
images = sc.get_assets :images # => Returns an array of unique images
|
200
|
+
stylesheets = sc.get_assets :stylesheets # => Returns an array of unique stylesheets
|
201
|
+
scripts = sc.get_assets :scripts # => Returns an array of unique scripts
|
202
|
+
|
203
|
+
It is important to note that all the given arrays contain unique absolute URLs. As said before, the assets are not necessarily internal assets.
|
204
|
+
|
190
205
|
## Limitations
|
191
206
|
|
192
207
|
Actually, the gem has the following limitations:
|
data/lib/super_crawler/crawl.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
require 'thread'
|
2
2
|
|
3
3
|
require 'super_crawler/scrap'
|
4
|
+
require 'super_crawler/render'
|
4
5
|
|
5
6
|
module SuperCrawler
|
6
7
|
|
7
8
|
###
|
8
9
|
# Crawl a whole website
|
10
|
+
# For each new link detected, scrap the corresponding page.
|
9
11
|
#
|
10
12
|
class Crawl
|
11
13
|
|
@@ -25,11 +27,12 @@ module SuperCrawler
|
|
25
27
|
#
|
26
28
|
def start threads_count = 10
|
27
29
|
|
28
|
-
crawling_start_notice( @start_url, threads_count ) # Show message on what will happen
|
29
|
-
|
30
|
-
|
31
|
-
@
|
32
|
-
@
|
30
|
+
SuperCrawler::Render.crawling_start_notice( @start_url, threads_count ) if @option_debug # Show message on what will happen
|
31
|
+
|
32
|
+
threads = [] # Will contain our n-threads
|
33
|
+
@links_queue = Queue.new # Will contain the links queue that the threads will use
|
34
|
+
@links = [@start_url] # Re-init the links list
|
35
|
+
@crawl_results = [] # Re-init the crawling results
|
33
36
|
|
34
37
|
start_time = Time.now if @option_debug # Start the timer
|
35
38
|
|
@@ -51,42 +54,40 @@ module SuperCrawler
|
|
51
54
|
end
|
52
55
|
|
53
56
|
threads.map(&:join) # Activate the threads
|
54
|
-
crawling_summary_notice(
|
57
|
+
SuperCrawler::Render.crawling_summary_notice(Time.now - start_time, threads_count, @links.count) if @option_debug # Display crawling summary
|
55
58
|
|
56
59
|
return true
|
57
60
|
end
|
58
61
|
|
59
62
|
###
|
60
|
-
# Render sitemap
|
61
|
-
# Show, for each link, internal links and assets
|
62
|
-
# We will limit pages to display, because some sites have more than 1,000 pages
|
63
|
+
# Render the crawling result as a sitemap in the console
|
63
64
|
#
|
64
65
|
def render max_pages = 10
|
65
|
-
|
66
|
-
|
67
|
-
@crawl_results[0..(max_pages-1)].each_with_index do |result, index|
|
68
|
-
puts "[#{index+1}] Content of #{result[:url]}\n"
|
69
|
-
|
70
|
-
puts " + Internal links: #{'None' if result[:links].empty?}"
|
71
|
-
result[:links].each { |link| puts " - #{link}" }
|
72
|
-
|
73
|
-
puts " + Internal images: #{'None' if result[:assets][:images].empty?}"
|
74
|
-
result[:assets][:images].each { |link| puts " - #{link}" }
|
75
|
-
|
76
|
-
puts " + Internal stylesheets: #{'None' if result[:assets][:stylesheets].empty?}"
|
77
|
-
result[:assets][:stylesheets].each { |link| puts " - #{link}" }
|
66
|
+
SuperCrawler::Render.console( @crawl_results, max_pages )
|
67
|
+
end
|
78
68
|
|
79
|
-
|
80
|
-
|
81
|
-
|
69
|
+
###
|
70
|
+
# Get specific assets (images, stylesheets and scripts)
|
71
|
+
#
|
72
|
+
def get_assets asset
|
73
|
+
return [] if @crawl_results.empty? # No crawling yet? Return empty search
|
74
|
+
|
75
|
+
# The asset parameter can only be images, stylesheets or scripts
|
76
|
+
unless %w(images stylesheets scripts).include? asset.to_s
|
77
|
+
# Display error message in this case.
|
78
|
+
SuperCrawler::Render.error "`asset` parameter can only be `images`, `stylesheets` or `scripts`"
|
79
|
+
return [] # Return empty array
|
82
80
|
end
|
83
|
-
|
81
|
+
|
82
|
+
# Good! Return flatten array of unique assets
|
83
|
+
return @crawl_results.map{ |cr| cr[:assets][asset.to_sym] }.flatten.uniq
|
84
84
|
end
|
85
85
|
|
86
86
|
private
|
87
87
|
|
88
88
|
###
|
89
|
-
# Process a page by extracting information and updating links queue,
|
89
|
+
# Process a page by extracting information and updating links queue,
|
90
|
+
# links list and results.
|
90
91
|
#
|
91
92
|
def process_page page_url
|
92
93
|
page = SuperCrawler::Scrap.new(page_url) # Scrap the current page
|
@@ -102,50 +103,7 @@ module SuperCrawler
|
|
102
103
|
assets: page.get_assets # Its assets
|
103
104
|
}
|
104
105
|
|
105
|
-
log_status( page_url ) if @option_debug # Display site crawling status
|
106
|
-
end
|
107
|
-
|
108
|
-
###
|
109
|
-
# Display a notice when starting a site crawl
|
110
|
-
#
|
111
|
-
def crawling_start_notice start_url, threads
|
112
|
-
draw_line
|
113
|
-
puts "Start crawling #{start_url} using #{threads} threads. Crawling rules:"
|
114
|
-
puts "1. Keep only internal links"
|
115
|
-
puts "2. Links with different scheme are agnored"
|
116
|
-
puts "3. Remove the fragment part from the links (#...)"
|
117
|
-
puts "4. Keep paths with different parameters (?...)"
|
118
|
-
draw_line
|
119
|
-
end
|
120
|
-
|
121
|
-
###
|
122
|
-
# Log current search status (crawled links / total links)
|
123
|
-
#
|
124
|
-
def log_status url
|
125
|
-
text = "Crawled #{@crawl_results.length.to_s}/#{@links.length.to_s}: #{url}"
|
126
|
-
print "\r#{" "*100}\r" # Clean the previous text
|
127
|
-
print (text.length <= 50) ? text : "#{text[0..46]}..."
|
128
|
-
STDOUT.flush
|
129
|
-
end
|
130
|
-
|
131
|
-
###
|
132
|
-
# Display final crawling summary after site crawling complete
|
133
|
-
#
|
134
|
-
def crawling_summary_notice time_start, time_end, threads
|
135
|
-
total_time = time_end - time_start
|
136
|
-
puts ""
|
137
|
-
draw_line
|
138
|
-
puts "Crawled #{@links.count} links in #{total_time.to_f.to_s} seconds using #{threads} threads."
|
139
|
-
puts "Use .crawl_results to access the crawl results as an array of hashes."
|
140
|
-
puts "Use .render to see the crawl_results as a sitemap."
|
141
|
-
draw_line
|
142
|
-
end
|
143
|
-
|
144
|
-
###
|
145
|
-
# Draw a line (because readability is also important!!)
|
146
|
-
#
|
147
|
-
def draw_line
|
148
|
-
puts "#{'-' * 80}"
|
106
|
+
SuperCrawler::Render.log_status( page_url, @crawl_results.length, @links.length ) if @option_debug # Display site crawling status
|
149
107
|
end
|
150
108
|
|
151
109
|
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module SuperCrawler
|
2
|
+
|
3
|
+
##
|
4
|
+
# Render crawl results and processing.
|
5
|
+
#
|
6
|
+
class Render
|
7
|
+
|
8
|
+
###
|
9
|
+
# Display error message in the console.
|
10
|
+
#
|
11
|
+
def self.error message
|
12
|
+
puts "\e[31m[ERROR]\e[0m #{message}"
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
# Display a notice when starting a site crawl
|
17
|
+
#
|
18
|
+
def self.crawling_start_notice start_url, threads
|
19
|
+
self.draw_line
|
20
|
+
puts "Start crawling #{start_url} using #{threads} threads. Crawling rules:"
|
21
|
+
puts "1. Consider only links starting with #{start_url}"
|
22
|
+
puts "2. Remove the fragment part from the links (#...)"
|
23
|
+
puts "3. Keep paths with different parameters (?...)"
|
24
|
+
puts "4. Assets can be internal or external to the site"
|
25
|
+
self.draw_line
|
26
|
+
end
|
27
|
+
|
28
|
+
###
|
29
|
+
# Render sitemap in console
|
30
|
+
# Show, for each link, internal links and assets
|
31
|
+
# We will limit pages to display, because some sites have more than 1,000 pages
|
32
|
+
#
|
33
|
+
def self.console crawl_results, max_pages
|
34
|
+
self.draw_line
|
35
|
+
puts "Showing first #{max_pages} crawled pages and their contents:\n\n"
|
36
|
+
crawl_results[0..(max_pages-1)].each_with_index do |result, index|
|
37
|
+
puts "[#{index+1}] Content of #{result[:url]}\n"
|
38
|
+
|
39
|
+
puts " + Internal links: #{'None' if result[:links].empty?}"
|
40
|
+
result[:links].each { |link| puts " - #{link}" }
|
41
|
+
|
42
|
+
puts " + Internal images: #{'None' if result[:assets][:images].empty?}"
|
43
|
+
result[:assets][:images].each { |link| puts " - #{link}" }
|
44
|
+
|
45
|
+
puts " + Internal stylesheets: #{'None' if result[:assets][:stylesheets].empty?}"
|
46
|
+
result[:assets][:stylesheets].each { |link| puts " - #{link}" }
|
47
|
+
|
48
|
+
puts " + Internal scripts: #{'None' if result[:assets][:scripts].empty?}"
|
49
|
+
result[:assets][:scripts].each { |link| puts " - #{link}" }
|
50
|
+
puts ""
|
51
|
+
end
|
52
|
+
self.draw_line
|
53
|
+
end
|
54
|
+
|
55
|
+
###
|
56
|
+
# Log current search status (crawled links / total links)
|
57
|
+
#
|
58
|
+
def self.log_status url, crawl_results_length, links_length
|
59
|
+
text = "Crawled #{crawl_results_length.to_s}/#{links_length.to_s}: #{url}"
|
60
|
+
print "\r#{" "*100}\r" # Clean the previous text
|
61
|
+
print (text.length <= 50) ? text : "#{text[0..46]}..."
|
62
|
+
STDOUT.flush
|
63
|
+
end
|
64
|
+
|
65
|
+
###
|
66
|
+
# Display final crawling summary after site crawling complete
|
67
|
+
#
|
68
|
+
def self.crawling_summary_notice total_time, threads_count, links_count
|
69
|
+
puts
|
70
|
+
self.draw_line
|
71
|
+
puts "\e[33m[SUCCESS]\e[0m Crawled #{links_count} links in #{total_time.to_f.to_s} seconds using #{threads_count} threads."
|
72
|
+
puts "Use .crawl_results to access the crawl results as an array of hashes."
|
73
|
+
puts "Use .render to see the crawl_results as a sitemap."
|
74
|
+
self.draw_line
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
###
|
80
|
+
# Draw a line (because readability is also important!!)
|
81
|
+
#
|
82
|
+
def self.draw_line
|
83
|
+
puts "#{'-' * 80}"
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: super_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hassen Taidirt
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-07-
|
11
|
+
date: 2016-07-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -100,6 +100,7 @@ files:
|
|
100
100
|
- bin/setup
|
101
101
|
- lib/super_crawler.rb
|
102
102
|
- lib/super_crawler/crawl.rb
|
103
|
+
- lib/super_crawler/render.rb
|
103
104
|
- lib/super_crawler/scrap.rb
|
104
105
|
- lib/super_crawler/version.rb
|
105
106
|
- super_crawler.gemspec
|