super_crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +255 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/super_crawler/crawl_page.rb +160 -0
- data/lib/super_crawler/crawl_site.rb +154 -0
- data/lib/super_crawler/version.rb +3 -0
- data/lib/super_crawler.rb +4 -0
- data/super_crawler.gemspec +37 -0
- metadata +145 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4f566869c9f06df215b047291bbc08a83e25a176
|
4
|
+
data.tar.gz: ee35c2dcee2dae0289e70a8fb0195c63859bf793
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e7f5c9db1479c83af91879d41f9ea6480142ddf11fba9fcc3bc944f46de90d8b821929582526f245cd6752a4506019f8b2489d08175e5c6fe30faea620c1ea61
|
7
|
+
data.tar.gz: 1c81fc244e6679cfe6782b9651782944306290a9281277658a220155ef6c4a7ccee59b9b7a2f058c0862d52027e7f8453ba01a21c953d2f31ff5980e9fea87ad
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at htaidirt@gmail.com. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 Hassen Taidirt
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,255 @@
|
|
1
|
+
# SuperCrawler
|
2
|
+
|
3
|
+
Easy (yet efficient) ruby gem to crawl your favorite website.
|
4
|
+
|
5
|
+
## Quick Start
|
6
|
+
|
7
|
+
Open your terminal, then:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
$ git clone https://github.com/htaidirt/super_crawler
|
11
|
+
|
12
|
+
$ cd super_crawler
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
$ ./bin/console
|
17
|
+
```
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
> sc = SuperCrawler::CrawlSite.new('https://gocardless.com')
|
21
|
+
|
22
|
+
> sc.start # => Start crawling the website
|
23
|
+
|
24
|
+
> sc.render(5) # => Show first 5 results of the crawling as sitemap
|
25
|
+
```
|
26
|
+
|
27
|
+
## Installation
|
28
|
+
|
29
|
+
Add this line to your application's Gemfile:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
gem 'super_crawler'
|
33
|
+
```
|
34
|
+
|
35
|
+
And then execute:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
bundle install
|
39
|
+
```
|
40
|
+
|
41
|
+
Or install it yourself as:
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
gem install super_crawler
|
45
|
+
```
|
46
|
+
|
47
|
+
Want to experiment with the gem without installing it? Clone the following repo and run `bin/console` for an interactive prompt that will allow you to experiment.
|
48
|
+
|
49
|
+
## Warning!
|
50
|
+
|
51
|
+
This gem is an experiment and can't be used for production purposes. Please, use it with caution if you want to use it in your projects.
|
52
|
+
|
53
|
+
There are also a lot of limitations that weren't handled due to time. You'll find more information on the limitations below.
|
54
|
+
|
55
|
+
SuperCrawler gem was only tested on MRI and ruby 2.3.1.
|
56
|
+
|
57
|
+
## Philosophy
|
58
|
+
|
59
|
+
Starting from a URL, extract all the internal links and assets within the page. Add all unique links to an array for future exploration of theses links. Repeat for each link in the links list until no new link is discovered.
|
60
|
+
|
61
|
+
Due to the heavy operations, and the time to access each page content, we will use threads to perform near-parallel processing.
|
62
|
+
|
63
|
+
In order to keep the code readable and structured, create two classes:
|
64
|
+
|
65
|
+
- `SuperCrawler::CrawlPage` that is responsible for crawling a single page and extracting all relevant information (internal links and assets)
|
66
|
+
- `SuperCrawler::CrawlSite` that is responsible for crawling a whole website, by collecting links and calling `SuperCrawler::CrawlPage` within threads. This class is also responsible for rendering results.
|
67
|
+
|
68
|
+
## More detailed use
|
69
|
+
|
70
|
+
Open your favorite ruby console and require the gem:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
require 'super_crawler'
|
74
|
+
```
|
75
|
+
|
76
|
+
### Crawling a single web page
|
77
|
+
|
78
|
+
Read the following if you would like to crawl a single web page and extract relevant information (internal links and assets).
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
page = SuperCrawler::CrawlPage.new( url )
|
82
|
+
```
|
83
|
+
|
84
|
+
Where `url` should be the URL of the page you would like to crawl.
|
85
|
+
|
86
|
+
**Nota:** When missing a scheme (`http://` or `https://`), SuperCrawler will prepend the URL with an `http://`.
|
87
|
+
|
88
|
+
#### Get the encoded URL
|
89
|
+
|
90
|
+
Run
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
page.url
|
94
|
+
```
|
95
|
+
|
96
|
+
to get the encoded URL provided.
|
97
|
+
|
98
|
+
#### Get internal links of a page
|
99
|
+
|
100
|
+
Run
|
101
|
+
|
102
|
+
```ruby
|
103
|
+
page.get_links
|
104
|
+
```
|
105
|
+
|
106
|
+
to get a list of internal links within the crawled page. An internal link is a link that _has the same host than the page URL_. Subdomains are rejected.
|
107
|
+
|
108
|
+
This method searches in the `href` attribute of all `<a>` anchor tags.
|
109
|
+
|
110
|
+
**Nota:** This method returns an array of absolute URLs (all internal links).
|
111
|
+
|
112
|
+
**Nota 2:** Bad links and special links (like mailto and javascript) are discarded.
|
113
|
+
|
114
|
+
#### Get images of a page
|
115
|
+
|
116
|
+
Run
|
117
|
+
|
118
|
+
```ruby
|
119
|
+
page.get_images
|
120
|
+
```
|
121
|
+
|
122
|
+
to get a list of images links within the page. The images links are extracted from the `src="..."` attribute of all `<img>` tags.
|
123
|
+
|
124
|
+
**Nota:** Images included using CSS or JavaScript aren't detected by the method.
|
125
|
+
|
126
|
+
**Nota 2:** This method returns an array of absolute URLs.
|
127
|
+
|
128
|
+
#### Get stylesheets of a page
|
129
|
+
|
130
|
+
Run
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
page.get_stylesheets
|
134
|
+
```
|
135
|
+
|
136
|
+
to get a list of stylesheets links within the page. The stylesheets links are extracted from the `href="..."` attribute of all `<link rel="stylesheet">` tags.
|
137
|
+
|
138
|
+
**Nota:** Inline styling isn't yet detected by the method.
|
139
|
+
|
140
|
+
**Nota 2:** This method returns an array of absolute URLs.
|
141
|
+
|
142
|
+
#### Get scripts of a page
|
143
|
+
|
144
|
+
Run
|
145
|
+
|
146
|
+
```ruby
|
147
|
+
page.get_scripts
|
148
|
+
```
|
149
|
+
|
150
|
+
to get a list of scripts links within the page. The scripts links are extracted from the `src="..."` attribute of all `<script>` tags.
|
151
|
+
|
152
|
+
**Nota:** Inline script isn't yet detected by the method.
|
153
|
+
|
154
|
+
**Nota 2:** This method returns an array of absolute URLs.
|
155
|
+
|
156
|
+
#### List all assets of a page
|
157
|
+
|
158
|
+
Run
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
page.get_assets
|
162
|
+
```
|
163
|
+
|
164
|
+
to get a list of all assets (images, stylesheets and scripts links) as a hash of arrays.
|
165
|
+
|
166
|
+
### Crawling a whole web site
|
167
|
+
|
168
|
+
First instantiate the site crawler.
|
169
|
+
|
170
|
+
```ruby
|
171
|
+
sc = SuperCrawler::CrawlSite.new(url, count_threads)
|
172
|
+
```
|
173
|
+
|
174
|
+
where `url` is the URL of the page to crawl, and `count_threads` the number of threads to handle the job (by default 10).
|
175
|
+
|
176
|
+
Next, start the crawler:
|
177
|
+
|
178
|
+
```ruby
|
179
|
+
sc.start
|
180
|
+
```
|
181
|
+
|
182
|
+
This can take some time, depending on the site to crawl.
|
183
|
+
|
184
|
+
To access crawl results, you can use the following:
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
sc.links # The array of internal links
|
188
|
+
|
189
|
+
sc.crawl_results # Array of hashes containing links and assets for every link crawled
|
190
|
+
```
|
191
|
+
|
192
|
+
To see the crawling as a sitemap, use:
|
193
|
+
|
194
|
+
```ruby
|
195
|
+
sc.render(5) # Will render the sitemap of the first 5 pages
|
196
|
+
```
|
197
|
+
|
198
|
+
TODO: Make more sophisticated rendering class, that can render within files of different formats (HTML, XML, JSON,...)
|
199
|
+
|
200
|
+
#### Tips on searching assets and links
|
201
|
+
|
202
|
+
After `sc.start`, you can access all collected resources (links and assets) using `sc.crawl_results`. This has the following structure:
|
203
|
+
|
204
|
+
```json
|
205
|
+
[
|
206
|
+
{
|
207
|
+
url: 'http://example.com/',
|
208
|
+
links: [...array of internal links...],
|
209
|
+
assets: {
|
210
|
+
images: [...array of images links],
|
211
|
+
stylesheets: [...array of stylesheets links],
|
212
|
+
scripts: [...array of scripts links],
|
213
|
+
}
|
214
|
+
},
|
215
|
+
...
|
216
|
+
]
|
217
|
+
```
|
218
|
+
|
219
|
+
You can use `sc.crawl_results.select{ |resource| ... }` to select a particular resource.
|
220
|
+
|
221
|
+
## Limitations
|
222
|
+
|
223
|
+
Actually, the gem has the following limitations:
|
224
|
+
|
225
|
+
- Subdomains are not considered as internal links
|
226
|
+
- Both HTTP and HTTPS pages are taken into account. This can increase the number of links found, but we think that we need to keep it because some sites don't duplicate all contents for HTTP and HTTPS
|
227
|
+
- Only links within `<a href="...">` tags are extracted
|
228
|
+
- Only images links within `<img src="..."/>` tags are extracted
|
229
|
+
- Only stylesheets links within `<link rel="stylesheet" href="..." />` tags are extracted
|
230
|
+
- Only scripts links within `<script src="...">` tags are extracted
|
231
|
+
- A page that is not accessible (eg. error 404) is not checked later
|
232
|
+
|
233
|
+
## Development
|
234
|
+
|
235
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
236
|
+
|
237
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
238
|
+
|
239
|
+
## Contributing
|
240
|
+
|
241
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/htaidirt/super_crawler. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
242
|
+
|
243
|
+
Want to contribute, please follow this process:
|
244
|
+
|
245
|
+
1. Fork it
|
246
|
+
2. Create your feature branch (git checkout -b my-new-feature)
|
247
|
+
3. Commit your changes (git commit -am 'Add some feature')
|
248
|
+
4. Push to the branch (git push origin my-new-feature)
|
249
|
+
5. Create new Pull Request
|
250
|
+
|
251
|
+
## License
|
252
|
+
|
253
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
254
|
+
|
255
|
+
## Don't forget to have fun coding Ruby...
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "super_crawler"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "open_uri_redirections"
|
3
|
+
require "nokogiri"
|
4
|
+
|
5
|
+
module SuperCrawler
|
6
|
+
|
7
|
+
###
|
8
|
+
# Crawl a single HTML page
|
9
|
+
# Responsible for extracting all relevant information within a page
|
10
|
+
#
|
11
|
+
class CrawlPage
|
12
|
+
|
13
|
+
attr_reader :url
|
14
|
+
|
15
|
+
def initialize url
|
16
|
+
# Normalize the URL, by adding http(s) if not present in the URL
|
17
|
+
# NOTA: By default, add http:// scheme to an URL that doesn't have one
|
18
|
+
@url = URI.encode( !!(url =~ /^(http(s)?:\/\/)/) ? url : ('http://' + url) )
|
19
|
+
end
|
20
|
+
|
21
|
+
###
|
22
|
+
# Get INTERNAL links of the page (same host)
|
23
|
+
#
|
24
|
+
def get_links
|
25
|
+
return [] unless page_exists?
|
26
|
+
|
27
|
+
# Get all the links that are within <a> tag, using Nokogiri
|
28
|
+
links = get_doc.css('a').map{ |link| link['href'] }.compact
|
29
|
+
|
30
|
+
# Select only internal links (relative links, or absolute links with the same host)
|
31
|
+
links.select!{ |link| URI.parse(URI.encode link).host.nil? || URI.parse(URI.encode link).host == URI.parse(@url).host }
|
32
|
+
|
33
|
+
# Reject bad matches links (like mailto, tel and javascript)
|
34
|
+
links.reject!{ |link| !!(link =~ /^(mailto:|tel:|javascript:)/) }
|
35
|
+
|
36
|
+
# Clean the links
|
37
|
+
links.map!{ |link| create_absolute_url( link ) } # Make all links absolute
|
38
|
+
.map!{ |link| link.split('#')[0] } # Remove the fragment part from the links (...#...) if any
|
39
|
+
.map!{ |link| URI(URI.encode link).normalize().to_s } # Normalize links
|
40
|
+
|
41
|
+
return links.uniq # Return links without duplicates
|
42
|
+
end
|
43
|
+
|
44
|
+
###
|
45
|
+
# Get all the images within a page
|
46
|
+
# NOTA: These are images within <img src="..." /> tag.
|
47
|
+
#
|
48
|
+
def get_images
|
49
|
+
return [] unless page_exists?
|
50
|
+
|
51
|
+
# Get all the images sources (URLs), using Nokogiri
|
52
|
+
images_links = get_doc.css('img').map{ |image| image['src'] }.compact
|
53
|
+
|
54
|
+
# Create the absolute path of the images
|
55
|
+
images_links.map!{ |image| create_absolute_url( image ) }
|
56
|
+
|
57
|
+
return images_links.uniq # Return links to images without duplicates
|
58
|
+
end
|
59
|
+
|
60
|
+
###
|
61
|
+
# Get all the CSS links within a page
|
62
|
+
# NOTA: These are links within <link href="..." /> tag.
|
63
|
+
#
|
64
|
+
def get_stylesheets
|
65
|
+
return [] unless page_exists?
|
66
|
+
|
67
|
+
# Get all the stylesheet links (URLs), using Nokogiri
|
68
|
+
css_links = get_doc.css('link').select{ |css_link| css_link['rel'] == 'stylesheet' }
|
69
|
+
.map{ |css_link| css_link['href'] }
|
70
|
+
.compact
|
71
|
+
|
72
|
+
# Create the absolute path of the CSS links
|
73
|
+
css_links.map!{ |css_link| create_absolute_url( css_link ) }
|
74
|
+
|
75
|
+
return css_links.uniq # Return links to CSS files without duplicates
|
76
|
+
end
|
77
|
+
|
78
|
+
###
|
79
|
+
# Get all the JS scripts within a page
|
80
|
+
# NOTA: These are scripts within <script src="..." /> tag.
|
81
|
+
#
|
82
|
+
def get_scripts
|
83
|
+
return [] unless page_exists?
|
84
|
+
|
85
|
+
# Get all the script sources (URLs), using Nokogiri
|
86
|
+
scripts_links = get_doc.css('script').map{ |script| script['src'] }.compact
|
87
|
+
|
88
|
+
# Create the absolute path of the scripts
|
89
|
+
scripts_links.map!{ |script| create_absolute_url( script ) }
|
90
|
+
|
91
|
+
return scripts_links.uniq # Return links to scripts without duplicates
|
92
|
+
end
|
93
|
+
|
94
|
+
###
|
95
|
+
# Get all assets within a page
|
96
|
+
# Returns a hash of images, stylesheets and scripts URLs
|
97
|
+
#
|
98
|
+
def get_assets
|
99
|
+
{
|
100
|
+
'images': get_images,
|
101
|
+
'stylesheets': get_stylesheets,
|
102
|
+
'scripts': get_scripts
|
103
|
+
}
|
104
|
+
end
|
105
|
+
|
106
|
+
###
|
107
|
+
# Get links and assets within a page
|
108
|
+
# Returns a hash of links, images, stylesheets and scripts URLs
|
109
|
+
#
|
110
|
+
def get_all
|
111
|
+
{
|
112
|
+
'links': get_links,
|
113
|
+
'images': get_images,
|
114
|
+
'stylesheets': get_stylesheets,
|
115
|
+
'scripts': get_scripts
|
116
|
+
}
|
117
|
+
end
|
118
|
+
|
119
|
+
###
|
120
|
+
# Check if the page exists
|
121
|
+
#
|
122
|
+
def page_exists?
|
123
|
+
!!( get_doc rescue false )
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
###
|
129
|
+
# Get the page `doc` (document) from Nokogiri.
|
130
|
+
# Cache it for performace issue.
|
131
|
+
#
|
132
|
+
def get_doc
|
133
|
+
begin
|
134
|
+
@doc ||= Nokogiri(open( @url , allow_redirections: :all ))
|
135
|
+
rescue Exception => e
|
136
|
+
raise "Problem with URL #{@url}: #{e}"
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
###
|
141
|
+
# Extract the base URL (scheme and host only)
|
142
|
+
#
|
143
|
+
# eg:
|
144
|
+
# http://mysite.com/abc -> http://mysite.com
|
145
|
+
# https://dev.mysite.co.uk/mylink -> https://dev.mysite.co.uk
|
146
|
+
def base_url
|
147
|
+
"#{URI.parse(@url).scheme}://#{URI.parse(@url).host}"
|
148
|
+
end
|
149
|
+
|
150
|
+
###
|
151
|
+
# Given a URL, return the absolute URL
|
152
|
+
#
|
153
|
+
def create_absolute_url url
|
154
|
+
# Append the base URL (scheme+host) if the provided URL is relative
|
155
|
+
URI.parse(URI.encode url).host.nil? ? (base_url + url) : url
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
require 'super_crawler/crawl_page'
|
4
|
+
|
5
|
+
module SuperCrawler
|
6
|
+
|
7
|
+
###
|
8
|
+
# Crawl a whole website
|
9
|
+
#
|
10
|
+
class CrawlSite
|
11
|
+
|
12
|
+
attr_reader :links, :crawl_results
|
13
|
+
|
14
|
+
def initialize start_url, threads = 10, options = {}
|
15
|
+
@start_url = URI(URI.encode start_url).normalize().to_s # Normalize the given URL
|
16
|
+
@links = [@start_url] # Will contain the list of all links found
|
17
|
+
@crawl_results = [] # Will contain the crawl results (links and assets), as array of hashes
|
18
|
+
@threads = threads # How many threads to use? Default: 10
|
19
|
+
|
20
|
+
@option_debug = options[:debug].nil? ? true : !!(options[:debug]) # Debug by default
|
21
|
+
end
|
22
|
+
|
23
|
+
###
|
24
|
+
# Start crawling site
|
25
|
+
# Could take a while. Use threads to speed up crawling and logging to inform user.
|
26
|
+
#
|
27
|
+
def start
|
28
|
+
|
29
|
+
crawling_start_notice # Show message on what will happen
|
30
|
+
threads = [] # Will contain our threads
|
31
|
+
@links_queue = Queue.new # Will contain the links queue that the threads will use
|
32
|
+
@links = [@start_url] # Re-init the links list
|
33
|
+
@crawl_results = [] # Re-init the crawling results
|
34
|
+
|
35
|
+
start_time = Time.now if @option_debug # Start the timer
|
36
|
+
|
37
|
+
# Let's populate our queue with links and resources from source url
|
38
|
+
process_page( @start_url )
|
39
|
+
|
40
|
+
# Create threads to handle new links
|
41
|
+
@threads.times do # Create many threads
|
42
|
+
|
43
|
+
threads << Thread.new do # Add a new threads
|
44
|
+
begin
|
45
|
+
while current_link = @links_queue.pop(true) # Popping every link after another
|
46
|
+
process_page( current_link ) # Get links and assets
|
47
|
+
end
|
48
|
+
rescue ThreadError # Stop when empty links queue
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
threads.map(&:join) # Activate the threads
|
55
|
+
crawling_summary_notice(start_time, Time.now) if @option_debug # Display crawling summary
|
56
|
+
|
57
|
+
return true
|
58
|
+
end
|
59
|
+
|
60
|
+
###
|
61
|
+
# Render sitemap
|
62
|
+
# Show, for each link, internal links and assets
|
63
|
+
# We will limit pages to display, because some sites have more than 1,000 pages
|
64
|
+
#
|
65
|
+
def render max_pages = 10
|
66
|
+
draw_line
|
67
|
+
puts "Showing first #{max_links} crawled pages and their contents:\n\n"
|
68
|
+
@crawl_results[0..(max_pages-1)].each_with_index do |result, index|
|
69
|
+
puts "[#{index+1}] Content of #{result[:url]}\n"
|
70
|
+
|
71
|
+
puts " + Internal links: #{'None' if result[:links].empty?}"
|
72
|
+
result[:links].each { |link| puts " - #{link}" }
|
73
|
+
|
74
|
+
puts " + Internal images: #{'None' if result[:assets][:images].empty?}"
|
75
|
+
result[:assets][:images].each { |link| puts " - #{link}" }
|
76
|
+
|
77
|
+
puts " + Internal stylesheets: #{'None' if result[:assets][:stylesheets].empty?}"
|
78
|
+
result[:assets][:stylesheets].each { |link| puts " - #{link}" }
|
79
|
+
|
80
|
+
puts " + Internal scripts: #{'None' if result[:assets][:scripts].empty?}"
|
81
|
+
result[:assets][:scripts].each { |link| puts " - #{link}" }
|
82
|
+
puts ""
|
83
|
+
end
|
84
|
+
draw_line
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
###
|
90
|
+
# Process a page by extracting information and updating links queue, links list and results.
|
91
|
+
#
|
92
|
+
def process_page page_url
|
93
|
+
page = SuperCrawler::CrawlPage.new(page_url) # Crawl the current page
|
94
|
+
|
95
|
+
current_page_links = page.get_links # Get current page internal links
|
96
|
+
new_links = current_page_links - @links # Select new links
|
97
|
+
|
98
|
+
new_links.each { |link| @links_queue.push(link) } # Add new links to the queue
|
99
|
+
@links += new_links # Add new links to the total links list
|
100
|
+
@crawl_results << { # Provide current page crawl result as a hash
|
101
|
+
url: page.url, # The crawled page
|
102
|
+
links: current_page_links, # Its internal links
|
103
|
+
assets: page.get_assets # Its assets
|
104
|
+
}
|
105
|
+
|
106
|
+
log_status( page_url ) if @option_debug # Display site crawling status
|
107
|
+
end
|
108
|
+
|
109
|
+
###
|
110
|
+
# Display a notice when starting a site crawl
|
111
|
+
#
|
112
|
+
def crawling_start_notice
|
113
|
+
draw_line
|
114
|
+
puts "Start crawling #{@start_url} using #{@threads} threads. Crawling rules:"
|
115
|
+
puts "1. Keep only internal links"
|
116
|
+
puts "2. http and https links are considered different"
|
117
|
+
puts "3. Remove the fragment part from the links (#...)"
|
118
|
+
puts "4. Keep paths with different parameters (?...)"
|
119
|
+
draw_line
|
120
|
+
end
|
121
|
+
|
122
|
+
###
|
123
|
+
# Log current search status (crawled links / total links)
|
124
|
+
#
|
125
|
+
def log_status url
|
126
|
+
text = "Crawled #{@crawl_results.length.to_s}/#{@links.length.to_s}: #{url}"
|
127
|
+
print "\r#{" "*100}\r" # Clean the previous text
|
128
|
+
print (text.length <= 50) ? text : "#{text[0..46]}..."
|
129
|
+
STDOUT.flush
|
130
|
+
end
|
131
|
+
|
132
|
+
###
|
133
|
+
# Display final crawling summary after site crawling complete
|
134
|
+
#
|
135
|
+
def crawling_summary_notice time_start, time_end
|
136
|
+
total_time = time_end - time_start
|
137
|
+
puts ""
|
138
|
+
draw_line
|
139
|
+
puts "Crawled #{@links.count} links in #{total_time.to_f.to_s} seconds using #{@threads} threads."
|
140
|
+
puts "Use .crawl_results to access the crawl results as an array of hashes."
|
141
|
+
puts "Use .render to see the crawl_results as a sitemap."
|
142
|
+
draw_line
|
143
|
+
end
|
144
|
+
|
145
|
+
###
|
146
|
+
# Draw a line (because readability is also important!!)
|
147
|
+
#
|
148
|
+
def draw_line
|
149
|
+
puts "#{'-' * 80}"
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'super_crawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "super_crawler"
|
8
|
+
spec.version = SuperCrawler::VERSION
|
9
|
+
spec.authors = ["Hassen Taidirt"]
|
10
|
+
spec.email = ["htaidirt@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Easy (yet efficient) ruby gem to crawl your favorite website.}
|
13
|
+
spec.description = %q{SuperCrawler allows you to easily crawl full web sites or web pages (extracting internal links and assets) in few seconds.}
|
14
|
+
spec.homepage = "https://github.com/htaidirt/super_crawler"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
if spec.respond_to?(:metadata)
|
20
|
+
spec.metadata['allowed_push_host'] = "https://rubygems.org"
|
21
|
+
else
|
22
|
+
raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
|
23
|
+
end
|
24
|
+
|
25
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
spec.bindir = "exe"
|
27
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
+
spec.require_paths = ["lib"]
|
29
|
+
|
30
|
+
spec.add_dependency "nokogiri", "~> 1"
|
31
|
+
spec.add_dependency "open_uri_redirections", "~> 0.2"
|
32
|
+
spec.add_dependency "thread", "~> 0.2"
|
33
|
+
|
34
|
+
spec.add_development_dependency "bundler", "~> 1.10"
|
35
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
36
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
37
|
+
end
|
metadata
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: super_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Hassen Taidirt
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-07-09 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: open_uri_redirections
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.2'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: thread
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.2'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0.2'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bundler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.10'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '10.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '10.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.0'
|
97
|
+
description: SuperCrawler allows you to easily crawl full web sites or web pages (extracting
|
98
|
+
internal links and assets) in few seconds.
|
99
|
+
email:
|
100
|
+
- htaidirt@gmail.com
|
101
|
+
executables: []
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- ".gitignore"
|
106
|
+
- ".rspec"
|
107
|
+
- ".travis.yml"
|
108
|
+
- CODE_OF_CONDUCT.md
|
109
|
+
- Gemfile
|
110
|
+
- LICENSE.txt
|
111
|
+
- README.md
|
112
|
+
- Rakefile
|
113
|
+
- bin/console
|
114
|
+
- bin/setup
|
115
|
+
- lib/super_crawler.rb
|
116
|
+
- lib/super_crawler/crawl_page.rb
|
117
|
+
- lib/super_crawler/crawl_site.rb
|
118
|
+
- lib/super_crawler/version.rb
|
119
|
+
- super_crawler.gemspec
|
120
|
+
homepage: https://github.com/htaidirt/super_crawler
|
121
|
+
licenses:
|
122
|
+
- MIT
|
123
|
+
metadata:
|
124
|
+
allowed_push_host: https://rubygems.org
|
125
|
+
post_install_message:
|
126
|
+
rdoc_options: []
|
127
|
+
require_paths:
|
128
|
+
- lib
|
129
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
requirements: []
|
140
|
+
rubyforge_project:
|
141
|
+
rubygems_version: 2.5.1
|
142
|
+
signing_key:
|
143
|
+
specification_version: 4
|
144
|
+
summary: Easy (yet efficient) ruby gem to crawl your favorite website.
|
145
|
+
test_files: []
|