iron-crawler 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +9 -1
- data/VERSION +1 -1
- data/bin/iron-crawler +1 -2
- data/iron-crawler.gemspec +96 -0
- data/lib/iron-crawler/crawler.rb +25 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6357b1e7e083df930cf9e58bee810de58908c88
|
4
|
+
data.tar.gz: 7f402f09862bca26320f8af26c11b97225e00f7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 07c7fcda49f067c91a85bfc8982592d30c53af316d4415c8551b38523915ca165576a2cdd23e1923be0ad985361e5a839a360fbacc12eb0dd005b26ad68595a9
|
7
|
+
data.tar.gz: 8e497245dde2de5f4e46e60362531f260808072f6b586775dfc476984a3790d2e39159b044a9c946135b5b2b0011464ad2d744af227c298f9511c5db6ac46a3c
|
data/README.md
CHANGED
@@ -14,10 +14,18 @@ From a starting URL, crawl all links on that URL and print a list of URLs visite
|
|
14
14
|
|
15
15
|
It's easy to get started!
|
16
16
|
|
17
|
+
## Install
|
18
|
+
|
19
|
+
```
|
20
|
+
gem install iron-crawler
|
21
|
+
```
|
22
|
+
|
23
|
+
## Run
|
24
|
+
|
17
25
|
```
|
18
26
|
iron-crawler <url>
|
19
27
|
```
|
20
28
|
|
21
|
-
will crawl any site for you.
|
29
|
+
The above command will crawl any site for you.
|
22
30
|
|
23
31
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.1.0
|
data/bin/iron-crawler
CHANGED
@@ -0,0 +1,96 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
# stub: iron-crawler 1.1.0 ruby lib
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = "iron-crawler"
|
9
|
+
s.version = "1.1.0"
|
10
|
+
|
11
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib"]
|
13
|
+
s.authors = ["Ben Visser"]
|
14
|
+
s.date = "2016-02-08"
|
15
|
+
s.description = "A generic web crawler that doesn't crawl outside URLs."
|
16
|
+
s.email = "theodore.r.visser@gmail.com"
|
17
|
+
s.executables = ["iron-crawler"]
|
18
|
+
s.extra_rdoc_files = [
|
19
|
+
"LICENSE.txt",
|
20
|
+
"README.md"
|
21
|
+
]
|
22
|
+
s.files = [
|
23
|
+
".document",
|
24
|
+
"Gemfile",
|
25
|
+
"Gemfile.lock",
|
26
|
+
"LICENSE.txt",
|
27
|
+
"README.md",
|
28
|
+
"Rakefile",
|
29
|
+
"VERSION",
|
30
|
+
"bin/iron-crawler",
|
31
|
+
"iron-crawler.gemspec",
|
32
|
+
"lib/iron-crawler.rb",
|
33
|
+
"lib/iron-crawler/crawler.rb",
|
34
|
+
"tasks/jeweler.rake",
|
35
|
+
"tasks/qa.rake"
|
36
|
+
]
|
37
|
+
s.homepage = "http://github.com/noqcks/iron-crawler"
|
38
|
+
s.licenses = ["MIT"]
|
39
|
+
s.rubygems_version = "2.4.6"
|
40
|
+
s.summary = "A generic web crawler."
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
s.specification_version = 4
|
44
|
+
|
45
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
46
|
+
s.add_runtime_dependency(%q<mechanize>, [">= 0"])
|
47
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
49
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0"])
|
50
|
+
s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
|
51
|
+
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
52
|
+
s.add_development_dependency(%q<rspec>, ["~> 3.4.0"])
|
53
|
+
s.add_development_dependency(%q<reek>, ["~> 3.10.0"])
|
54
|
+
s.add_development_dependency(%q<roodi>, ["~> 5.0.0"])
|
55
|
+
s.add_development_dependency(%q<yard>, ["~> 0.8.0"])
|
56
|
+
s.add_development_dependency(%q<churn>, ["~> 1.0.2"])
|
57
|
+
s.add_development_dependency(%q<flay>, ["~> 2.7.0"])
|
58
|
+
s.add_development_dependency(%q<flog>, ["~> 4.3.2"])
|
59
|
+
s.add_development_dependency(%q<inch>, ["~> 0.7.0"])
|
60
|
+
s.add_development_dependency(%q<rubocop>, ["~> 0.37.0"])
|
61
|
+
else
|
62
|
+
s.add_dependency(%q<mechanize>, [">= 0"])
|
63
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
64
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
65
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
66
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
67
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
68
|
+
s.add_dependency(%q<rspec>, ["~> 3.4.0"])
|
69
|
+
s.add_dependency(%q<reek>, ["~> 3.10.0"])
|
70
|
+
s.add_dependency(%q<roodi>, ["~> 5.0.0"])
|
71
|
+
s.add_dependency(%q<yard>, ["~> 0.8.0"])
|
72
|
+
s.add_dependency(%q<churn>, ["~> 1.0.2"])
|
73
|
+
s.add_dependency(%q<flay>, ["~> 2.7.0"])
|
74
|
+
s.add_dependency(%q<flog>, ["~> 4.3.2"])
|
75
|
+
s.add_dependency(%q<inch>, ["~> 0.7.0"])
|
76
|
+
s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
|
77
|
+
end
|
78
|
+
else
|
79
|
+
s.add_dependency(%q<mechanize>, [">= 0"])
|
80
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
81
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
82
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
83
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
84
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
85
|
+
s.add_dependency(%q<rspec>, ["~> 3.4.0"])
|
86
|
+
s.add_dependency(%q<reek>, ["~> 3.10.0"])
|
87
|
+
s.add_dependency(%q<roodi>, ["~> 5.0.0"])
|
88
|
+
s.add_dependency(%q<yard>, ["~> 0.8.0"])
|
89
|
+
s.add_dependency(%q<churn>, ["~> 1.0.2"])
|
90
|
+
s.add_dependency(%q<flay>, ["~> 2.7.0"])
|
91
|
+
s.add_dependency(%q<flog>, ["~> 4.3.2"])
|
92
|
+
s.add_dependency(%q<inch>, ["~> 0.7.0"])
|
93
|
+
s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
data/lib/iron-crawler/crawler.rb
CHANGED
@@ -14,17 +14,41 @@ class Crawler < Mechanize
|
|
14
14
|
#
|
15
15
|
def spiderize(url)
|
16
16
|
@mech.max_history = nil
|
17
|
-
|
17
|
+
page = @mech.get(url)
|
18
|
+
stack = page.links
|
19
|
+
stack.push(*src_links(page))
|
20
|
+
|
18
21
|
while link = stack.pop
|
19
22
|
next if reject(link)
|
20
23
|
puts "crawling #{link.uri}"
|
21
24
|
begin
|
22
25
|
page = link.click
|
23
26
|
next unless Mechanize::Page === page
|
27
|
+
stack.push(*src_links(page))
|
24
28
|
stack.push(*page.links)
|
25
29
|
rescue Mechanize::ResponseCodeError
|
26
30
|
end
|
27
31
|
end
|
32
|
+
return @mech.history
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
# Since mechanize doesn't treat src elements as links, this will
|
37
|
+
# return all src links from a page.
|
38
|
+
# @param [Mechanize::Page] A mechanize page object.
|
39
|
+
# @return [Array] An array of created Mechanize::Page::Link objects.
|
40
|
+
#
|
41
|
+
def src_links(page)
|
42
|
+
links = []
|
43
|
+
page.search("script").each do |element|
|
44
|
+
next if element.attributes['src'].nil?
|
45
|
+
doc = Nokogiri::HTML::Document.new
|
46
|
+
node = Nokogiri::XML::Node.new('foo', doc)
|
47
|
+
node['href'] = element.attributes['src'].value
|
48
|
+
link = Mechanize::Page::Link.new(node, @mech, page)
|
49
|
+
links.push(link)
|
50
|
+
end
|
51
|
+
return links
|
28
52
|
end
|
29
53
|
|
30
54
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iron-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Visser
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -237,6 +237,7 @@ files:
|
|
237
237
|
- Rakefile
|
238
238
|
- VERSION
|
239
239
|
- bin/iron-crawler
|
240
|
+
- iron-crawler.gemspec
|
240
241
|
- lib/iron-crawler.rb
|
241
242
|
- lib/iron-crawler/crawler.rb
|
242
243
|
- tasks/jeweler.rake
|