iron-crawler 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -1
- data/VERSION +1 -1
- data/bin/iron-crawler +1 -2
- data/iron-crawler.gemspec +96 -0
- data/lib/iron-crawler/crawler.rb +25 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6357b1e7e083df930cf9e58bee810de58908c88
|
4
|
+
data.tar.gz: 7f402f09862bca26320f8af26c11b97225e00f7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 07c7fcda49f067c91a85bfc8982592d30c53af316d4415c8551b38523915ca165576a2cdd23e1923be0ad985361e5a839a360fbacc12eb0dd005b26ad68595a9
|
7
|
+
data.tar.gz: 8e497245dde2de5f4e46e60362531f260808072f6b586775dfc476984a3790d2e39159b044a9c946135b5b2b0011464ad2d744af227c298f9511c5db6ac46a3c
|
data/README.md
CHANGED
@@ -14,10 +14,18 @@ From a starting URL, crawl all links on that URL and print a list of URLs visite
|
|
14
14
|
|
15
15
|
It's easy to get started!
|
16
16
|
|
17
|
+
## Install
|
18
|
+
|
19
|
+
```
|
20
|
+
gem install iron-crawler
|
21
|
+
```
|
22
|
+
|
23
|
+
## Run
|
24
|
+
|
17
25
|
```
|
18
26
|
iron-crawler <url>
|
19
27
|
```
|
20
28
|
|
21
|
-
will crawl any site for you.
|
29
|
+
The above command will crawl any site for you.
|
22
30
|
|
23
31
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.1.0
|
data/bin/iron-crawler
CHANGED
@@ -0,0 +1,96 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
# stub: iron-crawler 1.1.0 ruby lib
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = "iron-crawler"
|
9
|
+
s.version = "1.1.0"
|
10
|
+
|
11
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib"]
|
13
|
+
s.authors = ["Ben Visser"]
|
14
|
+
s.date = "2016-02-08"
|
15
|
+
s.description = "A generic web crawler that doesn't crawl outside URLs."
|
16
|
+
s.email = "theodore.r.visser@gmail.com"
|
17
|
+
s.executables = ["iron-crawler"]
|
18
|
+
s.extra_rdoc_files = [
|
19
|
+
"LICENSE.txt",
|
20
|
+
"README.md"
|
21
|
+
]
|
22
|
+
s.files = [
|
23
|
+
".document",
|
24
|
+
"Gemfile",
|
25
|
+
"Gemfile.lock",
|
26
|
+
"LICENSE.txt",
|
27
|
+
"README.md",
|
28
|
+
"Rakefile",
|
29
|
+
"VERSION",
|
30
|
+
"bin/iron-crawler",
|
31
|
+
"iron-crawler.gemspec",
|
32
|
+
"lib/iron-crawler.rb",
|
33
|
+
"lib/iron-crawler/crawler.rb",
|
34
|
+
"tasks/jeweler.rake",
|
35
|
+
"tasks/qa.rake"
|
36
|
+
]
|
37
|
+
s.homepage = "http://github.com/noqcks/iron-crawler"
|
38
|
+
s.licenses = ["MIT"]
|
39
|
+
s.rubygems_version = "2.4.6"
|
40
|
+
s.summary = "A generic web crawler."
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
s.specification_version = 4
|
44
|
+
|
45
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
46
|
+
s.add_runtime_dependency(%q<mechanize>, [">= 0"])
|
47
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
49
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0"])
|
50
|
+
s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
|
51
|
+
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
52
|
+
s.add_development_dependency(%q<rspec>, ["~> 3.4.0"])
|
53
|
+
s.add_development_dependency(%q<reek>, ["~> 3.10.0"])
|
54
|
+
s.add_development_dependency(%q<roodi>, ["~> 5.0.0"])
|
55
|
+
s.add_development_dependency(%q<yard>, ["~> 0.8.0"])
|
56
|
+
s.add_development_dependency(%q<churn>, ["~> 1.0.2"])
|
57
|
+
s.add_development_dependency(%q<flay>, ["~> 2.7.0"])
|
58
|
+
s.add_development_dependency(%q<flog>, ["~> 4.3.2"])
|
59
|
+
s.add_development_dependency(%q<inch>, ["~> 0.7.0"])
|
60
|
+
s.add_development_dependency(%q<rubocop>, ["~> 0.37.0"])
|
61
|
+
else
|
62
|
+
s.add_dependency(%q<mechanize>, [">= 0"])
|
63
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
64
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
65
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
66
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
67
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
68
|
+
s.add_dependency(%q<rspec>, ["~> 3.4.0"])
|
69
|
+
s.add_dependency(%q<reek>, ["~> 3.10.0"])
|
70
|
+
s.add_dependency(%q<roodi>, ["~> 5.0.0"])
|
71
|
+
s.add_dependency(%q<yard>, ["~> 0.8.0"])
|
72
|
+
s.add_dependency(%q<churn>, ["~> 1.0.2"])
|
73
|
+
s.add_dependency(%q<flay>, ["~> 2.7.0"])
|
74
|
+
s.add_dependency(%q<flog>, ["~> 4.3.2"])
|
75
|
+
s.add_dependency(%q<inch>, ["~> 0.7.0"])
|
76
|
+
s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
|
77
|
+
end
|
78
|
+
else
|
79
|
+
s.add_dependency(%q<mechanize>, [">= 0"])
|
80
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
81
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
82
|
+
s.add_dependency(%q<bundler>, ["~> 1.0"])
|
83
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
|
84
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
85
|
+
s.add_dependency(%q<rspec>, ["~> 3.4.0"])
|
86
|
+
s.add_dependency(%q<reek>, ["~> 3.10.0"])
|
87
|
+
s.add_dependency(%q<roodi>, ["~> 5.0.0"])
|
88
|
+
s.add_dependency(%q<yard>, ["~> 0.8.0"])
|
89
|
+
s.add_dependency(%q<churn>, ["~> 1.0.2"])
|
90
|
+
s.add_dependency(%q<flay>, ["~> 2.7.0"])
|
91
|
+
s.add_dependency(%q<flog>, ["~> 4.3.2"])
|
92
|
+
s.add_dependency(%q<inch>, ["~> 0.7.0"])
|
93
|
+
s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
data/lib/iron-crawler/crawler.rb
CHANGED
@@ -14,17 +14,41 @@ class Crawler < Mechanize
|
|
14
14
|
#
|
15
15
|
def spiderize(url)
|
16
16
|
@mech.max_history = nil
|
17
|
-
|
17
|
+
page = @mech.get(url)
|
18
|
+
stack = page.links
|
19
|
+
stack.push(*src_links(page))
|
20
|
+
|
18
21
|
while link = stack.pop
|
19
22
|
next if reject(link)
|
20
23
|
puts "crawling #{link.uri}"
|
21
24
|
begin
|
22
25
|
page = link.click
|
23
26
|
next unless Mechanize::Page === page
|
27
|
+
stack.push(*src_links(page))
|
24
28
|
stack.push(*page.links)
|
25
29
|
rescue Mechanize::ResponseCodeError
|
26
30
|
end
|
27
31
|
end
|
32
|
+
return @mech.history
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
# Since mechanize doesn't treat src elements as links, this will
|
37
|
+
# return all src links from a page.
|
38
|
+
# @param [Mechanize::Page] A mechanize page object.
|
39
|
+
# @return [Array] An array of created Mechanize::Page::Link objects.
|
40
|
+
#
|
41
|
+
def src_links(page)
|
42
|
+
links = []
|
43
|
+
page.search("script").each do |element|
|
44
|
+
next if element.attributes['src'].nil?
|
45
|
+
doc = Nokogiri::HTML::Document.new
|
46
|
+
node = Nokogiri::XML::Node.new('foo', doc)
|
47
|
+
node['href'] = element.attributes['src'].value
|
48
|
+
link = Mechanize::Page::Link.new(node, @mech, page)
|
49
|
+
links.push(link)
|
50
|
+
end
|
51
|
+
return links
|
28
52
|
end
|
29
53
|
|
30
54
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iron-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Visser
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -237,6 +237,7 @@ files:
|
|
237
237
|
- Rakefile
|
238
238
|
- VERSION
|
239
239
|
- bin/iron-crawler
|
240
|
+
- iron-crawler.gemspec
|
240
241
|
- lib/iron-crawler.rb
|
241
242
|
- lib/iron-crawler/crawler.rb
|
242
243
|
- tasks/jeweler.rake
|