iron-crawler 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8a2dc8e70c7883e03bbc90ec200e0f80ef59d059
4
- data.tar.gz: a348eb768509de52f3406d1fa23d52ddea636c62
3
+ metadata.gz: f6357b1e7e083df930cf9e58bee810de58908c88
4
+ data.tar.gz: 7f402f09862bca26320f8af26c11b97225e00f7f
5
5
  SHA512:
6
- metadata.gz: b5f54a41ccbdadccecf7d522dcd81c0a13e424e3f3064f9aa9e45775e1f597b0b073d0ae6212d7cc163db3db4bd238031f13e4a86801d35eb84c6cc113f8440c
7
- data.tar.gz: 522d5666aa95be4477ffb6d4922606fe6c02fe4f5afec4adacd5338ae36ea641f5bd852c68fe04188a6e590537c10e2b9b16a1ac3ee05390b24a1fde1a1516f3
6
+ metadata.gz: 07c7fcda49f067c91a85bfc8982592d30c53af316d4415c8551b38523915ca165576a2cdd23e1923be0ad985361e5a839a360fbacc12eb0dd005b26ad68595a9
7
+ data.tar.gz: 8e497245dde2de5f4e46e60362531f260808072f6b586775dfc476984a3790d2e39159b044a9c946135b5b2b0011464ad2d744af227c298f9511c5db6ac46a3c
data/README.md CHANGED
@@ -14,10 +14,18 @@ From a starting URL, crawl all links on that URL and print a list of URLs visite
14
14
 
15
15
  It's easy to get started!
16
16
 
17
+ ## Install
18
+
19
+ ```
20
+ gem install iron-crawler
21
+ ```
22
+
23
+ ## Run
24
+
17
25
  ```
18
26
  iron-crawler <url>
19
27
  ```
20
28
 
21
- will crawl any site for you.
29
+ The above command will crawl any site for you.
22
30
 
23
31
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.0
1
+ 1.1.0
data/bin/iron-crawler CHANGED
@@ -12,6 +12,5 @@ url = ARGV.first
12
12
 
13
13
  Announce.info "Crawling #{url}..."
14
14
  agent = Crawler.new
15
- agent.spiderize(url)
15
+ puts agent.spiderize(url)
16
16
 
17
- puts agent.history
@@ -0,0 +1,96 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+ # stub: iron-crawler 1.1.0 ruby lib
6
+
7
+ Gem::Specification.new do |s|
8
+ s.name = "iron-crawler"
9
+ s.version = "1.1.0"
10
+
11
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib"]
13
+ s.authors = ["Ben Visser"]
14
+ s.date = "2016-02-08"
15
+ s.description = "A generic web crawler that doesn't crawl outside URLs."
16
+ s.email = "theodore.r.visser@gmail.com"
17
+ s.executables = ["iron-crawler"]
18
+ s.extra_rdoc_files = [
19
+ "LICENSE.txt",
20
+ "README.md"
21
+ ]
22
+ s.files = [
23
+ ".document",
24
+ "Gemfile",
25
+ "Gemfile.lock",
26
+ "LICENSE.txt",
27
+ "README.md",
28
+ "Rakefile",
29
+ "VERSION",
30
+ "bin/iron-crawler",
31
+ "iron-crawler.gemspec",
32
+ "lib/iron-crawler.rb",
33
+ "lib/iron-crawler/crawler.rb",
34
+ "tasks/jeweler.rake",
35
+ "tasks/qa.rake"
36
+ ]
37
+ s.homepage = "http://github.com/noqcks/iron-crawler"
38
+ s.licenses = ["MIT"]
39
+ s.rubygems_version = "2.4.6"
40
+ s.summary = "A generic web crawler."
41
+
42
+ if s.respond_to? :specification_version then
43
+ s.specification_version = 4
44
+
45
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
46
+ s.add_runtime_dependency(%q<mechanize>, [">= 0"])
47
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
48
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
49
+ s.add_development_dependency(%q<bundler>, ["~> 1.0"])
50
+ s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
51
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
52
+ s.add_development_dependency(%q<rspec>, ["~> 3.4.0"])
53
+ s.add_development_dependency(%q<reek>, ["~> 3.10.0"])
54
+ s.add_development_dependency(%q<roodi>, ["~> 5.0.0"])
55
+ s.add_development_dependency(%q<yard>, ["~> 0.8.0"])
56
+ s.add_development_dependency(%q<churn>, ["~> 1.0.2"])
57
+ s.add_development_dependency(%q<flay>, ["~> 2.7.0"])
58
+ s.add_development_dependency(%q<flog>, ["~> 4.3.2"])
59
+ s.add_development_dependency(%q<inch>, ["~> 0.7.0"])
60
+ s.add_development_dependency(%q<rubocop>, ["~> 0.37.0"])
61
+ else
62
+ s.add_dependency(%q<mechanize>, [">= 0"])
63
+ s.add_dependency(%q<shoulda>, [">= 0"])
64
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
65
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
66
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
67
+ s.add_dependency(%q<simplecov>, [">= 0"])
68
+ s.add_dependency(%q<rspec>, ["~> 3.4.0"])
69
+ s.add_dependency(%q<reek>, ["~> 3.10.0"])
70
+ s.add_dependency(%q<roodi>, ["~> 5.0.0"])
71
+ s.add_dependency(%q<yard>, ["~> 0.8.0"])
72
+ s.add_dependency(%q<churn>, ["~> 1.0.2"])
73
+ s.add_dependency(%q<flay>, ["~> 2.7.0"])
74
+ s.add_dependency(%q<flog>, ["~> 4.3.2"])
75
+ s.add_dependency(%q<inch>, ["~> 0.7.0"])
76
+ s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
77
+ end
78
+ else
79
+ s.add_dependency(%q<mechanize>, [">= 0"])
80
+ s.add_dependency(%q<shoulda>, [">= 0"])
81
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
82
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
83
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
84
+ s.add_dependency(%q<simplecov>, [">= 0"])
85
+ s.add_dependency(%q<rspec>, ["~> 3.4.0"])
86
+ s.add_dependency(%q<reek>, ["~> 3.10.0"])
87
+ s.add_dependency(%q<roodi>, ["~> 5.0.0"])
88
+ s.add_dependency(%q<yard>, ["~> 0.8.0"])
89
+ s.add_dependency(%q<churn>, ["~> 1.0.2"])
90
+ s.add_dependency(%q<flay>, ["~> 2.7.0"])
91
+ s.add_dependency(%q<flog>, ["~> 4.3.2"])
92
+ s.add_dependency(%q<inch>, ["~> 0.7.0"])
93
+ s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
94
+ end
95
+ end
96
+
@@ -14,17 +14,41 @@ class Crawler < Mechanize
14
14
  #
15
15
  def spiderize(url)
16
16
  @mech.max_history = nil
17
- stack = @mech.get(url).links
17
+ page = @mech.get(url)
18
+ stack = page.links
19
+ stack.push(*src_links(page))
20
+
18
21
  while link = stack.pop
19
22
  next if reject(link)
20
23
  puts "crawling #{link.uri}"
21
24
  begin
22
25
  page = link.click
23
26
  next unless Mechanize::Page === page
27
+ stack.push(*src_links(page))
24
28
  stack.push(*page.links)
25
29
  rescue Mechanize::ResponseCodeError
26
30
  end
27
31
  end
32
+ return @mech.history
33
+ end
34
+
35
+
36
+ # Since mechanize doesn't treat src elements as links, this will
37
+ # return all src links from a page.
38
+ # @param [Mechanize::Page] A mechanize page object.
39
+ # @return [Array] An array of created Mechanize::Page::Link objects.
40
+ #
41
+ def src_links(page)
42
+ links = []
43
+ page.search("script").each do |element|
44
+ next if element.attributes['src'].nil?
45
+ doc = Nokogiri::HTML::Document.new
46
+ node = Nokogiri::XML::Node.new('foo', doc)
47
+ node['href'] = element.attributes['src'].value
48
+ link = Mechanize::Page::Link.new(node, @mech, page)
49
+ links.push(link)
50
+ end
51
+ return links
28
52
  end
29
53
 
30
54
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iron-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Visser
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-07 00:00:00.000000000 Z
11
+ date: 2016-02-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -237,6 +237,7 @@ files:
237
237
  - Rakefile
238
238
  - VERSION
239
239
  - bin/iron-crawler
240
+ - iron-crawler.gemspec
240
241
  - lib/iron-crawler.rb
241
242
  - lib/iron-crawler/crawler.rb
242
243
  - tasks/jeweler.rake