iron-crawler 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8a2dc8e70c7883e03bbc90ec200e0f80ef59d059
4
- data.tar.gz: a348eb768509de52f3406d1fa23d52ddea636c62
3
+ metadata.gz: f6357b1e7e083df930cf9e58bee810de58908c88
4
+ data.tar.gz: 7f402f09862bca26320f8af26c11b97225e00f7f
5
5
  SHA512:
6
- metadata.gz: b5f54a41ccbdadccecf7d522dcd81c0a13e424e3f3064f9aa9e45775e1f597b0b073d0ae6212d7cc163db3db4bd238031f13e4a86801d35eb84c6cc113f8440c
7
- data.tar.gz: 522d5666aa95be4477ffb6d4922606fe6c02fe4f5afec4adacd5338ae36ea641f5bd852c68fe04188a6e590537c10e2b9b16a1ac3ee05390b24a1fde1a1516f3
6
+ metadata.gz: 07c7fcda49f067c91a85bfc8982592d30c53af316d4415c8551b38523915ca165576a2cdd23e1923be0ad985361e5a839a360fbacc12eb0dd005b26ad68595a9
7
+ data.tar.gz: 8e497245dde2de5f4e46e60362531f260808072f6b586775dfc476984a3790d2e39159b044a9c946135b5b2b0011464ad2d744af227c298f9511c5db6ac46a3c
data/README.md CHANGED
@@ -14,10 +14,18 @@ From a starting URL, crawl all links on that URL and print a list of URLs visite
14
14
 
15
15
  It's easy to get started!
16
16
 
17
+ ## Install
18
+
19
+ ```
20
+ gem install iron-crawler
21
+ ```
22
+
23
+ ## Run
24
+
17
25
  ```
18
26
  iron-crawler <url>
19
27
  ```
20
28
 
21
- will crawl any site for you.
29
+ The above command will crawl any site for you.
22
30
 
23
31
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.0
1
+ 1.1.0
data/bin/iron-crawler CHANGED
@@ -12,6 +12,5 @@ url = ARGV.first
12
12
 
13
13
  Announce.info "Crawling #{url}..."
14
14
  agent = Crawler.new
15
- agent.spiderize(url)
15
+ puts agent.spiderize(url)
16
16
 
17
- puts agent.history
@@ -0,0 +1,96 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+ # stub: iron-crawler 1.1.0 ruby lib
6
+
7
+ Gem::Specification.new do |s|
8
+ s.name = "iron-crawler"
9
+ s.version = "1.1.0"
10
+
11
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib"]
13
+ s.authors = ["Ben Visser"]
14
+ s.date = "2016-02-08"
15
+ s.description = "A generic web crawler that doesn't crawl outside URLs."
16
+ s.email = "theodore.r.visser@gmail.com"
17
+ s.executables = ["iron-crawler"]
18
+ s.extra_rdoc_files = [
19
+ "LICENSE.txt",
20
+ "README.md"
21
+ ]
22
+ s.files = [
23
+ ".document",
24
+ "Gemfile",
25
+ "Gemfile.lock",
26
+ "LICENSE.txt",
27
+ "README.md",
28
+ "Rakefile",
29
+ "VERSION",
30
+ "bin/iron-crawler",
31
+ "iron-crawler.gemspec",
32
+ "lib/iron-crawler.rb",
33
+ "lib/iron-crawler/crawler.rb",
34
+ "tasks/jeweler.rake",
35
+ "tasks/qa.rake"
36
+ ]
37
+ s.homepage = "http://github.com/noqcks/iron-crawler"
38
+ s.licenses = ["MIT"]
39
+ s.rubygems_version = "2.4.6"
40
+ s.summary = "A generic web crawler."
41
+
42
+ if s.respond_to? :specification_version then
43
+ s.specification_version = 4
44
+
45
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
46
+ s.add_runtime_dependency(%q<mechanize>, [">= 0"])
47
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
48
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
49
+ s.add_development_dependency(%q<bundler>, ["~> 1.0"])
50
+ s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
51
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
52
+ s.add_development_dependency(%q<rspec>, ["~> 3.4.0"])
53
+ s.add_development_dependency(%q<reek>, ["~> 3.10.0"])
54
+ s.add_development_dependency(%q<roodi>, ["~> 5.0.0"])
55
+ s.add_development_dependency(%q<yard>, ["~> 0.8.0"])
56
+ s.add_development_dependency(%q<churn>, ["~> 1.0.2"])
57
+ s.add_development_dependency(%q<flay>, ["~> 2.7.0"])
58
+ s.add_development_dependency(%q<flog>, ["~> 4.3.2"])
59
+ s.add_development_dependency(%q<inch>, ["~> 0.7.0"])
60
+ s.add_development_dependency(%q<rubocop>, ["~> 0.37.0"])
61
+ else
62
+ s.add_dependency(%q<mechanize>, [">= 0"])
63
+ s.add_dependency(%q<shoulda>, [">= 0"])
64
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
65
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
66
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
67
+ s.add_dependency(%q<simplecov>, [">= 0"])
68
+ s.add_dependency(%q<rspec>, ["~> 3.4.0"])
69
+ s.add_dependency(%q<reek>, ["~> 3.10.0"])
70
+ s.add_dependency(%q<roodi>, ["~> 5.0.0"])
71
+ s.add_dependency(%q<yard>, ["~> 0.8.0"])
72
+ s.add_dependency(%q<churn>, ["~> 1.0.2"])
73
+ s.add_dependency(%q<flay>, ["~> 2.7.0"])
74
+ s.add_dependency(%q<flog>, ["~> 4.3.2"])
75
+ s.add_dependency(%q<inch>, ["~> 0.7.0"])
76
+ s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
77
+ end
78
+ else
79
+ s.add_dependency(%q<mechanize>, [">= 0"])
80
+ s.add_dependency(%q<shoulda>, [">= 0"])
81
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
82
+ s.add_dependency(%q<bundler>, ["~> 1.0"])
83
+ s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
84
+ s.add_dependency(%q<simplecov>, [">= 0"])
85
+ s.add_dependency(%q<rspec>, ["~> 3.4.0"])
86
+ s.add_dependency(%q<reek>, ["~> 3.10.0"])
87
+ s.add_dependency(%q<roodi>, ["~> 5.0.0"])
88
+ s.add_dependency(%q<yard>, ["~> 0.8.0"])
89
+ s.add_dependency(%q<churn>, ["~> 1.0.2"])
90
+ s.add_dependency(%q<flay>, ["~> 2.7.0"])
91
+ s.add_dependency(%q<flog>, ["~> 4.3.2"])
92
+ s.add_dependency(%q<inch>, ["~> 0.7.0"])
93
+ s.add_dependency(%q<rubocop>, ["~> 0.37.0"])
94
+ end
95
+ end
96
+
@@ -14,17 +14,41 @@ class Crawler < Mechanize
14
14
  #
15
15
  def spiderize(url)
16
16
  @mech.max_history = nil
17
- stack = @mech.get(url).links
17
+ page = @mech.get(url)
18
+ stack = page.links
19
+ stack.push(*src_links(page))
20
+
18
21
  while link = stack.pop
19
22
  next if reject(link)
20
23
  puts "crawling #{link.uri}"
21
24
  begin
22
25
  page = link.click
23
26
  next unless Mechanize::Page === page
27
+ stack.push(*src_links(page))
24
28
  stack.push(*page.links)
25
29
  rescue Mechanize::ResponseCodeError
26
30
  end
27
31
  end
32
+ return @mech.history
33
+ end
34
+
35
+
36
+ # Since mechanize doesn't treat src elements as links, this will
37
+ # return all src links from a page.
38
+ # @param [Mechanize::Page] A mechanize page object.
39
+ # @return [Array] An array of created Mechanize::Page::Link objects.
40
+ #
41
+ def src_links(page)
42
+ links = []
43
+ page.search("script").each do |element|
44
+ next if element.attributes['src'].nil?
45
+ doc = Nokogiri::HTML::Document.new
46
+ node = Nokogiri::XML::Node.new('foo', doc)
47
+ node['href'] = element.attributes['src'].value
48
+ link = Mechanize::Page::Link.new(node, @mech, page)
49
+ links.push(link)
50
+ end
51
+ return links
28
52
  end
29
53
 
30
54
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iron-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Visser
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-07 00:00:00.000000000 Z
11
+ date: 2016-02-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -237,6 +237,7 @@ files:
237
237
  - Rakefile
238
238
  - VERSION
239
239
  - bin/iron-crawler
240
+ - iron-crawler.gemspec
240
241
  - lib/iron-crawler.rb
241
242
  - lib/iron-crawler/crawler.rb
242
243
  - tasks/jeweler.rake