snapcrawl 0.2.8 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2500895cfd465dd633ac9650fc702f87b684ae4a81aa2feb979a16fad17bedc2
4
- data.tar.gz: 042f0e14f8139d3b33bdd0061731c7973f7c445e57e2c474d32c3b226d333acb
3
+ metadata.gz: e0dba7d3febfccf88e22194125387c6219cf500f0cb0e00fe4ed343a57fe9034
4
+ data.tar.gz: d05440bccd6ddde9773b24c425f3420b1970c3985805f6eb685171237e382cda
5
5
  SHA512:
6
- metadata.gz: 7603f985467c47f26b5bbf8c3cebf5654de708d0f7039c0268b4c4824089c259c17b687be482ed3e5813f44fef2274407542c76e3291f064aed1b06c2af1837a
7
- data.tar.gz: 3246ea728fa90fc65da1732964492da9383ae0c99a47746d1194f5c991b2efadd333c69478e2c97fd99274acb47fc38773285a53c9fd5505e7a605af08f58cf2
6
+ metadata.gz: 71f65748af4339bf606765e2f1ea0aab90f4099ff986ce59318c6c966363ca6386a92d08ce175373cfa8e83c096af844983da2cde22ccbbebd2ab151514b6e72
7
+ data.tar.gz: a641b038a18114117375249d260895d73d08cd2cf47c7f668641c0891cd4b4723db80d4313a6a64cd89de02be0ea72aa9fa7b15f74d1ba6e0724017486c2d9c4
@@ -1,10 +1,11 @@
1
1
  require 'colsole'
2
2
  require 'docopt'
3
3
  require 'fileutils'
4
+ require 'httparty'
4
5
  require 'nokogiri'
5
- require 'open-uri'
6
6
  require 'ostruct'
7
7
  require 'pstore'
8
+ require 'uri'
8
9
  require 'webshot'
9
10
 
10
11
  module Snapcrawl
@@ -131,15 +132,18 @@ module Snapcrawl
131
132
  say " !txtblu!Crawl!!txtrst! Extracting links... "
132
133
 
133
134
  begin
134
- doc = Nokogiri::HTML open url
135
- links = doc.css('a')
136
- links = normalize_links links
137
- @store.transaction { @store[url] = links }
138
- say "done"
139
- rescue OpenURI::HTTPError => e
140
- links = []
141
- say "!txtred!FAILED"
142
- say "!txtred! ! HTTP Error: #{e.message} at #{url}"
135
+ response = HTTParty.get url
136
+ if response.success?
137
+ doc = Nokogiri::HTML response.body
138
+ links = doc.css('a')
139
+ links = normalize_links links
140
+ @store.transaction { @store[url] = links }
141
+ say "done"
142
+ else
143
+ links = []
144
+ say "!txtred!FAILED"
145
+ say "!txtred! ! HTTP Error: #{response.code} #{response.message.strip} at #{url}"
146
+ end
143
147
  end
144
148
  links
145
149
  end
@@ -181,7 +185,7 @@ module Snapcrawl
181
185
 
182
186
  links_array = []
183
187
 
184
- links.each_with_index do |link|
188
+ links.each do |link|
185
189
  link = link.attribute('href').to_s
186
190
 
187
191
  # Remove #hash
@@ -191,10 +195,12 @@ module Snapcrawl
191
195
  # Remove links to specific extensions and protocols
192
196
  next if link =~ /\.(#{extensions})(\?.*)?$/
193
197
  next if link =~ /^(#{beginnings})/
194
-
195
- # Add the base domain to relative URLs
196
- link = link =~ /^http/ ? link : "#{@opts.base}#{link}"
197
- link = "http://#{link}" unless link =~ /^http/
198
+
199
+ # Strip spaces
200
+ link.strip!
201
+
202
+ # Convert relative links to absolute
203
+ link = URI.join( @opts.base, link ).to_s
198
204
 
199
205
  # Keep only links in our base domain
200
206
  next unless link.include? @opts.base
@@ -1,3 +1,3 @@
1
1
  module Snapcrawl
2
- VERSION = "0.2.8"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/snapcrawl.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'snapcrawl/version'
2
2
  require 'snapcrawl/crawler'
3
3
 
4
+ require 'byebug' if ENV['BYEBUG']
5
+
4
6
  self.extend Snapcrawl
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snapcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danny Ben Shitrit
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-14 00:00:00.000000000 Z
11
+ date: 2019-09-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: colsole
@@ -72,6 +72,20 @@ dependencies:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
74
  version: '0.1'
75
+ - !ruby/object:Gem::Dependency
76
+ name: httparty
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.17'
82
+ type: :runtime
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.17'
75
89
  description: Snapcrawl is a command line utility for crawling a website and saving
76
90
  screenshots.
77
91
  email: db@dannyben.com
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
105
119
  - !ruby/object:Gem::Version
106
120
  version: '0'
107
121
  requirements: []
108
- rubygems_version: 3.0.3
122
+ rubygems_version: 3.0.4
109
123
  signing_key:
110
124
  specification_version: 4
111
125
  summary: Crawl a website and take screenshots (CLI + Library)