snapcrawl 0.2.8 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2500895cfd465dd633ac9650fc702f87b684ae4a81aa2feb979a16fad17bedc2
4
- data.tar.gz: 042f0e14f8139d3b33bdd0061731c7973f7c445e57e2c474d32c3b226d333acb
3
+ metadata.gz: e0dba7d3febfccf88e22194125387c6219cf500f0cb0e00fe4ed343a57fe9034
4
+ data.tar.gz: d05440bccd6ddde9773b24c425f3420b1970c3985805f6eb685171237e382cda
5
5
  SHA512:
6
- metadata.gz: 7603f985467c47f26b5bbf8c3cebf5654de708d0f7039c0268b4c4824089c259c17b687be482ed3e5813f44fef2274407542c76e3291f064aed1b06c2af1837a
7
- data.tar.gz: 3246ea728fa90fc65da1732964492da9383ae0c99a47746d1194f5c991b2efadd333c69478e2c97fd99274acb47fc38773285a53c9fd5505e7a605af08f58cf2
6
+ metadata.gz: 71f65748af4339bf606765e2f1ea0aab90f4099ff986ce59318c6c966363ca6386a92d08ce175373cfa8e83c096af844983da2cde22ccbbebd2ab151514b6e72
7
+ data.tar.gz: a641b038a18114117375249d260895d73d08cd2cf47c7f668641c0891cd4b4723db80d4313a6a64cd89de02be0ea72aa9fa7b15f74d1ba6e0724017486c2d9c4
@@ -1,10 +1,11 @@
1
1
  require 'colsole'
2
2
  require 'docopt'
3
3
  require 'fileutils'
4
+ require 'httparty'
4
5
  require 'nokogiri'
5
- require 'open-uri'
6
6
  require 'ostruct'
7
7
  require 'pstore'
8
+ require 'uri'
8
9
  require 'webshot'
9
10
 
10
11
  module Snapcrawl
@@ -131,15 +132,18 @@ module Snapcrawl
131
132
  say " !txtblu!Crawl!!txtrst! Extracting links... "
132
133
 
133
134
  begin
134
- doc = Nokogiri::HTML open url
135
- links = doc.css('a')
136
- links = normalize_links links
137
- @store.transaction { @store[url] = links }
138
- say "done"
139
- rescue OpenURI::HTTPError => e
140
- links = []
141
- say "!txtred!FAILED"
142
- say "!txtred! ! HTTP Error: #{e.message} at #{url}"
135
+ response = HTTParty.get url
136
+ if response.success?
137
+ doc = Nokogiri::HTML response.body
138
+ links = doc.css('a')
139
+ links = normalize_links links
140
+ @store.transaction { @store[url] = links }
141
+ say "done"
142
+ else
143
+ links = []
144
+ say "!txtred!FAILED"
145
+ say "!txtred! ! HTTP Error: #{response.code} #{response.message.strip} at #{url}"
146
+ end
143
147
  end
144
148
  links
145
149
  end
@@ -181,7 +185,7 @@ module Snapcrawl
181
185
 
182
186
  links_array = []
183
187
 
184
- links.each_with_index do |link|
188
+ links.each do |link|
185
189
  link = link.attribute('href').to_s
186
190
 
187
191
  # Remove #hash
@@ -191,10 +195,12 @@ module Snapcrawl
191
195
  # Remove links to specific extensions and protocols
192
196
  next if link =~ /\.(#{extensions})(\?.*)?$/
193
197
  next if link =~ /^(#{beginnings})/
194
-
195
- # Add the base domain to relative URLs
196
- link = link =~ /^http/ ? link : "#{@opts.base}#{link}"
197
- link = "http://#{link}" unless link =~ /^http/
198
+
199
+ # Strip spaces
200
+ link.strip!
201
+
202
+ # Convert relative links to absolute
203
+ link = URI.join( @opts.base, link ).to_s
198
204
 
199
205
  # Keep only links in our base domain
200
206
  next unless link.include? @opts.base
@@ -1,3 +1,3 @@
1
1
  module Snapcrawl
2
- VERSION = "0.2.8"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/snapcrawl.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'snapcrawl/version'
2
2
  require 'snapcrawl/crawler'
3
3
 
4
+ require 'byebug' if ENV['BYEBUG']
5
+
4
6
  self.extend Snapcrawl
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snapcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danny Ben Shitrit
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-14 00:00:00.000000000 Z
11
+ date: 2019-09-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: colsole
@@ -72,6 +72,20 @@ dependencies:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
74
  version: '0.1'
75
+ - !ruby/object:Gem::Dependency
76
+ name: httparty
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.17'
82
+ type: :runtime
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.17'
75
89
  description: Snapcrawl is a command line utility for crawling a website and saving
76
90
  screenshots.
77
91
  email: db@dannyben.com
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
105
119
  - !ruby/object:Gem::Version
106
120
  version: '0'
107
121
  requirements: []
108
- rubygems_version: 3.0.3
122
+ rubygems_version: 3.0.4
109
123
  signing_key:
110
124
  specification_version: 4
111
125
  summary: Crawl a website and take screenshots (CLI + Library)