snapcrawl 0.2.8 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/snapcrawl/crawler.rb +21 -15
- data/lib/snapcrawl/version.rb +1 -1
- data/lib/snapcrawl.rb +2 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0dba7d3febfccf88e22194125387c6219cf500f0cb0e00fe4ed343a57fe9034
|
4
|
+
data.tar.gz: d05440bccd6ddde9773b24c425f3420b1970c3985805f6eb685171237e382cda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71f65748af4339bf606765e2f1ea0aab90f4099ff986ce59318c6c966363ca6386a92d08ce175373cfa8e83c096af844983da2cde22ccbbebd2ab151514b6e72
|
7
|
+
data.tar.gz: a641b038a18114117375249d260895d73d08cd2cf47c7f668641c0891cd4b4723db80d4313a6a64cd89de02be0ea72aa9fa7b15f74d1ba6e0724017486c2d9c4
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'colsole'
|
2
2
|
require 'docopt'
|
3
3
|
require 'fileutils'
|
4
|
+
require 'httparty'
|
4
5
|
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
6
|
require 'ostruct'
|
7
7
|
require 'pstore'
|
8
|
+
require 'uri'
|
8
9
|
require 'webshot'
|
9
10
|
|
10
11
|
module Snapcrawl
|
@@ -131,15 +132,18 @@ module Snapcrawl
|
|
131
132
|
say " !txtblu!Crawl!!txtrst! Extracting links... "
|
132
133
|
|
133
134
|
begin
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
135
|
+
response = HTTParty.get url
|
136
|
+
if response.success?
|
137
|
+
doc = Nokogiri::HTML response.body
|
138
|
+
links = doc.css('a')
|
139
|
+
links = normalize_links links
|
140
|
+
@store.transaction { @store[url] = links }
|
141
|
+
say "done"
|
142
|
+
else
|
143
|
+
links = []
|
144
|
+
say "!txtred!FAILED"
|
145
|
+
say "!txtred! ! HTTP Error: #{response.code} #{response.message.strip} at #{url}"
|
146
|
+
end
|
143
147
|
end
|
144
148
|
links
|
145
149
|
end
|
@@ -181,7 +185,7 @@ module Snapcrawl
|
|
181
185
|
|
182
186
|
links_array = []
|
183
187
|
|
184
|
-
links.
|
188
|
+
links.each do |link|
|
185
189
|
link = link.attribute('href').to_s
|
186
190
|
|
187
191
|
# Remove #hash
|
@@ -191,10 +195,12 @@ module Snapcrawl
|
|
191
195
|
# Remove links to specific extensions and protocols
|
192
196
|
next if link =~ /\.(#{extensions})(\?.*)?$/
|
193
197
|
next if link =~ /^(#{beginnings})/
|
194
|
-
|
195
|
-
#
|
196
|
-
link
|
197
|
-
|
198
|
+
|
199
|
+
# Strip spaces
|
200
|
+
link.strip!
|
201
|
+
|
202
|
+
# Convert relative links to absolute
|
203
|
+
link = URI.join( @opts.base, link ).to_s
|
198
204
|
|
199
205
|
# Keep only links in our base domain
|
200
206
|
next unless link.include? @opts.base
|
data/lib/snapcrawl/version.rb
CHANGED
data/lib/snapcrawl.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -72,6 +72,20 @@ dependencies:
|
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
74
|
version: '0.1'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: httparty
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0.17'
|
82
|
+
type: :runtime
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0.17'
|
75
89
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
76
90
|
screenshots.
|
77
91
|
email: db@dannyben.com
|
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
105
119
|
- !ruby/object:Gem::Version
|
106
120
|
version: '0'
|
107
121
|
requirements: []
|
108
|
-
rubygems_version: 3.0.
|
122
|
+
rubygems_version: 3.0.4
|
109
123
|
signing_key:
|
110
124
|
specification_version: 4
|
111
125
|
summary: Crawl a website and take screenshots (CLI + Library)
|