snapcrawl 0.2.8 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/snapcrawl/crawler.rb +21 -15
- data/lib/snapcrawl/version.rb +1 -1
- data/lib/snapcrawl.rb +2 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0dba7d3febfccf88e22194125387c6219cf500f0cb0e00fe4ed343a57fe9034
|
4
|
+
data.tar.gz: d05440bccd6ddde9773b24c425f3420b1970c3985805f6eb685171237e382cda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71f65748af4339bf606765e2f1ea0aab90f4099ff986ce59318c6c966363ca6386a92d08ce175373cfa8e83c096af844983da2cde22ccbbebd2ab151514b6e72
|
7
|
+
data.tar.gz: a641b038a18114117375249d260895d73d08cd2cf47c7f668641c0891cd4b4723db80d4313a6a64cd89de02be0ea72aa9fa7b15f74d1ba6e0724017486c2d9c4
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'colsole'
|
2
2
|
require 'docopt'
|
3
3
|
require 'fileutils'
|
4
|
+
require 'httparty'
|
4
5
|
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
6
|
require 'ostruct'
|
7
7
|
require 'pstore'
|
8
|
+
require 'uri'
|
8
9
|
require 'webshot'
|
9
10
|
|
10
11
|
module Snapcrawl
|
@@ -131,15 +132,18 @@ module Snapcrawl
|
|
131
132
|
say " !txtblu!Crawl!!txtrst! Extracting links... "
|
132
133
|
|
133
134
|
begin
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
135
|
+
response = HTTParty.get url
|
136
|
+
if response.success?
|
137
|
+
doc = Nokogiri::HTML response.body
|
138
|
+
links = doc.css('a')
|
139
|
+
links = normalize_links links
|
140
|
+
@store.transaction { @store[url] = links }
|
141
|
+
say "done"
|
142
|
+
else
|
143
|
+
links = []
|
144
|
+
say "!txtred!FAILED"
|
145
|
+
say "!txtred! ! HTTP Error: #{response.code} #{response.message.strip} at #{url}"
|
146
|
+
end
|
143
147
|
end
|
144
148
|
links
|
145
149
|
end
|
@@ -181,7 +185,7 @@ module Snapcrawl
|
|
181
185
|
|
182
186
|
links_array = []
|
183
187
|
|
184
|
-
links.
|
188
|
+
links.each do |link|
|
185
189
|
link = link.attribute('href').to_s
|
186
190
|
|
187
191
|
# Remove #hash
|
@@ -191,10 +195,12 @@ module Snapcrawl
|
|
191
195
|
# Remove links to specific extensions and protocols
|
192
196
|
next if link =~ /\.(#{extensions})(\?.*)?$/
|
193
197
|
next if link =~ /^(#{beginnings})/
|
194
|
-
|
195
|
-
#
|
196
|
-
link
|
197
|
-
|
198
|
+
|
199
|
+
# Strip spaces
|
200
|
+
link.strip!
|
201
|
+
|
202
|
+
# Convert relative links to absolute
|
203
|
+
link = URI.join( @opts.base, link ).to_s
|
198
204
|
|
199
205
|
# Keep only links in our base domain
|
200
206
|
next unless link.include? @opts.base
|
data/lib/snapcrawl/version.rb
CHANGED
data/lib/snapcrawl.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -72,6 +72,20 @@ dependencies:
|
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
74
|
version: '0.1'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: httparty
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0.17'
|
82
|
+
type: :runtime
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0.17'
|
75
89
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
76
90
|
screenshots.
|
77
91
|
email: db@dannyben.com
|
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
105
119
|
- !ruby/object:Gem::Version
|
106
120
|
version: '0'
|
107
121
|
requirements: []
|
108
|
-
rubygems_version: 3.0.
|
122
|
+
rubygems_version: 3.0.4
|
109
123
|
signing_key:
|
110
124
|
specification_version: 4
|
111
125
|
summary: Crawl a website and take screenshots (CLI + Library)
|