webstract 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/webstract.rb +4 -1
- data/lib/webstract/errors.rb +4 -0
- data/lib/webstract/screen_capture.rb +60 -0
- data/lib/webstract/screenshot.rb +3 -7
- data/lib/webstract/screenshot_backend.rb +64 -0
- data/lib/webstract/version.rb +1 -1
- metadata +63 -16
- data/.gitignore +0 -14
- data/Gemfile +0 -4
- data/Rakefile +0 -2
- data/webstract.gemspec +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6243396588bd377a63b7bde8eda4753f74580d2e
|
4
|
+
data.tar.gz: 1a3a31eb9db5c4e83b197798d00a8c049ca6acb6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4a1b8514db48474677b8a2c51e59a4ad2da3a61d4f482372d2cc467dedfc98fbb8ad34abaee5fdca140b45afcf9f32cceefb473932da24a25761571e462c2a7e
|
7
|
+
data.tar.gz: 463d81732822cb8dea73e1c8055d71bc9f9824d8dc335d78ed1ef8557a5f48e2149da8132e74acc373e787e5c3658e4f05231a4816364304ee33d4e96e3db166
|
data/lib/webstract.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module Webstract
|
2
2
|
|
3
|
+
autoload :Errors, 'webstract/errors'
|
4
|
+
autoload :ScreenCapture, 'webstract/screen_capture'
|
5
|
+
autoload :ScreenshotBackend, 'webstract/screenshot_backend'
|
3
6
|
autoload :Screenshot, 'webstract/screenshot'
|
4
|
-
autoload :Favicon,
|
7
|
+
autoload :Favicon, 'webstract/favicon'
|
5
8
|
|
6
9
|
def self.screenshot(options = {})
|
7
10
|
Webstract::Screenshot.new(options)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'capybara/dsl'
|
2
|
+
|
3
|
+
module Webstract
|
4
|
+
class ScreenCapture
|
5
|
+
include Capybara::DSL
|
6
|
+
|
7
|
+
attr_reader :width, :height, :user_agent, :accept_language, :path
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
Webstract::ScreenshotBackend.capybara_setup!
|
11
|
+
@width = opts.fetch(:width, Webstract::ScreenshotBackend.width)
|
12
|
+
@height = opts.fetch(:height, Webstract::ScreenshotBackend.height)
|
13
|
+
@user_agent = opts.fetch(:user_agent, Webstract::ScreenshotBackend.user_agent)
|
14
|
+
@accept_language = opts.fetch(:accept_language, Webstract::ScreenshotBackend.accept_language)
|
15
|
+
|
16
|
+
# Browser settings
|
17
|
+
page.driver.resize(@width, @height)
|
18
|
+
page.driver.headers = {
|
19
|
+
"User-Agent" => @user_agent,
|
20
|
+
'Accept-Language' => @accept_language
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
def start_session(&block)
|
25
|
+
Capybara.reset_sessions!
|
26
|
+
Capybara.current_session.instance_eval(&block) if block_given?
|
27
|
+
@session_started = true
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
# Captures a screenshot of +url+ saving it to +path+.
|
32
|
+
def capture(url, path, opts = {})
|
33
|
+
begin
|
34
|
+
# Default settings
|
35
|
+
@width = opts.fetch(:width, 120) if opts[:width]
|
36
|
+
@height = opts.fetch(:height, 90) if opts[:width]
|
37
|
+
|
38
|
+
# Reset session before visiting url
|
39
|
+
Capybara.reset_sessions! unless @session_started
|
40
|
+
@session_started = false
|
41
|
+
|
42
|
+
# Open page
|
43
|
+
visit(url)
|
44
|
+
|
45
|
+
# Timeout
|
46
|
+
sleep opts[:timeout] if opts[:timeout]
|
47
|
+
|
48
|
+
# Check response code
|
49
|
+
if page.driver.status_code.to_i == 200 || page.driver.status_code.to_i / 100 == 3
|
50
|
+
page.driver.save_screenshot(path, :full => true)
|
51
|
+
else
|
52
|
+
raise Webstract::Error.new("Could not fetch page: #{url.inspect}, error code: #{page.driver.status_code}")
|
53
|
+
end
|
54
|
+
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient, Capybara::Poltergeist::TimeoutError, Errno::EPIPE => e
|
55
|
+
# TODO: Handle Errno::EPIPE and Errno::ECONNRESET
|
56
|
+
raise Webstract::Error.new("Capybara error: #{e.message.inspect}")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/webstract/screenshot.rb
CHANGED
@@ -1,23 +1,19 @@
|
|
1
|
-
require 'webshot'
|
2
|
-
|
3
1
|
module Webstract
|
4
2
|
class Screenshot
|
5
|
-
attr_accessor :url, :path, :width, :height, :
|
3
|
+
attr_accessor :url, :path, :width, :height, :user_agent, :accept_language
|
6
4
|
attr_reader :handle
|
7
5
|
|
8
6
|
def initialize(options = {})
|
9
|
-
@handle =
|
7
|
+
@handle = Webstract::ScreenCapture.new(options)
|
10
8
|
|
11
9
|
options.each do |k, value|
|
12
10
|
setter = "#{k}="
|
13
11
|
self.public_send(setter, value) if self.respond_to?(setter)
|
14
12
|
end
|
15
|
-
|
16
|
-
@quality = 85 unless self.quality
|
17
13
|
end
|
18
14
|
|
19
15
|
def capture
|
20
|
-
handle.capture(url, path, width: width, height: height,
|
16
|
+
handle.capture(url, path, width: width, height: height, user_agent: user_agent, accept_language: accept_language)
|
21
17
|
end
|
22
18
|
|
23
19
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "capybara/dsl"
|
2
|
+
require "capybara/poltergeist"
|
3
|
+
require "active_support"
|
4
|
+
require "active_support/core_ext"
|
5
|
+
|
6
|
+
module Webstract
|
7
|
+
module ScreenshotBackend
|
8
|
+
|
9
|
+
USER_AGENTS = {
|
10
|
+
web: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31',
|
11
|
+
android: 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19',
|
12
|
+
ios: 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3'
|
13
|
+
}
|
14
|
+
|
15
|
+
## Browser settings
|
16
|
+
# Width
|
17
|
+
mattr_accessor :width
|
18
|
+
@@width = 1024
|
19
|
+
|
20
|
+
# Height
|
21
|
+
mattr_accessor :height
|
22
|
+
@@height = 768
|
23
|
+
|
24
|
+
mattr_accessor :accept_language
|
25
|
+
@@accept_language = 'en-us,en;q=0.5'
|
26
|
+
|
27
|
+
|
28
|
+
# User agent
|
29
|
+
class << self
|
30
|
+
|
31
|
+
def user_agent
|
32
|
+
@user_agent ||= USER_AGENT[:web]
|
33
|
+
end
|
34
|
+
def user_agent=(ua)
|
35
|
+
agent_string = USER_AGENT[ua]
|
36
|
+
raise(ArgumentError.new('must be one of #{USER_AGENTS.inspect}')) unless agent_string
|
37
|
+
@user_agent = agent_string
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
# Customize settings
|
43
|
+
def self.setup
|
44
|
+
yield(self)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Capibara setup
|
48
|
+
def self.capybara_setup!
|
49
|
+
# By default Capybara will try to boot a rack application
|
50
|
+
# automatically. You might want to switch off Capybara's
|
51
|
+
# rack server if you are running against a remote application
|
52
|
+
Capybara.run_server = false
|
53
|
+
Capybara.register_driver :poltergeist do |app|
|
54
|
+
Capybara::Poltergeist::Driver.new(app, {
|
55
|
+
# Raise JavaScript errors to Ruby
|
56
|
+
js_errors: false,
|
57
|
+
# Additional command line options for PhantomJS
|
58
|
+
phantomjs_options: ['--ignore-ssl-errors=yes'],
|
59
|
+
})
|
60
|
+
end
|
61
|
+
Capybara.current_driver = :poltergeist
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/webstract/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,63 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webstract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Faucett
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '4.1'
|
20
|
+
- - "<"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '5'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '4.1'
|
30
|
+
- - "<"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '5'
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: poltergeist
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '1.5'
|
40
|
+
type: :runtime
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '1.5'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: faviconduit
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
13
61
|
- !ruby/object:Gem::Dependency
|
14
62
|
name: bundler
|
15
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,33 +87,33 @@ dependencies:
|
|
39
87
|
- !ruby/object:Gem::Version
|
40
88
|
version: '10.0'
|
41
89
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
90
|
+
name: rspec
|
43
91
|
requirement: !ruby/object:Gem::Requirement
|
44
92
|
requirements:
|
45
|
-
- - "
|
93
|
+
- - "~>"
|
46
94
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
48
|
-
type: :
|
95
|
+
version: '3'
|
96
|
+
type: :development
|
49
97
|
prerelease: false
|
50
98
|
version_requirements: !ruby/object:Gem::Requirement
|
51
99
|
requirements:
|
52
|
-
- - "
|
100
|
+
- - "~>"
|
53
101
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
102
|
+
version: '3'
|
55
103
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
104
|
+
name: pry
|
57
105
|
requirement: !ruby/object:Gem::Requirement
|
58
106
|
requirements:
|
59
107
|
- - ">="
|
60
108
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
62
|
-
type: :
|
109
|
+
version: 0.10.0
|
110
|
+
type: :development
|
63
111
|
prerelease: false
|
64
112
|
version_requirements: !ruby/object:Gem::Requirement
|
65
113
|
requirements:
|
66
114
|
- - ">="
|
67
115
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
116
|
+
version: 0.10.0
|
69
117
|
description: Extract images, favicons, and meta info from websites
|
70
118
|
email:
|
71
119
|
- jwaterfaucett@gmail.com
|
@@ -73,16 +121,15 @@ executables: []
|
|
73
121
|
extensions: []
|
74
122
|
extra_rdoc_files: []
|
75
123
|
files:
|
76
|
-
- ".gitignore"
|
77
|
-
- Gemfile
|
78
124
|
- LICENSE.txt
|
79
125
|
- README.md
|
80
|
-
- Rakefile
|
81
126
|
- lib/webstract.rb
|
127
|
+
- lib/webstract/errors.rb
|
82
128
|
- lib/webstract/favicon.rb
|
129
|
+
- lib/webstract/screen_capture.rb
|
83
130
|
- lib/webstract/screenshot.rb
|
131
|
+
- lib/webstract/screenshot_backend.rb
|
84
132
|
- lib/webstract/version.rb
|
85
|
-
- webstract.gemspec
|
86
133
|
homepage:
|
87
134
|
licenses:
|
88
135
|
- MIT
|
data/.gitignore
DELETED
data/Gemfile
DELETED
data/Rakefile
DELETED
data/webstract.gemspec
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'webstract/version'
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = "webstract"
|
8
|
-
spec.version = Webstract::VERSION
|
9
|
-
spec.authors = ["John Faucett"]
|
10
|
-
spec.email = ["jwaterfaucett@gmail.com"]
|
11
|
-
spec.summary = 'Extract information from websites'
|
12
|
-
spec.description = 'Extract images, favicons, and meta info from websites'
|
13
|
-
spec.license = 'MIT'
|
14
|
-
|
15
|
-
spec.files = `git ls-files -z`.split("\x0")
|
16
|
-
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
17
|
-
spec.require_paths = ['lib']
|
18
|
-
|
19
|
-
spec.add_development_dependency 'bundler', '~> 1.7'
|
20
|
-
spec.add_development_dependency 'rake', '~> 10.0'
|
21
|
-
|
22
|
-
spec.add_runtime_dependency 'webshot'
|
23
|
-
spec.add_runtime_dependency 'faviconduit'
|
24
|
-
end
|