scraped_page_archive 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +16 -0
- data/lib/scraped_page_archive.rb +45 -16
- data/lib/scraped_page_archive/capybara.rb +65 -0
- data/lib/scraped_page_archive/version.rb +2 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 85882d28526d19f24b3b6d6dc4761dfa4114ef15
|
4
|
+
data.tar.gz: ceb803c8e19b29ff312d81008601b7a702f719de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 95da4b3e80ee2ad5704a1c432d2d32e84806324e38150c49a4c2389d95f4215f0529e491892d943863879abf1d52710dcc01e194c32c8fbc6d16a898e56cd563
|
7
|
+
data.tar.gz: 2cbe81f2f74c8245b4d5d1b1fd2900224b5d0261da02e1f8a7251f6ee74e12f749cece5d7ebce0cb3df55ac2dffee3c6fcbe0b0180aca8794c49fddc3c1cfbdf
|
data/CHANGELOG.md
CHANGED
@@ -5,6 +5,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
5
5
|
|
6
6
|
## [Unreleased]
|
7
7
|
|
8
|
+
## [0.4.0] - 2016-08-04
|
9
|
+
|
10
|
+
### Features
|
11
|
+
|
12
|
+
- Added support for Capybara Poltergeist driver
|
13
|
+
- You can now use the `ScrapedPageArchive#open_from_archive` method to retrieve a page from the archive.
|
14
|
+
|
8
15
|
## [0.3.1] - 2016-07-29
|
9
16
|
|
10
17
|
### Fixes
|
@@ -35,3 +42,4 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
35
42
|
[0.2.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.1.0...v0.2.0
|
36
43
|
[0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
|
37
44
|
[0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
|
45
|
+
[0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0
|
data/README.md
CHANGED
@@ -60,12 +60,28 @@ response = open('http://example.com/')
|
|
60
60
|
# Use the response...
|
61
61
|
```
|
62
62
|
|
63
|
+
### Use with the Capybara Poltergeist driver
|
64
|
+
|
65
|
+
If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
require 'scraped_page_archive/capybara'
|
69
|
+
visit('http://example.com/')
|
70
|
+
# Use the response...
|
71
|
+
```
|
72
|
+
|
73
|
+
It should be possible to adapt this to work with other Capybara drivers
|
74
|
+
fairly easily.
|
75
|
+
|
63
76
|
## Development
|
64
77
|
|
65
78
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
66
79
|
|
67
80
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
68
81
|
|
82
|
+
Note that this does not install Capybara or any drivers so if you want
|
83
|
+
to work on that you will need to do that.
|
84
|
+
|
69
85
|
## Contributing
|
70
86
|
|
71
87
|
Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
|
data/lib/scraped_page_archive.rb
CHANGED
@@ -11,29 +11,21 @@ VCR.configure do |config|
|
|
11
11
|
config.allow_http_connections_when_no_cassette = true
|
12
12
|
end
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
class ScrapedPageArchive
|
15
|
+
class Error < StandardError; end
|
16
16
|
|
17
17
|
attr_writer :github_repo_url
|
18
18
|
|
19
|
+
def self.record(*args, &block)
|
20
|
+
new.record(*args, &block)
|
21
|
+
end
|
22
|
+
|
19
23
|
def record(&block)
|
20
24
|
if github_repo_url.nil?
|
21
25
|
warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
|
22
26
|
"See https://github.com/everypolitician/scraped_page_archive#usage for details."
|
23
27
|
return block.call
|
24
28
|
end
|
25
|
-
VCR::Archive::Persister.storage_location = git.dir.path
|
26
|
-
if git.branches[branch_name] || git.branches["origin/#{branch_name}"]
|
27
|
-
git.checkout(branch_name)
|
28
|
-
else
|
29
|
-
git.chdir do
|
30
|
-
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
31
|
-
# @see https://github.com/schacon/ruby-git/pull/140
|
32
|
-
system("git checkout --orphan #{branch_name}")
|
33
|
-
system("git rm --quiet -rf .")
|
34
|
-
end
|
35
|
-
git.commit("Initial commit", allow_empty: true)
|
36
|
-
end
|
37
29
|
ret = VCR.use_cassette('', &block)
|
38
30
|
|
39
31
|
# NOTE: This is a workaround for a ruby-git bug.
|
@@ -47,13 +39,38 @@ module ScrapedPageArchive
|
|
47
39
|
interaction = git.chdir { YAML.load_file(f) }
|
48
40
|
message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
|
49
41
|
git.add([f, f.sub(/\.yml$/, '.html')])
|
50
|
-
git.commit(message)
|
42
|
+
git.commit(message)
|
51
43
|
end
|
52
44
|
# FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
|
53
45
|
git.push('origin', branch_name)
|
54
46
|
ret
|
55
47
|
end
|
56
48
|
|
49
|
+
def open_from_archive(url, *args)
|
50
|
+
git.chdir do
|
51
|
+
filename = filename_from_url(url.to_s)
|
52
|
+
meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
|
53
|
+
response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
|
54
|
+
unless meta && response_body
|
55
|
+
fail Error, "No archived copy of #{url} found."
|
56
|
+
end
|
57
|
+
response_from(meta, response_body)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def filename_from_url(url)
|
62
|
+
File.join(URI.parse(url).host, Digest::SHA1.hexdigest(url))
|
63
|
+
end
|
64
|
+
|
65
|
+
def response_from(meta, response_body)
|
66
|
+
StringIO.new(response_body).tap do |response|
|
67
|
+
OpenURI::Meta.init(response)
|
68
|
+
meta['response']['headers'].each { |k, v| response.meta_add_field(k, v) }
|
69
|
+
response.status = meta['response']['status'].values.map(&:to_s)
|
70
|
+
response.base_uri = URI.parse(meta['request']['uri'])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
57
74
|
# TODO: This should be configurable.
|
58
75
|
def branch_name
|
59
76
|
@branch_name ||= 'scraped-pages-archive'
|
@@ -63,6 +80,18 @@ module ScrapedPageArchive
|
|
63
80
|
@git ||= Git.clone(git_url, tmpdir).tap do |g|
|
64
81
|
g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
|
65
82
|
g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
|
83
|
+
VCR::Archive::Persister.storage_location = g.dir.path
|
84
|
+
if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
|
85
|
+
g.checkout(branch_name)
|
86
|
+
else
|
87
|
+
g.chdir do
|
88
|
+
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
89
|
+
# @see https://github.com/schacon/ruby-git/pull/140
|
90
|
+
system("git checkout --orphan #{branch_name}")
|
91
|
+
system("git rm --quiet -rf .")
|
92
|
+
end
|
93
|
+
g.commit("Initial commit", allow_empty: true)
|
94
|
+
end
|
66
95
|
end
|
67
96
|
end
|
68
97
|
|
@@ -79,7 +108,7 @@ module ScrapedPageArchive
|
|
79
108
|
end
|
80
109
|
|
81
110
|
def github_repo_url
|
82
|
-
@github_repo_url ||= (
|
111
|
+
@github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
|
83
112
|
end
|
84
113
|
|
85
114
|
def git_remote_get_url_origin
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Monkey patch capybara poltergiest driver to record http requests automatically.
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
require 'scraped_page_archive'
|
4
|
+
|
5
|
+
module Capybara::Poltergeist
|
6
|
+
class Browser
|
7
|
+
alias __command command
|
8
|
+
|
9
|
+
def sha_url(url)
|
10
|
+
Digest::SHA1.hexdigest url
|
11
|
+
end
|
12
|
+
|
13
|
+
def base_dir_for_url(url)
|
14
|
+
dir = File.join(VCR::Archive::Persister.storage_location, URI(url).host)
|
15
|
+
FileUtils.mkdir_p(dir)
|
16
|
+
dir
|
17
|
+
end
|
18
|
+
|
19
|
+
def get_paths(url)
|
20
|
+
base_path = File.join(base_dir_for_url(url), sha_url(url))
|
21
|
+
|
22
|
+
['.html', '.yml'].map { |x| base_path + x }
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_details(url)
|
26
|
+
status_code = page.status_code
|
27
|
+
{
|
28
|
+
'request' => {
|
29
|
+
'method' => 'get', # assume this as no way to access it
|
30
|
+
'uri' => url
|
31
|
+
},
|
32
|
+
'response' => {
|
33
|
+
'status' => {
|
34
|
+
'message' => status_code == 200 ? 'OK' : 'NOT OK',
|
35
|
+
'code' => status_code
|
36
|
+
},
|
37
|
+
'date' => [ page.response_headers['Date'] ]
|
38
|
+
}
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
def save_request(html, details, url)
|
43
|
+
html_path, yaml_path = get_paths(url)
|
44
|
+
|
45
|
+
File.open(html_path,"w") do |f|
|
46
|
+
f.write(html)
|
47
|
+
end
|
48
|
+
File.open(yaml_path,"w") do |f|
|
49
|
+
f.write(YAML.dump(details))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def command(name, *args)
|
54
|
+
result = __command(name, *args)
|
55
|
+
# we skip these methods because they are called a lot, don't cause the page
|
56
|
+
# to change and having record round them slows things down quite a bit.
|
57
|
+
return result if ['tag_name', 'visible', 'property', 'find', 'body', 'set_js_errors', 'current_url', 'status_code', 'response_headers'].include?(name)
|
58
|
+
current_url = page.current_url.to_s
|
59
|
+
ScrapedPageArchive.record do
|
60
|
+
save_request(page.html, get_details(current_url), current_url)
|
61
|
+
end
|
62
|
+
result
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = '0.
|
1
|
+
class ScrapedPageArchive
|
2
|
+
VERSION = '0.4.0'.freeze
|
3
3
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraped_page_archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Mytton
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: vcr-archive
|
@@ -111,6 +111,7 @@ files:
|
|
111
111
|
- bin/console
|
112
112
|
- bin/setup
|
113
113
|
- lib/scraped_page_archive.rb
|
114
|
+
- lib/scraped_page_archive/capybara.rb
|
114
115
|
- lib/scraped_page_archive/open-uri.rb
|
115
116
|
- lib/scraped_page_archive/version.rb
|
116
117
|
- scraped_page_archive.gemspec
|