scraped_page_archive 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +16 -0
- data/lib/scraped_page_archive.rb +45 -16
- data/lib/scraped_page_archive/capybara.rb +65 -0
- data/lib/scraped_page_archive/version.rb +2 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 85882d28526d19f24b3b6d6dc4761dfa4114ef15
|
4
|
+
data.tar.gz: ceb803c8e19b29ff312d81008601b7a702f719de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 95da4b3e80ee2ad5704a1c432d2d32e84806324e38150c49a4c2389d95f4215f0529e491892d943863879abf1d52710dcc01e194c32c8fbc6d16a898e56cd563
|
7
|
+
data.tar.gz: 2cbe81f2f74c8245b4d5d1b1fd2900224b5d0261da02e1f8a7251f6ee74e12f749cece5d7ebce0cb3df55ac2dffee3c6fcbe0b0180aca8794c49fddc3c1cfbdf
|
data/CHANGELOG.md
CHANGED
@@ -5,6 +5,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
5
5
|
|
6
6
|
## [Unreleased]
|
7
7
|
|
8
|
+
## [0.4.0] - 2016-08-04
|
9
|
+
|
10
|
+
### Features
|
11
|
+
|
12
|
+
- Added support for Capybara Poltergeist driver
|
13
|
+
- You can now use the `ScrapedPageArchive#open_from_archive` method to retrieve a page from the archive.
|
14
|
+
|
8
15
|
## [0.3.1] - 2016-07-29
|
9
16
|
|
10
17
|
### Fixes
|
@@ -35,3 +42,4 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
35
42
|
[0.2.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.1.0...v0.2.0
|
36
43
|
[0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
|
37
44
|
[0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
|
45
|
+
[0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0
|
data/README.md
CHANGED
@@ -60,12 +60,28 @@ response = open('http://example.com/')
|
|
60
60
|
# Use the response...
|
61
61
|
```
|
62
62
|
|
63
|
+
### Use with the Capybara Poltergeist driver
|
64
|
+
|
65
|
+
If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
require 'scraped_page_archive/capybara'
|
69
|
+
visit('http://example.com/')
|
70
|
+
# Use the response...
|
71
|
+
```
|
72
|
+
|
73
|
+
It should be possible to adapt this to work with other Capybara drivers
|
74
|
+
fairly easily.
|
75
|
+
|
63
76
|
## Development
|
64
77
|
|
65
78
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
66
79
|
|
67
80
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
68
81
|
|
82
|
+
Note that this does not install Capybara or any drivers so if you want
|
83
|
+
to work on that you will need to do that.
|
84
|
+
|
69
85
|
## Contributing
|
70
86
|
|
71
87
|
Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
|
data/lib/scraped_page_archive.rb
CHANGED
@@ -11,29 +11,21 @@ VCR.configure do |config|
|
|
11
11
|
config.allow_http_connections_when_no_cassette = true
|
12
12
|
end
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
class ScrapedPageArchive
|
15
|
+
class Error < StandardError; end
|
16
16
|
|
17
17
|
attr_writer :github_repo_url
|
18
18
|
|
19
|
+
def self.record(*args, &block)
|
20
|
+
new.record(*args, &block)
|
21
|
+
end
|
22
|
+
|
19
23
|
def record(&block)
|
20
24
|
if github_repo_url.nil?
|
21
25
|
warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
|
22
26
|
"See https://github.com/everypolitician/scraped_page_archive#usage for details."
|
23
27
|
return block.call
|
24
28
|
end
|
25
|
-
VCR::Archive::Persister.storage_location = git.dir.path
|
26
|
-
if git.branches[branch_name] || git.branches["origin/#{branch_name}"]
|
27
|
-
git.checkout(branch_name)
|
28
|
-
else
|
29
|
-
git.chdir do
|
30
|
-
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
31
|
-
# @see https://github.com/schacon/ruby-git/pull/140
|
32
|
-
system("git checkout --orphan #{branch_name}")
|
33
|
-
system("git rm --quiet -rf .")
|
34
|
-
end
|
35
|
-
git.commit("Initial commit", allow_empty: true)
|
36
|
-
end
|
37
29
|
ret = VCR.use_cassette('', &block)
|
38
30
|
|
39
31
|
# NOTE: This is a workaround for a ruby-git bug.
|
@@ -47,13 +39,38 @@ module ScrapedPageArchive
|
|
47
39
|
interaction = git.chdir { YAML.load_file(f) }
|
48
40
|
message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
|
49
41
|
git.add([f, f.sub(/\.yml$/, '.html')])
|
50
|
-
git.commit(message)
|
42
|
+
git.commit(message)
|
51
43
|
end
|
52
44
|
# FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
|
53
45
|
git.push('origin', branch_name)
|
54
46
|
ret
|
55
47
|
end
|
56
48
|
|
49
|
+
def open_from_archive(url, *args)
|
50
|
+
git.chdir do
|
51
|
+
filename = filename_from_url(url.to_s)
|
52
|
+
meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
|
53
|
+
response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
|
54
|
+
unless meta && response_body
|
55
|
+
fail Error, "No archived copy of #{url} found."
|
56
|
+
end
|
57
|
+
response_from(meta, response_body)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def filename_from_url(url)
|
62
|
+
File.join(URI.parse(url).host, Digest::SHA1.hexdigest(url))
|
63
|
+
end
|
64
|
+
|
65
|
+
def response_from(meta, response_body)
|
66
|
+
StringIO.new(response_body).tap do |response|
|
67
|
+
OpenURI::Meta.init(response)
|
68
|
+
meta['response']['headers'].each { |k, v| response.meta_add_field(k, v) }
|
69
|
+
response.status = meta['response']['status'].values.map(&:to_s)
|
70
|
+
response.base_uri = URI.parse(meta['request']['uri'])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
57
74
|
# TODO: This should be configurable.
|
58
75
|
def branch_name
|
59
76
|
@branch_name ||= 'scraped-pages-archive'
|
@@ -63,6 +80,18 @@ module ScrapedPageArchive
|
|
63
80
|
@git ||= Git.clone(git_url, tmpdir).tap do |g|
|
64
81
|
g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
|
65
82
|
g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
|
83
|
+
VCR::Archive::Persister.storage_location = g.dir.path
|
84
|
+
if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
|
85
|
+
g.checkout(branch_name)
|
86
|
+
else
|
87
|
+
g.chdir do
|
88
|
+
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
89
|
+
# @see https://github.com/schacon/ruby-git/pull/140
|
90
|
+
system("git checkout --orphan #{branch_name}")
|
91
|
+
system("git rm --quiet -rf .")
|
92
|
+
end
|
93
|
+
g.commit("Initial commit", allow_empty: true)
|
94
|
+
end
|
66
95
|
end
|
67
96
|
end
|
68
97
|
|
@@ -79,7 +108,7 @@ module ScrapedPageArchive
|
|
79
108
|
end
|
80
109
|
|
81
110
|
def github_repo_url
|
82
|
-
@github_repo_url ||= (
|
111
|
+
@github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
|
83
112
|
end
|
84
113
|
|
85
114
|
def git_remote_get_url_origin
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Monkey patch capybara poltergiest driver to record http requests automatically.
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
require 'scraped_page_archive'
|
4
|
+
|
5
|
+
module Capybara::Poltergeist
|
6
|
+
class Browser
|
7
|
+
alias __command command
|
8
|
+
|
9
|
+
def sha_url(url)
|
10
|
+
Digest::SHA1.hexdigest url
|
11
|
+
end
|
12
|
+
|
13
|
+
def base_dir_for_url(url)
|
14
|
+
dir = File.join(VCR::Archive::Persister.storage_location, URI(url).host)
|
15
|
+
FileUtils.mkdir_p(dir)
|
16
|
+
dir
|
17
|
+
end
|
18
|
+
|
19
|
+
def get_paths(url)
|
20
|
+
base_path = File.join(base_dir_for_url(url), sha_url(url))
|
21
|
+
|
22
|
+
['.html', '.yml'].map { |x| base_path + x }
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_details(url)
|
26
|
+
status_code = page.status_code
|
27
|
+
{
|
28
|
+
'request' => {
|
29
|
+
'method' => 'get', # assume this as no way to access it
|
30
|
+
'uri' => url
|
31
|
+
},
|
32
|
+
'response' => {
|
33
|
+
'status' => {
|
34
|
+
'message' => status_code == 200 ? 'OK' : 'NOT OK',
|
35
|
+
'code' => status_code
|
36
|
+
},
|
37
|
+
'date' => [ page.response_headers['Date'] ]
|
38
|
+
}
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
def save_request(html, details, url)
|
43
|
+
html_path, yaml_path = get_paths(url)
|
44
|
+
|
45
|
+
File.open(html_path,"w") do |f|
|
46
|
+
f.write(html)
|
47
|
+
end
|
48
|
+
File.open(yaml_path,"w") do |f|
|
49
|
+
f.write(YAML.dump(details))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def command(name, *args)
|
54
|
+
result = __command(name, *args)
|
55
|
+
# we skip these methods because they are called a lot, don't cause the page
|
56
|
+
# to change and having record round them slows things down quite a bit.
|
57
|
+
return result if ['tag_name', 'visible', 'property', 'find', 'body', 'set_js_errors', 'current_url', 'status_code', 'response_headers'].include?(name)
|
58
|
+
current_url = page.current_url.to_s
|
59
|
+
ScrapedPageArchive.record do
|
60
|
+
save_request(page.html, get_details(current_url), current_url)
|
61
|
+
end
|
62
|
+
result
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = '0.
|
1
|
+
class ScrapedPageArchive
|
2
|
+
VERSION = '0.4.0'.freeze
|
3
3
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraped_page_archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Mytton
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: vcr-archive
|
@@ -111,6 +111,7 @@ files:
|
|
111
111
|
- bin/console
|
112
112
|
- bin/setup
|
113
113
|
- lib/scraped_page_archive.rb
|
114
|
+
- lib/scraped_page_archive/capybara.rb
|
114
115
|
- lib/scraped_page_archive/open-uri.rb
|
115
116
|
- lib/scraped_page_archive/version.rb
|
116
117
|
- scraped_page_archive.gemspec
|