scraped_page_archive 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5e7f709fa26f4b28c6c7248f9a98b0ee28ad0a2e
4
- data.tar.gz: 20547cc3ef320e8e06d59a97d4b7af65514b5beb
3
+ metadata.gz: 85882d28526d19f24b3b6d6dc4761dfa4114ef15
4
+ data.tar.gz: ceb803c8e19b29ff312d81008601b7a702f719de
5
5
  SHA512:
6
- metadata.gz: 9c8457c7070d41f5d5897b7e1ac0ef6f90aa03c61b12b8ff6ec3cf7c2c3e323e63edf75e140417140af82fe2f5f926b32af265d7e88b63d23377d67f2469c3fa
7
- data.tar.gz: 4525301d13f21a8f883bd3f2962b873d6cc0021b0e11446d0791036b2da65021c0aa7fa8cba4296b0f3316fab696e74580aa45872b02f9f5ffe2872a0e23887d
6
+ metadata.gz: 95da4b3e80ee2ad5704a1c432d2d32e84806324e38150c49a4c2389d95f4215f0529e491892d943863879abf1d52710dcc01e194c32c8fbc6d16a898e56cd563
7
+ data.tar.gz: 2cbe81f2f74c8245b4d5d1b1fd2900224b5d0261da02e1f8a7251f6ee74e12f749cece5d7ebce0cb3df55ac2dffee3c6fcbe0b0180aca8794c49fddc3c1cfbdf
data/CHANGELOG.md CHANGED
@@ -5,6 +5,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
5
5
 
6
6
  ## [Unreleased]
7
7
 
8
+ ## [0.4.0] - 2016-08-04
9
+
10
+ ### Features
11
+
12
+ - Added support for Capybara Poltergeist driver
13
+ - You can now use the `ScrapedPageArchive#open_from_archive` method to retrieve a page from the archive.
14
+
8
15
  ## [0.3.1] - 2016-07-29
9
16
 
10
17
  ### Fixes
@@ -35,3 +42,4 @@ This project adheres to [Semantic Versioning](http://semver.org/).
35
42
  [0.2.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.1.0...v0.2.0
36
43
  [0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
37
44
  [0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
45
+ [0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0
data/README.md CHANGED
@@ -60,12 +60,28 @@ response = open('http://example.com/')
60
60
  # Use the response...
61
61
  ```
62
62
 
63
+ ### Use with the Capybara Poltergeist driver
64
+
65
+ If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
66
+
67
+ ```ruby
68
+ require 'scraped_page_archive/capybara'
69
+ visit('http://example.com/')
70
+ # Use the response...
71
+ ```
72
+
73
+ It should be possible to adapt this to work with other Capybara drivers
74
+ fairly easily.
75
+
63
76
  ## Development
64
77
 
65
78
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
66
79
 
67
80
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
68
81
 
82
+ Note that this does not install Capybara or any drivers so if you want
83
+ to work on that you will need to do that.
84
+
69
85
  ## Contributing
70
86
 
71
87
  Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
@@ -11,29 +11,21 @@ VCR.configure do |config|
11
11
  config.allow_http_connections_when_no_cassette = true
12
12
  end
13
13
 
14
- module ScrapedPageArchive
15
- extend self
14
+ class ScrapedPageArchive
15
+ class Error < StandardError; end
16
16
 
17
17
  attr_writer :github_repo_url
18
18
 
19
+ def self.record(*args, &block)
20
+ new.record(*args, &block)
21
+ end
22
+
19
23
  def record(&block)
20
24
  if github_repo_url.nil?
21
25
  warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
22
26
  "See https://github.com/everypolitician/scraped_page_archive#usage for details."
23
27
  return block.call
24
28
  end
25
- VCR::Archive::Persister.storage_location = git.dir.path
26
- if git.branches[branch_name] || git.branches["origin/#{branch_name}"]
27
- git.checkout(branch_name)
28
- else
29
- git.chdir do
30
- # FIXME: It's not currently possible to create an orphan branch with ruby-git
31
- # @see https://github.com/schacon/ruby-git/pull/140
32
- system("git checkout --orphan #{branch_name}")
33
- system("git rm --quiet -rf .")
34
- end
35
- git.commit("Initial commit", allow_empty: true)
36
- end
37
29
  ret = VCR.use_cassette('', &block)
38
30
 
39
31
  # NOTE: This is a workaround for a ruby-git bug.
@@ -47,13 +39,38 @@ module ScrapedPageArchive
47
39
  interaction = git.chdir { YAML.load_file(f) }
48
40
  message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
49
41
  git.add([f, f.sub(/\.yml$/, '.html')])
50
- git.commit(message) rescue binding.pry
42
+ git.commit(message)
51
43
  end
52
44
  # FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
53
45
  git.push('origin', branch_name)
54
46
  ret
55
47
  end
56
48
 
49
+ def open_from_archive(url, *args)
50
+ git.chdir do
51
+ filename = filename_from_url(url.to_s)
52
+ meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
53
+ response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
54
+ unless meta && response_body
55
+ fail Error, "No archived copy of #{url} found."
56
+ end
57
+ response_from(meta, response_body)
58
+ end
59
+ end
60
+
61
+ def filename_from_url(url)
62
+ File.join(URI.parse(url).host, Digest::SHA1.hexdigest(url))
63
+ end
64
+
65
+ def response_from(meta, response_body)
66
+ StringIO.new(response_body).tap do |response|
67
+ OpenURI::Meta.init(response)
68
+ meta['response']['headers'].each { |k, v| response.meta_add_field(k, v) }
69
+ response.status = meta['response']['status'].values.map(&:to_s)
70
+ response.base_uri = URI.parse(meta['request']['uri'])
71
+ end
72
+ end
73
+
57
74
  # TODO: This should be configurable.
58
75
  def branch_name
59
76
  @branch_name ||= 'scraped-pages-archive'
@@ -63,6 +80,18 @@ module ScrapedPageArchive
63
80
  @git ||= Git.clone(git_url, tmpdir).tap do |g|
64
81
  g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
65
82
  g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
83
+ VCR::Archive::Persister.storage_location = g.dir.path
84
+ if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
85
+ g.checkout(branch_name)
86
+ else
87
+ g.chdir do
88
+ # FIXME: It's not currently possible to create an orphan branch with ruby-git
89
+ # @see https://github.com/schacon/ruby-git/pull/140
90
+ system("git checkout --orphan #{branch_name}")
91
+ system("git rm --quiet -rf .")
92
+ end
93
+ g.commit("Initial commit", allow_empty: true)
94
+ end
66
95
  end
67
96
  end
68
97
 
@@ -79,7 +108,7 @@ module ScrapedPageArchive
79
108
  end
80
109
 
81
110
  def github_repo_url
82
- @github_repo_url ||= (git_remote_get_url_origin || ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'])
111
+ @github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
83
112
  end
84
113
 
85
114
  def git_remote_get_url_origin
@@ -0,0 +1,65 @@
1
+ # Monkey patch capybara poltergiest driver to record http requests automatically.
2
+ require 'capybara/poltergeist'
3
+ require 'scraped_page_archive'
4
+
5
+ module Capybara::Poltergeist
6
+ class Browser
7
+ alias __command command
8
+
9
+ def sha_url(url)
10
+ Digest::SHA1.hexdigest url
11
+ end
12
+
13
+ def base_dir_for_url(url)
14
+ dir = File.join(VCR::Archive::Persister.storage_location, URI(url).host)
15
+ FileUtils.mkdir_p(dir)
16
+ dir
17
+ end
18
+
19
+ def get_paths(url)
20
+ base_path = File.join(base_dir_for_url(url), sha_url(url))
21
+
22
+ ['.html', '.yml'].map { |x| base_path + x }
23
+ end
24
+
25
+ def get_details(url)
26
+ status_code = page.status_code
27
+ {
28
+ 'request' => {
29
+ 'method' => 'get', # assume this as no way to access it
30
+ 'uri' => url
31
+ },
32
+ 'response' => {
33
+ 'status' => {
34
+ 'message' => status_code == 200 ? 'OK' : 'NOT OK',
35
+ 'code' => status_code
36
+ },
37
+ 'date' => [ page.response_headers['Date'] ]
38
+ }
39
+ }
40
+ end
41
+
42
+ def save_request(html, details, url)
43
+ html_path, yaml_path = get_paths(url)
44
+
45
+ File.open(html_path,"w") do |f|
46
+ f.write(html)
47
+ end
48
+ File.open(yaml_path,"w") do |f|
49
+ f.write(YAML.dump(details))
50
+ end
51
+ end
52
+
53
+ def command(name, *args)
54
+ result = __command(name, *args)
55
+ # we skip these methods because they are called a lot, don't cause the page
56
+ # to change and having record round them slows things down quite a bit.
57
+ return result if ['tag_name', 'visible', 'property', 'find', 'body', 'set_js_errors', 'current_url', 'status_code', 'response_headers'].include?(name)
58
+ current_url = page.current_url.to_s
59
+ ScrapedPageArchive.record do
60
+ save_request(page.html, get_details(current_url), current_url)
61
+ end
62
+ result
63
+ end
64
+ end
65
+ end
@@ -1,3 +1,3 @@
1
- module ScrapedPageArchive
2
- VERSION = '0.3.1'.freeze
1
+ class ScrapedPageArchive
2
+ VERSION = '0.4.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraped_page_archive
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Mytton
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-07-29 00:00:00.000000000 Z
11
+ date: 2016-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: vcr-archive
@@ -111,6 +111,7 @@ files:
111
111
  - bin/console
112
112
  - bin/setup
113
113
  - lib/scraped_page_archive.rb
114
+ - lib/scraped_page_archive/capybara.rb
114
115
  - lib/scraped_page_archive/open-uri.rb
115
116
  - lib/scraped_page_archive/version.rb
116
117
  - scraped_page_archive.gemspec