RubyGems - scraped_page_archive - Versions diffs - 0.3.1 → 0.4.0 - Mend

scraped_page_archive 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -0
data/README.md +16 -0
data/lib/scraped_page_archive.rb +45 -16
data/lib/scraped_page_archive/capybara.rb +65 -0
data/lib/scraped_page_archive/version.rb +2 -2
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5e7f709fa26f4b28c6c7248f9a98b0ee28ad0a2e
-  data.tar.gz: 20547cc3ef320e8e06d59a97d4b7af65514b5beb
+  metadata.gz: 85882d28526d19f24b3b6d6dc4761dfa4114ef15
+  data.tar.gz: ceb803c8e19b29ff312d81008601b7a702f719de
 SHA512:
-  metadata.gz: 9c8457c7070d41f5d5897b7e1ac0ef6f90aa03c61b12b8ff6ec3cf7c2c3e323e63edf75e140417140af82fe2f5f926b32af265d7e88b63d23377d67f2469c3fa
-  data.tar.gz: 4525301d13f21a8f883bd3f2962b873d6cc0021b0e11446d0791036b2da65021c0aa7fa8cba4296b0f3316fab696e74580aa45872b02f9f5ffe2872a0e23887d
+  metadata.gz: 95da4b3e80ee2ad5704a1c432d2d32e84806324e38150c49a4c2389d95f4215f0529e491892d943863879abf1d52710dcc01e194c32c8fbc6d16a898e56cd563
+  data.tar.gz: 2cbe81f2f74c8245b4d5d1b1fd2900224b5d0261da02e1f8a7251f6ee74e12f749cece5d7ebce0cb3df55ac2dffee3c6fcbe0b0180aca8794c49fddc3c1cfbdf

data/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 ## [Unreleased]
+## [0.4.0] - 2016-08-04
+### Features
+- Added support for Capybara Poltergeist driver
+- You can now use the `ScrapedPageArchive#open_from_archive` method to retrieve a page from the archive.
 ## [0.3.1] - 2016-07-29
 ### Fixes
@@ -35,3 +42,4 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 [0.2.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.1.0...v0.2.0
 [0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
 [0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
+[0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0

data/README.md CHANGED Viewed

@@ -60,12 +60,28 @@ response = open('http://example.com/')
 # Use the response...
 ```
+### Use with the Capybara Poltergeist driver
+If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
+```ruby
+require 'scraped_page_archive/capybara'
+visit('http://example.com/')
+# Use the response...
+```
+It should be possible to adapt this to work with other Capybara drivers
+fairly easily.
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
 To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+Note that this does not install Capybara or any drivers so if you want
+to work on that you will need to do that.
 ## Contributing
 Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.

data/lib/scraped_page_archive.rb CHANGED Viewed

@@ -11,29 +11,21 @@ VCR.configure do |config|
   config.allow_http_connections_when_no_cassette = true
 end
-module ScrapedPageArchive
-  extend self
+class ScrapedPageArchive
+  class Error < StandardError; end
   attr_writer :github_repo_url
+  def self.record(*args, &block)
+    new.record(*args, &block)
+  end
   def record(&block)
     if github_repo_url.nil?
       warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
         "See https://github.com/everypolitician/scraped_page_archive#usage for details."
       return block.call
     end
-    VCR::Archive::Persister.storage_location = git.dir.path
-    if git.branches[branch_name] || git.branches["origin/#{branch_name}"]
-      git.checkout(branch_name)
-    else
-      git.chdir do
-        # FIXME: It's not currently possible to create an orphan branch with ruby-git
-        # @see https://github.com/schacon/ruby-git/pull/140
-        system("git checkout --orphan #{branch_name}")
-        system("git rm --quiet -rf .")
-      end
-      git.commit("Initial commit", allow_empty: true)
-    end
     ret = VCR.use_cassette('', &block)
     # NOTE: This is a workaround for a ruby-git bug.
@@ -47,13 +39,38 @@ module ScrapedPageArchive
       interaction = git.chdir { YAML.load_file(f) }
       message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
       git.add([f, f.sub(/\.yml$/, '.html')])
-      git.commit(message) rescue binding.pry
+      git.commit(message)
     end
     # FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
     git.push('origin', branch_name)
     ret
   end
+  def open_from_archive(url, *args)
+    git.chdir do
+      filename = filename_from_url(url.to_s)
+      meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
+      response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
+      unless meta && response_body
+        fail Error, "No archived copy of #{url} found."
+      end
+      response_from(meta, response_body)
+    end
+  end
+  def filename_from_url(url)
+    File.join(URI.parse(url).host, Digest::SHA1.hexdigest(url))
+  end
+  def response_from(meta, response_body)
+    StringIO.new(response_body).tap do |response|
+      OpenURI::Meta.init(response)
+      meta['response']['headers'].each { |k, v| response.meta_add_field(k, v) }
+      response.status = meta['response']['status'].values.map(&:to_s)
+      response.base_uri = URI.parse(meta['request']['uri'])
+    end
+  end
   # TODO: This should be configurable.
   def branch_name
     @branch_name ||= 'scraped-pages-archive'
@@ -63,6 +80,18 @@ module ScrapedPageArchive
     @git ||= Git.clone(git_url, tmpdir).tap do |g|
       g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
       g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
+      VCR::Archive::Persister.storage_location = g.dir.path
+      if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
+        g.checkout(branch_name)
+      else
+        g.chdir do
+          # FIXME: It's not currently possible to create an orphan branch with ruby-git
+          # @see https://github.com/schacon/ruby-git/pull/140
+          system("git checkout --orphan #{branch_name}")
+          system("git rm --quiet -rf .")
+        end
+        g.commit("Initial commit", allow_empty: true)
+      end
     end
   end
@@ -79,7 +108,7 @@ module ScrapedPageArchive
   end
   def github_repo_url
-    @github_repo_url ||= (git_remote_get_url_origin || ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'])
+    @github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
   end
   def git_remote_get_url_origin

data/lib/scraped_page_archive/capybara.rb ADDED Viewed

@@ -0,0 +1,65 @@
+# Monkey patch capybara poltergiest driver to record http requests automatically.
+require 'capybara/poltergeist'
+require 'scraped_page_archive'
+module Capybara::Poltergeist
+  class Browser
+    alias __command command
+    def sha_url(url)
+      Digest::SHA1.hexdigest url
+    end
+    def base_dir_for_url(url)
+      dir = File.join(VCR::Archive::Persister.storage_location, URI(url).host)
+      FileUtils.mkdir_p(dir)
+      dir
+    end
+    def get_paths(url)
+      base_path = File.join(base_dir_for_url(url), sha_url(url))
+      ['.html', '.yml'].map { |x| base_path + x }
+    end
+    def get_details(url)
+      status_code = page.status_code
+      {
+        'request' => {
+          'method' => 'get', # assume this as no way to access it
+          'uri' => url
+        },
+        'response' => {
+          'status' => {
+            'message' => status_code == 200 ? 'OK' : 'NOT OK',
+            'code' => status_code
+          },
+          'date' => [ page.response_headers['Date'] ]
+        }
+      }
+    end
+    def save_request(html, details, url)
+      html_path, yaml_path = get_paths(url)
+      File.open(html_path,"w") do |f|
+        f.write(html)
+      end
+      File.open(yaml_path,"w") do |f|
+        f.write(YAML.dump(details))
+      end
+    end
+    def command(name, *args)
+      result = __command(name, *args)
+      # we skip these methods because they are called a lot, don't cause the page
+      # to change and having record round them slows things down quite a bit.
+      return result if ['tag_name', 'visible', 'property', 'find', 'body', 'set_js_errors', 'current_url', 'status_code', 'response_headers'].include?(name)
+      current_url = page.current_url.to_s
+      ScrapedPageArchive.record do
+        save_request(page.html, get_details(current_url), current_url)
+      end
+      result
+    end
+  end
+end

data/lib/scraped_page_archive/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
-module ScrapedPageArchive
-  VERSION = '0.3.1'.freeze
+class ScrapedPageArchive
+  VERSION = '0.4.0'.freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraped_page_archive
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.4.0
 platform: ruby
 authors:
 - Chris Mytton
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-07-29 00:00:00.000000000 Z
+date: 2016-08-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: vcr-archive
@@ -111,6 +111,7 @@ files:
 - bin/console
 - bin/setup
 - lib/scraped_page_archive.rb
+- lib/scraped_page_archive/capybara.rb
 - lib/scraped_page_archive/open-uri.rb
 - lib/scraped_page_archive/version.rb
 - scraped_page_archive.gemspec