scraped_page_archive 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5e7f709fa26f4b28c6c7248f9a98b0ee28ad0a2e
4
- data.tar.gz: 20547cc3ef320e8e06d59a97d4b7af65514b5beb
3
+ metadata.gz: 85882d28526d19f24b3b6d6dc4761dfa4114ef15
4
+ data.tar.gz: ceb803c8e19b29ff312d81008601b7a702f719de
5
5
  SHA512:
6
- metadata.gz: 9c8457c7070d41f5d5897b7e1ac0ef6f90aa03c61b12b8ff6ec3cf7c2c3e323e63edf75e140417140af82fe2f5f926b32af265d7e88b63d23377d67f2469c3fa
7
- data.tar.gz: 4525301d13f21a8f883bd3f2962b873d6cc0021b0e11446d0791036b2da65021c0aa7fa8cba4296b0f3316fab696e74580aa45872b02f9f5ffe2872a0e23887d
6
+ metadata.gz: 95da4b3e80ee2ad5704a1c432d2d32e84806324e38150c49a4c2389d95f4215f0529e491892d943863879abf1d52710dcc01e194c32c8fbc6d16a898e56cd563
7
+ data.tar.gz: 2cbe81f2f74c8245b4d5d1b1fd2900224b5d0261da02e1f8a7251f6ee74e12f749cece5d7ebce0cb3df55ac2dffee3c6fcbe0b0180aca8794c49fddc3c1cfbdf
data/CHANGELOG.md CHANGED
@@ -5,6 +5,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
5
5
 
6
6
  ## [Unreleased]
7
7
 
8
+ ## [0.4.0] - 2016-08-04
9
+
10
+ ### Features
11
+
12
+ - Added support for Capybara Poltergeist driver
13
+ - You can now use the `ScrapedPageArchive#open_from_archive` method to retrieve a page from the archive.
14
+
8
15
  ## [0.3.1] - 2016-07-29
9
16
 
10
17
  ### Fixes
@@ -35,3 +42,4 @@ This project adheres to [Semantic Versioning](http://semver.org/).
35
42
  [0.2.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.1.0...v0.2.0
36
43
  [0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
37
44
  [0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
45
+ [0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0
data/README.md CHANGED
@@ -60,12 +60,28 @@ response = open('http://example.com/')
60
60
  # Use the response...
61
61
  ```
62
62
 
63
+ ### Use with the Capybara Poltergeist driver
64
+
65
+ If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
66
+
67
+ ```ruby
68
+ require 'scraped_page_archive/capybara'
69
+ visit('http://example.com/')
70
+ # Use the response...
71
+ ```
72
+
73
+ It should be possible to adapt this to work with other Capybara drivers
74
+ fairly easily.
75
+
63
76
  ## Development
64
77
 
65
78
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
66
79
 
67
80
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
68
81
 
82
+ Note that this does not install Capybara or any drivers so if you want
83
+ to work on that you will need to do that.
84
+
69
85
  ## Contributing
70
86
 
71
87
  Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
@@ -11,29 +11,21 @@ VCR.configure do |config|
11
11
  config.allow_http_connections_when_no_cassette = true
12
12
  end
13
13
 
14
- module ScrapedPageArchive
15
- extend self
14
+ class ScrapedPageArchive
15
+ class Error < StandardError; end
16
16
 
17
17
  attr_writer :github_repo_url
18
18
 
19
+ def self.record(*args, &block)
20
+ new.record(*args, &block)
21
+ end
22
+
19
23
  def record(&block)
20
24
  if github_repo_url.nil?
21
25
  warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
22
26
  "See https://github.com/everypolitician/scraped_page_archive#usage for details."
23
27
  return block.call
24
28
  end
25
- VCR::Archive::Persister.storage_location = git.dir.path
26
- if git.branches[branch_name] || git.branches["origin/#{branch_name}"]
27
- git.checkout(branch_name)
28
- else
29
- git.chdir do
30
- # FIXME: It's not currently possible to create an orphan branch with ruby-git
31
- # @see https://github.com/schacon/ruby-git/pull/140
32
- system("git checkout --orphan #{branch_name}")
33
- system("git rm --quiet -rf .")
34
- end
35
- git.commit("Initial commit", allow_empty: true)
36
- end
37
29
  ret = VCR.use_cassette('', &block)
38
30
 
39
31
  # NOTE: This is a workaround for a ruby-git bug.
@@ -47,13 +39,38 @@ module ScrapedPageArchive
47
39
  interaction = git.chdir { YAML.load_file(f) }
48
40
  message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
49
41
  git.add([f, f.sub(/\.yml$/, '.html')])
50
- git.commit(message) rescue binding.pry
42
+ git.commit(message)
51
43
  end
52
44
  # FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
53
45
  git.push('origin', branch_name)
54
46
  ret
55
47
  end
56
48
 
49
+ def open_from_archive(url, *args)
50
+ git.chdir do
51
+ filename = filename_from_url(url.to_s)
52
+ meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
53
+ response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
54
+ unless meta && response_body
55
+ fail Error, "No archived copy of #{url} found."
56
+ end
57
+ response_from(meta, response_body)
58
+ end
59
+ end
60
+
61
+ def filename_from_url(url)
62
+ File.join(URI.parse(url).host, Digest::SHA1.hexdigest(url))
63
+ end
64
+
65
+ def response_from(meta, response_body)
66
+ StringIO.new(response_body).tap do |response|
67
+ OpenURI::Meta.init(response)
68
+ meta['response']['headers'].each { |k, v| response.meta_add_field(k, v) }
69
+ response.status = meta['response']['status'].values.map(&:to_s)
70
+ response.base_uri = URI.parse(meta['request']['uri'])
71
+ end
72
+ end
73
+
57
74
  # TODO: This should be configurable.
58
75
  def branch_name
59
76
  @branch_name ||= 'scraped-pages-archive'
@@ -63,6 +80,18 @@ module ScrapedPageArchive
63
80
  @git ||= Git.clone(git_url, tmpdir).tap do |g|
64
81
  g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
65
82
  g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
83
+ VCR::Archive::Persister.storage_location = g.dir.path
84
+ if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
85
+ g.checkout(branch_name)
86
+ else
87
+ g.chdir do
88
+ # FIXME: It's not currently possible to create an orphan branch with ruby-git
89
+ # @see https://github.com/schacon/ruby-git/pull/140
90
+ system("git checkout --orphan #{branch_name}")
91
+ system("git rm --quiet -rf .")
92
+ end
93
+ g.commit("Initial commit", allow_empty: true)
94
+ end
66
95
  end
67
96
  end
68
97
 
@@ -79,7 +108,7 @@ module ScrapedPageArchive
79
108
  end
80
109
 
81
110
  def github_repo_url
82
- @github_repo_url ||= (git_remote_get_url_origin || ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'])
111
+ @github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
83
112
  end
84
113
 
85
114
  def git_remote_get_url_origin
@@ -0,0 +1,65 @@
1
+ # Monkey patch capybara poltergiest driver to record http requests automatically.
2
+ require 'capybara/poltergeist'
3
+ require 'scraped_page_archive'
4
+
5
+ module Capybara::Poltergeist
6
+ class Browser
7
+ alias __command command
8
+
9
+ def sha_url(url)
10
+ Digest::SHA1.hexdigest url
11
+ end
12
+
13
+ def base_dir_for_url(url)
14
+ dir = File.join(VCR::Archive::Persister.storage_location, URI(url).host)
15
+ FileUtils.mkdir_p(dir)
16
+ dir
17
+ end
18
+
19
+ def get_paths(url)
20
+ base_path = File.join(base_dir_for_url(url), sha_url(url))
21
+
22
+ ['.html', '.yml'].map { |x| base_path + x }
23
+ end
24
+
25
+ def get_details(url)
26
+ status_code = page.status_code
27
+ {
28
+ 'request' => {
29
+ 'method' => 'get', # assume this as no way to access it
30
+ 'uri' => url
31
+ },
32
+ 'response' => {
33
+ 'status' => {
34
+ 'message' => status_code == 200 ? 'OK' : 'NOT OK',
35
+ 'code' => status_code
36
+ },
37
+ 'date' => [ page.response_headers['Date'] ]
38
+ }
39
+ }
40
+ end
41
+
42
+ def save_request(html, details, url)
43
+ html_path, yaml_path = get_paths(url)
44
+
45
+ File.open(html_path,"w") do |f|
46
+ f.write(html)
47
+ end
48
+ File.open(yaml_path,"w") do |f|
49
+ f.write(YAML.dump(details))
50
+ end
51
+ end
52
+
53
+ def command(name, *args)
54
+ result = __command(name, *args)
55
+ # we skip these methods because they are called a lot, don't cause the page
56
+ # to change and having record round them slows things down quite a bit.
57
+ return result if ['tag_name', 'visible', 'property', 'find', 'body', 'set_js_errors', 'current_url', 'status_code', 'response_headers'].include?(name)
58
+ current_url = page.current_url.to_s
59
+ ScrapedPageArchive.record do
60
+ save_request(page.html, get_details(current_url), current_url)
61
+ end
62
+ result
63
+ end
64
+ end
65
+ end
@@ -1,3 +1,3 @@
1
- module ScrapedPageArchive
2
- VERSION = '0.3.1'.freeze
1
+ class ScrapedPageArchive
2
+ VERSION = '0.4.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraped_page_archive
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Mytton
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-07-29 00:00:00.000000000 Z
11
+ date: 2016-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: vcr-archive
@@ -111,6 +111,7 @@ files:
111
111
  - bin/console
112
112
  - bin/setup
113
113
  - lib/scraped_page_archive.rb
114
+ - lib/scraped_page_archive/capybara.rb
114
115
  - lib/scraped_page_archive/open-uri.rb
115
116
  - lib/scraped_page_archive/version.rb
116
117
  - scraped_page_archive.gemspec