RubyGems - scraped_page_archive - Versions diffs - 0.4.1 → 0.5.0 - Mend

scraped_page_archive 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.rubocop.yml +6 -0
data/.rubocop_todo.yml +36 -0
data/.travis.yml +8 -0
data/CHANGELOG.md +12 -0
data/README.md +73 -25
data/Rakefile +3 -0
data/bin/console +2 -6
data/lib/scraped_page_archive.rb +18 -72
data/lib/scraped_page_archive/capybara.rb +14 -12
data/lib/scraped_page_archive/git_storage.rb +87 -0
data/lib/scraped_page_archive/open-uri.rb +5 -1
data/lib/scraped_page_archive/version.rb +1 -1
data/scraped_page_archive.gemspec +2 -1
metadata +20 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 619f8211f74d6b0ae46ac97bc8eb54eae3f0ce40
-  data.tar.gz: cb1b6aac4fbaa149322276d4217c5e54dc8ab1f1
+  metadata.gz: 50b3da0577359ded1637fa4be35716026007b93b
+  data.tar.gz: ff190d0566994bf15b0ec4c28539a4c078969c5d
 SHA512:
-  metadata.gz: bb0d5ffa75160dfec99247518a870d65e78c431232ee5ef086b2bb06cf304de51f38199abbc3e78c21b8859e020c86f2899876da0b75a7cc239a46762e6c0642
-  data.tar.gz: fec75270ba3b8f0dc23035a114a970106a3f5b4d278f3f31dc24b92e74bb4e602cc5ecf5929d94292d90667e83ea0b88e89cab210d848ddbd230a3f2540d7641
+  metadata.gz: 0a5923d6d2b9946188bfcdd614189c677df278f0c70bbbfaa5c1e168f066889aa83e445aa273dbc93e756d593c33dc58c6ad32aabc1838f04410b9b03b57b9a6
+  data.tar.gz: 422ae03a74df61120ca9c79bdaced1617935af0b272c86f0c50366ca01e6d1539a9549d3282dbee997287b668f6603001c94d2d70cbe06dfd1c1d7bce8ca2bdd

data/.gitignore CHANGED

@@ -7,3 +7,4 @@
 /pkg/
 /spec/reports/
 /tmp/
+.rubocop-https---raw-githubusercontent-com-everypolitician-everypolitician-data-master--rubocop-base-yml

data/.rubocop.yml ADDED

@@ -0,0 +1,6 @@
+AllCops:
+  TargetRubyVersion: 2.0
+inherit_from:
+  - https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
+  - .rubocop_todo.yml

data/.rubocop_todo.yml ADDED

@@ -0,0 +1,36 @@
+# This configuration was generated by
+# `rubocop --auto-gen-config`
+# on 2016-09-26 16:56:34 +0100 using RuboCop version 0.42.0.
+# The point is for the user to remove these configuration records
+# one by one as the offenses are removed from the code base.
+# Note that changes in the inspected code, or installation of new
+# versions of RuboCop, may require this file to be generated again.
+# Offense count: 3
+Metrics/AbcSize:
+  Max: 45
+# Offense count: 16
+# Configuration parameters: AllowHeredoc, AllowURI, URISchemes.
+# URISchemes: http, https
+Metrics/LineLength:
+  Max: 132
+# Offense count: 3
+# Configuration parameters: CountComments.
+Metrics/MethodLength:
+  Max: 18
+# Offense count: 2
+# Configuration parameters: EnforcedStyle, SupportedStyles.
+# SupportedStyles: nested, compact
+Style/ClassAndModuleChildren:
+  Exclude:
+    - 'lib/scraped_page_archive/capybara.rb'
+    - 'test/test_helper.rb'
+# Offense count: 1
+# Configuration parameters: ExpectMatchingDefinition, Regex, IgnoreExecutableScripts.
+Style/FileName:
+  Exclude:
+    - 'lib/scraped_page_archive/open-uri.rb'

data/.travis.yml CHANGED

@@ -6,4 +6,12 @@ rvm:
   - 2.2.5
   - 2.3.1
 before_install: gem install bundler -v 1.12.5
+script:
+  - bundle exec rake test
+  - bundle exec rake rubocop
 cache: bundler
+deploy:
+  provider: rubygems
+  api_key: "$RUBYGEMS_API_KEY"
+  on:
+    tags: true

data/CHANGELOG.md CHANGED

@@ -5,6 +5,16 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 ## [Unreleased]
+## [0.5.0] - 2016-11-03
+### Changes
+- The git storage logic has been pulled into its own class. This means that you need to pass a `ScrapedPageArchive::GitStorage` instance to the `ScrapedPageArchive` constructor if you're using the class directly. See the ["Running on other platforms" section in README.md](README.md#running-on-other-platforms) for more details.
+### Fixes
+- Avoid recloning the whole repo for each request by caching the `ScrapedPageArchive` instance in the `open-uri` and `capybara` adapters.
 ## [0.4.1] - 2016-08-15
 ### Fixes
@@ -49,3 +59,5 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 [0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
 [0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
 [0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0
+[0.4.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.4.0...v0.4.1
+[0.5.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.4.1...v0.5.0

data/README.md CHANGED

@@ -21,46 +21,62 @@ Or install it yourself as:
 ## Usage
-First require the library:
+### Running locally
+#### Use with open-uri
+If you’re running a scraper locally, and the library can auto-detect
+what repo it’s in, and find your credentials, all you need to do for an
+`open-uri` based scraper is add a `require` line:
 ```ruby
-require 'scraped_page_archive'
+require 'scraped_page_archive/open-uri'
+response = open('http://example.com/')
+# Use the response...
 ```
-Then configure the github url to clone. This will need to have a GitHub token embedded in it, you can [generate a new one here](https://github.com/settings/tokens). It will need to have the `repo` permission checked.
+As your scraper fetches any page it will also commit a copy of the
+response (and the headers), into a `scraped-pages-archive` branch.
+### Running on other platforms
-If you're using the excellent [morph.io](https://morph.io) then you can set the `MORPH_SCRAPER_CACHE_GITHUB_REPO_URL` environment variable to your git url:
+If you are not running your app locally, or it can’t auto-detect the
+information it needs to be able to do the archiving, then you need to
+provide some extra configuration — specifically the url to your repo and
+a GitHub access token.
-| Name                                  | Value                                                           |
-|---------------------------------------|-----------------------------------------------------------------|
-| `MORPH_SCRAPER_CACHE_GITHUB_REPO_URL` | `https://githubtokenhere@github.com/tmtmtmtm/estonia-riigikogu` |
+[Generate a GitHub access token here](https://github.com/settings/tokens):
+it will need to have the `repo` permission checked. Then combine it with
+the details of your repo to produce a setting in the form:
-You can also set this to any value (including another environment variable of your choosing) with the following:
 ```ruby
-ScrapedPageArchive.github_repo_url = 'https://githubtokenhere@github.com/tmtmtmtm/estonia-riigikogu'
+REPO = 'https://YOUR_GITHUB_TOKEN@github.com/everypolitician-scrapers/kenya-mzalendo'
+storage = ScrapedPageArchive::GitStorage.new(REPO)
+archive = ScrapedPageArchive.new(storage)
+archive.record { open('http://example.com/') }
 ```
-Then you can record http requests by performing them in a block passed to `ScrapedPageArchive.record`:
+(Though, obviously, you’ll want your own scraper details there rather than
+`everypolitician-scrapers/kenya-mzalendo`!)
-```ruby
-ScrapedPageArchive.record do
-  response = open('http://example.com/')
-  # Use the response...
-end
-```
+IMPORTANT: Remember not to share your GitHub access token. Don’t include
+it in your code, especially if it lives in a public repo. Normal usage
+would be to set this from an environment variable.
-### Use with open-uri
+#### Use with Morph
-If you would like to have your http requests automatically recorded when using open-uri do the following:
+If you’re using the excellent [morph.io](https://morph.io), you can set
+your repo URL configuration in the "Secret environment variables"
+section of the scraper’s Settings page. We automatically check if
+`MORPH_SCRAPER_CACHE_GITHUB_REPO_URL` is set — there’s no need to
+explicitly set it using `ScrapedPageArchive.github_repo_url` in this
+case.
-```ruby
-require 'scraped_page_archive/open-uri'
-response = open('http://example.com/')
-# Use the response...
-```
-### Use with the Capybara Poltergeist driver
+### More complex scenarios
+#### Use with the Capybara Poltergeist driver
 If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
@@ -73,6 +89,18 @@ visit('http://example.com/')
 It should be possible to adapt this to work with other Capybara drivers
 fairly easily.
+#### Use with `ScrapedPageArchive.record`
+You can have complete control and record http requests by performing them in a block passed to `ScrapedPageArchive.record`:
+```ruby
+require 'scraped_page_archive'
+ScrapedPageArchive.record do
+  response = open('http://example.com/')
+  # Use the response...
+end
+```
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -82,10 +110,30 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 Note that this does not install Capybara or any drivers so if you want
 to work on that you will need to do that.
+### Releases
+After you've added a new feature or fixed a bug you should release the gem to rubygems.org.
+#### Before releasing a new version
+- [ ] Is your new feature/bugfix documented in [`CHANGELOG.md`](CHANGELOG.md)?
+- [ ] Have added a section for the new version in [`CHANGELOG.md`](CHANGELOG.md)?
+- [ ] Have you updated `ScrapedPage::VERSION` according to [SemVer](http://semver.org/)?
+- [ ] Are all of the changes that you want included in the release on the `master` branch?
+#### Releasing a new version
+If you wanted to release version `0.42.0`, for example, you would need to run the following commands:
+    git tag -a -m "scraped_page_archive v0.42.0" v0.42.0
+    git push origin --tags
+Then Travis CI will notice that you've pushed a new tag and will release the new version of the gem.
 ## Contributing
 Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
 ## License
-The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT)

data/Rakefile CHANGED

@@ -1,5 +1,6 @@
 require 'bundler/gem_tasks'
 require 'rake/testtask'
+require 'rubocop/rake_task'
 Rake::TestTask.new(:test) do |t|
   t.libs << 'test'
@@ -8,3 +9,5 @@ Rake::TestTask.new(:test) do |t|
 end
 task default: :test
+RuboCop::RakeTask.new

data/bin/console CHANGED

@@ -6,9 +6,5 @@ require 'scraped_page_archive'
 # You can add fixtures and/or initialization code here to make experimenting
 # with your gem easier. You can also use a different console, if you like.
-# (If you use this, don't forget to add pry to your Gemfile!)
-# require "pry"
-# Pry.start
-require 'irb'
-IRB.start
+require 'pry'
+Pry.start

data/lib/scraped_page_archive.rb CHANGED

@@ -1,6 +1,6 @@
 require 'scraped_page_archive/version'
+require 'scraped_page_archive/git_storage'
 require 'vcr'
-require 'git'
 require 'vcr/archive'
 VCR.configure do |config|
@@ -14,40 +14,32 @@ end
 class ScrapedPageArchive
   class Error < StandardError; end
-  attr_writer :github_repo_url
   def self.record(*args, &block)
-    new.record(*args, &block)
+    new(GitStorage.new).record(*args, &block)
   end
-  def record(&block)
-    if github_repo_url.nil?
-      warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
-        "See https://github.com/everypolitician/scraped_page_archive#usage for details."
-      return block.call
-    end
-    ret = VCR.use_cassette('', &block)
+  attr_reader :storage
-    # NOTE: This is a workaround for a ruby-git bug.
-    # @see https://github.com/schacon/ruby-git/issues/23
-    git.status.changed.each { git.diff.entries }
+  def initialize(storage)
+    @storage = storage
+  end
-    files = (git.status.changed.keys + git.status.untracked.keys)
-    return ret unless files.any?
-    # For each interaction, commit the yml and html along with the correct commit message.
-    files.find_all { |f| f.end_with?('.yml') }.each do |f|
-      interaction = git.chdir { YAML.load_file(f) }
-      message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
-      git.add([f, f.sub(/\.yml$/, '.html')])
-      git.commit(message)
+  def record(&block)
+    if storage.github_repo_url.nil?
+      warn "The 'scraped_page_archive' gem wants to store the scraped pages in a git repo," \
+        'but it cannot determine which git repo it should use.  See ' \
+        'https://github.com/everypolitician/scraped_page_archive#usage for details of how ' \
+        "to specify the repo.\n\n"
+      return yield
     end
-    # FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
-    git.push('origin', branch_name)
+    VCR::Archive::Persister.storage_location = storage.path
+    ret = VCR.use_cassette('', &block)
+    storage.save
     ret
   end
-  def open_from_archive(url, *args)
-    git.chdir do
+  def open_from_archive(url)
+    storage.chdir do
       filename = filename_from_url(url.to_s)
       meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
       response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
@@ -70,50 +62,4 @@ class ScrapedPageArchive
       response.base_uri = URI.parse(meta['request']['uri'])
     end
   end
-  # TODO: This should be configurable.
-  def branch_name
-    @branch_name ||= 'scraped-pages-archive'
-  end
-  def git
-    @git ||= Git.clone(git_url, tmpdir).tap do |g|
-      g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
-      g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
-      VCR::Archive::Persister.storage_location = g.dir.path
-      if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
-        g.checkout(branch_name)
-      else
-        g.chdir do
-          # FIXME: It's not currently possible to create an orphan branch with ruby-git
-          # @see https://github.com/schacon/ruby-git/pull/140
-          system("git checkout --orphan #{branch_name}")
-          system("git rm --quiet -rf .")
-        end
-        g.commit("Initial commit", allow_empty: true)
-      end
-    end
-  end
-  def tmpdir
-    @tmpdir ||= Dir.mktmpdir
-  end
-  def git_url
-    @git_url ||= begin
-      url = URI.parse(github_repo_url)
-      url.password = ENV['SCRAPED_PAGE_ARCHIVE_GITHUB_TOKEN']
-      url.to_s
-    end
-  end
-  def github_repo_url
-    @github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
-  end
-  def git_remote_get_url_origin
-    remote_url = `git config remote.origin.url`.chomp
-    return nil unless $?.success?
-    remote_url
-  end
 end

data/lib/scraped_page_archive/capybara.rb CHANGED

@@ -23,29 +23,28 @@ module Capybara::Poltergeist
     end
     def get_details(url)
-      status_code = page.status_code
       {
-        'request' => {
+        'request'  => {
           'method' => 'get', # assume this as no way to access it
-          'uri' => url
+          'uri'    => url,
         },
         'response' => {
           'status' => {
             'message' => status_code == 200 ? 'OK' : 'NOT OK',
-            'code' => status_code
+            'code'    => status_code,
           },
-          'date' => [ page.response_headers['Date'] ]
-        }
+          'date'   => [response_headers['Date']],
+        },
       }
     end
     def save_request(html, details, url)
       html_path, yaml_path = get_paths(url)
-      File.open(html_path,"w") do |f|
+      File.open(html_path, 'w') do |f|
         f.write(html)
       end
-      File.open(yaml_path,"w") do |f|
+      File.open(yaml_path, 'w') do |f|
         f.write(YAML.dump(details))
       end
     end
@@ -54,12 +53,15 @@ module Capybara::Poltergeist
       result = __command(name, *args)
       # we skip these methods because they are called a lot, don't cause the page
       # to change and having record round them slows things down quite a bit.
-      return result if ['tag_name', 'visible', 'property', 'find', 'body', 'set_js_errors', 'current_url', 'status_code', 'response_headers'].include?(name)
-      current_url = page.current_url.to_s
-      ScrapedPageArchive.record do
-        save_request(page.html, get_details(current_url), current_url)
+      return result if %w(tag_name visible property find body set_js_errors current_url status_code response_headers).include?(name)
+      scraped_page_archive.record do
+        save_request(body, get_details(current_url), current_url)
       end
       result
     end
+    def scraped_page_archive
+      @scraped_page_archive ||= ScrapedPageArchive.new(ScrapedPageArchive::GitStorage.new)
+    end
   end
 end

data/lib/scraped_page_archive/git_storage.rb ADDED

@@ -0,0 +1,87 @@
+require 'git'
+require 'English'
+class ScrapedPageArchive
+  class GitStorage
+    attr_reader :github_repo_url
+    def initialize(github_repo_url = nil)
+      @github_repo_url = (
+        github_repo_url ||
+        ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] ||
+        git_remote_get_url_origin
+      )
+    end
+    def path
+      git.dir.path
+    end
+    def chdir(&block)
+      git.chdir(&block)
+    end
+    # FIXME: This should be refactored so it doesn't have as much knowledge about
+    # the locations of files on the filesystem.
+    def save
+      # NOTE: This is a workaround for a ruby-git bug.
+      # @see https://github.com/schacon/ruby-git/issues/23
+      git.status.changed.each { git.diff.entries }
+      files = (git.status.changed.keys + git.status.untracked.keys)
+      return unless files.any?
+      # For each interaction, commit the yml and html along with the correct commit message.
+      files.select { |f| f.end_with?('.yml') }.each do |f|
+        interaction = git.chdir { YAML.load_file(f) }
+        message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
+        git.add([f, f.sub(/\.yml$/, '.html')])
+        git.commit(message)
+      end
+      # FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
+      git.push('origin', branch_name)
+    end
+    private
+    # TODO: This should be configurable.
+    def branch_name
+      @branch_name ||= 'scraped-pages-archive'
+    end
+    def git
+      @git ||= Git.clone(git_url, tmpdir).tap do |g|
+        g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
+        g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
+        if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
+          g.checkout(branch_name)
+        else
+          g.chdir do
+            # FIXME: It's not currently possible to create an orphan branch with ruby-git
+            # @see https://github.com/schacon/ruby-git/pull/140
+            system("git checkout --orphan #{branch_name}")
+            system('git rm --quiet -rf .')
+          end
+          g.commit('Initial commit', allow_empty: true)
+        end
+      end
+    end
+    def tmpdir
+      @tmpdir ||= Dir.mktmpdir
+    end
+    def git_url
+      @git_url ||= begin
+        url = URI.parse(github_repo_url)
+        url.password = ENV['SCRAPED_PAGE_ARCHIVE_GITHUB_TOKEN']
+        url.to_s
+      end
+    end
+    def git_remote_get_url_origin
+      remote_url = `git config remote.origin.url`.chomp
+      return nil unless $CHILD_STATUS.success?
+      remote_url
+    end
+  end
+end

data/lib/scraped_page_archive/open-uri.rb CHANGED

@@ -6,7 +6,11 @@ module OpenURI
   class << self
     alias __open_uri open_uri
     def open_uri(*args, &block)
-      ScrapedPageArchive.record { __open_uri(*args, &block) }
+      scraped_page_archive.record { __open_uri(*args, &block) }
+    end
+    def scraped_page_archive
+      @scraped_page_archive ||= ScrapedPageArchive.new(ScrapedPageArchive::GitStorage.new)
     end
   end
 end

data/lib/scraped_page_archive/version.rb CHANGED

@@ -1,3 +1,3 @@
 class ScrapedPageArchive
-  VERSION = '0.4.1'.freeze
+  VERSION = '0.5.0'.freeze
 end

data/scraped_page_archive.gemspec CHANGED

@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
   spec.email         = ['chrismytton@gmail.com']
   spec.summary       = 'Archives a copy of scraped web pages into a git branch'
-  spec.homepage      = "https://github.com/everypolitician/scraped_page_archive"
+  spec.homepage      = 'https://github.com/everypolitician/scraped_page_archive'
   spec.license       = 'MIT'
   spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
@@ -25,4 +25,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'rake', '~> 10.0'
   spec.add_development_dependency 'minitest', '~> 5.0'
   spec.add_development_dependency 'pry', '~> 0.10.4'
+  spec.add_development_dependency 'rubocop', '~> 0.42'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraped_page_archive
 version: !ruby/object:Gem::Version
-  version: 0.4.1
+  version: 0.5.0
 platform: ruby
 authors:
 - Chris Mytton
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-08-15 00:00:00.000000000 Z
+date: 2016-11-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: vcr-archive
@@ -94,6 +94,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 0.10.4
+- !ruby/object:Gem::Dependency
+  name: rubocop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.42'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.42'
 description:
 email:
 - chrismytton@gmail.com
@@ -102,6 +116,8 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".rubocop.yml"
+- ".rubocop_todo.yml"
 - ".travis.yml"
 - CHANGELOG.md
 - Gemfile
@@ -112,6 +128,7 @@ files:
 - bin/setup
 - lib/scraped_page_archive.rb
 - lib/scraped_page_archive/capybara.rb
+- lib/scraped_page_archive/git_storage.rb
 - lib/scraped_page_archive/open-uri.rb
 - lib/scraped_page_archive/version.rb
 - scraped_page_archive.gemspec
@@ -135,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.1
+rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
 summary: Archives a copy of scraped web pages into a git branch