scraped_page_archive 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +75 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/scraped_page_archive/open-uri.rb +12 -0
- data/lib/scraped_page_archive/version.rb +3 -0
- data/lib/scraped_page_archive.rb +87 -0
- data/scraped_page_archive.gemspec +28 -0
- metadata +141 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: dad81b1b381a56c2e975f9b25216b68865e30a05
|
4
|
+
data.tar.gz: 558e3bb764ff15b5af3581f46b2a9b65448f01a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ac28d6306e7cdbb3aa5763f947bb78a22f065900cf90de24dbdb6bb802c9ce6644017d176c5ce6111517e62221bbcab75221aa863b23b5321c868dd2746e897d
|
7
|
+
data.tar.gz: 3fa19422cbc4c41217b167e54a3deed64254bf4ab9a0ea215501dbf06bed28b72a0d50aa1314ba9edc66be09d98eb174b60d718dbcb139c635d07409f15dbd91
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Change Log
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
This project adheres to [Semantic Versioning](http://semver.org/).
|
5
|
+
|
6
|
+
## [Unreleased]
|
7
|
+
|
8
|
+
## 0.1.0 - 2016-07-28
|
9
|
+
|
10
|
+
### Features
|
11
|
+
|
12
|
+
- Record http interactions
|
13
|
+
- Save the interaction to a git repository
|
14
|
+
|
15
|
+
[Unreleased]: https://github.com/everypolitician/scraped_page_archive/compare/v0.1.0...HEAD
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 Chris Mytton
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# ScrapedPageArchive
|
2
|
+
|
3
|
+
Add this gem to your Ruby scraper and it will automatically capture http requests
|
4
|
+
and cache the response in a branch within your git repository.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem 'scraped_page_archive'
|
12
|
+
```
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install scraped_page_archive
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
First require the library:
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'scraped_page_archive'
|
28
|
+
```
|
29
|
+
|
30
|
+
Then configure the github url to clone. This will need to have a GitHub token embedded in it, you can [generate a new one here](https://github.com/settings/tokens). It will need to have the `repo` permission checked.
|
31
|
+
|
32
|
+
If you're using the excellent [morph.io](https://morph.io) then you can set the `MORPH_SCRAPER_CACHE_GITHUB_REPO_URL` environment variable to your git url:
|
33
|
+
|
34
|
+
| Name | Value |
|
35
|
+
|---------------------------------------|-----------------------------------------------------------------|
|
36
|
+
| `MORPH_SCRAPER_CACHE_GITHUB_REPO_URL` | `https://githubtokenhere@github.com/tmtmtmtm/estonia-riigikogu` |
|
37
|
+
|
38
|
+
You can also set this to any value (including another environment variable of your choosing) with the following:
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
ScrapedPageArchive.github_repo_url = 'https://githubtokenhere@github.com/tmtmtmtm/estonia-riigikogu'
|
42
|
+
```
|
43
|
+
|
44
|
+
Then you can record http requests by performing them in a block passed to `ScrapedPageArchive.record`:
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
ScrapedPageArchive.record do
|
48
|
+
response = open('http://example.com/')
|
49
|
+
# Use the response...
|
50
|
+
end
|
51
|
+
```
|
52
|
+
|
53
|
+
### Use with open-uri
|
54
|
+
|
55
|
+
If you would like to have your http requests automatically recorded when using open-uri do the following:
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
require 'scraped_page_archive/open-uri'
|
59
|
+
response = open('http://example.com/')
|
60
|
+
# Use the response...
|
61
|
+
```
|
62
|
+
|
63
|
+
## Development
|
64
|
+
|
65
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
66
|
+
|
67
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
68
|
+
|
69
|
+
## Contributing
|
70
|
+
|
71
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
|
72
|
+
|
73
|
+
## License
|
74
|
+
|
75
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'scraped_page_archive'
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require 'irb'
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# Monkey patch open-uri to record http requests automatically.
|
2
|
+
require 'open-uri'
|
3
|
+
require 'scraped_page_archive'
|
4
|
+
|
5
|
+
module OpenURI
|
6
|
+
class << self
|
7
|
+
alias __open_uri open_uri
|
8
|
+
def open_uri(*args, &block)
|
9
|
+
ScrapedPageArchive.record { __open_uri(*args, &block) }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'scraped_page_archive/version'
|
2
|
+
require 'vcr'
|
3
|
+
require 'git'
|
4
|
+
require 'vcr/archive'
|
5
|
+
|
6
|
+
VCR.configure do |config|
|
7
|
+
config.hook_into :webmock
|
8
|
+
config.cassette_serializers[:vcr_archive] = VCR::Archive::Serializer
|
9
|
+
config.cassette_persisters[:vcr_archive] = VCR::Archive::Persister
|
10
|
+
config.default_cassette_options = { serialize_with: :vcr_archive, persist_with: :vcr_archive, record: :all }
|
11
|
+
end
|
12
|
+
|
13
|
+
module ScrapedPageArchive
|
14
|
+
extend self
|
15
|
+
|
16
|
+
attr_writer :github_repo_url
|
17
|
+
|
18
|
+
def record(&block)
|
19
|
+
if github_repo_url.nil?
|
20
|
+
warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
|
21
|
+
"See https://github.com/everypolitician/scraped_page_archive#usage for details."
|
22
|
+
return block.call
|
23
|
+
end
|
24
|
+
VCR::Archive::Persister.storage_location = git.dir.path
|
25
|
+
if git.branches[branch_name] || git.branches["origin/#{branch_name}"]
|
26
|
+
git.checkout(branch_name)
|
27
|
+
else
|
28
|
+
git.chdir do
|
29
|
+
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
30
|
+
# @see https://github.com/schacon/ruby-git/pull/140
|
31
|
+
system("git checkout --orphan #{branch_name}")
|
32
|
+
system("git rm --quiet -rf .")
|
33
|
+
end
|
34
|
+
git.commit("Initial commit", allow_empty: true)
|
35
|
+
end
|
36
|
+
ret = VCR.use_cassette('', &block)
|
37
|
+
|
38
|
+
# NOTE: This is a workaround for a ruby-git bug.
|
39
|
+
# @see https://github.com/schacon/ruby-git/issues/23
|
40
|
+
git.status.changed.each { git.diff.entries }
|
41
|
+
|
42
|
+
files = (git.status.changed.keys + git.status.untracked.keys)
|
43
|
+
return ret unless files.any?
|
44
|
+
# For each interaction, commit the yml and html along with the correct commit message.
|
45
|
+
files.find_all { |f| f.end_with?('.yml') }.each do |f|
|
46
|
+
interaction = git.chdir { YAML.load_file(f) }
|
47
|
+
message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
|
48
|
+
git.add([f, f.sub(/\.yml$/, '.html')])
|
49
|
+
git.commit(message) rescue binding.pry
|
50
|
+
end
|
51
|
+
# FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
|
52
|
+
git.push('origin', branch_name)
|
53
|
+
ret
|
54
|
+
end
|
55
|
+
|
56
|
+
# TODO: This should be configurable.
|
57
|
+
def branch_name
|
58
|
+
@branch_name ||= 'scraped-pages-archive'
|
59
|
+
end
|
60
|
+
|
61
|
+
def git
|
62
|
+
@git ||= Git.clone(git_url, tmpdir)
|
63
|
+
end
|
64
|
+
|
65
|
+
def tmpdir
|
66
|
+
@tmpdir ||= Dir.mktmpdir
|
67
|
+
end
|
68
|
+
|
69
|
+
def git_url
|
70
|
+
@git_url ||= begin
|
71
|
+
url = URI.parse(github_repo_url)
|
72
|
+
url.password = ENV['SCRAPED_PAGE_ARCHIVE_GITHUB_TOKEN']
|
73
|
+
url.to_s
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def github_repo_url
|
78
|
+
@github_repo_url ||= (git_remote_get_url_origin || ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'])
|
79
|
+
end
|
80
|
+
|
81
|
+
def git_remote_get_url_origin
|
82
|
+
@git_remote_get_url_origin ||= begin
|
83
|
+
remote_url = `git remote get-url origin`.chomp
|
84
|
+
remote_url.empty? ? nil : remote_url
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'scraped_page_archive/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'scraped_page_archive'
|
8
|
+
spec.version = ScrapedPageArchive::VERSION
|
9
|
+
spec.authors = ['Chris Mytton']
|
10
|
+
spec.email = ['chrismytton@gmail.com']
|
11
|
+
|
12
|
+
spec.summary = 'Archives a copy of scraped web pages into a git branch'
|
13
|
+
spec.homepage = "https://github.com/everypolitician/scraped_page_archive"
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = 'exe'
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'vcr-archive', '~> 0.3.0'
|
22
|
+
spec.add_runtime_dependency 'git', '~> 1.3.0'
|
23
|
+
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.12'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
27
|
+
spec.add_development_dependency 'pry', '~> 0.10.4'
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scraped_page_archive
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Chris Mytton
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-07-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: vcr-archive
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.3.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.3.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: git
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.3.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.3.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.12'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.12'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: minitest
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '5.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '5.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: pry
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.10.4
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.10.4
|
97
|
+
description:
|
98
|
+
email:
|
99
|
+
- chrismytton@gmail.com
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- ".gitignore"
|
105
|
+
- ".travis.yml"
|
106
|
+
- CHANGELOG.md
|
107
|
+
- Gemfile
|
108
|
+
- LICENSE.txt
|
109
|
+
- README.md
|
110
|
+
- Rakefile
|
111
|
+
- bin/console
|
112
|
+
- bin/setup
|
113
|
+
- lib/scraped_page_archive.rb
|
114
|
+
- lib/scraped_page_archive/open-uri.rb
|
115
|
+
- lib/scraped_page_archive/version.rb
|
116
|
+
- scraped_page_archive.gemspec
|
117
|
+
homepage: https://github.com/everypolitician/scraped_page_archive
|
118
|
+
licenses:
|
119
|
+
- MIT
|
120
|
+
metadata: {}
|
121
|
+
post_install_message:
|
122
|
+
rdoc_options: []
|
123
|
+
require_paths:
|
124
|
+
- lib
|
125
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
|
+
requirements:
|
132
|
+
- - ">="
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: '0'
|
135
|
+
requirements: []
|
136
|
+
rubyforge_project:
|
137
|
+
rubygems_version: 2.5.1
|
138
|
+
signing_key:
|
139
|
+
specification_version: 4
|
140
|
+
summary: Archives a copy of scraped web pages into a git branch
|
141
|
+
test_files: []
|