scraped_page_archive 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +6 -0
- data/.rubocop_todo.yml +36 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +12 -0
- data/README.md +73 -25
- data/Rakefile +3 -0
- data/bin/console +2 -6
- data/lib/scraped_page_archive.rb +18 -72
- data/lib/scraped_page_archive/capybara.rb +14 -12
- data/lib/scraped_page_archive/git_storage.rb +87 -0
- data/lib/scraped_page_archive/open-uri.rb +5 -1
- data/lib/scraped_page_archive/version.rb +1 -1
- data/scraped_page_archive.gemspec +2 -1
- metadata +20 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50b3da0577359ded1637fa4be35716026007b93b
|
4
|
+
data.tar.gz: ff190d0566994bf15b0ec4c28539a4c078969c5d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a5923d6d2b9946188bfcdd614189c677df278f0c70bbbfaa5c1e168f066889aa83e445aa273dbc93e756d593c33dc58c6ad32aabc1838f04410b9b03b57b9a6
|
7
|
+
data.tar.gz: 422ae03a74df61120ca9c79bdaced1617935af0b272c86f0c50366ca01e6d1539a9549d3282dbee997287b668f6603001c94d2d70cbe06dfd1c1d7bce8ca2bdd
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2016-09-26 16:56:34 +0100 using RuboCop version 0.42.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 3
|
10
|
+
Metrics/AbcSize:
|
11
|
+
Max: 45
|
12
|
+
|
13
|
+
# Offense count: 16
|
14
|
+
# Configuration parameters: AllowHeredoc, AllowURI, URISchemes.
|
15
|
+
# URISchemes: http, https
|
16
|
+
Metrics/LineLength:
|
17
|
+
Max: 132
|
18
|
+
|
19
|
+
# Offense count: 3
|
20
|
+
# Configuration parameters: CountComments.
|
21
|
+
Metrics/MethodLength:
|
22
|
+
Max: 18
|
23
|
+
|
24
|
+
# Offense count: 2
|
25
|
+
# Configuration parameters: EnforcedStyle, SupportedStyles.
|
26
|
+
# SupportedStyles: nested, compact
|
27
|
+
Style/ClassAndModuleChildren:
|
28
|
+
Exclude:
|
29
|
+
- 'lib/scraped_page_archive/capybara.rb'
|
30
|
+
- 'test/test_helper.rb'
|
31
|
+
|
32
|
+
# Offense count: 1
|
33
|
+
# Configuration parameters: ExpectMatchingDefinition, Regex, IgnoreExecutableScripts.
|
34
|
+
Style/FileName:
|
35
|
+
Exclude:
|
36
|
+
- 'lib/scraped_page_archive/open-uri.rb'
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -5,6 +5,16 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
5
5
|
|
6
6
|
## [Unreleased]
|
7
7
|
|
8
|
+
## [0.5.0] - 2016-11-03
|
9
|
+
|
10
|
+
### Changes
|
11
|
+
|
12
|
+
- The git storage logic has been pulled into its own class. This means that you need to pass a `ScrapedPageArchive::GitStorage` instance to the `ScrapedPageArchive` constructor if you're using the class directly. See the ["Running on other platforms" section in README.md](README.md#running-on-other-platforms) for more details.
|
13
|
+
|
14
|
+
### Fixes
|
15
|
+
|
16
|
+
- Avoid recloning the whole repo for each request by caching the `ScrapedPageArchive` instance in the `open-uri` and `capybara` adapters.
|
17
|
+
|
8
18
|
## [0.4.1] - 2016-08-15
|
9
19
|
|
10
20
|
### Fixes
|
@@ -49,3 +59,5 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
49
59
|
[0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
|
50
60
|
[0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
|
51
61
|
[0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0
|
62
|
+
[0.4.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.4.0...v0.4.1
|
63
|
+
[0.5.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.4.1...v0.5.0
|
data/README.md
CHANGED
@@ -21,46 +21,62 @@ Or install it yourself as:
|
|
21
21
|
|
22
22
|
## Usage
|
23
23
|
|
24
|
-
|
24
|
+
### Running locally
|
25
|
+
|
26
|
+
#### Use with open-uri
|
27
|
+
|
28
|
+
If you’re running a scraper locally, and the library can auto-detect
|
29
|
+
what repo it’s in, and find your credentials, all you need to do for an
|
30
|
+
`open-uri` based scraper is add a `require` line:
|
25
31
|
|
26
32
|
```ruby
|
27
|
-
require 'scraped_page_archive'
|
33
|
+
require 'scraped_page_archive/open-uri'
|
34
|
+
response = open('http://example.com/')
|
35
|
+
# Use the response...
|
28
36
|
```
|
29
37
|
|
30
|
-
|
38
|
+
As your scraper fetches any page it will also commit a copy of the
|
39
|
+
response (and the headers), into a `scraped-pages-archive` branch.
|
40
|
+
|
41
|
+
### Running on other platforms
|
31
42
|
|
32
|
-
If you
|
43
|
+
If you are not running your app locally, or it can’t auto-detect the
|
44
|
+
information it needs to be able to do the archiving, then you need to
|
45
|
+
provide some extra configuration — specifically the url to your repo and
|
46
|
+
a GitHub access token.
|
33
47
|
|
34
|
-
|
35
|
-
|
36
|
-
|
48
|
+
[Generate a GitHub access token here](https://github.com/settings/tokens):
|
49
|
+
it will need to have the `repo` permission checked. Then combine it with
|
50
|
+
the details of your repo to produce a setting in the form:
|
37
51
|
|
38
|
-
You can also set this to any value (including another environment variable of your choosing) with the following:
|
39
52
|
|
40
53
|
```ruby
|
41
|
-
|
54
|
+
REPO = 'https://YOUR_GITHUB_TOKEN@github.com/everypolitician-scrapers/kenya-mzalendo'
|
55
|
+
storage = ScrapedPageArchive::GitStorage.new(REPO)
|
56
|
+
archive = ScrapedPageArchive.new(storage)
|
57
|
+
archive.record { open('http://example.com/') }
|
42
58
|
```
|
43
59
|
|
44
|
-
|
60
|
+
(Though, obviously, you’ll want your own scraper details there rather than
|
61
|
+
`everypolitician-scrapers/kenya-mzalendo`!)
|
45
62
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
# Use the response...
|
50
|
-
end
|
51
|
-
```
|
63
|
+
IMPORTANT: Remember not to share your GitHub access token. Don’t include
|
64
|
+
it in your code, especially if it lives in a public repo. Normal usage
|
65
|
+
would be to set this from an environment variable.
|
52
66
|
|
53
|
-
|
67
|
+
#### Use with Morph
|
54
68
|
|
55
|
-
If you
|
69
|
+
If you’re using the excellent [morph.io](https://morph.io), you can set
|
70
|
+
your repo URL configuration in the "Secret environment variables"
|
71
|
+
section of the scraper’s Settings page. We automatically check if
|
72
|
+
`MORPH_SCRAPER_CACHE_GITHUB_REPO_URL` is set — there’s no need to
|
73
|
+
explicitly set it using `ScrapedPageArchive.github_repo_url` in this
|
74
|
+
case.
|
56
75
|
|
57
|
-
```ruby
|
58
|
-
require 'scraped_page_archive/open-uri'
|
59
|
-
response = open('http://example.com/')
|
60
|
-
# Use the response...
|
61
|
-
```
|
62
76
|
|
63
|
-
###
|
77
|
+
### More complex scenarios
|
78
|
+
|
79
|
+
#### Use with the Capybara Poltergeist driver
|
64
80
|
|
65
81
|
If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
|
66
82
|
|
@@ -73,6 +89,18 @@ visit('http://example.com/')
|
|
73
89
|
It should be possible to adapt this to work with other Capybara drivers
|
74
90
|
fairly easily.
|
75
91
|
|
92
|
+
#### Use with `ScrapedPageArchive.record`
|
93
|
+
|
94
|
+
You can have complete control and record http requests by performing them in a block passed to `ScrapedPageArchive.record`:
|
95
|
+
|
96
|
+
```ruby
|
97
|
+
require 'scraped_page_archive'
|
98
|
+
ScrapedPageArchive.record do
|
99
|
+
response = open('http://example.com/')
|
100
|
+
# Use the response...
|
101
|
+
end
|
102
|
+
```
|
103
|
+
|
76
104
|
## Development
|
77
105
|
|
78
106
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -82,10 +110,30 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
82
110
|
Note that this does not install Capybara or any drivers so if you want
|
83
111
|
to work on that you will need to do that.
|
84
112
|
|
113
|
+
### Releases
|
114
|
+
|
115
|
+
After you've added a new feature or fixed a bug you should release the gem to rubygems.org.
|
116
|
+
|
117
|
+
#### Before releasing a new version
|
118
|
+
|
119
|
+
- [ ] Is your new feature/bugfix documented in [`CHANGELOG.md`](CHANGELOG.md)?
|
120
|
+
- [ ] Have added a section for the new version in [`CHANGELOG.md`](CHANGELOG.md)?
|
121
|
+
- [ ] Have you updated `ScrapedPage::VERSION` according to [SemVer](http://semver.org/)?
|
122
|
+
- [ ] Are all of the changes that you want included in the release on the `master` branch?
|
123
|
+
|
124
|
+
#### Releasing a new version
|
125
|
+
|
126
|
+
If you wanted to release version `0.42.0`, for example, you would need to run the following commands:
|
127
|
+
|
128
|
+
git tag -a -m "scraped_page_archive v0.42.0" v0.42.0
|
129
|
+
git push origin --tags
|
130
|
+
|
131
|
+
Then Travis CI will notice that you've pushed a new tag and will release the new version of the gem.
|
132
|
+
|
85
133
|
## Contributing
|
86
134
|
|
87
135
|
Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
|
88
136
|
|
89
137
|
## License
|
90
138
|
|
91
|
-
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT)
|
139
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT)
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -6,9 +6,5 @@ require 'scraped_page_archive'
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
# Pry.start
|
12
|
-
|
13
|
-
require 'irb'
|
14
|
-
IRB.start
|
9
|
+
require 'pry'
|
10
|
+
Pry.start
|
data/lib/scraped_page_archive.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'scraped_page_archive/version'
|
2
|
+
require 'scraped_page_archive/git_storage'
|
2
3
|
require 'vcr'
|
3
|
-
require 'git'
|
4
4
|
require 'vcr/archive'
|
5
5
|
|
6
6
|
VCR.configure do |config|
|
@@ -14,40 +14,32 @@ end
|
|
14
14
|
class ScrapedPageArchive
|
15
15
|
class Error < StandardError; end
|
16
16
|
|
17
|
-
attr_writer :github_repo_url
|
18
|
-
|
19
17
|
def self.record(*args, &block)
|
20
|
-
new.record(*args, &block)
|
18
|
+
new(GitStorage.new).record(*args, &block)
|
21
19
|
end
|
22
20
|
|
23
|
-
|
24
|
-
if github_repo_url.nil?
|
25
|
-
warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
|
26
|
-
"See https://github.com/everypolitician/scraped_page_archive#usage for details."
|
27
|
-
return block.call
|
28
|
-
end
|
29
|
-
ret = VCR.use_cassette('', &block)
|
21
|
+
attr_reader :storage
|
30
22
|
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
def initialize(storage)
|
24
|
+
@storage = storage
|
25
|
+
end
|
34
26
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
git.commit(message)
|
27
|
+
def record(&block)
|
28
|
+
if storage.github_repo_url.nil?
|
29
|
+
warn "The 'scraped_page_archive' gem wants to store the scraped pages in a git repo," \
|
30
|
+
'but it cannot determine which git repo it should use. See ' \
|
31
|
+
'https://github.com/everypolitician/scraped_page_archive#usage for details of how ' \
|
32
|
+
"to specify the repo.\n\n"
|
33
|
+
return yield
|
43
34
|
end
|
44
|
-
|
45
|
-
|
35
|
+
VCR::Archive::Persister.storage_location = storage.path
|
36
|
+
ret = VCR.use_cassette('', &block)
|
37
|
+
storage.save
|
46
38
|
ret
|
47
39
|
end
|
48
40
|
|
49
|
-
def open_from_archive(url
|
50
|
-
|
41
|
+
def open_from_archive(url)
|
42
|
+
storage.chdir do
|
51
43
|
filename = filename_from_url(url.to_s)
|
52
44
|
meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
|
53
45
|
response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
|
@@ -70,50 +62,4 @@ class ScrapedPageArchive
|
|
70
62
|
response.base_uri = URI.parse(meta['request']['uri'])
|
71
63
|
end
|
72
64
|
end
|
73
|
-
|
74
|
-
# TODO: This should be configurable.
|
75
|
-
def branch_name
|
76
|
-
@branch_name ||= 'scraped-pages-archive'
|
77
|
-
end
|
78
|
-
|
79
|
-
def git
|
80
|
-
@git ||= Git.clone(git_url, tmpdir).tap do |g|
|
81
|
-
g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
|
82
|
-
g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
|
83
|
-
VCR::Archive::Persister.storage_location = g.dir.path
|
84
|
-
if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
|
85
|
-
g.checkout(branch_name)
|
86
|
-
else
|
87
|
-
g.chdir do
|
88
|
-
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
89
|
-
# @see https://github.com/schacon/ruby-git/pull/140
|
90
|
-
system("git checkout --orphan #{branch_name}")
|
91
|
-
system("git rm --quiet -rf .")
|
92
|
-
end
|
93
|
-
g.commit("Initial commit", allow_empty: true)
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def tmpdir
|
99
|
-
@tmpdir ||= Dir.mktmpdir
|
100
|
-
end
|
101
|
-
|
102
|
-
def git_url
|
103
|
-
@git_url ||= begin
|
104
|
-
url = URI.parse(github_repo_url)
|
105
|
-
url.password = ENV['SCRAPED_PAGE_ARCHIVE_GITHUB_TOKEN']
|
106
|
-
url.to_s
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def github_repo_url
|
111
|
-
@github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
|
112
|
-
end
|
113
|
-
|
114
|
-
def git_remote_get_url_origin
|
115
|
-
remote_url = `git config remote.origin.url`.chomp
|
116
|
-
return nil unless $?.success?
|
117
|
-
remote_url
|
118
|
-
end
|
119
65
|
end
|
@@ -23,29 +23,28 @@ module Capybara::Poltergeist
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def get_details(url)
|
26
|
-
status_code = page.status_code
|
27
26
|
{
|
28
|
-
'request'
|
27
|
+
'request' => {
|
29
28
|
'method' => 'get', # assume this as no way to access it
|
30
|
-
'uri'
|
29
|
+
'uri' => url,
|
31
30
|
},
|
32
31
|
'response' => {
|
33
32
|
'status' => {
|
34
33
|
'message' => status_code == 200 ? 'OK' : 'NOT OK',
|
35
|
-
'code'
|
34
|
+
'code' => status_code,
|
36
35
|
},
|
37
|
-
'date'
|
38
|
-
}
|
36
|
+
'date' => [response_headers['Date']],
|
37
|
+
},
|
39
38
|
}
|
40
39
|
end
|
41
40
|
|
42
41
|
def save_request(html, details, url)
|
43
42
|
html_path, yaml_path = get_paths(url)
|
44
43
|
|
45
|
-
File.open(html_path,
|
44
|
+
File.open(html_path, 'w') do |f|
|
46
45
|
f.write(html)
|
47
46
|
end
|
48
|
-
File.open(yaml_path,
|
47
|
+
File.open(yaml_path, 'w') do |f|
|
49
48
|
f.write(YAML.dump(details))
|
50
49
|
end
|
51
50
|
end
|
@@ -54,12 +53,15 @@ module Capybara::Poltergeist
|
|
54
53
|
result = __command(name, *args)
|
55
54
|
# we skip these methods because they are called a lot, don't cause the page
|
56
55
|
# to change and having record round them slows things down quite a bit.
|
57
|
-
return result if
|
58
|
-
|
59
|
-
|
60
|
-
save_request(page.html, get_details(current_url), current_url)
|
56
|
+
return result if %w(tag_name visible property find body set_js_errors current_url status_code response_headers).include?(name)
|
57
|
+
scraped_page_archive.record do
|
58
|
+
save_request(body, get_details(current_url), current_url)
|
61
59
|
end
|
62
60
|
result
|
63
61
|
end
|
62
|
+
|
63
|
+
def scraped_page_archive
|
64
|
+
@scraped_page_archive ||= ScrapedPageArchive.new(ScrapedPageArchive::GitStorage.new)
|
65
|
+
end
|
64
66
|
end
|
65
67
|
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'git'
|
2
|
+
require 'English'
|
3
|
+
|
4
|
+
class ScrapedPageArchive
|
5
|
+
class GitStorage
|
6
|
+
attr_reader :github_repo_url
|
7
|
+
|
8
|
+
def initialize(github_repo_url = nil)
|
9
|
+
@github_repo_url = (
|
10
|
+
github_repo_url ||
|
11
|
+
ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] ||
|
12
|
+
git_remote_get_url_origin
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
def path
|
17
|
+
git.dir.path
|
18
|
+
end
|
19
|
+
|
20
|
+
def chdir(&block)
|
21
|
+
git.chdir(&block)
|
22
|
+
end
|
23
|
+
|
24
|
+
# FIXME: This should be refactored so it doesn't have as much knowledge about
|
25
|
+
# the locations of files on the filesystem.
|
26
|
+
def save
|
27
|
+
# NOTE: This is a workaround for a ruby-git bug.
|
28
|
+
# @see https://github.com/schacon/ruby-git/issues/23
|
29
|
+
git.status.changed.each { git.diff.entries }
|
30
|
+
|
31
|
+
files = (git.status.changed.keys + git.status.untracked.keys)
|
32
|
+
return unless files.any?
|
33
|
+
# For each interaction, commit the yml and html along with the correct commit message.
|
34
|
+
files.select { |f| f.end_with?('.yml') }.each do |f|
|
35
|
+
interaction = git.chdir { YAML.load_file(f) }
|
36
|
+
message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
|
37
|
+
git.add([f, f.sub(/\.yml$/, '.html')])
|
38
|
+
git.commit(message)
|
39
|
+
end
|
40
|
+
# FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
|
41
|
+
git.push('origin', branch_name)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
# TODO: This should be configurable.
|
47
|
+
def branch_name
|
48
|
+
@branch_name ||= 'scraped-pages-archive'
|
49
|
+
end
|
50
|
+
|
51
|
+
def git
|
52
|
+
@git ||= Git.clone(git_url, tmpdir).tap do |g|
|
53
|
+
g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
|
54
|
+
g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
|
55
|
+
if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
|
56
|
+
g.checkout(branch_name)
|
57
|
+
else
|
58
|
+
g.chdir do
|
59
|
+
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
60
|
+
# @see https://github.com/schacon/ruby-git/pull/140
|
61
|
+
system("git checkout --orphan #{branch_name}")
|
62
|
+
system('git rm --quiet -rf .')
|
63
|
+
end
|
64
|
+
g.commit('Initial commit', allow_empty: true)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def tmpdir
|
70
|
+
@tmpdir ||= Dir.mktmpdir
|
71
|
+
end
|
72
|
+
|
73
|
+
def git_url
|
74
|
+
@git_url ||= begin
|
75
|
+
url = URI.parse(github_repo_url)
|
76
|
+
url.password = ENV['SCRAPED_PAGE_ARCHIVE_GITHUB_TOKEN']
|
77
|
+
url.to_s
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def git_remote_get_url_origin
|
82
|
+
remote_url = `git config remote.origin.url`.chomp
|
83
|
+
return nil unless $CHILD_STATUS.success?
|
84
|
+
remote_url
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -6,7 +6,11 @@ module OpenURI
|
|
6
6
|
class << self
|
7
7
|
alias __open_uri open_uri
|
8
8
|
def open_uri(*args, &block)
|
9
|
-
|
9
|
+
scraped_page_archive.record { __open_uri(*args, &block) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def scraped_page_archive
|
13
|
+
@scraped_page_archive ||= ScrapedPageArchive.new(ScrapedPageArchive::GitStorage.new)
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['chrismytton@gmail.com']
|
11
11
|
|
12
12
|
spec.summary = 'Archives a copy of scraped web pages into a git branch'
|
13
|
-
spec.homepage =
|
13
|
+
spec.homepage = 'https://github.com/everypolitician/scraped_page_archive'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
@@ -25,4 +25,5 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency 'rake', '~> 10.0'
|
26
26
|
spec.add_development_dependency 'minitest', '~> 5.0'
|
27
27
|
spec.add_development_dependency 'pry', '~> 0.10.4'
|
28
|
+
spec.add_development_dependency 'rubocop', '~> 0.42'
|
28
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraped_page_archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Mytton
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: vcr-archive
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: 0.10.4
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.42'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.42'
|
97
111
|
description:
|
98
112
|
email:
|
99
113
|
- chrismytton@gmail.com
|
@@ -102,6 +116,8 @@ extensions: []
|
|
102
116
|
extra_rdoc_files: []
|
103
117
|
files:
|
104
118
|
- ".gitignore"
|
119
|
+
- ".rubocop.yml"
|
120
|
+
- ".rubocop_todo.yml"
|
105
121
|
- ".travis.yml"
|
106
122
|
- CHANGELOG.md
|
107
123
|
- Gemfile
|
@@ -112,6 +128,7 @@ files:
|
|
112
128
|
- bin/setup
|
113
129
|
- lib/scraped_page_archive.rb
|
114
130
|
- lib/scraped_page_archive/capybara.rb
|
131
|
+
- lib/scraped_page_archive/git_storage.rb
|
115
132
|
- lib/scraped_page_archive/open-uri.rb
|
116
133
|
- lib/scraped_page_archive/version.rb
|
117
134
|
- scraped_page_archive.gemspec
|
@@ -135,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
152
|
version: '0'
|
136
153
|
requirements: []
|
137
154
|
rubyforge_project:
|
138
|
-
rubygems_version: 2.5
|
155
|
+
rubygems_version: 2.4.5
|
139
156
|
signing_key:
|
140
157
|
specification_version: 4
|
141
158
|
summary: Archives a copy of scraped web pages into a git branch
|