scraped_page_archive 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +6 -0
- data/.rubocop_todo.yml +36 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +12 -0
- data/README.md +73 -25
- data/Rakefile +3 -0
- data/bin/console +2 -6
- data/lib/scraped_page_archive.rb +18 -72
- data/lib/scraped_page_archive/capybara.rb +14 -12
- data/lib/scraped_page_archive/git_storage.rb +87 -0
- data/lib/scraped_page_archive/open-uri.rb +5 -1
- data/lib/scraped_page_archive/version.rb +1 -1
- data/scraped_page_archive.gemspec +2 -1
- metadata +20 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50b3da0577359ded1637fa4be35716026007b93b
|
4
|
+
data.tar.gz: ff190d0566994bf15b0ec4c28539a4c078969c5d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a5923d6d2b9946188bfcdd614189c677df278f0c70bbbfaa5c1e168f066889aa83e445aa273dbc93e756d593c33dc58c6ad32aabc1838f04410b9b03b57b9a6
|
7
|
+
data.tar.gz: 422ae03a74df61120ca9c79bdaced1617935af0b272c86f0c50366ca01e6d1539a9549d3282dbee997287b668f6603001c94d2d70cbe06dfd1c1d7bce8ca2bdd
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2016-09-26 16:56:34 +0100 using RuboCop version 0.42.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 3
|
10
|
+
Metrics/AbcSize:
|
11
|
+
Max: 45
|
12
|
+
|
13
|
+
# Offense count: 16
|
14
|
+
# Configuration parameters: AllowHeredoc, AllowURI, URISchemes.
|
15
|
+
# URISchemes: http, https
|
16
|
+
Metrics/LineLength:
|
17
|
+
Max: 132
|
18
|
+
|
19
|
+
# Offense count: 3
|
20
|
+
# Configuration parameters: CountComments.
|
21
|
+
Metrics/MethodLength:
|
22
|
+
Max: 18
|
23
|
+
|
24
|
+
# Offense count: 2
|
25
|
+
# Configuration parameters: EnforcedStyle, SupportedStyles.
|
26
|
+
# SupportedStyles: nested, compact
|
27
|
+
Style/ClassAndModuleChildren:
|
28
|
+
Exclude:
|
29
|
+
- 'lib/scraped_page_archive/capybara.rb'
|
30
|
+
- 'test/test_helper.rb'
|
31
|
+
|
32
|
+
# Offense count: 1
|
33
|
+
# Configuration parameters: ExpectMatchingDefinition, Regex, IgnoreExecutableScripts.
|
34
|
+
Style/FileName:
|
35
|
+
Exclude:
|
36
|
+
- 'lib/scraped_page_archive/open-uri.rb'
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -5,6 +5,16 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
5
5
|
|
6
6
|
## [Unreleased]
|
7
7
|
|
8
|
+
## [0.5.0] - 2016-11-03
|
9
|
+
|
10
|
+
### Changes
|
11
|
+
|
12
|
+
- The git storage logic has been pulled into its own class. This means that you need to pass a `ScrapedPageArchive::GitStorage` instance to the `ScrapedPageArchive` constructor if you're using the class directly. See the ["Running on other platforms" section in README.md](README.md#running-on-other-platforms) for more details.
|
13
|
+
|
14
|
+
### Fixes
|
15
|
+
|
16
|
+
- Avoid recloning the whole repo for each request by caching the `ScrapedPageArchive` instance in the `open-uri` and `capybara` adapters.
|
17
|
+
|
8
18
|
## [0.4.1] - 2016-08-15
|
9
19
|
|
10
20
|
### Fixes
|
@@ -49,3 +59,5 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
49
59
|
[0.3.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.2.0...v0.3.0
|
50
60
|
[0.3.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.0...v0.3.1
|
51
61
|
[0.4.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.3.1...v0.4.0
|
62
|
+
[0.4.1]: https://github.com/everypolitician/scraped_page_archive/compare/v0.4.0...v0.4.1
|
63
|
+
[0.5.0]: https://github.com/everypolitician/scraped_page_archive/compare/v0.4.1...v0.5.0
|
data/README.md
CHANGED
@@ -21,46 +21,62 @@ Or install it yourself as:
|
|
21
21
|
|
22
22
|
## Usage
|
23
23
|
|
24
|
-
|
24
|
+
### Running locally
|
25
|
+
|
26
|
+
#### Use with open-uri
|
27
|
+
|
28
|
+
If you’re running a scraper locally, and the library can auto-detect
|
29
|
+
what repo it’s in, and find your credentials, all you need to do for an
|
30
|
+
`open-uri` based scraper is add a `require` line:
|
25
31
|
|
26
32
|
```ruby
|
27
|
-
require 'scraped_page_archive'
|
33
|
+
require 'scraped_page_archive/open-uri'
|
34
|
+
response = open('http://example.com/')
|
35
|
+
# Use the response...
|
28
36
|
```
|
29
37
|
|
30
|
-
|
38
|
+
As your scraper fetches any page it will also commit a copy of the
|
39
|
+
response (and the headers), into a `scraped-pages-archive` branch.
|
40
|
+
|
41
|
+
### Running on other platforms
|
31
42
|
|
32
|
-
If you
|
43
|
+
If you are not running your app locally, or it can’t auto-detect the
|
44
|
+
information it needs to be able to do the archiving, then you need to
|
45
|
+
provide some extra configuration — specifically the url to your repo and
|
46
|
+
a GitHub access token.
|
33
47
|
|
34
|
-
|
35
|
-
|
36
|
-
|
48
|
+
[Generate a GitHub access token here](https://github.com/settings/tokens):
|
49
|
+
it will need to have the `repo` permission checked. Then combine it with
|
50
|
+
the details of your repo to produce a setting in the form:
|
37
51
|
|
38
|
-
You can also set this to any value (including another environment variable of your choosing) with the following:
|
39
52
|
|
40
53
|
```ruby
|
41
|
-
|
54
|
+
REPO = 'https://YOUR_GITHUB_TOKEN@github.com/everypolitician-scrapers/kenya-mzalendo'
|
55
|
+
storage = ScrapedPageArchive::GitStorage.new(REPO)
|
56
|
+
archive = ScrapedPageArchive.new(storage)
|
57
|
+
archive.record { open('http://example.com/') }
|
42
58
|
```
|
43
59
|
|
44
|
-
|
60
|
+
(Though, obviously, you’ll want your own scraper details there rather than
|
61
|
+
`everypolitician-scrapers/kenya-mzalendo`!)
|
45
62
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
# Use the response...
|
50
|
-
end
|
51
|
-
```
|
63
|
+
IMPORTANT: Remember not to share your GitHub access token. Don’t include
|
64
|
+
it in your code, especially if it lives in a public repo. Normal usage
|
65
|
+
would be to set this from an environment variable.
|
52
66
|
|
53
|
-
|
67
|
+
#### Use with Morph
|
54
68
|
|
55
|
-
If you
|
69
|
+
If you’re using the excellent [morph.io](https://morph.io), you can set
|
70
|
+
your repo URL configuration in the "Secret environment variables"
|
71
|
+
section of the scraper’s Settings page. We automatically check if
|
72
|
+
`MORPH_SCRAPER_CACHE_GITHUB_REPO_URL` is set — there’s no need to
|
73
|
+
explicitly set it using `ScrapedPageArchive.github_repo_url` in this
|
74
|
+
case.
|
56
75
|
|
57
|
-
```ruby
|
58
|
-
require 'scraped_page_archive/open-uri'
|
59
|
-
response = open('http://example.com/')
|
60
|
-
# Use the response...
|
61
|
-
```
|
62
76
|
|
63
|
-
###
|
77
|
+
### More complex scenarios
|
78
|
+
|
79
|
+
#### Use with the Capybara Poltergeist driver
|
64
80
|
|
65
81
|
If you would like to have your http requests automatically recorded when using the Poltergeist driver in Capybara do the following:
|
66
82
|
|
@@ -73,6 +89,18 @@ visit('http://example.com/')
|
|
73
89
|
It should be possible to adapt this to work with other Capybara drivers
|
74
90
|
fairly easily.
|
75
91
|
|
92
|
+
#### Use with `ScrapedPageArchive.record`
|
93
|
+
|
94
|
+
You can have complete control and record http requests by performing them in a block passed to `ScrapedPageArchive.record`:
|
95
|
+
|
96
|
+
```ruby
|
97
|
+
require 'scraped_page_archive'
|
98
|
+
ScrapedPageArchive.record do
|
99
|
+
response = open('http://example.com/')
|
100
|
+
# Use the response...
|
101
|
+
end
|
102
|
+
```
|
103
|
+
|
76
104
|
## Development
|
77
105
|
|
78
106
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -82,10 +110,30 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
82
110
|
Note that this does not install Capybara or any drivers so if you want
|
83
111
|
to work on that you will need to do that.
|
84
112
|
|
113
|
+
### Releases
|
114
|
+
|
115
|
+
After you've added a new feature or fixed a bug you should release the gem to rubygems.org.
|
116
|
+
|
117
|
+
#### Before releasing a new version
|
118
|
+
|
119
|
+
- [ ] Is your new feature/bugfix documented in [`CHANGELOG.md`](CHANGELOG.md)?
|
120
|
+
- [ ] Have added a section for the new version in [`CHANGELOG.md`](CHANGELOG.md)?
|
121
|
+
- [ ] Have you updated `ScrapedPage::VERSION` according to [SemVer](http://semver.org/)?
|
122
|
+
- [ ] Are all of the changes that you want included in the release on the `master` branch?
|
123
|
+
|
124
|
+
#### Releasing a new version
|
125
|
+
|
126
|
+
If you wanted to release version `0.42.0`, for example, you would need to run the following commands:
|
127
|
+
|
128
|
+
git tag -a -m "scraped_page_archive v0.42.0" v0.42.0
|
129
|
+
git push origin --tags
|
130
|
+
|
131
|
+
Then Travis CI will notice that you've pushed a new tag and will release the new version of the gem.
|
132
|
+
|
85
133
|
## Contributing
|
86
134
|
|
87
135
|
Bug reports and pull requests are welcome on GitHub at https://github.com/everypolitician/scraped_page_archive.
|
88
136
|
|
89
137
|
## License
|
90
138
|
|
91
|
-
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT)
|
139
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT)
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -6,9 +6,5 @@ require 'scraped_page_archive'
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
# Pry.start
|
12
|
-
|
13
|
-
require 'irb'
|
14
|
-
IRB.start
|
9
|
+
require 'pry'
|
10
|
+
Pry.start
|
data/lib/scraped_page_archive.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'scraped_page_archive/version'
|
2
|
+
require 'scraped_page_archive/git_storage'
|
2
3
|
require 'vcr'
|
3
|
-
require 'git'
|
4
4
|
require 'vcr/archive'
|
5
5
|
|
6
6
|
VCR.configure do |config|
|
@@ -14,40 +14,32 @@ end
|
|
14
14
|
class ScrapedPageArchive
|
15
15
|
class Error < StandardError; end
|
16
16
|
|
17
|
-
attr_writer :github_repo_url
|
18
|
-
|
19
17
|
def self.record(*args, &block)
|
20
|
-
new.record(*args, &block)
|
18
|
+
new(GitStorage.new).record(*args, &block)
|
21
19
|
end
|
22
20
|
|
23
|
-
|
24
|
-
if github_repo_url.nil?
|
25
|
-
warn "Could not determine git repo for 'scraped_page_archive' to use.\n\n" \
|
26
|
-
"See https://github.com/everypolitician/scraped_page_archive#usage for details."
|
27
|
-
return block.call
|
28
|
-
end
|
29
|
-
ret = VCR.use_cassette('', &block)
|
21
|
+
attr_reader :storage
|
30
22
|
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
def initialize(storage)
|
24
|
+
@storage = storage
|
25
|
+
end
|
34
26
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
git.commit(message)
|
27
|
+
def record(&block)
|
28
|
+
if storage.github_repo_url.nil?
|
29
|
+
warn "The 'scraped_page_archive' gem wants to store the scraped pages in a git repo," \
|
30
|
+
'but it cannot determine which git repo it should use. See ' \
|
31
|
+
'https://github.com/everypolitician/scraped_page_archive#usage for details of how ' \
|
32
|
+
"to specify the repo.\n\n"
|
33
|
+
return yield
|
43
34
|
end
|
44
|
-
|
45
|
-
|
35
|
+
VCR::Archive::Persister.storage_location = storage.path
|
36
|
+
ret = VCR.use_cassette('', &block)
|
37
|
+
storage.save
|
46
38
|
ret
|
47
39
|
end
|
48
40
|
|
49
|
-
def open_from_archive(url
|
50
|
-
|
41
|
+
def open_from_archive(url)
|
42
|
+
storage.chdir do
|
51
43
|
filename = filename_from_url(url.to_s)
|
52
44
|
meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
|
53
45
|
response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
|
@@ -70,50 +62,4 @@ class ScrapedPageArchive
|
|
70
62
|
response.base_uri = URI.parse(meta['request']['uri'])
|
71
63
|
end
|
72
64
|
end
|
73
|
-
|
74
|
-
# TODO: This should be configurable.
|
75
|
-
def branch_name
|
76
|
-
@branch_name ||= 'scraped-pages-archive'
|
77
|
-
end
|
78
|
-
|
79
|
-
def git
|
80
|
-
@git ||= Git.clone(git_url, tmpdir).tap do |g|
|
81
|
-
g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
|
82
|
-
g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
|
83
|
-
VCR::Archive::Persister.storage_location = g.dir.path
|
84
|
-
if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
|
85
|
-
g.checkout(branch_name)
|
86
|
-
else
|
87
|
-
g.chdir do
|
88
|
-
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
89
|
-
# @see https://github.com/schacon/ruby-git/pull/140
|
90
|
-
system("git checkout --orphan #{branch_name}")
|
91
|
-
system("git rm --quiet -rf .")
|
92
|
-
end
|
93
|
-
g.commit("Initial commit", allow_empty: true)
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def tmpdir
|
99
|
-
@tmpdir ||= Dir.mktmpdir
|
100
|
-
end
|
101
|
-
|
102
|
-
def git_url
|
103
|
-
@git_url ||= begin
|
104
|
-
url = URI.parse(github_repo_url)
|
105
|
-
url.password = ENV['SCRAPED_PAGE_ARCHIVE_GITHUB_TOKEN']
|
106
|
-
url.to_s
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def github_repo_url
|
111
|
-
@github_repo_url ||= (ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] || git_remote_get_url_origin)
|
112
|
-
end
|
113
|
-
|
114
|
-
def git_remote_get_url_origin
|
115
|
-
remote_url = `git config remote.origin.url`.chomp
|
116
|
-
return nil unless $?.success?
|
117
|
-
remote_url
|
118
|
-
end
|
119
65
|
end
|
@@ -23,29 +23,28 @@ module Capybara::Poltergeist
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def get_details(url)
|
26
|
-
status_code = page.status_code
|
27
26
|
{
|
28
|
-
'request'
|
27
|
+
'request' => {
|
29
28
|
'method' => 'get', # assume this as no way to access it
|
30
|
-
'uri'
|
29
|
+
'uri' => url,
|
31
30
|
},
|
32
31
|
'response' => {
|
33
32
|
'status' => {
|
34
33
|
'message' => status_code == 200 ? 'OK' : 'NOT OK',
|
35
|
-
'code'
|
34
|
+
'code' => status_code,
|
36
35
|
},
|
37
|
-
'date'
|
38
|
-
}
|
36
|
+
'date' => [response_headers['Date']],
|
37
|
+
},
|
39
38
|
}
|
40
39
|
end
|
41
40
|
|
42
41
|
def save_request(html, details, url)
|
43
42
|
html_path, yaml_path = get_paths(url)
|
44
43
|
|
45
|
-
File.open(html_path,
|
44
|
+
File.open(html_path, 'w') do |f|
|
46
45
|
f.write(html)
|
47
46
|
end
|
48
|
-
File.open(yaml_path,
|
47
|
+
File.open(yaml_path, 'w') do |f|
|
49
48
|
f.write(YAML.dump(details))
|
50
49
|
end
|
51
50
|
end
|
@@ -54,12 +53,15 @@ module Capybara::Poltergeist
|
|
54
53
|
result = __command(name, *args)
|
55
54
|
# we skip these methods because they are called a lot, don't cause the page
|
56
55
|
# to change and having record round them slows things down quite a bit.
|
57
|
-
return result if
|
58
|
-
|
59
|
-
|
60
|
-
save_request(page.html, get_details(current_url), current_url)
|
56
|
+
return result if %w(tag_name visible property find body set_js_errors current_url status_code response_headers).include?(name)
|
57
|
+
scraped_page_archive.record do
|
58
|
+
save_request(body, get_details(current_url), current_url)
|
61
59
|
end
|
62
60
|
result
|
63
61
|
end
|
62
|
+
|
63
|
+
def scraped_page_archive
|
64
|
+
@scraped_page_archive ||= ScrapedPageArchive.new(ScrapedPageArchive::GitStorage.new)
|
65
|
+
end
|
64
66
|
end
|
65
67
|
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'git'
|
2
|
+
require 'English'
|
3
|
+
|
4
|
+
class ScrapedPageArchive
|
5
|
+
class GitStorage
|
6
|
+
attr_reader :github_repo_url
|
7
|
+
|
8
|
+
def initialize(github_repo_url = nil)
|
9
|
+
@github_repo_url = (
|
10
|
+
github_repo_url ||
|
11
|
+
ENV['MORPH_SCRAPER_CACHE_GITHUB_REPO_URL'] ||
|
12
|
+
git_remote_get_url_origin
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
def path
|
17
|
+
git.dir.path
|
18
|
+
end
|
19
|
+
|
20
|
+
def chdir(&block)
|
21
|
+
git.chdir(&block)
|
22
|
+
end
|
23
|
+
|
24
|
+
# FIXME: This should be refactored so it doesn't have as much knowledge about
|
25
|
+
# the locations of files on the filesystem.
|
26
|
+
def save
|
27
|
+
# NOTE: This is a workaround for a ruby-git bug.
|
28
|
+
# @see https://github.com/schacon/ruby-git/issues/23
|
29
|
+
git.status.changed.each { git.diff.entries }
|
30
|
+
|
31
|
+
files = (git.status.changed.keys + git.status.untracked.keys)
|
32
|
+
return unless files.any?
|
33
|
+
# For each interaction, commit the yml and html along with the correct commit message.
|
34
|
+
files.select { |f| f.end_with?('.yml') }.each do |f|
|
35
|
+
interaction = git.chdir { YAML.load_file(f) }
|
36
|
+
message = "#{interaction['response']['status'].values_at('code', 'message').join(' ')} #{interaction['request']['uri']}"
|
37
|
+
git.add([f, f.sub(/\.yml$/, '.html')])
|
38
|
+
git.commit(message)
|
39
|
+
end
|
40
|
+
# FIXME: Auto-pushing should be optional if the user wants to manually do it at the end.
|
41
|
+
git.push('origin', branch_name)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
# TODO: This should be configurable.
|
47
|
+
def branch_name
|
48
|
+
@branch_name ||= 'scraped-pages-archive'
|
49
|
+
end
|
50
|
+
|
51
|
+
def git
|
52
|
+
@git ||= Git.clone(git_url, tmpdir).tap do |g|
|
53
|
+
g.config('user.name', "scraped_page_archive gem #{ScrapedPageArchive::VERSION}")
|
54
|
+
g.config('user.email', "scraped_page_archive-#{ScrapedPageArchive::VERSION}@scrapers.everypolitician.org")
|
55
|
+
if g.branches[branch_name] || g.branches["origin/#{branch_name}"]
|
56
|
+
g.checkout(branch_name)
|
57
|
+
else
|
58
|
+
g.chdir do
|
59
|
+
# FIXME: It's not currently possible to create an orphan branch with ruby-git
|
60
|
+
# @see https://github.com/schacon/ruby-git/pull/140
|
61
|
+
system("git checkout --orphan #{branch_name}")
|
62
|
+
system('git rm --quiet -rf .')
|
63
|
+
end
|
64
|
+
g.commit('Initial commit', allow_empty: true)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def tmpdir
|
70
|
+
@tmpdir ||= Dir.mktmpdir
|
71
|
+
end
|
72
|
+
|
73
|
+
def git_url
|
74
|
+
@git_url ||= begin
|
75
|
+
url = URI.parse(github_repo_url)
|
76
|
+
url.password = ENV['SCRAPED_PAGE_ARCHIVE_GITHUB_TOKEN']
|
77
|
+
url.to_s
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def git_remote_get_url_origin
|
82
|
+
remote_url = `git config remote.origin.url`.chomp
|
83
|
+
return nil unless $CHILD_STATUS.success?
|
84
|
+
remote_url
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -6,7 +6,11 @@ module OpenURI
|
|
6
6
|
class << self
|
7
7
|
alias __open_uri open_uri
|
8
8
|
def open_uri(*args, &block)
|
9
|
-
|
9
|
+
scraped_page_archive.record { __open_uri(*args, &block) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def scraped_page_archive
|
13
|
+
@scraped_page_archive ||= ScrapedPageArchive.new(ScrapedPageArchive::GitStorage.new)
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['chrismytton@gmail.com']
|
11
11
|
|
12
12
|
spec.summary = 'Archives a copy of scraped web pages into a git branch'
|
13
|
-
spec.homepage =
|
13
|
+
spec.homepage = 'https://github.com/everypolitician/scraped_page_archive'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
@@ -25,4 +25,5 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency 'rake', '~> 10.0'
|
26
26
|
spec.add_development_dependency 'minitest', '~> 5.0'
|
27
27
|
spec.add_development_dependency 'pry', '~> 0.10.4'
|
28
|
+
spec.add_development_dependency 'rubocop', '~> 0.42'
|
28
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraped_page_archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Mytton
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: vcr-archive
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: 0.10.4
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.42'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.42'
|
97
111
|
description:
|
98
112
|
email:
|
99
113
|
- chrismytton@gmail.com
|
@@ -102,6 +116,8 @@ extensions: []
|
|
102
116
|
extra_rdoc_files: []
|
103
117
|
files:
|
104
118
|
- ".gitignore"
|
119
|
+
- ".rubocop.yml"
|
120
|
+
- ".rubocop_todo.yml"
|
105
121
|
- ".travis.yml"
|
106
122
|
- CHANGELOG.md
|
107
123
|
- Gemfile
|
@@ -112,6 +128,7 @@ files:
|
|
112
128
|
- bin/setup
|
113
129
|
- lib/scraped_page_archive.rb
|
114
130
|
- lib/scraped_page_archive/capybara.rb
|
131
|
+
- lib/scraped_page_archive/git_storage.rb
|
115
132
|
- lib/scraped_page_archive/open-uri.rb
|
116
133
|
- lib/scraped_page_archive/version.rb
|
117
134
|
- scraped_page_archive.gemspec
|
@@ -135,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
152
|
version: '0'
|
136
153
|
requirements: []
|
137
154
|
rubyforge_project:
|
138
|
-
rubygems_version: 2.5
|
155
|
+
rubygems_version: 2.4.5
|
139
156
|
signing_key:
|
140
157
|
specification_version: 4
|
141
158
|
summary: Archives a copy of scraped web pages into a git branch
|