gov_scooper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5b86b619b7752b3318c28f910474c06fc0f15d8a
4
+ data.tar.gz: a0f45a04ca8a897e634fe956a426ce7dc1a1f49c
5
+ SHA512:
6
+ metadata.gz: 4cfee6dc74f3563ec7443025855fb548738f1bd379f889cfb396fef171606a10a77ceb7b77f4fcb7145d8250904506e0bed1cd2685e192d96bafee6a5203d831
7
+ data.tar.gz: 10662831c4924da06a8ecf1c4d0feedd1059918cac3bad3209f4acc77f93cc297e0c2662f3fa766f99532d78f0448bdea0fb639154e2d1a3a2c3151f9b91f504
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.3
5
+ before_install: gem install bundler -v 1.13.6
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in gov_scooper.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Jack Reed
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # GovScooper
2
+
3
+ Scoopin' up all of the metadata from Data.gov. GovScooper is a paginated harvester of the Data.gov CKAN API. It also enables you to save that metadata in a pairtree.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'gov_scooper'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install gov_scooper
20
+
21
+ ## Usage
22
+
23
+ ```ruby
24
+ metadata_enumberable = DataGov::API.new.search
25
+ metadata_enumberable.length #=> 10
26
+ metadata_enumberable.map { |md| DataGov::Dataset.new(md).save_ckan_metadata }
27
+ # metadata is now saved in a pairtree directory structure based off of id
28
+
29
+ # Download the resources
30
+ DataGov::Dataset.from_id('8a311c18-3060-438e-a8c7-9e37bcde6529').resources.map { |r| r.download }
31
+ ```
32
+
33
+ ## Development
34
+
35
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
36
+
37
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
38
+
39
+ ## Contributing
40
+
41
+ Bug reports and pull requests are welcome on GitHub at https://github.com/mejackreed/GovScooper.
42
+
43
+
44
+ ## License
45
+
46
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ Dir.glob('lib/tasks/*.rake').each { |r| load r}
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "gov_scooper"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gov_scooper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'gov_scooper'
8
+ spec.version = GovScooper::VERSION
9
+ spec.authors = ['Jack Reed']
10
+ spec.email = ['phillipjreed@gmail.com']
11
+
12
+ spec.summary = 'Scooper of data.gov metadata'
13
+ spec.description = 'Scooper of data.gov metadata'
14
+ spec.homepage = 'https://github.com/mejackreed/gov_scooper'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+ spec.bindir = 'exe'
21
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
+ spec.require_paths = ['lib']
23
+
24
+ spec.add_dependency 'faraday'
25
+ spec.add_dependency 'faraday_middleware'
26
+ spec.add_dependency 'pairtree'
27
+ spec.add_dependency 'mime-types'
28
+ spec.add_dependency 'ruby-progressbar'
29
+ spec.add_dependency 'open_uri_redirections'
30
+
31
+ spec.add_development_dependency 'bundler', '~> 1.13'
32
+ spec.add_development_dependency 'rake', '~> 10.0'
33
+ spec.add_development_dependency 'rspec', '~> 3.0'
34
+ end
@@ -0,0 +1,17 @@
1
+ module DataGov
2
+ class API
3
+ ##
4
+ # @return [Enumerator]
5
+ def search(params = {})
6
+ client.paginated_get('/api/3/action/package_search', 'results', params)
7
+ end
8
+
9
+ def harvest_object(id)
10
+ client.get("/harvest/object/#{id}")
11
+ end
12
+
13
+ def client
14
+ @client ||= DataGov::Client.new
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,54 @@
1
+ require 'faraday'
2
+ require 'faraday_middleware'
3
+
4
+ module DataGov
5
+ class Client
6
+ ##
7
+ # @return [Hash] a parsed JSON hash
8
+ def get(path, params = {})
9
+ response = connection.get(path) do |req|
10
+ req.params = params
11
+ end
12
+ puts "#{response.status} #{response.env.url}"
13
+ response.body
14
+ end
15
+
16
+ def paginated_get(path, accessor, options = {})
17
+ Enumerator.new do |yielder|
18
+ params = options.dup
19
+ rows = params.delete(:rows) { 5 }
20
+ start = params.delete(:start) { 1 }
21
+ max = params.delete(:max) { 10 }
22
+ total = 0
23
+
24
+ loop do
25
+ data = get(path, { rows: rows, start: start }.merge(params))
26
+
27
+ total += data['result'][accessor].length
28
+
29
+ data['result'][accessor].each do |element|
30
+ yielder.yield element
31
+ end
32
+
33
+ start += rows
34
+ puts "total: #{total}, max: #{max}, count: #{data['result']['count']}"
35
+ break if total >= data['result']['count'] || total >= max
36
+ end
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def connection
43
+ @connection ||= begin
44
+ conn = Faraday.new(url: 'https://catalog.data.gov') do |faraday|
45
+ faraday.adapter Faraday.default_adapter
46
+ faraday.response :json
47
+ end
48
+ conn.options.timeout = 60
49
+ conn.options.open_timeout = 10
50
+ conn
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,47 @@
1
+ require 'pairtree'
2
+ require 'JSON'
3
+
4
+ module DataGov
5
+ class Dataset
6
+ attr_accessor :id, :ckan_metadata
7
+ attr_reader :resources
8
+
9
+ def initialize(ckan_metadata)
10
+ @ckan_metadata = ckan_metadata
11
+ @id = ckan_metadata['id']
12
+ end
13
+
14
+ def save_ckan_metadata
15
+ pairtree.open('ckan.json', 'w') do |io|
16
+ io.write(JSON.pretty_generate(ckan_metadata))
17
+ end
18
+ end
19
+
20
+ def resources
21
+ @resources ||= ckan_metadata['resources'].map do |resource|
22
+ DataGov::Resource.new(resource, self)
23
+ end
24
+ end
25
+
26
+ def download_resources
27
+ puts "Downloading resources for #{id}"
28
+ resources.map { |resource| resource.download }
29
+ end
30
+
31
+ def pairtree
32
+ @pairtree ||= Pairtree.at(pairtree_location, create: true)
33
+ .mk(id.delete('-'))
34
+ end
35
+
36
+ def pairtree_location
37
+ ENV.fetch('DATA_DIR')
38
+ end
39
+
40
+ def self.from_id(id)
41
+ instance = new('')
42
+ instance.id = id
43
+ instance.ckan_metadata = JSON.parse(instance.pairtree.read('ckan.json'))
44
+ instance
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,54 @@
1
+ require 'mime/types'
2
+ require 'open-uri'
3
+ require 'open_uri_redirections'
4
+ require 'ruby-progressbar'
5
+
6
+ module DataGov
7
+ class Resource
8
+ attr_reader :metadata, :dataset
9
+
10
+ def initialize(metadata, dataset)
11
+ @metadata = metadata
12
+ @dataset = dataset
13
+ end
14
+
15
+ def download
16
+ if dataset.pairtree.exists?(file_name)
17
+ puts "#{file_name} already exists, skipping download"
18
+ return
19
+ end
20
+ pbar = ProgressBar.create(title: file_name, total: nil)
21
+ begin
22
+ download = open(metadata['url'],
23
+ allow_redirections: :safe,
24
+ content_length_proc: lambda do |content_length|
25
+ if content_length && 0 < content_length
26
+ pbar.total = content_length
27
+ end
28
+ end,
29
+ progress_proc: lambda do |s|
30
+ if pbar.total
31
+ pbar.progress += s
32
+ else
33
+ pbar.increment
34
+ end
35
+ end)
36
+ dataset.pairtree.open(file_name, 'w') { |io| IO.copy_stream(download, io) }
37
+ rescue StandardError => e
38
+ puts e
39
+ end
40
+ end
41
+
42
+ def file_name
43
+ "#{metadata['id']}.#{extension}"
44
+ end
45
+
46
+ def extension
47
+ mimetype.preferred_extension
48
+ end
49
+
50
+ def mimetype
51
+ MIME::Types[metadata['mimetype']].first || MIME::Types['text/plain'].first
52
+ end
53
+ end
54
+ end
data/lib/data_gov.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'data_gov/API'
2
+ require 'data_gov/client'
3
+ require 'data_gov/dataset'
4
+ require 'data_gov/resource'
@@ -0,0 +1,3 @@
1
+ module GovScooper
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,6 @@
1
+ require 'gov_scooper/version'
2
+ require 'data_gov'
3
+
4
+ module GovScooper
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,76 @@
1
+ require 'json'
2
+ require 'gov_scooper'
3
+ require 'fileutils'
4
+ # require 'pry'
5
+
6
+ namespace :gov_scooper do
7
+ desc 'Sample OGM data and move it to a new directory'
8
+ task :sample do
9
+ number = Integer(ENV['GS_SAMPLE']) || 1
10
+ ogm_loc = ENV.fetch('DATA_DIR')
11
+ sample_output = ENV.fetch('GS_OUTPUT')
12
+ if ogm_loc.nil?
13
+ raise 'Please provide environment variable DATA_DIR'\
14
+ ' for opengeometdata directory location'
15
+ end
16
+ raise 'Please provide output directory GS_OUTPUT' if sample_output.nil?
17
+
18
+ layers = JSON.parse(File.read(File.join(ogm_loc, 'pairtree_root', 'layers.json')))
19
+ puts "#{layers.length} layers found"
20
+ random_layers = layers.to_a.sample(number).to_h
21
+ puts "Sampling #{random_layers.length} layers"
22
+ random_layers.values.each do |value|
23
+ output = File.join(sample_output, 'pairtree_root', value)
24
+ FileUtils.mkdir_p output
25
+ Dir[File.join(ogm_loc, 'pairtree_root', value, '*')].each do |file_name|
26
+ next if File.directory? file_name
27
+ puts "Copying #{file_name}"
28
+ FileUtils.cp file_name, output
29
+ end
30
+ end
31
+ ENV['DATA_DIR'] = sample_output
32
+ Rake::Task['gov_scooper:create_layers_json'].invoke
33
+ Rake::Task['gov_scooper:download_data'].invoke
34
+ ENV['DATA_DIR'] = ogm_loc
35
+ end
36
+ desc 'Download data for layers in a given directory - Be careful with this'
37
+ task :download_data do
38
+ ogm_loc = ENV.fetch('DATA_DIR')
39
+ if ogm_loc.nil?
40
+ raise 'Please provide environment variable DATA_DIR'\
41
+ ' for opengeometdata directory location'
42
+ end
43
+
44
+ layers = JSON.parse(File.read(File.join(ogm_loc, 'pairtree_root', 'layers.json')))
45
+ puts "#{layers.length} layers found"
46
+ resource_count = 0
47
+ layers.each do |layer|
48
+ dataset = DataGov::Dataset.from_id(layer[0])
49
+ puts "Downloading from dataset #{dataset.id}"
50
+ resources = dataset.resources
51
+ resource_count += resources.length
52
+ resources.map(&:download)
53
+ end
54
+ puts "#{resource_count} total resources"
55
+ end
56
+ desc 'Create layers.json'
57
+ task :create_layers_json do
58
+ ogm_loc = ENV.fetch('DATA_DIR')
59
+ if ogm_loc.nil?
60
+ raise 'Please provide environment variable DATA_DIR'\
61
+ ' for opengeometdata directory location'
62
+ end
63
+ layers = Dir[File.join(ogm_loc, 'pairtree_root', '**', 'ckan.json')]
64
+ h = layers.map do |f|
65
+ d = DataGov::Dataset.new(JSON.parse(File.read(f)))
66
+ {
67
+ d.id => f.sub(/.*pairtree_root\//, '').sub('ckan.json', '')
68
+ }
69
+ end
70
+ v = h.inject(:merge!)
71
+ puts "layers.json created for #{layers.count} files"
72
+ File.open(File.join(ogm_loc, 'pairtree_root', 'layers.json'), 'w') do |io|
73
+ io.write(JSON.pretty_generate(v))
74
+ end
75
+ end
76
+ end
metadata ADDED
@@ -0,0 +1,188 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gov_scooper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jack Reed
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-01-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faraday_middleware
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pairtree
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mime-types
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ruby-progressbar
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: open_uri_redirections
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: bundler
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.13'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.13'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '10.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '10.0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rspec
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '3.0'
139
+ description: Scooper of data.gov metadata
140
+ email:
141
+ - phillipjreed@gmail.com
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files: []
145
+ files:
146
+ - ".gitignore"
147
+ - ".rspec"
148
+ - ".travis.yml"
149
+ - Gemfile
150
+ - LICENSE.txt
151
+ - README.md
152
+ - Rakefile
153
+ - bin/console
154
+ - bin/setup
155
+ - gov_scooper.gemspec
156
+ - lib/data_gov.rb
157
+ - lib/data_gov/API.rb
158
+ - lib/data_gov/client.rb
159
+ - lib/data_gov/dataset.rb
160
+ - lib/data_gov/resource.rb
161
+ - lib/gov_scooper.rb
162
+ - lib/gov_scooper/version.rb
163
+ - lib/tasks/gov_scooper.rake
164
+ homepage: https://github.com/mejackreed/gov_scooper
165
+ licenses:
166
+ - MIT
167
+ metadata: {}
168
+ post_install_message:
169
+ rdoc_options: []
170
+ require_paths:
171
+ - lib
172
+ required_ruby_version: !ruby/object:Gem::Requirement
173
+ requirements:
174
+ - - ">="
175
+ - !ruby/object:Gem::Version
176
+ version: '0'
177
+ required_rubygems_version: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - ">="
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
182
+ requirements: []
183
+ rubyforge_project:
184
+ rubygems_version: 2.5.2
185
+ signing_key:
186
+ specification_version: 4
187
+ summary: Scooper of data.gov metadata
188
+ test_files: []