gis_scraper 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7418ca46fc872f0665cf06bb5becfbe72acd1ad2
4
+ data.tar.gz: e3d7888a862011049f123da0941726b1231b8565
5
+ SHA512:
6
+ metadata.gz: d8e75fe1ac4c5a7f1a8712fb8fa1b9268b64dc79be61ed8d13f409ceb21d836b0f4f624e4119f3fb1aec4c31f9c4c302708a98f620ca9f434fac45e89215cf80
7
+ data.tar.gz: e35a53941f8e4788e655cff6806af3cf901748106f2f9620b6ce7951050b23ec1e3601249afb719390d95b97e013b411844230943081a1f5336de3d79a1f1a09
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ **/.DS_Store
2
+
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --require spec_helper
2
+ --format documentation
3
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.0.0
5
+ - 2.1.6
6
+ - 2.2.3
7
+ - rbx-2.9
8
+
9
+ script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,62 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ gis_scraper (0.1.0.pre)
5
+ mechanize (~> 2.7)
6
+ parallel (~> 1.6)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ diff-lcs (1.2.5)
12
+ domain_name (0.5.25)
13
+ unf (>= 0.0.5, < 1.0.0)
14
+ http-cookie (1.0.2)
15
+ domain_name (~> 0.5)
16
+ mechanize (2.7.3)
17
+ domain_name (~> 0.5, >= 0.5.1)
18
+ http-cookie (~> 1.0)
19
+ mime-types (~> 2.0)
20
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
21
+ net-http-persistent (~> 2.5, >= 2.5.2)
22
+ nokogiri (~> 1.4)
23
+ ntlm-http (~> 0.1, >= 0.1.1)
24
+ webrobots (>= 0.0.9, < 0.2)
25
+ mime-types (2.99)
26
+ mini_portile2 (2.0.0)
27
+ net-http-digest_auth (1.4)
28
+ net-http-persistent (2.9.4)
29
+ nokogiri (1.6.7.1)
30
+ mini_portile2 (~> 2.0.0.rc2)
31
+ ntlm-http (0.1.1)
32
+ parallel (1.6.1)
33
+ rake (10.4.2)
34
+ rspec (3.3.0)
35
+ rspec-core (~> 3.3.0)
36
+ rspec-expectations (~> 3.3.0)
37
+ rspec-mocks (~> 3.3.0)
38
+ rspec-core (3.3.2)
39
+ rspec-support (~> 3.3.0)
40
+ rspec-expectations (3.3.1)
41
+ diff-lcs (>= 1.2.0, < 2.0)
42
+ rspec-support (~> 3.3.0)
43
+ rspec-mocks (3.3.2)
44
+ diff-lcs (>= 1.2.0, < 2.0)
45
+ rspec-support (~> 3.3.0)
46
+ rspec-support (3.3.0)
47
+ unf (0.1.4)
48
+ unf_ext
49
+ unf_ext (0.0.7.1)
50
+ webrobots (0.1.1)
51
+
52
+ PLATFORMS
53
+ ruby
54
+
55
+ DEPENDENCIES
56
+ bundler (~> 1.10)
57
+ gis_scraper!
58
+ rake (~> 10.0)
59
+ rspec (~> 3.0)
60
+
61
+ BUNDLED WITH
62
+ 1.10.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Bruce Steedman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,22 @@
1
+ # gis_scraper Ruby Gem
2
+ [![Gem Version](https://badge.fury.io/rb/gis_scraper.svg)](http://badge.fury.io/rb/gis_scraper)
3
+ [![Build status](https://secure.travis-ci.org/MatzFan/gis_scraper.svg)](http://travis-ci.org/MatzFan/gis_scraper)
4
+
5
+ Utility to recursively scrape ArcGIS MapServer data using REST API.
6
+
7
+ ArcGIS MapServer REST queries are limited to 1,000 objects in some cases. This tool makes repeated calls until all data for a given layer is extracted. It then merges the resulting JSON files into a single file. This allows GIS clients like QGIS to add a layer from a single file.
8
+
9
+ **Usage**
10
+
11
+ The executable is called 'gisget' and takes one required arg - a MapServer/Layer URL (ending in an integer representing the layer number). An optional file output path may also be specified. If omitted the file will be saved in current directory. Example:
12
+
13
+ ```
14
+ gisget http://gps.digimap.gg/arcgis/rest/services/StatesOfJersey/JerseyMappingOL/MapServer/0 ~/Desktop
15
+ ```
16
+
17
+ If the layer is type 'Feature Layer', a single file of JSON data will be saved (named the same as the layer). If the layer is type 'Group Layer', the sub-group structure is traversed recursively thus: Directories for each sub-group layer are created and JSON data files for each constituent feature layer written to them.
18
+
19
+ **Specification and Tests**
20
+
21
+ rspec spec
22
+
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'gis_scraper'
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require 'irb'
14
+ IRB.start
data/bin/gisget ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ start = Time.now
4
+ Layer.new(*ARGV).write
5
+ puts "Finished in #{Time.now - start} seconds"
data/bin/setup ADDED
@@ -0,0 +1,5 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'gis_scraper/version'
6
+
7
+ Gem::Specification.new do |s|
8
+ s.name = 'gis_scraper'
9
+ s.version = GisScraper::VERSION
10
+ s.authors = ['Bruce Steedman']
11
+ s.email = ['bruce.steedman@gmail.com']
12
+
13
+ s.summary = %q{Scrapes ArcGIS data from MapServer REST API}
14
+ s.description = %q{Scrapes ArcGIS data from MapServer REST API}
15
+ s.license = "MIT"
16
+
17
+ s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec)/}) }
18
+ s.bindir = 'exe'
19
+ s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ s.require_paths = ['lib']
21
+
22
+ s.add_development_dependency 'bundler', '~> 1.10'
23
+ s.add_development_dependency 'rake', '~> 10.0'
24
+ s.add_development_dependency 'rspec', '~> 3.0'
25
+
26
+ s.add_runtime_dependency 'mechanize', '~> 2.7'
27
+ s.add_runtime_dependency 'parallel', '~> 1.6'
28
+ end
@@ -0,0 +1,82 @@
1
+ class JSONParser < Mechanize::File
2
+ attr_reader :json
3
+
4
+ def initialize(uri=nil, response=nil, body=nil, code=nil)
5
+ super(uri, response, body, code)
6
+ @json = JSON.parse(body)
7
+ end
8
+ end
9
+
10
+ class FeatureScraper
11
+
12
+ attr_reader :name
13
+
14
+ def initialize(url)
15
+ @url = url
16
+ @agent = Mechanize.new
17
+ @agent.pluggable_parser['text/plain'] = JSONParser
18
+ @layer = layer # hash
19
+ @name = name
20
+ @pk = pk
21
+ @max = max # maxRecordCount - usually 1000
22
+ @form = form
23
+ @loops = loops
24
+ @threads = GisScraper.config[:threads]
25
+ end
26
+
27
+ def json_data
28
+ data(0).merge({'features' => features(@threads)}).to_json
29
+ end
30
+
31
+ private
32
+
33
+ def layer
34
+ @agent.get(@url + '?f=pjson').json
35
+ end
36
+
37
+ def name
38
+ @layer['name']
39
+ end
40
+
41
+ def pk
42
+ @layer['fields'].select { |f| f['type'] == 'esriFieldTypeOID' }[0]['name']
43
+ end
44
+
45
+ def max
46
+ @layer['maxRecordCount'].to_i
47
+ end
48
+
49
+ def form
50
+ @agent.get(@url + '/query').forms.first
51
+ end
52
+
53
+ def count
54
+ set_query_params
55
+ @form.submit(@form.buttons[1]).json['count'].to_i
56
+ end
57
+
58
+ def set_query_params(loop_num = nil)
59
+ @form.fields[0].value = where_text(loop_num)
60
+ loop_num ? @form.radiobuttons[4].uncheck : @form.radiobuttons[4].check # count only true
61
+ @form.fields[6].value = '*'
62
+ @form.field_with(name: 'f').options[1].select # for JSON
63
+ end
64
+
65
+ def data(n)
66
+ set_query_params(n)
67
+ @form.submit(@form.buttons[1]).json
68
+ end
69
+
70
+ def features(t)
71
+ Parallel.map(0...@loops, in_threads: t) { |n| data(n)['features'] }.flatten
72
+ end
73
+
74
+ def loops
75
+ (count.to_f/@max).ceil
76
+ end
77
+
78
+ def where_text(n)
79
+ n ? "#{pk} > #{n * @max} AND #{pk} <= #{(n + 1) * @max}" : "#{pk} > 0"
80
+ end
81
+
82
+ end
@@ -0,0 +1,105 @@
1
+ require 'fileutils'
2
+
3
+ class Layer
4
+
5
+ class JSONParser < Mechanize::File
6
+ attr_reader :json
7
+
8
+ def initialize(uri=nil, response=nil, body=nil, code=nil)
9
+ super(uri, response, body, code)
10
+ @json = JSON.parse(body)
11
+ end
12
+ end
13
+
14
+ class UnknownLayerType < StandardError; end
15
+
16
+ attr_reader :type, :id, :name
17
+
18
+ TYPES = ['Group Layer',
19
+ 'Feature Layer',
20
+ 'Annotation Layer',
21
+ 'Annotation SubLayer']
22
+ QUERYABLE = ['Feature Layer', 'Annotation Layer']
23
+
24
+ def initialize(url, path = '.')
25
+ @url, @path = url, File.expand_path(path)
26
+ @ms_url = ms_url # map server url ending '../MapServer'
27
+ @id = id
28
+ @agent = Mechanize.new
29
+ @agent.pluggable_parser['text/plain'] = JSONParser
30
+ validate_url
31
+ @page_json = page_json
32
+ @type = type
33
+ @name = name
34
+ end
35
+
36
+ def write
37
+ QUERYABLE.any? { |l| @type == l } ? write_json_files : process_sub_layers
38
+ end
39
+
40
+ private
41
+
42
+ def ms_url
43
+ @url.split('/')[0..-2].join('/')
44
+ end
45
+
46
+ def id
47
+ @url.split('/').last
48
+ end
49
+
50
+ def validate_url
51
+ raise ArgumentError, 'URL must end with layer id' if @id.to_i.to_s != @id
52
+ raise ArgumentError, 'Bad MapServer URL' if @ms_url[-9..-1] != 'MapServer'
53
+ end
54
+
55
+ def page_json
56
+ @agent.get(@url + '?f=pjson').json
57
+ end
58
+
59
+ def type
60
+ validate_type @page_json['type']
61
+ end
62
+
63
+ def name
64
+ replace_forwardslashes_with_underscores @page_json['name']
65
+ end
66
+
67
+ def validate_type(type)
68
+ raise UnknownLayerType, type unless (TYPES.any? { |t| t == type })
69
+ type
70
+ end
71
+
72
+ def sub_layer_id_names
73
+ @page_json['subLayers'] || []
74
+ end
75
+
76
+ def json_data(url)
77
+ FeatureScraper.new(url).json_data
78
+ end
79
+
80
+ def write_json_files
81
+ File.write "#{@path}/#{@name}.json", json_data("#{@ms_url}/#{@id}")
82
+ end
83
+
84
+ def process_sub_layers
85
+ sub_layer_id_names.each do |hash|
86
+ name, id = hash['name'], hash['id']
87
+ path = "#{@path}/#{name}"
88
+ recurse sub_layer(id, path), path
89
+ end
90
+ end
91
+
92
+ def recurse(layer, dir)
93
+ FileUtils.mkdir dir
94
+ layer.write
95
+ end
96
+
97
+ def sub_layer(id, path)
98
+ Layer.new "#{@ms_url}/#{id}", path
99
+ end
100
+
101
+ def replace_forwardslashes_with_underscores(string)
102
+ string.gsub /\//, '_'
103
+ end
104
+
105
+ end
@@ -0,0 +1,3 @@
1
+ module GisScraper
2
+ VERSION = '0.1.0.pre'
3
+ end
@@ -0,0 +1,37 @@
1
+ require 'gis_scraper/version'
2
+ require 'gis_scraper/feature_scraper'
3
+ require 'gis_scraper/layer'
4
+
5
+ require 'yaml'
6
+ require 'mechanize'
7
+ require 'parallel'
8
+
9
+ # stackoverflow.com/questions/6233124/where-to-place-access-config-file-in-gem
10
+ module GisScraper
11
+
12
+ @config = {threads: 8} # threads used for scraping
13
+ @valid_keys = @config.keys
14
+
15
+ def self.configure(opts = {})
16
+ opts.each { |k,v| @config[k.to_sym] = v if @valid_keys.include? k.to_sym }
17
+ end
18
+
19
+ def self.configure_with(path_to_yaml_file)
20
+ begin
21
+ config = YAML::load(IO.read(path_to_yaml_file))
22
+ rescue Errno::ENOENT
23
+ puts "YAML configuration file couldn't be found. Using defaults"
24
+ return
25
+ rescue Psych::SyntaxError
26
+ puts 'YAML configuration file contains invalid syntax. Using defaults'
27
+ return
28
+ end
29
+
30
+ configure(config)
31
+ end
32
+
33
+ def self.config
34
+ @config
35
+ end
36
+
37
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gis_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.pre
5
+ platform: ruby
6
+ authors:
7
+ - Bruce Steedman
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-12-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mechanize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.7'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.6'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.6'
83
+ description: Scrapes ArcGIS data from MapServer REST API
84
+ email:
85
+ - bruce.steedman@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - LICENSE.txt
96
+ - README.md
97
+ - Rakefile
98
+ - bin/console
99
+ - bin/gisget
100
+ - bin/setup
101
+ - gis_scraper.gemspec
102
+ - lib/gis_scraper.rb
103
+ - lib/gis_scraper/feature_scraper.rb
104
+ - lib/gis_scraper/layer.rb
105
+ - lib/gis_scraper/version.rb
106
+ homepage:
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">"
122
+ - !ruby/object:Gem::Version
123
+ version: 1.3.1
124
+ requirements: []
125
+ rubyforge_project:
126
+ rubygems_version: 2.4.8
127
+ signing_key:
128
+ specification_version: 4
129
+ summary: Scrapes ArcGIS data from MapServer REST API
130
+ test_files: []