gis_scraper 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7418ca46fc872f0665cf06bb5becfbe72acd1ad2
4
+ data.tar.gz: e3d7888a862011049f123da0941726b1231b8565
5
+ SHA512:
6
+ metadata.gz: d8e75fe1ac4c5a7f1a8712fb8fa1b9268b64dc79be61ed8d13f409ceb21d836b0f4f624e4119f3fb1aec4c31f9c4c302708a98f620ca9f434fac45e89215cf80
7
+ data.tar.gz: e35a53941f8e4788e655cff6806af3cf901748106f2f9620b6ce7951050b23ec1e3601249afb719390d95b97e013b411844230943081a1f5336de3d79a1f1a09
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ **/.DS_Store
2
+
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --require spec_helper
2
+ --format documentation
3
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.0.0
5
+ - 2.1.6
6
+ - 2.2.3
7
+ - rbx-2.9
8
+
9
+ script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,62 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ gis_scraper (0.1.0.pre)
5
+ mechanize (~> 2.7)
6
+ parallel (~> 1.6)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ diff-lcs (1.2.5)
12
+ domain_name (0.5.25)
13
+ unf (>= 0.0.5, < 1.0.0)
14
+ http-cookie (1.0.2)
15
+ domain_name (~> 0.5)
16
+ mechanize (2.7.3)
17
+ domain_name (~> 0.5, >= 0.5.1)
18
+ http-cookie (~> 1.0)
19
+ mime-types (~> 2.0)
20
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
21
+ net-http-persistent (~> 2.5, >= 2.5.2)
22
+ nokogiri (~> 1.4)
23
+ ntlm-http (~> 0.1, >= 0.1.1)
24
+ webrobots (>= 0.0.9, < 0.2)
25
+ mime-types (2.99)
26
+ mini_portile2 (2.0.0)
27
+ net-http-digest_auth (1.4)
28
+ net-http-persistent (2.9.4)
29
+ nokogiri (1.6.7.1)
30
+ mini_portile2 (~> 2.0.0.rc2)
31
+ ntlm-http (0.1.1)
32
+ parallel (1.6.1)
33
+ rake (10.4.2)
34
+ rspec (3.3.0)
35
+ rspec-core (~> 3.3.0)
36
+ rspec-expectations (~> 3.3.0)
37
+ rspec-mocks (~> 3.3.0)
38
+ rspec-core (3.3.2)
39
+ rspec-support (~> 3.3.0)
40
+ rspec-expectations (3.3.1)
41
+ diff-lcs (>= 1.2.0, < 2.0)
42
+ rspec-support (~> 3.3.0)
43
+ rspec-mocks (3.3.2)
44
+ diff-lcs (>= 1.2.0, < 2.0)
45
+ rspec-support (~> 3.3.0)
46
+ rspec-support (3.3.0)
47
+ unf (0.1.4)
48
+ unf_ext
49
+ unf_ext (0.0.7.1)
50
+ webrobots (0.1.1)
51
+
52
+ PLATFORMS
53
+ ruby
54
+
55
+ DEPENDENCIES
56
+ bundler (~> 1.10)
57
+ gis_scraper!
58
+ rake (~> 10.0)
59
+ rspec (~> 3.0)
60
+
61
+ BUNDLED WITH
62
+ 1.10.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Bruce Steedman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,22 @@
1
+ # gis_scraper Ruby Gem
2
+ [![Gem Version](https://badge.fury.io/rb/gis_scraper.svg)](http://badge.fury.io/rb/gis_scraper)
3
+ [![Build status](https://secure.travis-ci.org/MatzFan/gis_scraper.svg)](http://travis-ci.org/MatzFan/gis_scraper)
4
+
5
+ Utility to recursively scrape ArcGIS MapServer data using REST API.
6
+
7
+ ArcGIS MapServer REST queries are limited to 1,000 objects in some cases. This tool makes repeated calls until all data for a given layer is extracted. It then merges the resulting JSON files into a single file. This allows GIS clients like QGIS to add a layer from a single file.
8
+
9
+ **Usage**
10
+
11
+ The executable is called 'gisget' and takes one required arg - a MapServer/Layer URL (ending in an integer representing the layer number). An optional file output path may also be specified. If omitted the file will be saved in current directory. Example:
12
+
13
+ ```
14
+ gisget http://gps.digimap.gg/arcgis/rest/services/StatesOfJersey/JerseyMappingOL/MapServer/0 ~/Desktop
15
+ ```
16
+
17
+ If the layer is type 'Feature Layer', a single file of JSON data will be saved (named the same as the layer). If the layer is type 'Group Layer', the sub-group structure is traversed recursively thus: Directories for each sub-group layer are created and JSON data files for each constituent feature layer written to them.
18
+
19
+ **Specification and Tests**
20
+
21
+ rspec spec
22
+
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'gis_scraper'
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require 'irb'
14
+ IRB.start
data/bin/gisget ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ start = Time.now
4
+ Layer.new(*ARGV).write
5
+ puts "Finished in #{Time.now - start} seconds"
data/bin/setup ADDED
@@ -0,0 +1,5 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'gis_scraper/version'
6
+
7
+ Gem::Specification.new do |s|
8
+ s.name = 'gis_scraper'
9
+ s.version = GisScraper::VERSION
10
+ s.authors = ['Bruce Steedman']
11
+ s.email = ['bruce.steedman@gmail.com']
12
+
13
+ s.summary = %q{Scrapes ArcGIS data from MapServer REST API}
14
+ s.description = %q{Scrapes ArcGIS data from MapServer REST API}
15
+ s.license = "MIT"
16
+
17
+ s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec)/}) }
18
+ s.bindir = 'exe'
19
+ s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ s.require_paths = ['lib']
21
+
22
+ s.add_development_dependency 'bundler', '~> 1.10'
23
+ s.add_development_dependency 'rake', '~> 10.0'
24
+ s.add_development_dependency 'rspec', '~> 3.0'
25
+
26
+ s.add_runtime_dependency 'mechanize', '~> 2.7'
27
+ s.add_runtime_dependency 'parallel', '~> 1.6'
28
+ end
@@ -0,0 +1,82 @@
1
+ class JSONParser < Mechanize::File
2
+ attr_reader :json
3
+
4
+ def initialize(uri=nil, response=nil, body=nil, code=nil)
5
+ super(uri, response, body, code)
6
+ @json = JSON.parse(body)
7
+ end
8
+ end
9
+
10
+ class FeatureScraper
11
+
12
+ attr_reader :name
13
+
14
+ def initialize(url)
15
+ @url = url
16
+ @agent = Mechanize.new
17
+ @agent.pluggable_parser['text/plain'] = JSONParser
18
+ @layer = layer # hash
19
+ @name = name
20
+ @pk = pk
21
+ @max = max # maxRecordCount - usually 1000
22
+ @form = form
23
+ @loops = loops
24
+ @threads = GisScraper.config[:threads]
25
+ end
26
+
27
+ def json_data
28
+ data(0).merge({'features' => features(@threads)}).to_json
29
+ end
30
+
31
+ private
32
+
33
+ def layer
34
+ @agent.get(@url + '?f=pjson').json
35
+ end
36
+
37
+ def name
38
+ @layer['name']
39
+ end
40
+
41
+ def pk
42
+ @layer['fields'].select { |f| f['type'] == 'esriFieldTypeOID' }[0]['name']
43
+ end
44
+
45
+ def max
46
+ @layer['maxRecordCount'].to_i
47
+ end
48
+
49
+ def form
50
+ @agent.get(@url + '/query').forms.first
51
+ end
52
+
53
+ def count
54
+ set_query_params
55
+ @form.submit(@form.buttons[1]).json['count'].to_i
56
+ end
57
+
58
+ def set_query_params(loop_num = nil)
59
+ @form.fields[0].value = where_text(loop_num)
60
+ loop_num ? @form.radiobuttons[4].uncheck : @form.radiobuttons[4].check # count only true
61
+ @form.fields[6].value = '*'
62
+ @form.field_with(name: 'f').options[1].select # for JSON
63
+ end
64
+
65
+ def data(n)
66
+ set_query_params(n)
67
+ @form.submit(@form.buttons[1]).json
68
+ end
69
+
70
+ def features(t)
71
+ Parallel.map(0...@loops, in_threads: t) { |n| data(n)['features'] }.flatten
72
+ end
73
+
74
+ def loops
75
+ (count.to_f/@max).ceil
76
+ end
77
+
78
+ def where_text(n)
79
+ n ? "#{pk} > #{n * @max} AND #{pk} <= #{(n + 1) * @max}" : "#{pk} > 0"
80
+ end
81
+
82
+ end
@@ -0,0 +1,105 @@
1
+ require 'fileutils'
2
+
3
+ class Layer
4
+
5
+ class JSONParser < Mechanize::File
6
+ attr_reader :json
7
+
8
+ def initialize(uri=nil, response=nil, body=nil, code=nil)
9
+ super(uri, response, body, code)
10
+ @json = JSON.parse(body)
11
+ end
12
+ end
13
+
14
+ class UnknownLayerType < StandardError; end
15
+
16
+ attr_reader :type, :id, :name
17
+
18
+ TYPES = ['Group Layer',
19
+ 'Feature Layer',
20
+ 'Annotation Layer',
21
+ 'Annotation SubLayer']
22
+ QUERYABLE = ['Feature Layer', 'Annotation Layer']
23
+
24
+ def initialize(url, path = '.')
25
+ @url, @path = url, File.expand_path(path)
26
+ @ms_url = ms_url # map server url ending '../MapServer'
27
+ @id = id
28
+ @agent = Mechanize.new
29
+ @agent.pluggable_parser['text/plain'] = JSONParser
30
+ validate_url
31
+ @page_json = page_json
32
+ @type = type
33
+ @name = name
34
+ end
35
+
36
+ def write
37
+ QUERYABLE.any? { |l| @type == l } ? write_json_files : process_sub_layers
38
+ end
39
+
40
+ private
41
+
42
+ def ms_url
43
+ @url.split('/')[0..-2].join('/')
44
+ end
45
+
46
+ def id
47
+ @url.split('/').last
48
+ end
49
+
50
+ def validate_url
51
+ raise ArgumentError, 'URL must end with layer id' if @id.to_i.to_s != @id
52
+ raise ArgumentError, 'Bad MapServer URL' if @ms_url[-9..-1] != 'MapServer'
53
+ end
54
+
55
+ def page_json
56
+ @agent.get(@url + '?f=pjson').json
57
+ end
58
+
59
+ def type
60
+ validate_type @page_json['type']
61
+ end
62
+
63
+ def name
64
+ replace_forwardslashes_with_underscores @page_json['name']
65
+ end
66
+
67
+ def validate_type(type)
68
+ raise UnknownLayerType, type unless (TYPES.any? { |t| t == type })
69
+ type
70
+ end
71
+
72
+ def sub_layer_id_names
73
+ @page_json['subLayers'] || []
74
+ end
75
+
76
+ def json_data(url)
77
+ FeatureScraper.new(url).json_data
78
+ end
79
+
80
+ def write_json_files
81
+ File.write "#{@path}/#{@name}.json", json_data("#{@ms_url}/#{@id}")
82
+ end
83
+
84
+ def process_sub_layers
85
+ sub_layer_id_names.each do |hash|
86
+ name, id = hash['name'], hash['id']
87
+ path = "#{@path}/#{name}"
88
+ recurse sub_layer(id, path), path
89
+ end
90
+ end
91
+
92
+ def recurse(layer, dir)
93
+ FileUtils.mkdir dir
94
+ layer.write
95
+ end
96
+
97
+ def sub_layer(id, path)
98
+ Layer.new "#{@ms_url}/#{id}", path
99
+ end
100
+
101
+ def replace_forwardslashes_with_underscores(string)
102
+ string.gsub /\//, '_'
103
+ end
104
+
105
+ end
@@ -0,0 +1,3 @@
1
+ module GisScraper
2
+ VERSION = '0.1.0.pre'
3
+ end
@@ -0,0 +1,37 @@
1
+ require 'gis_scraper/version'
2
+ require 'gis_scraper/feature_scraper'
3
+ require 'gis_scraper/layer'
4
+
5
+ require 'yaml'
6
+ require 'mechanize'
7
+ require 'parallel'
8
+
9
+ # stackoverflow.com/questions/6233124/where-to-place-access-config-file-in-gem
10
+ module GisScraper
11
+
12
+ @config = {threads: 8} # threads used for scraping
13
+ @valid_keys = @config.keys
14
+
15
+ def self.configure(opts = {})
16
+ opts.each { |k,v| @config[k.to_sym] = v if @valid_keys.include? k.to_sym }
17
+ end
18
+
19
+ def self.configure_with(path_to_yaml_file)
20
+ begin
21
+ config = YAML::load(IO.read(path_to_yaml_file))
22
+ rescue Errno::ENOENT
23
+ puts "YAML configuration file couldn't be found. Using defaults"
24
+ return
25
+ rescue Psych::SyntaxError
26
+ puts 'YAML configuration file contains invalid syntax. Using defaults'
27
+ return
28
+ end
29
+
30
+ configure(config)
31
+ end
32
+
33
+ def self.config
34
+ @config
35
+ end
36
+
37
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gis_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.pre
5
+ platform: ruby
6
+ authors:
7
+ - Bruce Steedman
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-12-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mechanize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.7'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.6'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.6'
83
+ description: Scrapes ArcGIS data from MapServer REST API
84
+ email:
85
+ - bruce.steedman@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - LICENSE.txt
96
+ - README.md
97
+ - Rakefile
98
+ - bin/console
99
+ - bin/gisget
100
+ - bin/setup
101
+ - gis_scraper.gemspec
102
+ - lib/gis_scraper.rb
103
+ - lib/gis_scraper/feature_scraper.rb
104
+ - lib/gis_scraper/layer.rb
105
+ - lib/gis_scraper/version.rb
106
+ homepage:
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">"
122
+ - !ruby/object:Gem::Version
123
+ version: 1.3.1
124
+ requirements: []
125
+ rubyforge_project:
126
+ rubygems_version: 2.4.8
127
+ signing_key:
128
+ specification_version: 4
129
+ summary: Scrapes ArcGIS data from MapServer REST API
130
+ test_files: []