gis_scraper 0.0.0 → 0.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -5
- data/.travis.yml +9 -0
- data/Gemfile +0 -17
- data/Gemfile.lock +62 -0
- data/LICENSE.txt +1 -1
- data/README.md +8 -110
- data/Rakefile +1 -3
- data/bin/console +8 -3
- data/bin/gisget +5 -0
- data/gis_scraper.gemspec +19 -23
- data/lib/gis_scraper/feature_scraper.rb +43 -33
- data/lib/gis_scraper/layer.rb +105 -0
- data/lib/gis_scraper/version.rb +1 -3
- data/lib/gis_scraper.rb +10 -13
- metadata +58 -31
- data/.gitlab-ci.yml +0 -30
- data/.rubocop.yml +0 -17
- data/.ruby-version +0 -1
- data/CHANGELOG.md +0 -5
- data/Guardfile +0 -14
- data/lib/gis_scraper/layer_writer.rb +0 -175
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7418ca46fc872f0665cf06bb5becfbe72acd1ad2
|
4
|
+
data.tar.gz: e3d7888a862011049f123da0941726b1231b8565
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8e75fe1ac4c5a7f1a8712fb8fa1b9268b64dc79be61ed8d13f409ceb21d836b0f4f624e4119f3fb1aec4c31f9c4c302708a98f620ca9f434fac45e89215cf80
|
7
|
+
data.tar.gz: e35a53941f8e4788e655cff6806af3cf901748106f2f9620b6ce7951050b23ec1e3601249afb719390d95b97e013b411844230943081a1f5336de3d79a1f1a09
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -1,20 +1,3 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
source 'https://rubygems.org'
|
4
2
|
|
5
|
-
group :test, :development do
|
6
|
-
gem 'rubocop-rake', '~> 0.6', require: false
|
7
|
-
gem 'rubocop-rspec', '~> 3.0', require: false
|
8
|
-
end
|
9
|
-
|
10
|
-
group :development do
|
11
|
-
gem 'bundler', '~> 2.1'
|
12
|
-
gem 'guard', '~> 2.18'
|
13
|
-
gem 'guard-rspec', '~> 4.7'
|
14
|
-
gem 'libnotify', '~> 0.9' # guard notifications
|
15
|
-
gem 'rake', '~> 13.0'
|
16
|
-
gem 'rspec', '~> 3.13'
|
17
|
-
gem 'rubocop', '~> 1.64'
|
18
|
-
end
|
19
|
-
|
20
3
|
gemspec
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
gis_scraper (0.1.0.pre)
|
5
|
+
mechanize (~> 2.7)
|
6
|
+
parallel (~> 1.6)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
diff-lcs (1.2.5)
|
12
|
+
domain_name (0.5.25)
|
13
|
+
unf (>= 0.0.5, < 1.0.0)
|
14
|
+
http-cookie (1.0.2)
|
15
|
+
domain_name (~> 0.5)
|
16
|
+
mechanize (2.7.3)
|
17
|
+
domain_name (~> 0.5, >= 0.5.1)
|
18
|
+
http-cookie (~> 1.0)
|
19
|
+
mime-types (~> 2.0)
|
20
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
21
|
+
net-http-persistent (~> 2.5, >= 2.5.2)
|
22
|
+
nokogiri (~> 1.4)
|
23
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
24
|
+
webrobots (>= 0.0.9, < 0.2)
|
25
|
+
mime-types (2.99)
|
26
|
+
mini_portile2 (2.0.0)
|
27
|
+
net-http-digest_auth (1.4)
|
28
|
+
net-http-persistent (2.9.4)
|
29
|
+
nokogiri (1.6.7.1)
|
30
|
+
mini_portile2 (~> 2.0.0.rc2)
|
31
|
+
ntlm-http (0.1.1)
|
32
|
+
parallel (1.6.1)
|
33
|
+
rake (10.4.2)
|
34
|
+
rspec (3.3.0)
|
35
|
+
rspec-core (~> 3.3.0)
|
36
|
+
rspec-expectations (~> 3.3.0)
|
37
|
+
rspec-mocks (~> 3.3.0)
|
38
|
+
rspec-core (3.3.2)
|
39
|
+
rspec-support (~> 3.3.0)
|
40
|
+
rspec-expectations (3.3.1)
|
41
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
42
|
+
rspec-support (~> 3.3.0)
|
43
|
+
rspec-mocks (3.3.2)
|
44
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
45
|
+
rspec-support (~> 3.3.0)
|
46
|
+
rspec-support (3.3.0)
|
47
|
+
unf (0.1.4)
|
48
|
+
unf_ext
|
49
|
+
unf_ext (0.0.7.1)
|
50
|
+
webrobots (0.1.1)
|
51
|
+
|
52
|
+
PLATFORMS
|
53
|
+
ruby
|
54
|
+
|
55
|
+
DEPENDENCIES
|
56
|
+
bundler (~> 1.10)
|
57
|
+
gis_scraper!
|
58
|
+
rake (~> 10.0)
|
59
|
+
rspec (~> 3.0)
|
60
|
+
|
61
|
+
BUNDLED WITH
|
62
|
+
1.10.2
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,124 +1,22 @@
|
|
1
1
|
# gis_scraper Ruby Gem
|
2
2
|
[](http://badge.fury.io/rb/gis_scraper)
|
3
|
-
[](http://travis-ci.org/MatzFan/gis_scraper)
|
4
4
|
|
5
|
-
Utility to recursively scrape ArcGIS MapServer data using
|
5
|
+
Utility to recursively scrape ArcGIS MapServer data using REST API.
|
6
6
|
|
7
|
-
ArcGIS MapServer REST queries are limited to 1,000 objects
|
7
|
+
ArcGIS MapServer REST queries are limited to 1,000 objects in some cases. This tool makes repeated calls until all data for a given layer is extracted. It then merges the resulting JSON files into a single file. This allows GIS clients like QGIS to add a layer from a single file.
|
8
8
|
|
9
|
-
|
9
|
+
**Usage**
|
10
10
|
|
11
|
-
|
11
|
+
The executable is called 'gisget' and takes one required arg - a MapServer/Layer URL (ending in an integer representing the layer number). An optional file output path may also be specified. If omitted the file will be saved in current directory. Example:
|
12
12
|
|
13
|
-
A Postgres database with the PostGIS extension enabled for database export.
|
14
|
-
|
15
|
-
For data import to a database [GDAL](http://gdal.org) must be installed and specifically the [ogr2ogr](http://www.gdal.org/ogr2ogr.html) executable must be available in your path.
|
16
|
-
|
17
|
-
## Known Limitations
|
18
|
-
|
19
|
-
*NIX systems only - Linux/Mac OS X. ArcGIS MapServer data is readable directly by ArcGIS Windows clients 😉
|
20
|
-
|
21
|
-
The following esri geometry types are so far supported:
|
22
|
-
|
23
|
-
- esriGeometryPoint, esriGeometryMultipoint, esriGeometryLine, esriGeometryPolyline, esriGeometryPolygon
|
24
|
-
|
25
|
-
Annotation layers are ignored, as are layers with no esri geometryType.
|
26
|
-
|
27
|
-
Currently the JSON data for a whole layer is held in memory before being output. For large layers - e.g. >100,000 objects - this can be multiple GB of memory. Is this causes a problem for you please add a comment to [issue #4](https://gitlab.com/matzfan/gis_scraper/issues/4).
|
28
|
-
|
29
|
-
## Installation
|
30
|
-
|
31
|
-
Add this line to your application's Gemfile:
|
32
|
-
|
33
|
-
```ruby
|
34
|
-
gem 'gis_scraper'
|
35
13
|
```
|
36
|
-
|
37
|
-
And then execute:
|
38
|
-
|
39
|
-
$ bundle
|
40
|
-
|
41
|
-
Or install it yourself as:
|
42
|
-
|
43
|
-
$ gem install gis_scraper
|
44
|
-
|
45
|
-
## Configuration
|
46
|
-
|
47
|
-
Configuration options may be set via a hash or specified in a Yaml file. The following options are available:
|
48
|
-
|
49
|
-
- ```:threads``` Scraping is multi-threaded. The number of threads to use may be set with this option (default: 8)
|
50
|
-
- ```:output_path``` For JSON output, the path used to write files to (default: '~/Desktop')
|
51
|
-
|
52
|
-
The following options are used to connect to a database:
|
53
|
-
|
54
|
-
- ```:host``` (default: 'localhost')
|
55
|
-
- ```:port``` (default: 5432)
|
56
|
-
- ```:dbname``` (default: 'postgres')
|
57
|
-
- ```:user``` (default: 'postgres')
|
58
|
-
- ```:password``` (default: nil)
|
59
|
-
|
60
|
-
These additional options are available when using output to a database and are applied to the ```ogr2ogr``` command:
|
61
|
-
|
62
|
-
- ```:srs``` Used to overide the source spacial reference system. Currently only EPSG string format is valid - e.g. 'EPSG:3109' (default: no overide)
|
63
|
-
|
64
|
-
**To set via a hash**
|
65
|
-
|
66
|
-
```Ruby
|
67
|
-
GisScraper.configure(:threads => 16) # default is 8
|
68
|
-
```
|
69
|
-
|
70
|
-
**Using a Yaml configuration file**
|
71
|
-
|
72
|
-
```Ruby
|
73
|
-
GisScraper.configure_with 'path-to-Yaml-file'
|
74
|
-
```
|
75
|
-
|
76
|
-
```Ruby
|
77
|
-
GisScraper.config # returns the hash of configuration values
|
78
|
-
```
|
79
|
-
|
80
|
-
## Usage
|
81
|
-
|
82
|
-
A LayerWriter object must be instantiated with one required arg - a Service/Layer URL (ending in an integer representing the layer number). Example:
|
83
|
-
|
84
|
-
```Ruby
|
85
|
-
writer = LayerWriter.new(url: 'https://gps.digimap.gg/arcgis/rest/services/JerseyUtilities/JerseyUtilities/MapServer/0')
|
86
|
-
```
|
87
|
-
An optional second argument for the output path for JSON files may be specified. If so this overides the configuration option. Example:
|
88
|
-
```Ruby
|
89
|
-
writer = LayerWriter.new(url: 'https://gps.digimap.gg/arcgis/rest/services/JerseyUtilities/JerseyUtilities/MapServer/0', path: '~/Desktop')
|
90
|
-
```
|
91
|
-
The `gis_scraper` gem uses the `arcrest` Gem [README](https://gitlab.com/matzfan/arcrest) REST API to retrieve data from ArcGIS servers. A hash of arcrest options may be passed using the :arcrest_opts key:
|
92
|
-
```Ruby
|
93
|
-
writer = LayerWriter.new(url: 'https://gps.digimap.gg/arcgis/rest/services/JerseyUtilities/JerseyUtilities/MapServer/0', arcrest_opts: headers: { referer: 'https://some_referrer' })
|
94
|
-
```
|
95
|
-
|
96
|
-
**JSON output**
|
97
|
-
|
98
|
-
```Ruby
|
99
|
-
writer.output_json
|
14
|
+
gisget http://gps.digimap.gg/arcgis/rest/services/StatesOfJersey/JerseyMappingOL/MapServer/0 ~/Desktop
|
100
15
|
```
|
101
16
|
|
102
17
|
If the layer is type 'Feature Layer', a single file of JSON data will be saved (named the same as the layer). If the layer is type 'Group Layer', the sub-group structure is traversed recursively thus: Directories for each sub-group layer are created and JSON data files for each constituent feature layer written to them.
|
103
18
|
|
104
|
-
**
|
105
|
-
|
106
|
-
Valid database config options must be set. The following command will convert JSON files, create tables for each layer (& sub-layers, if any) and import the data. Table names are lowercased, prefixed '_' and have spaces replaced with undescores. If a table with the same name exists the name is appended with '_'.
|
107
|
-
|
108
|
-
```Ruby
|
109
|
-
writer.output_to_db
|
110
|
-
```
|
111
|
-
|
112
|
-
## Specification and Tests
|
113
|
-
|
114
|
-
For the full specification clone this repo and run:
|
115
|
-
|
116
|
-
`bundle exec rake spec`
|
117
|
-
|
118
|
-
## Contributing
|
119
|
-
|
120
|
-
Bug reports, pull requests (and feature requests) are welcome on GitLab at https://gitlab.com/matzfan/gis_scraper.
|
19
|
+
**Specification and Tests**
|
121
20
|
|
122
|
-
|
21
|
+
rspec spec
|
123
22
|
|
124
|
-
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses)
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -1,9 +1,14 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# !/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby
|
4
2
|
|
5
3
|
require 'bundler/setup'
|
6
4
|
require 'gis_scraper'
|
7
5
|
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
8
13
|
require 'irb'
|
9
14
|
IRB.start
|
data/bin/gisget
ADDED
data/gis_scraper.gemspec
CHANGED
@@ -1,32 +1,28 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
lib = File.expand_path('lib', __dir__)
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
4
|
|
6
5
|
require 'gis_scraper/version'
|
7
6
|
|
8
|
-
Gem::Specification.new do |
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'gis_scraper'
|
9
|
+
s.version = GisScraper::VERSION
|
10
|
+
s.authors = ['Bruce Steedman']
|
11
|
+
s.email = ['bruce.steedman@gmail.com']
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
spec.license = 'MIT'
|
17
|
-
spec.required_ruby_version = '>= 3.3.3'
|
13
|
+
s.summary = %q{Scrapes ArcGIS data from MapServer REST API}
|
14
|
+
s.description = %q{Scrapes ArcGIS data from MapServer REST API}
|
15
|
+
s.license = "MIT"
|
18
16
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
17
|
+
s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec)/}) }
|
18
|
+
s.bindir = 'exe'
|
19
|
+
s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
+
s.require_paths = ['lib']
|
23
21
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
spec.require_paths = ['lib']
|
22
|
+
s.add_development_dependency 'bundler', '~> 1.10'
|
23
|
+
s.add_development_dependency 'rake', '~> 10.0'
|
24
|
+
s.add_development_dependency 'rspec', '~> 3.0'
|
28
25
|
|
29
|
-
|
30
|
-
|
31
|
-
spec.add_runtime_dependency 'pg', '~> 1.5'
|
26
|
+
s.add_runtime_dependency 'mechanize', '~> 2.7'
|
27
|
+
s.add_runtime_dependency 'parallel', '~> 1.6'
|
32
28
|
end
|
@@ -1,72 +1,82 @@
|
|
1
|
-
|
1
|
+
class JSONParser < Mechanize::File
|
2
|
+
attr_reader :json
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
4
|
+
def initialize(uri=nil, response=nil, body=nil, code=nil)
|
5
|
+
super(uri, response, body, code)
|
6
|
+
@json = JSON.parse(body)
|
7
|
+
end
|
8
|
+
end
|
6
9
|
|
7
|
-
|
10
|
+
class FeatureScraper
|
8
11
|
|
9
12
|
attr_reader :name
|
10
13
|
|
11
|
-
def initialize(url
|
14
|
+
def initialize(url)
|
12
15
|
@url = url
|
13
|
-
@
|
14
|
-
@
|
15
|
-
@
|
16
|
-
@name =
|
16
|
+
@agent = Mechanize.new
|
17
|
+
@agent.pluggable_parser['text/plain'] = JSONParser
|
18
|
+
@layer = layer # hash
|
19
|
+
@name = name
|
17
20
|
@pk = pk
|
18
|
-
@
|
21
|
+
@max = max # maxRecordCount - usually 1000
|
22
|
+
@form = form
|
19
23
|
@loops = loops
|
20
24
|
@threads = GisScraper.config[:threads]
|
21
25
|
end
|
22
26
|
|
23
27
|
def json_data
|
24
|
-
|
28
|
+
data(0).merge({'features' => features(@threads)}).to_json
|
25
29
|
end
|
26
30
|
|
27
31
|
private
|
28
32
|
|
29
|
-
def query_layer
|
30
|
-
@layer.query(where: '1=1')
|
31
|
-
end
|
32
|
-
|
33
33
|
def layer
|
34
|
-
|
34
|
+
@agent.get(@url + '?f=pjson').json
|
35
35
|
end
|
36
36
|
|
37
|
-
def
|
38
|
-
@layer
|
37
|
+
def name
|
38
|
+
@layer['name']
|
39
39
|
end
|
40
40
|
|
41
|
-
def
|
42
|
-
@layer.
|
41
|
+
def pk
|
42
|
+
@layer['fields'].select { |f| f['type'] == 'esriFieldTypeOID' }[0]['name']
|
43
43
|
end
|
44
44
|
|
45
|
-
def
|
46
|
-
@
|
45
|
+
def max
|
46
|
+
@layer['maxRecordCount'].to_i
|
47
47
|
end
|
48
48
|
|
49
|
-
def
|
50
|
-
@
|
49
|
+
def form
|
50
|
+
@agent.get(@url + '/query').forms.first
|
51
51
|
end
|
52
52
|
|
53
53
|
def count
|
54
|
-
|
54
|
+
set_query_params
|
55
|
+
@form.submit(@form.buttons[1]).json['count'].to_i
|
55
56
|
end
|
56
57
|
|
57
|
-
def
|
58
|
-
@
|
58
|
+
def set_query_params(loop_num = nil)
|
59
|
+
@form.fields[0].value = where_text(loop_num)
|
60
|
+
loop_num ? @form.radiobuttons[4].uncheck : @form.radiobuttons[4].check # count only true
|
61
|
+
@form.fields[6].value = '*'
|
62
|
+
@form.field_with(name: 'f').options[1].select # for JSON
|
59
63
|
end
|
60
64
|
|
61
|
-
def
|
62
|
-
|
65
|
+
def data(n)
|
66
|
+
set_query_params(n)
|
67
|
+
@form.submit(@form.buttons[1]).json
|
68
|
+
end
|
69
|
+
|
70
|
+
def features(t)
|
71
|
+
Parallel.map(0...@loops, in_threads: t) { |n| data(n)['features'] }.flatten
|
63
72
|
end
|
64
73
|
|
65
74
|
def loops
|
66
|
-
(count.to_f
|
75
|
+
(count.to_f/@max).ceil
|
67
76
|
end
|
68
77
|
|
69
|
-
def where_text(
|
70
|
-
|
78
|
+
def where_text(n)
|
79
|
+
n ? "#{pk} > #{n * @max} AND #{pk} <= #{(n + 1) * @max}" : "#{pk} > 0"
|
71
80
|
end
|
81
|
+
|
72
82
|
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
class Layer
|
4
|
+
|
5
|
+
class JSONParser < Mechanize::File
|
6
|
+
attr_reader :json
|
7
|
+
|
8
|
+
def initialize(uri=nil, response=nil, body=nil, code=nil)
|
9
|
+
super(uri, response, body, code)
|
10
|
+
@json = JSON.parse(body)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class UnknownLayerType < StandardError; end
|
15
|
+
|
16
|
+
attr_reader :type, :id, :name
|
17
|
+
|
18
|
+
TYPES = ['Group Layer',
|
19
|
+
'Feature Layer',
|
20
|
+
'Annotation Layer',
|
21
|
+
'Annotation SubLayer']
|
22
|
+
QUERYABLE = ['Feature Layer', 'Annotation Layer']
|
23
|
+
|
24
|
+
def initialize(url, path = '.')
|
25
|
+
@url, @path = url, File.expand_path(path)
|
26
|
+
@ms_url = ms_url # map server url ending '../MapServer'
|
27
|
+
@id = id
|
28
|
+
@agent = Mechanize.new
|
29
|
+
@agent.pluggable_parser['text/plain'] = JSONParser
|
30
|
+
validate_url
|
31
|
+
@page_json = page_json
|
32
|
+
@type = type
|
33
|
+
@name = name
|
34
|
+
end
|
35
|
+
|
36
|
+
def write
|
37
|
+
QUERYABLE.any? { |l| @type == l } ? write_json_files : process_sub_layers
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def ms_url
|
43
|
+
@url.split('/')[0..-2].join('/')
|
44
|
+
end
|
45
|
+
|
46
|
+
def id
|
47
|
+
@url.split('/').last
|
48
|
+
end
|
49
|
+
|
50
|
+
def validate_url
|
51
|
+
raise ArgumentError, 'URL must end with layer id' if @id.to_i.to_s != @id
|
52
|
+
raise ArgumentError, 'Bad MapServer URL' if @ms_url[-9..-1] != 'MapServer'
|
53
|
+
end
|
54
|
+
|
55
|
+
def page_json
|
56
|
+
@agent.get(@url + '?f=pjson').json
|
57
|
+
end
|
58
|
+
|
59
|
+
def type
|
60
|
+
validate_type @page_json['type']
|
61
|
+
end
|
62
|
+
|
63
|
+
def name
|
64
|
+
replace_forwardslashes_with_underscores @page_json['name']
|
65
|
+
end
|
66
|
+
|
67
|
+
def validate_type(type)
|
68
|
+
raise UnknownLayerType, type unless (TYPES.any? { |t| t == type })
|
69
|
+
type
|
70
|
+
end
|
71
|
+
|
72
|
+
def sub_layer_id_names
|
73
|
+
@page_json['subLayers'] || []
|
74
|
+
end
|
75
|
+
|
76
|
+
def json_data(url)
|
77
|
+
FeatureScraper.new(url).json_data
|
78
|
+
end
|
79
|
+
|
80
|
+
def write_json_files
|
81
|
+
File.write "#{@path}/#{@name}.json", json_data("#{@ms_url}/#{@id}")
|
82
|
+
end
|
83
|
+
|
84
|
+
def process_sub_layers
|
85
|
+
sub_layer_id_names.each do |hash|
|
86
|
+
name, id = hash['name'], hash['id']
|
87
|
+
path = "#{@path}/#{name}"
|
88
|
+
recurse sub_layer(id, path), path
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def recurse(layer, dir)
|
93
|
+
FileUtils.mkdir dir
|
94
|
+
layer.write
|
95
|
+
end
|
96
|
+
|
97
|
+
def sub_layer(id, path)
|
98
|
+
Layer.new "#{@ms_url}/#{id}", path
|
99
|
+
end
|
100
|
+
|
101
|
+
def replace_forwardslashes_with_underscores(string)
|
102
|
+
string.gsub /\//, '_'
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/gis_scraper/version.rb
CHANGED
data/lib/gis_scraper.rb
CHANGED
@@ -1,28 +1,24 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'arcrest'
|
4
|
-
require 'parallel'
|
5
|
-
require 'pg'
|
6
|
-
require 'yaml'
|
7
|
-
|
8
1
|
require 'gis_scraper/version'
|
9
2
|
require 'gis_scraper/feature_scraper'
|
10
|
-
require 'gis_scraper/
|
3
|
+
require 'gis_scraper/layer'
|
4
|
+
|
5
|
+
require 'yaml'
|
6
|
+
require 'mechanize'
|
7
|
+
require 'parallel'
|
11
8
|
|
12
9
|
# stackoverflow.com/questions/6233124/where-to-place-access-config-file-in-gem
|
13
10
|
module GisScraper
|
14
|
-
|
15
|
-
|
16
|
-
srs: nil }
|
11
|
+
|
12
|
+
@config = {threads: 8} # threads used for scraping
|
17
13
|
@valid_keys = @config.keys
|
18
14
|
|
19
15
|
def self.configure(opts = {})
|
20
|
-
opts.each { |k,
|
16
|
+
opts.each { |k,v| @config[k.to_sym] = v if @valid_keys.include? k.to_sym }
|
21
17
|
end
|
22
18
|
|
23
19
|
def self.configure_with(path_to_yaml_file)
|
24
20
|
begin
|
25
|
-
config = YAML.
|
21
|
+
config = YAML::load(IO.read(path_to_yaml_file))
|
26
22
|
rescue Errno::ENOENT
|
27
23
|
puts "YAML configuration file couldn't be found. Using defaults"
|
28
24
|
return
|
@@ -37,4 +33,5 @@ module GisScraper
|
|
37
33
|
def self.config
|
38
34
|
@config
|
39
35
|
end
|
36
|
+
|
40
37
|
end
|
metadata
CHANGED
@@ -1,87 +1,113 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gis_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.1.0.pre
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
8
|
-
autorequire:
|
7
|
+
- Bruce Steedman
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-12-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
20
|
-
type: :
|
19
|
+
version: '1.10'
|
20
|
+
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1.
|
26
|
+
version: '1.10'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: mechanize
|
29
57
|
requirement: !ruby/object:Gem::Requirement
|
30
58
|
requirements:
|
31
59
|
- - "~>"
|
32
60
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
61
|
+
version: '2.7'
|
34
62
|
type: :runtime
|
35
63
|
prerelease: false
|
36
64
|
version_requirements: !ruby/object:Gem::Requirement
|
37
65
|
requirements:
|
38
66
|
- - "~>"
|
39
67
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
68
|
+
version: '2.7'
|
41
69
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
70
|
+
name: parallel
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|
44
72
|
requirements:
|
45
73
|
- - "~>"
|
46
74
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1.
|
75
|
+
version: '1.6'
|
48
76
|
type: :runtime
|
49
77
|
prerelease: false
|
50
78
|
version_requirements: !ruby/object:Gem::Requirement
|
51
79
|
requirements:
|
52
80
|
- - "~>"
|
53
81
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1.
|
55
|
-
description:
|
82
|
+
version: '1.6'
|
83
|
+
description: Scrapes ArcGIS data from MapServer REST API
|
56
84
|
email:
|
85
|
+
- bruce.steedman@gmail.com
|
57
86
|
executables: []
|
58
87
|
extensions: []
|
59
88
|
extra_rdoc_files: []
|
60
89
|
files:
|
61
90
|
- ".gitignore"
|
62
|
-
- ".gitlab-ci.yml"
|
63
91
|
- ".rspec"
|
64
|
-
- ".
|
65
|
-
- ".ruby-version"
|
66
|
-
- CHANGELOG.md
|
92
|
+
- ".travis.yml"
|
67
93
|
- Gemfile
|
68
|
-
-
|
94
|
+
- Gemfile.lock
|
69
95
|
- LICENSE.txt
|
70
96
|
- README.md
|
71
97
|
- Rakefile
|
72
98
|
- bin/console
|
99
|
+
- bin/gisget
|
73
100
|
- bin/setup
|
74
101
|
- gis_scraper.gemspec
|
75
102
|
- lib/gis_scraper.rb
|
76
103
|
- lib/gis_scraper/feature_scraper.rb
|
77
|
-
- lib/gis_scraper/
|
104
|
+
- lib/gis_scraper/layer.rb
|
78
105
|
- lib/gis_scraper/version.rb
|
79
|
-
homepage:
|
106
|
+
homepage:
|
80
107
|
licenses:
|
81
108
|
- MIT
|
82
|
-
metadata:
|
83
|
-
|
84
|
-
post_install_message:
|
109
|
+
metadata: {}
|
110
|
+
post_install_message:
|
85
111
|
rdoc_options: []
|
86
112
|
require_paths:
|
87
113
|
- lib
|
@@ -89,15 +115,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
89
115
|
requirements:
|
90
116
|
- - ">="
|
91
117
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
118
|
+
version: '0'
|
93
119
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
120
|
requirements:
|
95
|
-
- - "
|
121
|
+
- - ">"
|
96
122
|
- !ruby/object:Gem::Version
|
97
|
-
version:
|
123
|
+
version: 1.3.1
|
98
124
|
requirements: []
|
99
|
-
|
100
|
-
|
125
|
+
rubyforge_project:
|
126
|
+
rubygems_version: 2.4.8
|
127
|
+
signing_key:
|
101
128
|
specification_version: 4
|
102
|
-
summary:
|
129
|
+
summary: Scrapes ArcGIS data from MapServer REST API
|
103
130
|
test_files: []
|
data/.gitlab-ci.yml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
---
|
2
|
-
before_script:
|
3
|
-
- apt-get update -qq
|
4
|
-
- apt-get install -y postgresql postgresql-client libpq-dev gdal-bin
|
5
|
-
- ogr2ogr --version
|
6
|
-
- psql -U postgres -h $PG_HOST -d postgres -c "SELECT PostGIS_Lib_version();"
|
7
|
-
- pg_lsclusters
|
8
|
-
- ruby -v
|
9
|
-
- which ruby
|
10
|
-
- gem install bundler
|
11
|
-
- bundle install --jobs $(nproc) "${FLAGS[@]}"
|
12
|
-
- export POSTGRES_HOST=mdillon__postgis
|
13
|
-
|
14
|
-
.job_template: &job_definition
|
15
|
-
image: ruby:3.3.3
|
16
|
-
|
17
|
-
services:
|
18
|
-
# - postgres:latest # must use host "postgres" to connect
|
19
|
-
- mdillon/postgis:11
|
20
|
-
|
21
|
-
variables:
|
22
|
-
PG_HOST: mdillon__postgis
|
23
|
-
POSTGRES_USER: postgres
|
24
|
-
|
25
|
-
test:
|
26
|
-
<<: *job_definition
|
27
|
-
timeout: 10m
|
28
|
-
script:
|
29
|
-
- bundle exec rake spec
|
30
|
-
- bundle exec rubocop
|
data/.rubocop.yml
DELETED
data/.ruby-version
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
3.3.3
|
data/CHANGELOG.md
DELETED
data/Guardfile
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
guard :rspec, cmd: 'bundle exec rspec' do
|
4
|
-
require 'guard/rspec/dsl'
|
5
|
-
dsl = Guard::RSpec::Dsl.new self
|
6
|
-
|
7
|
-
rspec = dsl.rspec
|
8
|
-
watch(rspec.spec_helper) { rspec.spec_dir }
|
9
|
-
watch(rspec.spec_files)
|
10
|
-
|
11
|
-
# Ruby files
|
12
|
-
ruby = dsl.ruby
|
13
|
-
dsl.watch_spec_files_for(ruby.lib_files)
|
14
|
-
end
|
@@ -1,175 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'fileutils'
|
4
|
-
require 'shellwords'
|
5
|
-
require 'tmpdir'
|
6
|
-
|
7
|
-
# tool to write ArcGIS layer(s) to json or database output
|
8
|
-
# rubocop:disable Metrics/ClassLength
|
9
|
-
class LayerWriter
|
10
|
-
attr_reader :type
|
11
|
-
|
12
|
-
GDAL = /GDAL (\d+\.\d+\.\d+)/
|
13
|
-
V1_11_4 = Gem::Version.new('1.11.4') # https://trac.osgeo.org/gdal/ticket/6529
|
14
|
-
TABLES = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"
|
15
|
-
TYPES = ['Group ', 'Feature ', 'Annotation ', 'Annotation Sub'].freeze
|
16
|
-
CONN = %i[host port dbname user password].freeze
|
17
|
-
GEOM_TYPES = { 'esriGeometryPoint' => 'POINT',
|
18
|
-
'esriGeometryMultipoint' => 'MULTIPOINT',
|
19
|
-
'esriGeometryLine' => 'LINESTRING',
|
20
|
-
'esriGeometryPolyline' => 'MULTILINESTRING',
|
21
|
-
'esriGeometryPolygon' => 'MULTIPOLYGON' }.freeze
|
22
|
-
OGR = 'ogr2ogr -overwrite -f "PostgreSQL" PG:'
|
23
|
-
|
24
|
-
def initialize(url:, path: nil, arcrest_opts: {})
|
25
|
-
@url = url
|
26
|
-
@output_path = output_path(path) || config_path
|
27
|
-
@arcrest_opts = arcrest_opts
|
28
|
-
@conn = conn
|
29
|
-
@id = id
|
30
|
-
@service_url = service_url
|
31
|
-
@layer = layer
|
32
|
-
@page_json = @layer.json
|
33
|
-
@type = layer_type
|
34
|
-
@name = name
|
35
|
-
end
|
36
|
-
|
37
|
-
def output_json
|
38
|
-
output(:json)
|
39
|
-
end
|
40
|
-
|
41
|
-
def output_to_db
|
42
|
-
raise 'ogr2ogr executable missing, is GDAL installed and in your PATH?' unless (v_string = ogr2ogr?)
|
43
|
-
raise 'ogr2ogr version must be > 1.11.4' unless Gem::Version.new(v_string.match(GDAL)[1]) > V1_11_4
|
44
|
-
|
45
|
-
output(:db)
|
46
|
-
end
|
47
|
-
|
48
|
-
private
|
49
|
-
|
50
|
-
def conn
|
51
|
-
CONN.zip(CONN.map { |key| GisScraper.config[key] }).to_h
|
52
|
-
end
|
53
|
-
|
54
|
-
def output(format)
|
55
|
-
@type == 'Feature Layer' ? _method(format) : do_sub_layers(format) # recurses sub-layers
|
56
|
-
end
|
57
|
-
|
58
|
-
def _method(format)
|
59
|
-
format == :db ? write_to_db : write_json
|
60
|
-
end
|
61
|
-
|
62
|
-
def output_path(path)
|
63
|
-
File.expand_path(path) if path
|
64
|
-
end
|
65
|
-
|
66
|
-
def connection
|
67
|
-
PG.connect @conn
|
68
|
-
end
|
69
|
-
|
70
|
-
def ogr2ogr?
|
71
|
-
`ogr2ogr --version`
|
72
|
-
rescue Errno::ENOENT
|
73
|
-
nil
|
74
|
-
end
|
75
|
-
|
76
|
-
def config_path
|
77
|
-
File.expand_path GisScraper.config[:output_path]
|
78
|
-
end
|
79
|
-
|
80
|
-
def service_url
|
81
|
-
@url.split('/')[0..-2].join('/')
|
82
|
-
end
|
83
|
-
|
84
|
-
def id
|
85
|
-
@url.split('/').last
|
86
|
-
end
|
87
|
-
|
88
|
-
def layer
|
89
|
-
ArcREST::Layer.new(@url, @arcrest_opts)
|
90
|
-
end
|
91
|
-
|
92
|
-
def layer_type
|
93
|
-
validate_layer @page_json['type']
|
94
|
-
end
|
95
|
-
|
96
|
-
def validate_layer(type)
|
97
|
-
raise "Bad Layer type: #{type}" unless TYPES.any? { |t| type == "#{t}Layer" }
|
98
|
-
|
99
|
-
type
|
100
|
-
end
|
101
|
-
|
102
|
-
def name
|
103
|
-
@page_json['name'].tr('/', '_') # make Postgres-safe
|
104
|
-
end
|
105
|
-
|
106
|
-
def sub_layer_ids
|
107
|
-
@page_json['subLayers'].map { |hash| hash['id'] } || []
|
108
|
-
end
|
109
|
-
|
110
|
-
def json_data
|
111
|
-
FeatureScraper.new(url: "#{@service_url}/#{@id}", arcrest_opts: @arcrest_opts).json_data
|
112
|
-
end
|
113
|
-
|
114
|
-
def write_json
|
115
|
-
File.write json_path, json_data
|
116
|
-
end
|
117
|
-
|
118
|
-
def json_path
|
119
|
-
"#{@output_path}/#{@name}.json"
|
120
|
-
end
|
121
|
-
|
122
|
-
def write_to_db
|
123
|
-
@output_path = Dir.mktmpdir('gis_scraper') # prefix for identification
|
124
|
-
write_json
|
125
|
-
`#{OGR}"#{conn_str}" "#{json_path}" -nln #{table} #{srs} -nlt #{pg_geom}`
|
126
|
-
ensure
|
127
|
-
FileUtils.remove_entry @output_path
|
128
|
-
end
|
129
|
-
|
130
|
-
def pg_geom
|
131
|
-
GEOM_TYPES[geo] || raise("Unknown geom: '#{geo}' for layer #{@name}")
|
132
|
-
end
|
133
|
-
|
134
|
-
def geo
|
135
|
-
@page_json['geometryType']
|
136
|
-
end
|
137
|
-
|
138
|
-
def srs
|
139
|
-
return '' unless GisScraper.config[:srs]
|
140
|
-
|
141
|
-
"-a_srs #{GisScraper.config[:srs]}" || ''
|
142
|
-
end
|
143
|
-
|
144
|
-
def tables
|
145
|
-
connection.exec(TABLES).map { |tup| tup['table_name'] } # list of current db table names
|
146
|
-
end
|
147
|
-
|
148
|
-
def table
|
149
|
-
table_name << table_suffix
|
150
|
-
end
|
151
|
-
|
152
|
-
def table_name
|
153
|
-
Shellwords.escape(@name.downcase.tr(' ', '_')).prepend('_')
|
154
|
-
end
|
155
|
-
|
156
|
-
def table_suffix
|
157
|
-
tables.any? { |t| t == table_name } ? '_' : ''
|
158
|
-
end
|
159
|
-
|
160
|
-
def conn_str
|
161
|
-
host, port, db, user, pwd = *@conn.values
|
162
|
-
"dbname='#{db}' host='#{host}' port='#{port}' user='#{user}' password='#{pwd}'"
|
163
|
-
end
|
164
|
-
|
165
|
-
def do_sub_layers(format)
|
166
|
-
FileUtils.mkdir File.join(@output_path, @name) if format == :json
|
167
|
-
path = @output_path << "/#{@name}"
|
168
|
-
sub_layer_ids.each { |n| sub_layer(n, path).send(:output, format) }
|
169
|
-
end
|
170
|
-
|
171
|
-
def sub_layer(id, path)
|
172
|
-
self.class.new(url: "#{@service_url}/#{id}", path: path, arcrest_opts: @arcrest_opts) # recurse
|
173
|
-
end
|
174
|
-
end
|
175
|
-
# rubocop:enable Metrics/ClassLength
|