gis_scraper 0.1.9.pre → 0.1.10.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b4673aa6d74aa17a2b9cc7edd3629955edcf2893
4
- data.tar.gz: 05f54e0c85a5f5cf53ea10c6ee4ce1206e4f7bc0
3
+ metadata.gz: 6c95a3f2a8c7e50cf4df5e6f0f363d660e19c5a8
4
+ data.tar.gz: 5b6e929eec3af5ad0b767d34239de3d00d8e4d67
5
5
  SHA512:
6
- metadata.gz: aa65d5317c40e78c07a4007129c57079c42d7bd2389a2a98486b8970f566d8f097edb12ac1cef9811d11f9902c14ae5ecddcd7c3106be48cd2d1f22b2871d442
7
- data.tar.gz: 5f9eb70359db47600488932df54bc76be6129b902e94e2ab6c9a07ed03537e07b693139ca34f63c26fd03dccc1c6220c57e1d737baaffca638c90a82ae5377d3
6
+ metadata.gz: d13788b449d9c87a009816b48ef89038a7b686307ed68a915b02059580d6189ca946e3889485ee26f50d2e2cb1b296504cb32353d8f4461c7435e5e58fe86a27
7
+ data.tar.gz: cb6ec2b8c1b747a32e799dc7a905d7824b3e7f5b2a6382230ad7d4017ef2a13773a7ee4c63af900edacbf02454d0d18c45989f9a1894dcd0310cf1100807e540
data/.travis.yml CHANGED
@@ -1,26 +1,21 @@
1
1
  language: ruby
2
-
3
- addons:
4
- postgresql: "9.4"
2
+ rvm:
3
+ - 2.1
4
+ - 2.2
5
5
 
6
6
  services:
7
7
  - postgresql
8
8
 
9
- before_script:
10
- - psql -c 'create database travis_ci_test;' -U postgres
11
- - psql -U postgres -c 'create extension postgis;'
9
+ addons:
10
+ postgresql: "9.4"
12
11
 
13
12
  before_install:
14
13
  - gem update bundler
15
- # http://askubuntu.com/questions/206593/how-to-install-rgdal-on-ubuntu-12-10
16
- - sudo apt-get update -qq
17
- - sudo apt-get install -y aptitude
18
- - sudo aptitude install -y libgdal-dev libproj-dev
19
14
 
20
- rvm:
21
- - 2.0.0
22
- - 2.1.6
23
- - 2.2.3
24
- - rbx-2.9
15
+ before_script:
16
+ - sudo rm /etc/apt/sources.list.d/ubuntugis-stable-source.list # https://github.com/travis-ci/travis-ci/issues/2401
17
+ - sudo apt-get update -qq
18
+ - sudo apt-get install gdal-bin -y
19
+ - psql -c "create extension postgis" -U $USER -d $USER # default db name is same as user
25
20
 
26
21
  script: bundle exec rspec spec
data/README.md CHANGED
@@ -79,22 +79,22 @@ GisScraper.config # returns the hash of configuration values
79
79
 
80
80
  ## Usage
81
81
 
82
- A Layer object must be instantiated with one required arg - a MapServer/Layer URL (ending in an integer representing the layer number). Example:
82
+ A LayerWriter object must be instantiated with one required arg - a Service/Layer URL (ending in an integer representing the layer number). Example:
83
83
 
84
84
  ```
85
- layer = Layer.new('http://gps.digimap.gg/arcgis/rest/services/StatesOfJersey/JerseyMappingOL/MapServer/0')
85
+ writer = LayerWriter.new('http://gps.digimap.gg/arcgis/rest/services/StatesOfJersey/JerseyMappingOL/MapServer/0')
86
86
  ```
87
87
 
88
88
  An optional second argument for the output path for JSON files may be specified. If so this overides the configuration option. Example:
89
89
 
90
90
  ```
91
- layer = Layer.new('http://gps.digimap.gg/arcgis/rest/services/StatesOfJersey/JerseyMappingOL/MapServer/0', '~/Desktop')
91
+ writer = LayerWriter.new('http://gps.digimap.gg/arcgis/rest/services/StatesOfJersey/JerseyMappingOL/MapServer/0', '~/Desktop')
92
92
  ```
93
93
 
94
94
  **JSON output**
95
95
 
96
96
  ```
97
- layer.output_json
97
+ writer.output_json
98
98
  ```
99
99
 
100
100
  If the layer is type 'Feature Layer', a single file of JSON data will be saved (named the same as the layer). If the layer is type 'Group Layer', the sub-group structure is traversed recursively thus: Directories for each sub-group layer are created and JSON data files for each constituent feature layer written to them.
@@ -104,7 +104,7 @@ If the layer is type 'Feature Layer', a single file of JSON data will be saved (
104
104
  Valid database config options must be set. The following command will convert JSON files, create tables for each layer (& sub-layers, if any) and import the data. Table names are lowercased, prefixed '_' and have spaces replaced with undescores. If a table with the same name exists the name is appended with '_'.
105
105
 
106
106
  ```
107
- layer.output_to_db
107
+ writer.output_to_db
108
108
  ```
109
109
 
110
110
  ## Specification and Tests
data/Rakefile CHANGED
@@ -3,4 +3,4 @@ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
- task :default => :spec
6
+ task default: :spec
data/gis_scraper.gemspec CHANGED
@@ -10,21 +10,21 @@ Gem::Specification.new do |s|
10
10
  s.authors = ['Bruce Steedman']
11
11
  s.email = ['bruce.steedman@gmail.com']
12
12
 
13
- s.summary = %q{Scrapes ArcGIS data from MapServer REST API}
14
- s.description = %q{Scrapes ArcGIS data from MapServer REST API}
13
+ s.summary = 'Scrapes ArcGIS data from MapServer REST API'
14
+ s.description = 'Scrapes ArcGIS data from MapServer REST API'
15
15
  s.required_ruby_version = '>= 2.0'
16
- s.license = "MIT"
16
+ s.license = 'MIT'
17
17
 
18
- s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec)/}) }
18
+ s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec)/}) }
19
19
  s.bindir = 'exe'
20
20
  s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
21
  s.require_paths = ['lib']
22
22
 
23
23
  s.add_development_dependency 'bundler', '~> 1.11'
24
24
  s.add_development_dependency 'rake', '~> 10.0'
25
- s.add_development_dependency 'rspec', '~> 3.0'
25
+ s.add_development_dependency 'rspec', '~> 3.4'
26
26
 
27
- s.add_runtime_dependency 'mechanize', '~> 2.7'
28
- s.add_runtime_dependency 'parallel', '~> 1.6'
27
+ s.add_runtime_dependency 'arcrest', '~> 0.0.2'
28
+ s.add_runtime_dependency 'parallel', '~> 1.9'
29
29
  s.add_development_dependency 'pg', '~> 0.18'
30
30
  end
data/lib/gis_scraper.rb CHANGED
@@ -1,27 +1,27 @@
1
1
  require 'yaml'
2
- require 'mechanize'
2
+ require 'arcrest'
3
+ # require 'mechanize'
3
4
  require 'parallel'
4
5
  require 'pg'
5
6
 
6
7
  require 'gis_scraper/version'
7
8
  require 'gis_scraper/feature_scraper'
8
- require 'gis_scraper/layer'
9
+ require 'gis_scraper/layer_writer'
9
10
 
10
11
  # stackoverflow.com/questions/6233124/where-to-place-access-config-file-in-gem
11
12
  module GisScraper
12
-
13
- @config = {threads: 8, output_path: '~/Desktop',
14
- host: 'localhost', port: 5432, dbname: 'postgres', user: 'postgres', password: nil,
15
- srs: nil}
13
+ @config = { threads: 8, output_path: '~/Desktop', host: 'localhost',
14
+ port: 5432, dbname: 'postgres', user: 'postgres', password: nil,
15
+ srs: nil }
16
16
  @valid_keys = @config.keys
17
17
 
18
18
  def self.configure(opts = {})
19
- opts.each { |k,v| @config[k.to_sym] = v if @valid_keys.include? k.to_sym }
19
+ opts.each { |k, v| @config[k.to_sym] = v if @valid_keys.include? k.to_sym }
20
20
  end
21
21
 
22
22
  def self.configure_with(path_to_yaml_file)
23
23
  begin
24
- config = YAML::load(IO.read(path_to_yaml_file))
24
+ config = YAML.load(IO.read(path_to_yaml_file))
25
25
  rescue Errno::ENOENT
26
26
  puts "YAML configuration file couldn't be found. Using defaults"
27
27
  return
@@ -37,13 +37,13 @@ module GisScraper
37
37
  @config
38
38
  end
39
39
 
40
- class JSONParser < Mechanize::File # shared by FeatureScraper & Layer
41
- attr_reader :json
42
-
43
- def initialize(uri=nil, response=nil, body=nil, code=nil)
44
- super(uri, response, body, code)
45
- @json = JSON.parse(body)
46
- end
47
- end
40
+ # shared by FeatureScraper & Layer
41
+ # class JSONParser < Mechanize::File
42
+ # attr_reader :json
48
43
 
44
+ # def initialize(uri = nil, response = nil, body = nil, code = nil)
45
+ # super(uri, response, body, code)
46
+ # @json = JSON.parse(body)
47
+ # end
48
+ # end
49
49
  end
@@ -1,71 +1,89 @@
1
+ # scrapes feature layers
1
2
  class FeatureScraper
3
+ API_CALL_LIMIT = 1000
4
+ STRING = 'esriFieldTypeString'.freeze
5
+ VARCHAR_MAX_SIZE = 10_485_760 # max size for PostgreSQL VARCHAR
2
6
 
3
7
  attr_reader :name
4
8
 
5
9
  def initialize(url)
6
10
  @url = url
7
- @agent = Mechanize.new
8
- @agent.pluggable_parser['text/plain'] = GisScraper::JSONParser
9
- @layer = layer # hash of json
10
- @name, @pk, @max = name, pk, max # maxRecordCount - usually 1000
11
- @form = form
11
+ @layer = layer
12
+ @json = json
13
+ @name = name
14
+ @pk = pk
15
+ @max = max # maxRecordCount - usually 1000
12
16
  @loops = loops
13
17
  @threads = GisScraper.config[:threads]
14
18
  end
15
19
 
16
20
  def json_data
17
- data(0).merge({'features' => features(@threads)}).to_json
21
+ query_without_features.merge('features' => all_features(@threads)).to_json
18
22
  end
19
23
 
20
24
  private
21
25
 
26
+ def query_without_features # check_field_length not needed ogr2ogr >= 1.11.5
27
+ check_field_length @layer.query(where: '1=0')
28
+ end
29
+
22
30
  def layer
23
- @agent.get(@url + '?f=pjson').json
31
+ ArcREST::Layer.new(@url)
32
+ end
33
+
34
+ def json
35
+ @layer.json
24
36
  end
25
37
 
26
38
  def name
27
- @layer['name']
39
+ @layer.name
40
+ end
41
+
42
+ def renderer
43
+ @layer.drawing_info['renderer']
28
44
  end
29
45
 
30
46
  def pk
31
- @layer['fields'].select { |f| f['type'] == 'esriFieldTypeOID' }[0]['name']
47
+ @json['fields'].select { |f| f['type'] == 'esriFieldTypeOID' }[0]['name']
32
48
  end
33
49
 
34
50
  def max
35
- @layer['maxRecordCount'].to_i
51
+ @layer.max_record_count || API_CALL_LIMIT
36
52
  end
37
53
 
38
- def form
39
- @agent.get(@url + '/query').forms.first
54
+ def count
55
+ @layer.count
40
56
  end
41
57
 
42
- def count
43
- set_query_params
44
- @form.submit(@form.buttons[1]).json['count'].to_i
58
+ def features(n)
59
+ @layer.features(where: where_text(n))
60
+ end
61
+
62
+ def check_field_length(hash) # https://trac.osgeo.org/gdal/ticket/6529
63
+ hash.merge check_fields(hash['fields'])
45
64
  end
46
65
 
47
- def set_query_params(loop_num = nil)
48
- @form.fields[0].value = where_text(loop_num)
49
- loop_num ? @form.radiobuttons[4].uncheck : @form.radiobuttons[4].check
50
- @form.fields[6].value = '*'
51
- @form.field_with(name: 'f').options[1].select # for JSON
66
+ def check_fields(fields)
67
+ { 'fields' => fields.map { |f| f['type'] == STRING ? esri_string(f) : f } }
52
68
  end
53
69
 
54
- def data(n)
55
- set_query_params(n)
56
- @form.submit(@form.buttons[1]).json
70
+ def esri_string(fields)
71
+ Hash[fields.map { |k, v| [k, k == 'length' ? truncate(v) : v] }] # nice :)
57
72
  end
58
73
 
59
- def features(t)
60
- Parallel.map(0...@loops, in_threads: t) { |n| data(n)['features'] }.flatten
74
+ def truncate(length)
75
+ length > VARCHAR_MAX_SIZE ? 0 : length
76
+ end
77
+
78
+ def all_features(threads)
79
+ Parallel.map(0...@loops, in_threads: threads) { |n| features(n) }.flatten
61
80
  end
62
81
 
63
82
  def loops
64
- (count.to_f/@max).ceil
83
+ (count.to_f / @max).ceil
65
84
  end
66
85
 
67
86
  def where_text(n)
68
87
  n ? "#{pk} > #{n * @max} AND #{pk} <= #{(n + 1) * @max}" : "#{pk} > 0"
69
88
  end
70
-
71
89
  end
@@ -1,8 +1,9 @@
1
1
  require 'fileutils'
2
2
  require 'tmpdir'
3
+ require 'shellwords'
3
4
 
4
- class Layer
5
-
5
+ # tool to write ArcGIS layer(s) to json or database output
6
+ class LayerWriter
6
7
  class UnknownLayerType < StandardError; end
7
8
  class NoDatabase < StandardError; end
8
9
  class OgrMissing < StandardError; end
@@ -10,31 +11,33 @@ class Layer
10
11
  attr_reader :type
11
12
 
12
13
  TABLES = "SELECT table_name FROM information_schema.tables\
13
- WHERE table_schema = 'public'"
14
+ WHERE table_schema = 'public'".freeze
14
15
 
15
- TYPE = %w(Group\ Layer Feature\ Layer Annotation\ Layer Annotation\ SubLayer)
16
+ TYPE = ['Group Layer', 'Feature Layer', 'Annotation Layer',
17
+ 'Annotation SubLayer'].freeze
16
18
 
17
- CONN = [:host, :port, :dbname, :user, :password] # PG connection options
19
+ CONN = [:host, :port, :dbname, :user, :password].freeze
18
20
 
19
- GEOM_TYPES = {'esriGeometryPoint' => 'POINT',
20
- 'esriGeometryMultipoint' => 'MULTIPOINT',
21
- 'esriGeometryLine' => 'LINESTRING',
22
- 'esriGeometryPolyline' => 'MULTILINESTRING',
23
- 'esriGeometryPolygon' => 'MULTIPOLYGON'}
21
+ GEOM_TYPES = { 'esriGeometryPoint' => 'POINT',
22
+ 'esriGeometryMultipoint' => 'MULTIPOINT',
23
+ 'esriGeometryLine' => 'LINESTRING',
24
+ 'esriGeometryPolyline' => 'MULTILINESTRING',
25
+ 'esriGeometryPolygon' => 'MULTIPOLYGON' }.freeze
24
26
 
25
- MSURL = 'MapServer'
26
- OGR = 'ogr2ogr -overwrite -f "PostgreSQL" PG:'
27
+ OGR = 'ogr2ogr -overwrite -f "PostgreSQL" PG:'.freeze
27
28
 
28
29
  def initialize(url, path = nil)
29
30
  @conn_hash = CONN.zip(CONN.map { |key| GisScraper.config[key] }).to_h
30
31
  @url = url
31
32
  @output_path = output_path(path) || config_path
32
- @id, @mapserver_url = id, mapserver_url # mapserver url ends '../MapServer'
33
- @agent = Mechanize.new
34
- @agent.pluggable_parser['text/plain'] = GisScraper::JSONParser
35
- validate_url
33
+ @id = id
34
+ @service_url = service_url
35
+ @layer = layer
36
36
  @page_json = page_json
37
- @type, @name, @sub_layer_ids, @geo = type, name, sub_layer_ids, geo
37
+ @type = type
38
+ @name = name
39
+ @sub_layer_ids = sub_layer_ids
40
+ @geo = geo
38
41
  end
39
42
 
40
43
  def output_json
@@ -42,8 +45,8 @@ class Layer
42
45
  end
43
46
 
44
47
  def output_to_db
45
- raise OgrMissing.new, 'ogr2ogr missing, is GDAL installed?' if !ogr2ogr?
46
- raise NoDatabase.new, "No db connection: #{@conn_hash.inspect}" if !conn
48
+ raise OgrMissing.new, 'ogr2ogr missing, is GDAL installed?' unless ogr2ogr?
49
+ raise NoDatabase.new, "No db connection: #{@conn_hash.inspect}" unless conn
47
50
  output(:db)
48
51
  end
49
52
 
@@ -75,7 +78,7 @@ class Layer
75
78
  File.expand_path GisScraper.config[:output_path]
76
79
  end
77
80
 
78
- def mapserver_url
81
+ def service_url
79
82
  @url.split('/')[0..-2].join('/')
80
83
  end
81
84
 
@@ -83,13 +86,12 @@ class Layer
83
86
  @url.split('/').last
84
87
  end
85
88
 
86
- def validate_url
87
- raise ArgumentError, 'URL must end with layer id' if @id.to_i.to_s != @id
88
- raise ArgumentError, 'Bad MapServer URL' if @mapserver_url[-9..-1] != MSURL
89
+ def layer
90
+ ArcREST::Layer.new @url
89
91
  end
90
92
 
91
93
  def page_json
92
- @agent.get(@url + '?f=pjson').json
94
+ @layer.json
93
95
  end
94
96
 
95
97
  def type
@@ -101,7 +103,7 @@ class Layer
101
103
  end
102
104
 
103
105
  def validate_type(type)
104
- raise UnknownLayerType, type unless (TYPE.any? { |t| t == type })
106
+ raise UnknownLayerType, type unless TYPE.any? { |t| t == type }
105
107
  type
106
108
  end
107
109
 
@@ -110,7 +112,7 @@ class Layer
110
112
  end
111
113
 
112
114
  def json_data
113
- FeatureScraper.new("#{@mapserver_url}/#{@id}").json_data
115
+ FeatureScraper.new("#{@service_url}/#{@id}").json_data
114
116
  end
115
117
 
116
118
  def write_json
@@ -125,14 +127,14 @@ class Layer
125
127
  @output_path = Dir.mktmpdir('gis_scraper') # prefix for identification
126
128
  begin
127
129
  write_json
128
- `#{OGR}"#{c_str}" "#{json_path}" -nln #{table} #{srs} -nlt #{geom}`
130
+ `#{OGR}"#{conn_str}" "#{json_path}" -nln #{table} #{srs} -nlt #{geom}`
129
131
  ensure
130
132
  FileUtils.remove_entry @output_path
131
133
  end
132
134
  end
133
135
 
134
136
  def geom
135
- GEOM_TYPES[@geo] || raise("Unknown geometry: '#{@geo}' for layer #{@name}")
137
+ GEOM_TYPES[@geo] || raise("Unknown geom: '#{@geo}' for layer #{@name}")
136
138
  end
137
139
 
138
140
  def geo
@@ -153,14 +155,14 @@ class Layer
153
155
  end
154
156
 
155
157
  def table_name
156
- Shellwords.escape(@name.downcase.gsub(' ', '_')).prepend('_')
158
+ Shellwords.escape(@name.downcase.tr(' ', '_')).prepend('_')
157
159
  end
158
160
 
159
161
  def table_suffix
160
162
  tables.any? { |t| t == table_name } ? '_' : ''
161
163
  end
162
164
 
163
- def c_str
165
+ def conn_str
164
166
  host, port, db, user, pwd = *@conn_hash.values
165
167
  "host=#{host} port=#{port} dbname=#{db} user=#{user} password=#{pwd}"
166
168
  end
@@ -172,11 +174,10 @@ class Layer
172
174
  end
173
175
 
174
176
  def sub_layer(id, path)
175
- Layer.new("#{@mapserver_url}/#{id}", path)
177
+ self.class.new("#{@service_url}/#{id}", path)
176
178
  end
177
179
 
178
180
  def replace_forwardslashes_with_underscores(string)
179
- string.gsub /\//, '_'
181
+ string.tr('/', '_')
180
182
  end
181
-
182
183
  end
@@ -1,3 +1,3 @@
1
1
  module GisScraper
2
- VERSION = '0.1.9.pre'
2
+ VERSION = '0.1.10.pre'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gis_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9.pre
4
+ version: 0.1.10.pre
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bruce Steedman
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-15 00:00:00.000000000 Z
11
+ date: 2016-06-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -44,42 +44,42 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: '3.4'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: '3.4'
55
55
  - !ruby/object:Gem::Dependency
56
- name: mechanize
56
+ name: arcrest
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '2.7'
61
+ version: 0.0.2
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '2.7'
68
+ version: 0.0.2
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: parallel
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '1.6'
75
+ version: '1.9'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '1.6'
82
+ version: '1.9'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: pg
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -113,7 +113,7 @@ files:
113
113
  - gis_scraper.gemspec
114
114
  - lib/gis_scraper.rb
115
115
  - lib/gis_scraper/feature_scraper.rb
116
- - lib/gis_scraper/layer.rb
116
+ - lib/gis_scraper/layer_writer.rb
117
117
  - lib/gis_scraper/version.rb
118
118
  homepage:
119
119
  licenses:
@@ -135,7 +135,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
135
  version: 1.3.1
136
136
  requirements: []
137
137
  rubyforge_project:
138
- rubygems_version: 2.4.8
138
+ rubygems_version: 2.5.1
139
139
  signing_key:
140
140
  specification_version: 4
141
141
  summary: Scrapes ArcGIS data from MapServer REST API