geo_combine 0.2.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +4 -3
- data/Gemfile +2 -1
- data/README.md +107 -26
- data/geo_combine.gemspec +4 -2
- data/lib/geo_combine.rb +8 -1
- data/lib/geo_combine/bounding_box.rb +71 -0
- data/lib/geo_combine/ckan_metadata.rb +112 -0
- data/lib/geo_combine/exceptions.rb +2 -0
- data/lib/geo_combine/formatting.rb +6 -1
- data/lib/geo_combine/geo_blacklight_harvester.rb +204 -0
- data/lib/geo_combine/geoblacklight.rb +62 -13
- data/lib/geo_combine/ogp.rb +229 -0
- data/lib/geo_combine/railtie.rb +7 -0
- data/lib/geo_combine/version.rb +1 -1
- data/lib/tasks/geo_combine.rake +54 -20
- data/lib/xslt/fgdc2html.xsl +105 -157
- data/lib/xslt/iso2html.xsl +1107 -1070
- data/spec/features/iso2html_spec.rb +7 -1
- data/spec/fixtures/docs/ckan.json +456 -0
- data/spec/fixtures/docs/geoblacklight_pre_v1.json +37 -0
- data/spec/fixtures/docs/ogp_harvard_line.json +28 -0
- data/spec/fixtures/docs/ogp_harvard_raster.json +28 -0
- data/spec/fixtures/docs/ogp_tufts_vector.json +31 -0
- data/spec/fixtures/json_docs.rb +20 -0
- data/spec/lib/geo_combine/bounding_box_spec.rb +59 -0
- data/spec/lib/geo_combine/ckan_metadata_spec.rb +114 -0
- data/spec/lib/geo_combine/formatting_spec.rb +6 -0
- data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +190 -0
- data/spec/lib/geo_combine/geoblacklight_spec.rb +38 -7
- data/spec/lib/geo_combine/ogp_spec.rb +163 -0
- data/spec/spec_helper.rb +1 -0
- metadata +65 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '0439618d4248a21c1f4efe91e863d01ad9b97f0798b7f855b60c15eab7457bbf'
|
4
|
+
data.tar.gz: a01ec4dc01b9c6d3dd39e34c178f2271b83d8c51ecf16226e8595a36ddbf37ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c7c77f1cf1b60e7438f144d97a1af16126cdc1c9567ae2006cdcaa26c47fc0d5af8672e6368b5b139789b8f44f55cbadc1bd1f220b84b4a36c4bfc78dba3a65
|
7
|
+
data.tar.gz: d33c6c00ed6b39a6c2a8238ad9e14304bb59e450a031b8b8836635bb71d4a2deed3b8b0de68ca10c6e0ae9e9013f67acab42e765534475e2040af76f65a7a1ad
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -3,12 +3,13 @@
|
|
3
3
|
[![Build Status](https://travis-ci.org/OpenGeoMetadata/GeoCombine.svg?branch=master)](https://travis-ci.org/OpenGeoMetadata/GeoCombine) | [![Coverage Status](https://coveralls.io/repos/OpenGeoMetadata/GeoCombine/badge.svg?branch=master)](https://coveralls.io/r/OpenGeoMetadata/GeoCombine?branch=master)
|
4
4
|
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
A Ruby toolkit for managing geospatial metadata, including:
|
7
|
+
- tasks for cloning, updating, and indexing OpenGeoMetdata metadata
|
8
|
+
- library for converting metadata between standards
|
8
9
|
|
9
10
|
## Installation
|
10
11
|
|
11
|
-
Add this line to your application's Gemfile
|
12
|
+
Add this line to your application's `Gemfile`:
|
12
13
|
|
13
14
|
```ruby
|
14
15
|
gem 'geo_combine'
|
@@ -16,81 +17,161 @@ gem 'geo_combine'
|
|
16
17
|
|
17
18
|
And then execute:
|
18
19
|
|
19
|
-
$ bundle
|
20
|
+
$ bundle install
|
20
21
|
|
21
22
|
Or install it yourself as:
|
22
23
|
|
23
24
|
$ gem install geo_combine
|
24
25
|
|
25
26
|
## Usage
|
26
|
-
GeoCombine can be used as a set of rake tasks for cloning, updating, and indexing OpenGeoMetdata metdata. It can also be used as a Ruby library for converting metdata.
|
27
27
|
|
28
|
-
###
|
28
|
+
### Converting metadata
|
29
29
|
|
30
30
|
```ruby
|
31
31
|
# Create a new ISO19139 object
|
32
32
|
> iso_metadata = GeoCombine::Iso19139.new('./tmp/opengeometadata/edu.stanford.purl/bb/338/jh/0716/iso19139.xml')
|
33
33
|
|
34
|
-
# Convert
|
34
|
+
# Convert ISO to GeoBlacklight
|
35
35
|
> iso_metadata.to_geoblacklight
|
36
36
|
|
37
37
|
# Convert that to JSON
|
38
38
|
> iso_metadata.to_geoblacklight.to_json
|
39
39
|
|
40
|
-
# Convert ISO or FGDC to HTML
|
40
|
+
# Convert ISO (or FGDC) to HTML
|
41
41
|
> iso_metadata.to_html
|
42
42
|
```
|
43
43
|
|
44
|
-
|
44
|
+
### OpenGeoMetadata
|
45
|
+
|
46
|
+
#### Clone OpenGeoMetadata repositories locally
|
45
47
|
|
46
|
-
|
48
|
+
```sh
|
49
|
+
$ bundle exec rake geocombine:clone
|
50
|
+
```
|
47
51
|
|
48
|
-
|
52
|
+
Will clone all `edu.*`,` org.*`, and `uk.*` OpenGeoMetadata repositories into `./tmp/opengeometadata`. Location of the OpenGeoMetadata repositories can be configured using the `OGM_PATH` environment variable.
|
49
53
|
|
50
54
|
```sh
|
51
|
-
$ rake geocombine:clone
|
55
|
+
$ OGM_PATH='my/custom/location' bundle exec rake geocombine:clone
|
52
56
|
```
|
53
57
|
|
58
|
+
You can also specify a single repository:
|
59
|
+
|
54
60
|
```sh
|
55
|
-
$ bundle exec geocombine
|
61
|
+
$ bundle exec rake geocombine:clone[edu.stanford.purl]
|
56
62
|
```
|
57
63
|
|
58
|
-
|
64
|
+
#### Update local OpenGeoMetadata repositories
|
59
65
|
|
60
66
|
```sh
|
61
|
-
$
|
67
|
+
$ bundle exec rake geocombine:pull
|
62
68
|
```
|
63
69
|
|
64
|
-
|
70
|
+
Runs `git pull origin master` on all cloned repositories in `./tmp/opengeometadata` (or custom path with configured environment variable `OGM_PATH`).
|
71
|
+
|
72
|
+
You can also specify a single repository:
|
65
73
|
|
66
74
|
```sh
|
67
|
-
$ rake geocombine:pull
|
75
|
+
$ bundle exec rake geocombine:pull[edu.stanford.purl]
|
68
76
|
```
|
69
77
|
|
78
|
+
#### Index GeoBlacklight documents
|
79
|
+
|
80
|
+
To index into Solr, GeoCombine requires a Solr instance that is running the
|
81
|
+
[GeoBlacklight schema](https://github.com/geoblacklight/geoblacklight):
|
82
|
+
|
70
83
|
```sh
|
71
|
-
$ bundle exec geocombine
|
84
|
+
$ bundle exec rake geocombine:index
|
72
85
|
```
|
73
86
|
|
74
|
-
|
87
|
+
Indexes the `geoblacklight.json` files in cloned repositories to a Solr index running at http://127.0.0.1:8983/solr
|
75
88
|
|
76
|
-
|
89
|
+
##### Custom Solr location
|
90
|
+
|
91
|
+
Solr location can also be specified by an environment variable `SOLR_URL`.
|
77
92
|
|
78
93
|
```sh
|
79
|
-
$ rake geocombine:index
|
94
|
+
$ SOLR_URL=http://www.example.com:1234/solr/collection bundle exec rake geocombine:index
|
80
95
|
```
|
81
96
|
|
97
|
+
Depending on your Solr instance's performance characteristics, you may want to
|
98
|
+
change the [`commitWithin` parameter](https://lucene.apache.org/solr/guide/6_6/updatehandlers-in-solrconfig.html) (in milliseconds):
|
99
|
+
|
82
100
|
```sh
|
83
|
-
$ bundle exec geocombine
|
101
|
+
$ SOLR_COMMIT_WITHIN=100 bundle exec rake geocombine:index
|
84
102
|
```
|
85
103
|
|
86
|
-
|
104
|
+
### Harvesting and indexing documents from GeoBlacklight sites
|
87
105
|
|
88
|
-
|
106
|
+
GeoCombine provides a Harvester class and rake task to harvest and index content from GeoBlacklight sites (or any site that follows the Blacklight API format). Given that the configurations can change from consumer to consumer and site to site, the class provides a relatively simple configuration API. This can be configured in an initializer, a wrapping rake task, or any other ruby context where the rake task our class would be invoked.
|
89
107
|
|
90
|
-
|
108
|
+
```sh
|
109
|
+
bundle exec rake geocombine:geoblacklight_harvester:index[YOUR_CONFIGURED_SITE_KEY]
|
110
|
+
```
|
111
|
+
|
112
|
+
#### Harvester configuration
|
113
|
+
|
114
|
+
Only the sites themselves are required to be configured but there are various configuration options that can (optionally) be supplied to modify the harvester's behavior.
|
115
|
+
|
116
|
+
```ruby
|
117
|
+
GeoCombine::GeoBlacklightHarvester.configure do
|
118
|
+
{
|
119
|
+
commit_within: '10000',
|
120
|
+
crawl_delay: 1, # All sites
|
121
|
+
debug: true,
|
122
|
+
SITE1: {
|
123
|
+
crawl_delay: 2, # SITE1 only
|
124
|
+
host: 'https://geoblacklight.example.edu',
|
125
|
+
params: {
|
126
|
+
f: {
|
127
|
+
dct_provenance_s: ['Institution']
|
128
|
+
}
|
129
|
+
}
|
130
|
+
},
|
131
|
+
SITE2: {
|
132
|
+
host: 'https://geoportal.example.edu',
|
133
|
+
params: {
|
134
|
+
q: '*'
|
135
|
+
}
|
136
|
+
}
|
137
|
+
}
|
138
|
+
end
|
139
|
+
```
|
140
|
+
|
141
|
+
##### Crawl Delays (default: none)
|
142
|
+
|
143
|
+
Crawl delays can be configured (in seconds) either globally for all sites or on a per-site basis. This will cause a delay for that number of seconds between each search results page (note that Blacklight 7 necessitates a lot of requests per results page and this only causes the delay per page of results)
|
144
|
+
|
145
|
+
##### Solr's commitWithin (default: 5000 milliseconds)
|
146
|
+
|
147
|
+
Solr's commitWithin option can be configured (in milliseconds) by passing a value under the commit_within key.
|
148
|
+
|
149
|
+
##### Debugging (default: false)
|
150
|
+
|
151
|
+
The harvester and indexer will only `puts` content when errors happen. It is possible to see some progress information by setting the debug configuration option.
|
152
|
+
|
153
|
+
#### Transforming Documents
|
154
|
+
|
155
|
+
You may need to transform documents that are harvested for various purposes (removing fields, adding fields, omitting a document all together, etc). You can configure some ruby code (a proc) that will take the document in, transform it, and return the transformed document. By default the indexer will remove the `score`, `timestamp`, and `_version_` fields from the documents harvested. If you provide your own transformer, you'll likely want to remove these fields in addition to the other transformations you provide.
|
156
|
+
|
157
|
+
```ruby
|
158
|
+
GeoCombine::GeoBlacklightIndexer.document_transformer = -> (document) do
|
159
|
+
# Removes "bogus_field" from the content we're harvesting
|
160
|
+
# in addition to some other solr fields we don't want
|
161
|
+
%w[_version_ score timestamp bogus_field].each do |field|
|
162
|
+
document.delete(field)
|
163
|
+
end
|
164
|
+
|
165
|
+
document
|
166
|
+
end
|
167
|
+
```
|
168
|
+
|
169
|
+
## Tests
|
170
|
+
|
171
|
+
To run the tests, use:
|
91
172
|
|
92
173
|
```sh
|
93
|
-
$
|
174
|
+
$ bundle exec rake spec
|
94
175
|
```
|
95
176
|
|
96
177
|
## Contributing
|
data/geo_combine.gemspec
CHANGED
@@ -18,14 +18,16 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
+
spec.add_dependency 'activesupport'
|
21
22
|
spec.add_dependency 'rsolr'
|
23
|
+
spec.add_dependency 'net-http-persistent', '~> 2.0' # pin since faraday (rsolr) doesn't work correctly with 3.x
|
22
24
|
spec.add_dependency 'nokogiri'
|
23
25
|
spec.add_dependency 'json-schema'
|
24
26
|
spec.add_dependency 'sanitize'
|
25
27
|
spec.add_dependency 'thor'
|
26
28
|
|
27
|
-
spec.add_development_dependency
|
28
|
-
spec.add_development_dependency
|
29
|
+
spec.add_development_dependency 'bundler'
|
30
|
+
spec.add_development_dependency 'rake'
|
29
31
|
spec.add_development_dependency 'rspec'
|
30
32
|
spec.add_development_dependency 'rspec-html-matchers'
|
31
33
|
end
|
data/lib/geo_combine.rb
CHANGED
@@ -20,7 +20,7 @@ module GeoCombine
|
|
20
20
|
##
|
21
21
|
# Creates a new GeoCombine::Metadata object, where metadata parameter is can
|
22
22
|
# be a File path or String of XML
|
23
|
-
# @param [String] metadata can be a File path
|
23
|
+
# @param [String] metadata can be a File path
|
24
24
|
# "./tmp/edu.stanford.purl/bb/338/jh/0716/iso19139.xml" or a String of XML
|
25
25
|
# metadata
|
26
26
|
def initialize metadata
|
@@ -58,12 +58,19 @@ require 'geo_combine/geometry_types'
|
|
58
58
|
|
59
59
|
# Require helper mixins
|
60
60
|
require 'geo_combine/formatting'
|
61
|
+
require 'geo_combine/bounding_box'
|
61
62
|
|
62
63
|
# Require additional classes
|
63
64
|
require 'geo_combine/fgdc'
|
64
65
|
require 'geo_combine/geoblacklight'
|
65
66
|
require 'geo_combine/iso19139'
|
66
67
|
require 'geo_combine/esri_open_data'
|
68
|
+
require 'geo_combine/ckan_metadata'
|
69
|
+
require 'geo_combine/ogp'
|
70
|
+
|
71
|
+
# Require harvesting/indexing files
|
72
|
+
require 'geo_combine/geo_blacklight_harvester'
|
67
73
|
|
68
74
|
# Require gem files
|
69
75
|
require 'geo_combine/version'
|
76
|
+
require 'geo_combine/railtie' if defined?(Rails)
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module GeoCombine
|
2
|
+
class BoundingBox
|
3
|
+
attr_reader :west, :south, :east, :north
|
4
|
+
|
5
|
+
##
|
6
|
+
# @param [String, Integer, Float] west
|
7
|
+
# @param [String, Integer, Float] south
|
8
|
+
# @param [String, Integer, Float] east
|
9
|
+
# @param [String, Integer, Float] north
|
10
|
+
def initialize(west:, south:, east:, north:)
|
11
|
+
@west = west.to_f
|
12
|
+
@south = south.to_f
|
13
|
+
@east = east.to_f
|
14
|
+
@north = north.to_f
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Returns a bounding box in ENVELOPE syntax
|
19
|
+
# @return [String]
|
20
|
+
def to_envelope
|
21
|
+
"ENVELOPE(#{west}, #{east}, #{north}, #{south})"
|
22
|
+
end
|
23
|
+
|
24
|
+
def valid?
|
25
|
+
[south, north].map do |coord|
|
26
|
+
next if (-90..90).cover?(coord)
|
27
|
+
raise GeoCombine::Exceptions::InvalidGeometry,
|
28
|
+
"#{coord} should be in range -90 90"
|
29
|
+
end
|
30
|
+
[east, west].map do |coord|
|
31
|
+
next if (-180..180).cover?(coord)
|
32
|
+
raise GeoCombine::Exceptions::InvalidGeometry,
|
33
|
+
"#{coord} should be in range -180 180"
|
34
|
+
end
|
35
|
+
if west > east
|
36
|
+
raise GeoCombine::Exceptions::InvalidGeometry,
|
37
|
+
"east #{east} should be greater than or equal to west #{west}"
|
38
|
+
end
|
39
|
+
if south > north
|
40
|
+
raise GeoCombine::Exceptions::InvalidGeometry,
|
41
|
+
"north #{north} should be greater than or equal to south #{south}"
|
42
|
+
end
|
43
|
+
true
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.from_envelope(envelope)
|
47
|
+
return if envelope.nil?
|
48
|
+
envelope = envelope[/.*ENVELOPE\(([^\)]*)/, 1].split(',')
|
49
|
+
new(
|
50
|
+
west: envelope[0],
|
51
|
+
south: envelope[3],
|
52
|
+
east: envelope[1],
|
53
|
+
north: envelope[2]
|
54
|
+
)
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# @param [String] spatial w,s,e,n or w s e n
|
59
|
+
# @param [String] delimiter "," or " "
|
60
|
+
def self.from_string_delimiter(spatial, delimiter: ',')
|
61
|
+
return if spatial.nil?
|
62
|
+
spatial = spatial.split(delimiter)
|
63
|
+
new(
|
64
|
+
west: spatial[0],
|
65
|
+
south: spatial[1],
|
66
|
+
east: spatial[2],
|
67
|
+
north: spatial[3]
|
68
|
+
)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
module GeoCombine
|
2
|
+
class CkanMetadata
|
3
|
+
MAX_STRING_LENGTH = 32765 # Solr limit
|
4
|
+
|
5
|
+
attr_reader :metadata
|
6
|
+
def initialize(metadata)
|
7
|
+
@metadata = JSON.parse(metadata)
|
8
|
+
end
|
9
|
+
|
10
|
+
##
|
11
|
+
# Creates and returns a Geoblacklight schema object from this metadata
|
12
|
+
# @return [GeoCombine::Geoblacklight]
|
13
|
+
def to_geoblacklight
|
14
|
+
GeoCombine::Geoblacklight.new(geoblacklight_terms.to_json)
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Builds a Geoblacklight Schema type hash from Esri Open Data portal
|
19
|
+
# metadata
|
20
|
+
# @return [Hash]
|
21
|
+
def geoblacklight_terms
|
22
|
+
{
|
23
|
+
dc_identifier_s: @metadata['id'],
|
24
|
+
dc_title_s: @metadata['title'],
|
25
|
+
dc_rights_s: 'Public',
|
26
|
+
layer_geom_type_s: 'Not Specified',
|
27
|
+
dct_provenance_s: organization['title'],
|
28
|
+
dc_description_s: @metadata['notes'].respond_to?(:[]) ? @metadata['notes'][0..MAX_STRING_LENGTH] : nil,
|
29
|
+
layer_slug_s: @metadata['name'],
|
30
|
+
solr_geom: envelope,
|
31
|
+
dc_subject_sm: subjects,
|
32
|
+
dct_references_s: external_references.to_json.to_s,
|
33
|
+
dc_format_s: downloadable? ? 'ZIP' : nil # TODO: we only allow direct ZIP file downloads
|
34
|
+
}.select { |_k, v| !v.nil? }
|
35
|
+
end
|
36
|
+
|
37
|
+
def organization
|
38
|
+
@metadata['organization'] || { title: '' }
|
39
|
+
end
|
40
|
+
|
41
|
+
def envelope
|
42
|
+
return envelope_from_bbox unless envelope_from_bbox.nil?
|
43
|
+
return envelope_from_spatial(',') unless envelope_from_spatial(',').nil?
|
44
|
+
return envelope_from_spatial(' ') unless envelope_from_spatial(' ').nil?
|
45
|
+
end
|
46
|
+
|
47
|
+
def envelope_from_bbox
|
48
|
+
bbox = GeoCombine::BoundingBox.new(
|
49
|
+
west: extras('bbox-west-long'),
|
50
|
+
south: extras('bbox-south-lat'),
|
51
|
+
east: extras('bbox-east-long'),
|
52
|
+
north: extras('bbox-north-lat')
|
53
|
+
)
|
54
|
+
begin
|
55
|
+
return bbox.to_envelope if bbox.valid?
|
56
|
+
rescue GeoCombine::Exceptions::InvalidGeometry
|
57
|
+
return nil
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def envelope_from_spatial(delimiter)
|
62
|
+
bbox = GeoCombine::BoundingBox.from_string_delimiter(
|
63
|
+
extras('spatial'),
|
64
|
+
delimiter: delimiter
|
65
|
+
)
|
66
|
+
begin
|
67
|
+
return bbox.to_envelope if bbox.valid?
|
68
|
+
rescue GeoCombine::Exceptions::InvalidGeometry
|
69
|
+
return nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def subjects
|
74
|
+
extras('tags').split(',').map(&:strip)
|
75
|
+
end
|
76
|
+
|
77
|
+
def extras(key)
|
78
|
+
if @metadata['extras']
|
79
|
+
@metadata['extras'].select { |h| h['key'] == key }.collect { |v| v['value'] }[0] || ''
|
80
|
+
else
|
81
|
+
''
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def external_references
|
86
|
+
h = {
|
87
|
+
'http://schema.org/url' => resource_urls('information').first
|
88
|
+
}
|
89
|
+
|
90
|
+
if downloadable?
|
91
|
+
h['http://schema.org/downloadUrl'] = resource_urls('download').first
|
92
|
+
end
|
93
|
+
|
94
|
+
h.select { |_k, v| !v.nil? }
|
95
|
+
end
|
96
|
+
|
97
|
+
def downloadable?
|
98
|
+
resource_urls('download').first =~ /.*\.zip/im
|
99
|
+
end
|
100
|
+
|
101
|
+
def resources(type)
|
102
|
+
return [] if @metadata['resources'].nil?
|
103
|
+
@metadata['resources'].select { |resource| resource['resource_locator_function'] == type }
|
104
|
+
end
|
105
|
+
|
106
|
+
def resource_urls(type)
|
107
|
+
resources(type).collect do |resource|
|
108
|
+
resource['url'] if resource.respond_to?(:[])
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|