geoblacklight-schema 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/LICENSE +14 -0
- data/README.md +44 -0
- data/bin/fgdc2mods.rb +5 -0
- data/bin/mods2geoblacklight.rb +5 -0
- data/bin/xsltproc-saxon +14 -0
- data/conf/protwords.txt +21 -0
- data/conf/schema.xml +158 -0
- data/conf/solrconfig.xml +160 -0
- data/conf/stopwords_en.txt +34 -0
- data/conf/synonyms.txt +29 -0
- data/examples/Gemfile +4 -0
- data/examples/generate-example-doc.rb +42 -0
- data/examples/selected.json +5787 -0
- data/examples/upload-to-solr.rb +50 -0
- data/geoblacklight-schema.gemspec +23 -0
- data/lib/geoblacklight/gazetteer.csv +1011 -0
- data/lib/geoblacklight/gazetteer.rb +104 -0
- data/lib/xslt/arcgis_to_iso19110.xsl +364 -0
- data/lib/xslt/fgdc2mods.xsl +1007 -0
- data/lib/xslt/iso2mods.xsl +939 -0
- data/lib/xslt/mods2geoblacklight.xsl +268 -0
- data/lib/xslt/mods2ogp.xsl +195 -0
- data/tools/fgdc2html/Gemfile +2 -0
- data/tools/fgdc2html/fgdc2html.css +71 -0
- data/tools/fgdc2html/fgdc2html.js +6 -0
- data/tools/fgdc2html/fgdc2html.xsl +1034 -0
- data/tools/fgdc2html/render.rb +30 -0
- data/tools/iso2html/Gemfile +2 -0
- data/tools/iso2html/iso-html.xsl +1745 -0
- data/tools/iso2html/render.rb +24 -0
- data/tools/iso2html/utils/convert-enumerations.xsl +97 -0
- data/tools/iso2html/utils/convert-latlong.xsl +73 -0
- data/tools/iso2html/utils/decode-uri/base.css +408 -0
- data/tools/iso2html/utils/decode-uri/index.html +29 -0
- data/tools/iso2html/utils/elements-fgdc.xml +824 -0
- data/tools/iso2html/utils/elements-iso.xml +636 -0
- data/tools/iso2html/utils/printFormatted.xsl +267 -0
- data/tools/iso2html/utils/printTextLines.xsl +192 -0
- data/tools/iso2html/utils/replace-newlines.xsl +97 -0
- data/tools/iso2html/utils/replace-string.xsl +80 -0
- data/tools/iso2html/utils/strip-digits.xsl +60 -0
- data/tools/iso2html/utils/url-decode.xsl +87 -0
- data/tools/iso2html/utils/wrap-text.xsl +174 -0
- data/tools/ogp/0_download.rb +96 -0
- data/tools/ogp/1_validate.rb +225 -0
- data/tools/ogp/2_transform.rb +438 -0
- data/tools/ogp/3_stanford.rb +35 -0
- data/tools/ogp/4_select.rb +189 -0
- data/tools/ogp/5_ingest.rb +55 -0
- data/tools/ogp/Gemfile +2 -0
- data/tools/solr/Gemfile +3 -0
- data/tools/solr/purge.rb +33 -0
- data/tools/solr/upload.rb +35 -0
- data/vendor/.keep +0 -0
- metadata +131 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
class DownloadOgp
|
6
|
+
URL = {
|
7
|
+
'tufts' => 'http://geodata.tufts.edu/solr/select',
|
8
|
+
'mit' => 'http://arrowsmith.mit.edu/solr/select',
|
9
|
+
'berkeley' => 'http://gis.lib.berkeley.edu:9081/solr4/select',
|
10
|
+
'harvard' => 'http://geodata.tufts.edu/solr/select', # Harvard uses Tufts solr index
|
11
|
+
'ucla' => 'http://vizsla.oit.ucla.edu:8080/solr/select',
|
12
|
+
'columbia' => 'http://culspatial.cul.columbia.edu/solr/select',
|
13
|
+
'minnesota' => 'http://ec2-54-87-229-228.compute-1.amazonaws.com:8080/solr/collection1/select'
|
14
|
+
}
|
15
|
+
|
16
|
+
FIELDS = %w{
|
17
|
+
Abstract
|
18
|
+
Access
|
19
|
+
Area
|
20
|
+
Availability
|
21
|
+
CenterX
|
22
|
+
CenterY
|
23
|
+
ContentDate
|
24
|
+
DataType
|
25
|
+
ExternalLayerId
|
26
|
+
FgdcText
|
27
|
+
GeoReferenced
|
28
|
+
HalfHeight
|
29
|
+
HalfWidth
|
30
|
+
Institution
|
31
|
+
LayerDisplayName
|
32
|
+
LayerId
|
33
|
+
Location
|
34
|
+
MaxX
|
35
|
+
MaxY
|
36
|
+
MinX
|
37
|
+
MinY
|
38
|
+
Name
|
39
|
+
PlaceKeywords
|
40
|
+
Publisher
|
41
|
+
SrsProjectionCode
|
42
|
+
ThemeKeywords
|
43
|
+
WorkspaceName
|
44
|
+
}.join(',')
|
45
|
+
|
46
|
+
def download(src, i, n, w=50)
|
47
|
+
start = 0
|
48
|
+
i = src if i.nil?
|
49
|
+
while start < n do
|
50
|
+
fetch(src, i, start, w)
|
51
|
+
start += w
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# fetch a set of Solr records from the src provider about the target institution
|
56
|
+
#
|
57
|
+
# @param [String] src The source provider of the Solr records
|
58
|
+
# @param [String] target the target institution
|
59
|
+
# @param [Integer] start
|
60
|
+
# @param [Integer] rows
|
61
|
+
def fetch(src, target, start, rows, datadir = 'data')
|
62
|
+
fn = File.join(datadir, "#{src.downcase}_#{target.downcase}_#{sprintf('%05i', start)}_#{rows}.json")
|
63
|
+
unless File.exist?(fn)
|
64
|
+
raise "Unknown URL for #{src}" unless URL.include?(src.downcase)
|
65
|
+
puts "Downloading #{target} #{start} to #{start+rows}"
|
66
|
+
url = "#{URL[src.downcase]}?" + URI::encode_www_form(
|
67
|
+
'q' => '*:*',
|
68
|
+
'fq' => "Institution:#{target}",
|
69
|
+
'start' => start,
|
70
|
+
'rows' => rows,
|
71
|
+
'wt' => 'json',
|
72
|
+
'indent' => 'on',
|
73
|
+
'fl' => FIELDS
|
74
|
+
)
|
75
|
+
puts " #{url}" if $DEBUG
|
76
|
+
open(url) do |res|
|
77
|
+
File.open(fn, 'wb') do |f|
|
78
|
+
f.write(res.read())
|
79
|
+
end
|
80
|
+
end
|
81
|
+
else
|
82
|
+
puts "Using cache for #{target} #{start} to #{start+rows}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# __MAIN__
|
88
|
+
ogp = DownloadOgp.new
|
89
|
+
ogp.download('Berkeley', 'Berkeley', 450)
|
90
|
+
ogp.download('Tufts', 'MassGIS', 600)
|
91
|
+
ogp.download('Tufts', 'Tufts', 2850)
|
92
|
+
ogp.download('Tufts', 'Harvard', 10000)
|
93
|
+
ogp.download('MIT', 'MIT', 9200)
|
94
|
+
ogp.download('UCLA', 'UCLA', 200)
|
95
|
+
ogp.download('Columbia', 'Columbia', 3600)
|
96
|
+
ogp.download('Minnesota', 'Minnesota', 2300)
|
@@ -0,0 +1,225 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Usage: validate_ogp [output.json]
|
4
|
+
#
|
5
|
+
# Requires data/*.json as input and output to valid.json
|
6
|
+
#
|
7
|
+
require 'awesome_print'
|
8
|
+
require 'json'
|
9
|
+
require 'uri'
|
10
|
+
require 'date'
|
11
|
+
|
12
|
+
class ValidateOgp
|
13
|
+
def initialize(fn)
|
14
|
+
@wms_servers = {}
|
15
|
+
@output = File.open(fn, 'wb')
|
16
|
+
@output.write "[\n"
|
17
|
+
yield self
|
18
|
+
self.close
|
19
|
+
end
|
20
|
+
|
21
|
+
def validate_file(fn)
|
22
|
+
stats = { :accepted => 0, :rejected => 0 }
|
23
|
+
puts "Validating #{fn}"
|
24
|
+
json = JSON::parse(File.read(fn))
|
25
|
+
json['response']['docs'].each do |doc| # contains JSON Solr query results
|
26
|
+
begin
|
27
|
+
validate(doc)
|
28
|
+
stats[:accepted] += 1
|
29
|
+
rescue Exception => e
|
30
|
+
puts e
|
31
|
+
stats[:rejected] += 1
|
32
|
+
end
|
33
|
+
end
|
34
|
+
stats
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
def validate(layer)
|
39
|
+
id = layer['LayerId']
|
40
|
+
|
41
|
+
%w{LayerId Name Institution Access MinX MinY MaxX MaxY LayerDisplayName}.each do |k|
|
42
|
+
if layer[k].nil? || layer[k].to_s.empty?
|
43
|
+
raise ArgumentError, "ERROR: #{id} missing #{k}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
k = 'LayerId'
|
48
|
+
if layer[k].is_a? Array
|
49
|
+
layer[k] = layer[k].first
|
50
|
+
end
|
51
|
+
raise ArgumentError, "ERROR: #{k} is not a String: #{layer[k]}" unless layer[k].is_a? String
|
52
|
+
|
53
|
+
%w{MinX MaxX}.each do |lon|
|
54
|
+
raise ArgumentError, "ERROR: #{id}: Invalid longitude value: #{layer[lon]}" unless lon?(layer[lon])
|
55
|
+
end
|
56
|
+
|
57
|
+
%w{MinY MaxY}.each do |lat|
|
58
|
+
raise ArgumentError, "ERROR: #{id} Invalid latitude value: #{layer[lat]}" unless lat?(layer[lat])
|
59
|
+
end
|
60
|
+
|
61
|
+
if layer['MinX'].to_s.to_f > layer['MaxX'].to_s.to_f
|
62
|
+
raise ArgumentError, "ERROR: #{id} has MinX > MaxX"
|
63
|
+
end
|
64
|
+
|
65
|
+
if layer['MinY'].to_s.to_f > layer['MaxY'].to_s.to_f
|
66
|
+
raise ArgumentError, "ERROR: #{id} has MinY > MaxY"
|
67
|
+
end
|
68
|
+
|
69
|
+
k = 'Institution'
|
70
|
+
layer[k] = 'Columbia' if layer[k] == 'Columbia University'
|
71
|
+
if ([layer[k]] & %w{Berkeley Harvard MIT MassGIS Stanford Tufts UCLA Minnesota Columbia}).empty?
|
72
|
+
raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
|
73
|
+
end
|
74
|
+
|
75
|
+
k = 'DataType'
|
76
|
+
layer[k] = 'Paper Map' if layer[k] == 'Paper'
|
77
|
+
if ([layer[k]] & %w{Line Paper\ Map Point Polygon Raster CD-ROM DVD-ROM}).empty?
|
78
|
+
raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
|
79
|
+
end
|
80
|
+
|
81
|
+
k = 'Access'
|
82
|
+
if ([layer[k]] & %w{Public Restricted}).empty?
|
83
|
+
raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
|
84
|
+
end
|
85
|
+
|
86
|
+
k = 'WorkspaceName'
|
87
|
+
if layer[k].nil?
|
88
|
+
layer[k] = layer['Institution']
|
89
|
+
end
|
90
|
+
|
91
|
+
k = 'Availability'
|
92
|
+
if layer[k].downcase == 'online' # cleanup
|
93
|
+
layer[k] = 'Online'
|
94
|
+
end
|
95
|
+
if layer[k].downcase == 'offline'
|
96
|
+
layer[k] = 'Offline'
|
97
|
+
end
|
98
|
+
if ([layer[k]] & %w{Online Offline}).empty?
|
99
|
+
raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
|
100
|
+
end
|
101
|
+
|
102
|
+
k = 'Location'
|
103
|
+
layer[k] = validate_location(id, layer[k])
|
104
|
+
if layer[k].nil?
|
105
|
+
raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
|
106
|
+
end
|
107
|
+
|
108
|
+
k = 'GeoReferenced'
|
109
|
+
unless layer[k].nil? or layer[k] == true
|
110
|
+
puts "WARNING: #{id} has boundingbox but claims it is not georeferenced"
|
111
|
+
#layer[k] = true
|
112
|
+
end
|
113
|
+
|
114
|
+
k = 'Area'
|
115
|
+
unless layer[k].to_i >= 0
|
116
|
+
raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
|
117
|
+
end
|
118
|
+
|
119
|
+
k = 'ContentDate'
|
120
|
+
if layer[k].nil? || layer[k].to_s.strip.empty?
|
121
|
+
raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
|
122
|
+
end
|
123
|
+
dt = Date.rfc3339(layer[k])
|
124
|
+
if dt.year <= 1 or dt.year > 2100
|
125
|
+
raise ArgumentError, "ERROR: #{id} has invalid #{k}: #{layer[k]}: #{dt}"
|
126
|
+
end
|
127
|
+
|
128
|
+
# k = 'FgdcText'
|
129
|
+
# unless layer[k].nil? or layer[k].empty?
|
130
|
+
# layer[k] = ''
|
131
|
+
# end
|
132
|
+
|
133
|
+
@output.write JSON::pretty_generate(layer)
|
134
|
+
@output.write "\n,\n"
|
135
|
+
end
|
136
|
+
|
137
|
+
def close
|
138
|
+
@output.write "\n {} \n]\n"
|
139
|
+
@output.close
|
140
|
+
ap({:wms_servers => @wms_servers})
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def validate_location(id, location)
|
146
|
+
begin
|
147
|
+
begin
|
148
|
+
x = JSON::parse(location)
|
149
|
+
rescue JSON::ParserError => e
|
150
|
+
x = JSON::parse("{ #{location} }") # wrap in dictionary
|
151
|
+
end
|
152
|
+
|
153
|
+
unless x['externalDownload'].nil?
|
154
|
+
x['download'] = x['externalDownload']
|
155
|
+
x.delete('externalDownload')
|
156
|
+
end
|
157
|
+
unless x['libRecord'].nil?
|
158
|
+
x['url'] = x['libRecord']
|
159
|
+
x.delete('libRecord')
|
160
|
+
end
|
161
|
+
if x['download'].nil? && x['wms'].nil? && (x['wcs'].nil? && x['wfs'].nil?) && x['url'].nil?
|
162
|
+
puts "WARNING: #{id}: Missing Download or WMS or WCS/WFS or URL: #{x}"
|
163
|
+
return {}.to_json
|
164
|
+
end
|
165
|
+
|
166
|
+
%w{download wms wcs wfs url}.each do |protocol|
|
167
|
+
begin
|
168
|
+
unless x[protocol].nil?
|
169
|
+
if x[protocol].is_a? String
|
170
|
+
if x[protocol].empty? || x[protocol] == "None available"
|
171
|
+
x[protocol] = nil
|
172
|
+
next
|
173
|
+
else
|
174
|
+
x[protocol] = [x[protocol]]
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
unless x[protocol].is_a? Array
|
179
|
+
raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
|
180
|
+
end
|
181
|
+
|
182
|
+
x[protocol].each do |url|
|
183
|
+
uri = URI.parse(url)
|
184
|
+
raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(URI::HTTP) or uri.kind_of?(URI::HTTPS) or uri.kind_of?(URI::FTP)
|
185
|
+
end
|
186
|
+
|
187
|
+
# convert from Array to String
|
188
|
+
x[protocol] = x[protocol].first if x[protocol].is_a? Array
|
189
|
+
end
|
190
|
+
rescue URI::InvalidURIError => e
|
191
|
+
raise ArgumentError, "ERROR: #{id}: Invalid URL parsing: #{x}"
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
@wms_servers[x['wms']] = true unless x['wms'].nil?
|
196
|
+
|
197
|
+
return x.to_json
|
198
|
+
end
|
199
|
+
nil
|
200
|
+
end
|
201
|
+
|
202
|
+
def lon? lon
|
203
|
+
lon >= -180 and lon <= 180
|
204
|
+
end
|
205
|
+
|
206
|
+
def lat? lat
|
207
|
+
lat >= -90 and lat <= 90
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
# __MAIN__
|
213
|
+
ValidateOgp.new(ARGV[0].nil?? 'valid.json' : ARGV[0]) do |ogp|
|
214
|
+
stats = { :accepted => 0, :rejected => 0 }
|
215
|
+
Dir.glob('data/*.json') do |fn|
|
216
|
+
begin
|
217
|
+
s = ogp.validate_file(fn)
|
218
|
+
rescue Exception => e
|
219
|
+
end
|
220
|
+
|
221
|
+
stats[:accepted] += s[:accepted]
|
222
|
+
stats[:rejected] += s[:rejected]
|
223
|
+
end
|
224
|
+
ap({:statistics => stats})
|
225
|
+
end
|
@@ -0,0 +1,438 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Usage: transform_ogp output.json
|
4
|
+
#
|
5
|
+
# Reads valid*.json in current directory
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'awesome_print'
|
9
|
+
require 'json'
|
10
|
+
require 'uri'
|
11
|
+
require 'date'
|
12
|
+
require 'nokogiri'
|
13
|
+
require 'fileutils'
|
14
|
+
require 'open-uri'
|
15
|
+
|
16
|
+
# Transforms an OGP schema into GeoBlacklight. Requires input of a JSON array
|
17
|
+
# of OGP hashs.
|
18
|
+
class TransformOgp
|
19
|
+
|
20
|
+
def initialize(fn)
|
21
|
+
@output = File.open(fn, 'wb')
|
22
|
+
@output.write "[\n"
|
23
|
+
@fgdcdir = 'fgdc'
|
24
|
+
yield self
|
25
|
+
self.close
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param [String|Array] s the URI to clean up
|
29
|
+
# @return [String] a normalized URI
|
30
|
+
def clean_uri(s)
|
31
|
+
unless s.nil? or s.empty?
|
32
|
+
return (s.is_a?(Array) ? URI(s.first.to_s.strip) : URI(s.to_s.strip)).to_s
|
33
|
+
end
|
34
|
+
''
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param [String] fn filename of JSON array of OGP hash objects
|
38
|
+
# @return [Hash] stats about :accepted vs. :rejected records
|
39
|
+
def transform_file(fn, layers_json)
|
40
|
+
stats = { :accepted => 0, :rejected => 0 }
|
41
|
+
puts "Parsing #{fn}"
|
42
|
+
json = JSON::parse(File.open(fn, 'rb').read)
|
43
|
+
json.each do |doc| # contains JSON Solr query results
|
44
|
+
unless doc.empty?
|
45
|
+
begin
|
46
|
+
transform(doc, layers_json)
|
47
|
+
stats[:accepted] += 1
|
48
|
+
rescue ArgumentError => e
|
49
|
+
puts e, e.backtrace
|
50
|
+
stats[:rejected] += 1
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
stats
|
55
|
+
end
|
56
|
+
|
57
|
+
# Transforms a single OGP record into a GeoBlacklight record
|
58
|
+
# @param [Hash] layer an OGP hash for a given layer
|
59
|
+
def transform(layer, layers_json, skip_fgdc = false, skip_geoblacklight = false, skip_ogm_check = false)
|
60
|
+
id = layer['LayerId'].to_s.strip
|
61
|
+
puts "Tranforming #{id}"
|
62
|
+
|
63
|
+
# For URN style @see http://www.ietf.org/rfc/rfc2141.txt
|
64
|
+
# For ARK @see https://wiki.ucop.edu/display/Curation/ARK
|
65
|
+
layer['Institution'].strip!
|
66
|
+
prefix = case layer['Institution']
|
67
|
+
when 'Stanford'
|
68
|
+
'http://purl.stanford.edu/'
|
69
|
+
when 'Tufts'
|
70
|
+
'urn:geodata.tufts.edu:'
|
71
|
+
when 'MassGIS'
|
72
|
+
'urn:massgis.state.ma.us:'
|
73
|
+
when 'Berkeley'
|
74
|
+
'http://ark.cdlib.org/ark:/'
|
75
|
+
when 'MIT'
|
76
|
+
'urn:arrowsmith.mit.edu:'
|
77
|
+
when 'Harvard'
|
78
|
+
'urn:hul.harvard.edu:'
|
79
|
+
when 'Minnesota'
|
80
|
+
'urn:umn.edu:'
|
81
|
+
when 'UCLA'
|
82
|
+
'urn:ucla.edu:'
|
83
|
+
when 'Columbia'
|
84
|
+
'urn:columbia.edu:'
|
85
|
+
else
|
86
|
+
raise ArgumentError, 'ERROR: Skipping urn:UNKNOWN:'
|
87
|
+
end
|
88
|
+
uuid = prefix + URI.encode(id)
|
89
|
+
|
90
|
+
# Parse out the Location to get the WMS/WFS/WCS URLs, if available
|
91
|
+
location = JSON::parse(layer['Location'])
|
92
|
+
raise ArgumentError, "ERROR: #{id} has malformed location" unless location.is_a? Hash
|
93
|
+
|
94
|
+
# Parse out the bounding box
|
95
|
+
s = layer['MinY'].to_f
|
96
|
+
w = layer['MinX'].to_f
|
97
|
+
n = layer['MaxY'].to_f
|
98
|
+
e = layer['MaxX'].to_f
|
99
|
+
|
100
|
+
# Parse out the ContentDate date/time
|
101
|
+
begin
|
102
|
+
dt = DateTime.rfc3339(layer['ContentDate'])
|
103
|
+
rescue => e2
|
104
|
+
raise ArgumentError, "ERROR: #{id} has bad ContentDate: #{layer['ContentDate']}"
|
105
|
+
end
|
106
|
+
|
107
|
+
# pub_dt = DateTime.rfc3339('2000-01-01T00:00:00Z') # XXX fake data, get from MODS
|
108
|
+
|
109
|
+
access = layer['Access']
|
110
|
+
collection = nil
|
111
|
+
|
112
|
+
# Parse out the PURL and other metadata for Stanford
|
113
|
+
if layer['Institution'] == 'Stanford'
|
114
|
+
purl = location['purl']
|
115
|
+
if purl.is_a? Array
|
116
|
+
purl = purl.first
|
117
|
+
end
|
118
|
+
if purl.nil? and uuid =~ /^http/
|
119
|
+
purl = uuid
|
120
|
+
end
|
121
|
+
else
|
122
|
+
purl = nil
|
123
|
+
# Because OGP does not deliminate keywords, we use a heuristic here
|
124
|
+
%w{PlaceKeywords ThemeKeywords}.each do |k|
|
125
|
+
# layer[k] = nil
|
126
|
+
# unless layer[k] =~ /[;,]/ or layer[k].split.size < 4
|
127
|
+
# layer[k] = layer[k].split.join(';')
|
128
|
+
# end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
slug = to_slug(id, layer)
|
133
|
+
|
134
|
+
layer_geom_type = layer['DataType'].to_s.strip
|
135
|
+
if (layer_geom_type.downcase == 'raster' || layer['LayerDisplayName'] =~ /\(Raster Image\)/) ||
|
136
|
+
(layer['Institution'] == 'Harvard' && layer_geom_type.downcase == 'paper map')
|
137
|
+
format = 'GeoTIFF'
|
138
|
+
layer_geom_type = 'Raster'
|
139
|
+
elsif %w{Point Line Polygon}.include?(layer_geom_type)
|
140
|
+
format = 'Shapefile'
|
141
|
+
elsif layer_geom_type.downcase == 'paper map'
|
142
|
+
format = 'Paper'
|
143
|
+
layer_geom_type = 'Paper Map'
|
144
|
+
elsif layer_geom_type.downcase =~ /-rom$/
|
145
|
+
format = layer_geom_type
|
146
|
+
layer_geom_type = 'Mixed'
|
147
|
+
else
|
148
|
+
raise ArgumentError, "ERROR: Invalid layer_geom_type: #{layer_geom_type}"
|
149
|
+
end
|
150
|
+
|
151
|
+
# if layer['LayerDisplayName'] =~ /Scanned Map/
|
152
|
+
# layer_geom_type = 'Scanned Map'
|
153
|
+
# format = 'Paper'
|
154
|
+
# end
|
155
|
+
|
156
|
+
layer_id = layer['WorkspaceName'] + ':' + layer['Name']
|
157
|
+
|
158
|
+
# @see https://github.com/OSGeo/Cat-Interop
|
159
|
+
%w{wcs wfs wms}.each do |k|
|
160
|
+
location[k] = location[k].first if location[k].is_a? Array
|
161
|
+
end
|
162
|
+
refs = {}
|
163
|
+
refs['http://www.opengis.net/def/serviceType/ogc/wcs'] = "#{location['wcs']}" if location['wcs']
|
164
|
+
refs['http://www.opengis.net/def/serviceType/ogc/wfs'] = "#{location['wfs']}" if location['wfs']
|
165
|
+
refs['http://www.opengis.net/def/serviceType/ogc/wms'] = "#{location['wms']}" if location['wms']
|
166
|
+
if layer['Institution'] == 'Harvard'
|
167
|
+
refs['http://schema.org/DownloadAction'] = "#{location['download']}" if location['download']
|
168
|
+
refs['http://schema.org/UserDownloads'] = "#{location['serviceStart']}" if location['serviceStart']
|
169
|
+
refs['http://tilecache.org'] = "#{location['tilecache'].first}" if location['tilecache']
|
170
|
+
else
|
171
|
+
refs['http://schema.org/downloadUrl'] = "#{location['download']}" if location['download']
|
172
|
+
end
|
173
|
+
refs['http://schema.org/url'] = "#{location['url']}" if location['url']
|
174
|
+
if purl
|
175
|
+
refs["http://schema.org/thumbnailUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/preview.jpg"
|
176
|
+
refs["http://schema.org/url"] = "#{clean_uri(purl)}"
|
177
|
+
refs["http://schema.org/downloadUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/data.zip"
|
178
|
+
refs["http://www.loc.gov/mods/v3"] = "#{purl}.mods"
|
179
|
+
refs["http://www.isotc211.org/schemas/2005/gmd/"] = "http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/#{layer_id}/iso19139.xml"
|
180
|
+
refs['http://www.w3.org/1999/xhtml'] = "http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/#{layer_id}/iso19139.html"
|
181
|
+
else
|
182
|
+
refs['http://www.opengis.net/cat/csw/csdgm'] = "http://opengeometadata.stanford.edu/metadata/org.opengeoportal/#{URI::encode(layer_id)}/fgdc.xml"
|
183
|
+
begin
|
184
|
+
_f = open(refs['http://www.opengis.net/cat/csw/csdgm'])
|
185
|
+
refs['http://www.w3.org/1999/xhtml'] = "http://opengeometadata.stanford.edu/metadata/org.opengeoportal/#{URI::encode(layer_id)}/fgdc.html"
|
186
|
+
rescue OpenURI::HTTPError => e2
|
187
|
+
refs.delete('http://www.opengis.net/cat/csw/csdgm')
|
188
|
+
rescue URI::InvalidURIError => e2
|
189
|
+
raise ArgumentError, "ERROR: #{id} has bad LayerId: #{layer['layer_id']}"
|
190
|
+
end unless skip_ogm_check
|
191
|
+
end
|
192
|
+
|
193
|
+
# If there's no homepage, use the HTML version of the Metadata if available
|
194
|
+
if refs['http://schema.org/url'].nil? && !refs['http://www.w3.org/1999/xhtml'].nil?
|
195
|
+
refs['http://schema.org/url'] = refs['http://www.w3.org/1999/xhtml']
|
196
|
+
end
|
197
|
+
|
198
|
+
# Make the conversion from OGP to GeoBlacklight
|
199
|
+
#
|
200
|
+
# @see http://dublincore.org/documents/dcmi-terms/
|
201
|
+
# @see http://wiki.dublincore.org/index.php/User_Guide/Creating_Metadata
|
202
|
+
# @see http://www.ietf.org/rfc/rfc5013.txt
|
203
|
+
new_layer = {
|
204
|
+
:uuid => uuid,
|
205
|
+
|
206
|
+
# Dublin Core elements
|
207
|
+
:dc_creator_sm => string2array(layer['Originator']),
|
208
|
+
:dc_description_s => layer['Abstract'],
|
209
|
+
:dc_format_s => format,
|
210
|
+
:dc_identifier_s => uuid,
|
211
|
+
:dc_language_s => 'English', # 'en', # XXX: fake data
|
212
|
+
:dc_publisher_s => layer['Publisher'],
|
213
|
+
:dc_rights_s => access,
|
214
|
+
:dc_subject_sm => string2array(layer['ThemeKeywords'], true),
|
215
|
+
:dc_title_s => layer['LayerDisplayName'],
|
216
|
+
:dc_type_s => 'Dataset', # or 'Image' for non-georectified,
|
217
|
+
# or 'PhysicalObject' for non-digitized maps
|
218
|
+
# Dublin Core terms
|
219
|
+
:dct_isPartOf_sm => collection.nil?? nil : [collection],
|
220
|
+
:dct_references_s => refs.to_json.to_s,
|
221
|
+
:dct_spatial_sm => string2array(layer['PlaceKeywords'], true),
|
222
|
+
:dct_temporal_sm => [dt.year.to_s],
|
223
|
+
# :dct_issued_s => pub_dt.year.to_s,
|
224
|
+
:dct_provenance_s => layer['Institution'],
|
225
|
+
|
226
|
+
#
|
227
|
+
# xmlns:georss="http://www.georss.org/georss"
|
228
|
+
# A bounding box is a rectangular region, often used to define the extents of a map or a rough area of interest. A box contains two space seperate latitude-longitude pairs, with each pair separated by whitespace. The first pair is the lower corner, the second is the upper corner.
|
229
|
+
:georss_box_s => "#{s} #{w} #{n} #{e}",
|
230
|
+
:georss_polygon_s => "#{n} #{w} #{n} #{e} #{s} #{e} #{s} #{w} #{n} #{w}",
|
231
|
+
|
232
|
+
# Layer-specific schema
|
233
|
+
:layer_slug_s => slug,
|
234
|
+
:layer_id_s => layer_id,
|
235
|
+
# :layer_srs_s => 'EPSG:4326', # XXX: fake data
|
236
|
+
:layer_geom_type_s => layer_geom_type,
|
237
|
+
:layer_modified_dt => Time.now.utc.strftime('%FT%TZ'),
|
238
|
+
|
239
|
+
# derived fields used only by solr, for which copyField is insufficient
|
240
|
+
:solr_bbox => "#{w} #{s} #{e} #{n}", # minX minY maxX maxY
|
241
|
+
# :solr_ne_pt => "#{n},#{e}",
|
242
|
+
# :solr_sw_pt => "#{s},#{w}",
|
243
|
+
:solr_geom => "ENVELOPE(#{w}, #{e}, #{n}, #{s})",
|
244
|
+
:solr_year_i => dt.year,
|
245
|
+
# :solr_issued_dt => pub_dt.strftime('%FT%TZ') # Solr requires 1995-12-31T23:59:59Z
|
246
|
+
# :solr_wms_url => location['wms'],
|
247
|
+
# :solr_wfs_url => location['wfs'],
|
248
|
+
# :solr_wcs_url => location['wcs']
|
249
|
+
|
250
|
+
# :layer_year_i => dt.year#, # XXX: migrate to copyField
|
251
|
+
# :ogp_area_f => layer['Area'],
|
252
|
+
# :ogp_center_x_f => layer['CenterX'],
|
253
|
+
# :ogp_center_y_f => layer['CenterY'],
|
254
|
+
# :ogp_georeferenced_b => (layer['GeoReferenced'].to_s.downcase == 'true'),
|
255
|
+
# :ogp_halfheight_f => layer['HalfHeight'],
|
256
|
+
# :ogp_halfwidth_f => layer['HalfWidth'],
|
257
|
+
# :ogp_layer_id_s => layer['LayerId'],
|
258
|
+
# :ogp_name_s => layer['Name'],
|
259
|
+
# :ogp_location_s => layer['Location'],
|
260
|
+
# :ogp_workspace_s => layer['WorkspaceName']
|
261
|
+
}
|
262
|
+
|
263
|
+
# Remove any fields that are blank
|
264
|
+
new_layer.each do |k, v|
|
265
|
+
new_layer.delete(k) if v.nil? or (v.respond_to?(:empty?) and v.empty?)
|
266
|
+
end
|
267
|
+
|
268
|
+
# Write the JSON record for the GeoBlacklight layer
|
269
|
+
@output.write JSON::pretty_generate(new_layer)
|
270
|
+
@output.write "\n,\n"
|
271
|
+
|
272
|
+
# export into OGM
|
273
|
+
ogm_dir = new_layer[:dct_provenance_s] + '/' + new_layer[:layer_id_s][-2,2].downcase.gsub(/[^a-z0-9]/, '_') + '/' + new_layer[:layer_id_s]
|
274
|
+
unless skip_fgdc or layer['FgdcText'].nil? or layer['FgdcText'].empty?
|
275
|
+
_fn = 'opengeometadata/org.opengeoportal/' + ogm_dir + '/fgdc.xml'
|
276
|
+
unless File.size?(_fn)
|
277
|
+
FileUtils.mkdir_p(File.dirname(_fn)) unless File.directory?(File.dirname(_fn))
|
278
|
+
xml = Nokogiri::XML(layer['FgdcText'])
|
279
|
+
xml.write_xml_to(File.open(_fn, 'wb'), encoding: 'UTF-8', indent: 2)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
unless skip_geoblacklight
|
284
|
+
_fn = 'opengeometadata/org.opengeoportal/' + ogm_dir + '/geoblacklight.json'
|
285
|
+
layers_json[new_layer[:layer_id_s]] = ogm_dir
|
286
|
+
unless File.size?(_fn)
|
287
|
+
FileUtils.mkdir_p(File.dirname(_fn)) unless File.directory?(File.dirname(_fn))
|
288
|
+
_s = JSON::pretty_generate(new_layer)
|
289
|
+
File.open(_fn, 'wb') {|f| f.write(_s) }
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def close
|
295
|
+
@output.write "\n {} \n]\n"
|
296
|
+
@output.close
|
297
|
+
end
|
298
|
+
|
299
|
+
# @param [String] s has semi-colon/comma/gt delimited array
|
300
|
+
# @return [Array] results as array
|
301
|
+
def string2array(s, clean_only = false)
|
302
|
+
return nil if s.nil?
|
303
|
+
if clean_only
|
304
|
+
if s.strip.size > 0 && !s.strip.index(' ') && s.strip.downcase != 'none'
|
305
|
+
[s.strip]
|
306
|
+
else
|
307
|
+
nil
|
308
|
+
end
|
309
|
+
else
|
310
|
+
if s.to_s =~ /[;,>]/
|
311
|
+
s.split(/\s*[;,>]\s*/).uniq.collect {|i| i.strip}
|
312
|
+
elsif s.is_a?(String) && s.size > 0
|
313
|
+
[s.strip]
|
314
|
+
else
|
315
|
+
nil
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
@@slugs = {}
|
321
|
+
def to_slug(id, layer)
|
322
|
+
# strip out schema and usernames
|
323
|
+
name = layer['Name'].sub('SDE_DATA.', '').sub('SDE.', '').sub('SDE2.', '').sub('GISPORTAL.GISOWNER01.', '').sub('GISDATA.', '').sub('MORIS.', '')
|
324
|
+
unless name.size > 1
|
325
|
+
# use first word of title is empty name
|
326
|
+
name = layer['LayerDisplayName'].split.first
|
327
|
+
end
|
328
|
+
slug = layer['Institution'] + '-' + name
|
329
|
+
|
330
|
+
# slugs should only have a-z, A-Z, 0-9, and -
|
331
|
+
slug.gsub!(/[^a-zA-Z0-9\-]/, '-')
|
332
|
+
slug.gsub!(/[\-]+/, '-')
|
333
|
+
|
334
|
+
# only lowercase
|
335
|
+
slug.downcase!
|
336
|
+
|
337
|
+
# ensure slugs are unique for this pass
|
338
|
+
if @@slugs.include?(slug)
|
339
|
+
slug += '-' + sprintf("%06d", Random.rand(999999))
|
340
|
+
end
|
341
|
+
@@slugs[slug] = true
|
342
|
+
|
343
|
+
slug
|
344
|
+
end
|
345
|
+
|
346
|
+
# Ensure that the WMS/WFS/WCS location values are as expected
|
347
|
+
def validate_location(id, location)
|
348
|
+
begin
|
349
|
+
x = JSON::parse(location)
|
350
|
+
%w{download url wms wcs wfs}.each do |protocol|
|
351
|
+
begin
|
352
|
+
unless x[protocol].nil?
|
353
|
+
if x[protocol].is_a? String
|
354
|
+
x[protocol] = [x[protocol]]
|
355
|
+
end
|
356
|
+
|
357
|
+
unless x[protocol].is_a? Array
|
358
|
+
raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
|
359
|
+
end
|
360
|
+
|
361
|
+
x[protocol].each do |url|
|
362
|
+
uri = clean_uri.parse(url)
|
363
|
+
raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(clean_uri::HTTP) or uri.kind_of?(clean_uri::HTTPS)
|
364
|
+
end
|
365
|
+
x[protocol] = x[protocol].first
|
366
|
+
end
|
367
|
+
rescue Exception => e
|
368
|
+
raise ArgumentError, "ERROR: #{id}: Invalid #{k}: #{x}"
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
return x.to_json
|
373
|
+
rescue JSON::ParserError => e
|
374
|
+
raise ArgumentError, "ERROR: #{id}: Invalid JSON: #{location}"
|
375
|
+
end
|
376
|
+
nil
|
377
|
+
end
|
378
|
+
|
379
|
+
def lon? lon
|
380
|
+
lon >= -180 and lon <= 180
|
381
|
+
end
|
382
|
+
|
383
|
+
def lat? lat
|
384
|
+
lat >= -90 and lat <= 90
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
|
389
|
+
# __MAIN__
|
390
|
+
#
|
391
|
+
TransformOgp.new(ARGV[0].nil?? 'transformed.json' : ARGV[0]) do |ogp|
|
392
|
+
stats = { :accepted => 0, :rejected => 0 }
|
393
|
+
layers_json = {}
|
394
|
+
|
395
|
+
Dir.glob('valid*.json') do |fn|
|
396
|
+
s = ogp.transform_file(fn, layers_json)
|
397
|
+
stats[:accepted] += s[:accepted]
|
398
|
+
stats[:rejected] += s[:rejected]
|
399
|
+
end
|
400
|
+
|
401
|
+
File.open('opengeometadata/org.opengeoportal/layers.json', 'wb') do |f|
|
402
|
+
f << JSON::pretty_generate(layers_json)
|
403
|
+
end
|
404
|
+
|
405
|
+
ap({:statistics => stats})
|
406
|
+
end
|
407
|
+
|
408
|
+
# example input data
|
409
|
+
__END__
|
410
|
+
[
|
411
|
+
{
|
412
|
+
"Abstract": "The boundaries of each supervisorial district in Sonoma County based on 2000 census. Redrawn in 2001 using Autobound.",
|
413
|
+
"Access": "Public",
|
414
|
+
"Area": 0.9463444815860053,
|
415
|
+
"Availability": "Online",
|
416
|
+
"CenterX": -122.942159,
|
417
|
+
"CenterY": 38.4580755,
|
418
|
+
"ContentDate": "2000-01-01T01:01:01Z",
|
419
|
+
"DataType": "Polygon",
|
420
|
+
"FgdcText": "...",
|
421
|
+
"GeoReferenced": true,
|
422
|
+
"HalfHeight": 0.39885650000000084,
|
423
|
+
"HalfWidth": 0.593161000000002,
|
424
|
+
"Institution": "Berkeley",
|
425
|
+
"LayerDisplayName": "SCGISDB2_BASE_ADM_SUPERVISOR",
|
426
|
+
"LayerId": "28722/bk0012h5s52",
|
427
|
+
"Location": "{\"wms\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wms\"],\"tilecache\":[\"http://gis.lib.berkeley.edu:8080/geoserver/gwc/service/wms\"],\"download\":\"\",\"wfs\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wfs\"]}",
|
428
|
+
"MaxX": -122.348998,
|
429
|
+
"MaxY": 38.856932,
|
430
|
+
"MinX": -123.53532,
|
431
|
+
"MinY": 38.059219,
|
432
|
+
"Name": "ADM_SUPERVISOR",
|
433
|
+
"PlaceKeywords": "Sonoma County County of Sonoma Sonoma California Bay Area",
|
434
|
+
"Publisher": "UC Berkeley Libraries",
|
435
|
+
"ThemeKeywords": "Supervisorial districts 1st District 2nd District 3rd District 4th District 5th District",
|
436
|
+
"WorkspaceName": "UCB"
|
437
|
+
}
|
438
|
+
]
|