geoblacklight-schema 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +3 -0
  3. data/LICENSE +14 -0
  4. data/README.md +44 -0
  5. data/bin/fgdc2mods.rb +5 -0
  6. data/bin/mods2geoblacklight.rb +5 -0
  7. data/bin/xsltproc-saxon +14 -0
  8. data/conf/protwords.txt +21 -0
  9. data/conf/schema.xml +158 -0
  10. data/conf/solrconfig.xml +160 -0
  11. data/conf/stopwords_en.txt +34 -0
  12. data/conf/synonyms.txt +29 -0
  13. data/examples/Gemfile +4 -0
  14. data/examples/generate-example-doc.rb +42 -0
  15. data/examples/selected.json +5787 -0
  16. data/examples/upload-to-solr.rb +50 -0
  17. data/geoblacklight-schema.gemspec +23 -0
  18. data/lib/geoblacklight/gazetteer.csv +1011 -0
  19. data/lib/geoblacklight/gazetteer.rb +104 -0
  20. data/lib/xslt/arcgis_to_iso19110.xsl +364 -0
  21. data/lib/xslt/fgdc2mods.xsl +1007 -0
  22. data/lib/xslt/iso2mods.xsl +939 -0
  23. data/lib/xslt/mods2geoblacklight.xsl +268 -0
  24. data/lib/xslt/mods2ogp.xsl +195 -0
  25. data/tools/fgdc2html/Gemfile +2 -0
  26. data/tools/fgdc2html/fgdc2html.css +71 -0
  27. data/tools/fgdc2html/fgdc2html.js +6 -0
  28. data/tools/fgdc2html/fgdc2html.xsl +1034 -0
  29. data/tools/fgdc2html/render.rb +30 -0
  30. data/tools/iso2html/Gemfile +2 -0
  31. data/tools/iso2html/iso-html.xsl +1745 -0
  32. data/tools/iso2html/render.rb +24 -0
  33. data/tools/iso2html/utils/convert-enumerations.xsl +97 -0
  34. data/tools/iso2html/utils/convert-latlong.xsl +73 -0
  35. data/tools/iso2html/utils/decode-uri/base.css +408 -0
  36. data/tools/iso2html/utils/decode-uri/index.html +29 -0
  37. data/tools/iso2html/utils/elements-fgdc.xml +824 -0
  38. data/tools/iso2html/utils/elements-iso.xml +636 -0
  39. data/tools/iso2html/utils/printFormatted.xsl +267 -0
  40. data/tools/iso2html/utils/printTextLines.xsl +192 -0
  41. data/tools/iso2html/utils/replace-newlines.xsl +97 -0
  42. data/tools/iso2html/utils/replace-string.xsl +80 -0
  43. data/tools/iso2html/utils/strip-digits.xsl +60 -0
  44. data/tools/iso2html/utils/url-decode.xsl +87 -0
  45. data/tools/iso2html/utils/wrap-text.xsl +174 -0
  46. data/tools/ogp/0_download.rb +96 -0
  47. data/tools/ogp/1_validate.rb +225 -0
  48. data/tools/ogp/2_transform.rb +438 -0
  49. data/tools/ogp/3_stanford.rb +35 -0
  50. data/tools/ogp/4_select.rb +189 -0
  51. data/tools/ogp/5_ingest.rb +55 -0
  52. data/tools/ogp/Gemfile +2 -0
  53. data/tools/solr/Gemfile +3 -0
  54. data/tools/solr/purge.rb +33 -0
  55. data/tools/solr/upload.rb +35 -0
  56. data/vendor/.keep +0 -0
  57. metadata +131 -0
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'open-uri'
4
+
5
+ class DownloadOgp
6
+ URL = {
7
+ 'tufts' => 'http://geodata.tufts.edu/solr/select',
8
+ 'mit' => 'http://arrowsmith.mit.edu/solr/select',
9
+ 'berkeley' => 'http://gis.lib.berkeley.edu:9081/solr4/select',
10
+ 'harvard' => 'http://geodata.tufts.edu/solr/select', # Harvard uses Tufts solr index
11
+ 'ucla' => 'http://vizsla.oit.ucla.edu:8080/solr/select',
12
+ 'columbia' => 'http://culspatial.cul.columbia.edu/solr/select',
13
+ 'minnesota' => 'http://ec2-54-87-229-228.compute-1.amazonaws.com:8080/solr/collection1/select'
14
+ }
15
+
16
+ FIELDS = %w{
17
+ Abstract
18
+ Access
19
+ Area
20
+ Availability
21
+ CenterX
22
+ CenterY
23
+ ContentDate
24
+ DataType
25
+ ExternalLayerId
26
+ FgdcText
27
+ GeoReferenced
28
+ HalfHeight
29
+ HalfWidth
30
+ Institution
31
+ LayerDisplayName
32
+ LayerId
33
+ Location
34
+ MaxX
35
+ MaxY
36
+ MinX
37
+ MinY
38
+ Name
39
+ PlaceKeywords
40
+ Publisher
41
+ SrsProjectionCode
42
+ ThemeKeywords
43
+ WorkspaceName
44
+ }.join(',')
45
+
46
+ def download(src, i, n, w=50)
47
+ start = 0
48
+ i = src if i.nil?
49
+ while start < n do
50
+ fetch(src, i, start, w)
51
+ start += w
52
+ end
53
+ end
54
+
55
+ # fetch a set of Solr records from the src provider about the target institution
56
+ #
57
+ # @param [String] src The source provider of the Solr records
58
+ # @param [String] target the target institution
59
+ # @param [Integer] start
60
+ # @param [Integer] rows
61
+ def fetch(src, target, start, rows, datadir = 'data')
62
+ fn = File.join(datadir, "#{src.downcase}_#{target.downcase}_#{sprintf('%05i', start)}_#{rows}.json")
63
+ unless File.exist?(fn)
64
+ raise "Unknown URL for #{src}" unless URL.include?(src.downcase)
65
+ puts "Downloading #{target} #{start} to #{start+rows}"
66
+ url = "#{URL[src.downcase]}?" + URI::encode_www_form(
67
+ 'q' => '*:*',
68
+ 'fq' => "Institution:#{target}",
69
+ 'start' => start,
70
+ 'rows' => rows,
71
+ 'wt' => 'json',
72
+ 'indent' => 'on',
73
+ 'fl' => FIELDS
74
+ )
75
+ puts " #{url}" if $DEBUG
76
+ open(url) do |res|
77
+ File.open(fn, 'wb') do |f|
78
+ f.write(res.read())
79
+ end
80
+ end
81
+ else
82
+ puts "Using cache for #{target} #{start} to #{start+rows}"
83
+ end
84
+ end
85
+ end
86
+
87
+ # __MAIN__
88
+ ogp = DownloadOgp.new
89
+ ogp.download('Berkeley', 'Berkeley', 450)
90
+ ogp.download('Tufts', 'MassGIS', 600)
91
+ ogp.download('Tufts', 'Tufts', 2850)
92
+ ogp.download('Tufts', 'Harvard', 10000)
93
+ ogp.download('MIT', 'MIT', 9200)
94
+ ogp.download('UCLA', 'UCLA', 200)
95
+ ogp.download('Columbia', 'Columbia', 3600)
96
+ ogp.download('Minnesota', 'Minnesota', 2300)
@@ -0,0 +1,225 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: validate_ogp [output.json]
4
+ #
5
+ # Requires data/*.json as input and output to valid.json
6
+ #
7
+ require 'awesome_print'
8
+ require 'json'
9
+ require 'uri'
10
+ require 'date'
11
+
12
+ class ValidateOgp
13
+ def initialize(fn)
14
+ @wms_servers = {}
15
+ @output = File.open(fn, 'wb')
16
+ @output.write "[\n"
17
+ yield self
18
+ self.close
19
+ end
20
+
21
+ def validate_file(fn)
22
+ stats = { :accepted => 0, :rejected => 0 }
23
+ puts "Validating #{fn}"
24
+ json = JSON::parse(File.read(fn))
25
+ json['response']['docs'].each do |doc| # contains JSON Solr query results
26
+ begin
27
+ validate(doc)
28
+ stats[:accepted] += 1
29
+ rescue Exception => e
30
+ puts e
31
+ stats[:rejected] += 1
32
+ end
33
+ end
34
+ stats
35
+ end
36
+
37
+
38
+ def validate(layer)
39
+ id = layer['LayerId']
40
+
41
+ %w{LayerId Name Institution Access MinX MinY MaxX MaxY LayerDisplayName}.each do |k|
42
+ if layer[k].nil? || layer[k].to_s.empty?
43
+ raise ArgumentError, "ERROR: #{id} missing #{k}"
44
+ end
45
+ end
46
+
47
+ k = 'LayerId'
48
+ if layer[k].is_a? Array
49
+ layer[k] = layer[k].first
50
+ end
51
+ raise ArgumentError, "ERROR: #{k} is not a String: #{layer[k]}" unless layer[k].is_a? String
52
+
53
+ %w{MinX MaxX}.each do |lon|
54
+ raise ArgumentError, "ERROR: #{id}: Invalid longitude value: #{layer[lon]}" unless lon?(layer[lon])
55
+ end
56
+
57
+ %w{MinY MaxY}.each do |lat|
58
+ raise ArgumentError, "ERROR: #{id} Invalid latitude value: #{layer[lat]}" unless lat?(layer[lat])
59
+ end
60
+
61
+ if layer['MinX'].to_s.to_f > layer['MaxX'].to_s.to_f
62
+ raise ArgumentError, "ERROR: #{id} has MinX > MaxX"
63
+ end
64
+
65
+ if layer['MinY'].to_s.to_f > layer['MaxY'].to_s.to_f
66
+ raise ArgumentError, "ERROR: #{id} has MinY > MaxY"
67
+ end
68
+
69
+ k = 'Institution'
70
+ layer[k] = 'Columbia' if layer[k] == 'Columbia University'
71
+ if ([layer[k]] & %w{Berkeley Harvard MIT MassGIS Stanford Tufts UCLA Minnesota Columbia}).empty?
72
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
73
+ end
74
+
75
+ k = 'DataType'
76
+ layer[k] = 'Paper Map' if layer[k] == 'Paper'
77
+ if ([layer[k]] & %w{Line Paper\ Map Point Polygon Raster CD-ROM DVD-ROM}).empty?
78
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
79
+ end
80
+
81
+ k = 'Access'
82
+ if ([layer[k]] & %w{Public Restricted}).empty?
83
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
84
+ end
85
+
86
+ k = 'WorkspaceName'
87
+ if layer[k].nil?
88
+ layer[k] = layer['Institution']
89
+ end
90
+
91
+ k = 'Availability'
92
+ if layer[k].downcase == 'online' # cleanup
93
+ layer[k] = 'Online'
94
+ end
95
+ if layer[k].downcase == 'offline'
96
+ layer[k] = 'Offline'
97
+ end
98
+ if ([layer[k]] & %w{Online Offline}).empty?
99
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
100
+ end
101
+
102
+ k = 'Location'
103
+ layer[k] = validate_location(id, layer[k])
104
+ if layer[k].nil?
105
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
106
+ end
107
+
108
+ k = 'GeoReferenced'
109
+ unless layer[k].nil? or layer[k] == true
110
+ puts "WARNING: #{id} has boundingbox but claims it is not georeferenced"
111
+ #layer[k] = true
112
+ end
113
+
114
+ k = 'Area'
115
+ unless layer[k].to_i >= 0
116
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
117
+ end
118
+
119
+ k = 'ContentDate'
120
+ if layer[k].nil? || layer[k].to_s.strip.empty?
121
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
122
+ end
123
+ dt = Date.rfc3339(layer[k])
124
+ if dt.year <= 1 or dt.year > 2100
125
+ raise ArgumentError, "ERROR: #{id} has invalid #{k}: #{layer[k]}: #{dt}"
126
+ end
127
+
128
+ # k = 'FgdcText'
129
+ # unless layer[k].nil? or layer[k].empty?
130
+ # layer[k] = ''
131
+ # end
132
+
133
+ @output.write JSON::pretty_generate(layer)
134
+ @output.write "\n,\n"
135
+ end
136
+
137
+ def close
138
+ @output.write "\n {} \n]\n"
139
+ @output.close
140
+ ap({:wms_servers => @wms_servers})
141
+ end
142
+
143
+ private
144
+
145
+ def validate_location(id, location)
146
+ begin
147
+ begin
148
+ x = JSON::parse(location)
149
+ rescue JSON::ParserError => e
150
+ x = JSON::parse("{ #{location} }") # wrap in dictionary
151
+ end
152
+
153
+ unless x['externalDownload'].nil?
154
+ x['download'] = x['externalDownload']
155
+ x.delete('externalDownload')
156
+ end
157
+ unless x['libRecord'].nil?
158
+ x['url'] = x['libRecord']
159
+ x.delete('libRecord')
160
+ end
161
+ if x['download'].nil? && x['wms'].nil? && (x['wcs'].nil? && x['wfs'].nil?) && x['url'].nil?
162
+ puts "WARNING: #{id}: Missing Download or WMS or WCS/WFS or URL: #{x}"
163
+ return {}.to_json
164
+ end
165
+
166
+ %w{download wms wcs wfs url}.each do |protocol|
167
+ begin
168
+ unless x[protocol].nil?
169
+ if x[protocol].is_a? String
170
+ if x[protocol].empty? || x[protocol] == "None available"
171
+ x[protocol] = nil
172
+ next
173
+ else
174
+ x[protocol] = [x[protocol]]
175
+ end
176
+ end
177
+
178
+ unless x[protocol].is_a? Array
179
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
180
+ end
181
+
182
+ x[protocol].each do |url|
183
+ uri = URI.parse(url)
184
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(URI::HTTP) or uri.kind_of?(URI::HTTPS) or uri.kind_of?(URI::FTP)
185
+ end
186
+
187
+ # convert from Array to String
188
+ x[protocol] = x[protocol].first if x[protocol].is_a? Array
189
+ end
190
+ rescue URI::InvalidURIError => e
191
+ raise ArgumentError, "ERROR: #{id}: Invalid URL parsing: #{x}"
192
+ end
193
+ end
194
+
195
+ @wms_servers[x['wms']] = true unless x['wms'].nil?
196
+
197
+ return x.to_json
198
+ end
199
+ nil
200
+ end
201
+
202
+ def lon? lon
203
+ lon >= -180 and lon <= 180
204
+ end
205
+
206
+ def lat? lat
207
+ lat >= -90 and lat <= 90
208
+ end
209
+ end
210
+
211
+
212
+ # __MAIN__
213
+ ValidateOgp.new(ARGV[0].nil?? 'valid.json' : ARGV[0]) do |ogp|
214
+ stats = { :accepted => 0, :rejected => 0 }
215
+ Dir.glob('data/*.json') do |fn|
216
+ begin
217
+ s = ogp.validate_file(fn)
218
+ rescue Exception => e
219
+ end
220
+
221
+ stats[:accepted] += s[:accepted]
222
+ stats[:rejected] += s[:rejected]
223
+ end
224
+ ap({:statistics => stats})
225
+ end
@@ -0,0 +1,438 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: transform_ogp output.json
4
+ #
5
+ # Reads valid*.json in current directory
6
+ #
7
+
8
+ require 'awesome_print'
9
+ require 'json'
10
+ require 'uri'
11
+ require 'date'
12
+ require 'nokogiri'
13
+ require 'fileutils'
14
+ require 'open-uri'
15
+
16
+ # Transforms an OGP schema into GeoBlacklight. Requires input of a JSON array
17
+ # of OGP hashs.
18
+ class TransformOgp
19
+
20
+ def initialize(fn)
21
+ @output = File.open(fn, 'wb')
22
+ @output.write "[\n"
23
+ @fgdcdir = 'fgdc'
24
+ yield self
25
+ self.close
26
+ end
27
+
28
+ # @param [String|Array] s the URI to clean up
29
+ # @return [String] a normalized URI
30
+ def clean_uri(s)
31
+ unless s.nil? or s.empty?
32
+ return (s.is_a?(Array) ? URI(s.first.to_s.strip) : URI(s.to_s.strip)).to_s
33
+ end
34
+ ''
35
+ end
36
+
37
+ # @param [String] fn filename of JSON array of OGP hash objects
38
+ # @return [Hash] stats about :accepted vs. :rejected records
39
+ def transform_file(fn, layers_json)
40
+ stats = { :accepted => 0, :rejected => 0 }
41
+ puts "Parsing #{fn}"
42
+ json = JSON::parse(File.open(fn, 'rb').read)
43
+ json.each do |doc| # contains JSON Solr query results
44
+ unless doc.empty?
45
+ begin
46
+ transform(doc, layers_json)
47
+ stats[:accepted] += 1
48
+ rescue ArgumentError => e
49
+ puts e, e.backtrace
50
+ stats[:rejected] += 1
51
+ end
52
+ end
53
+ end
54
+ stats
55
+ end
56
+
57
+ # Transforms a single OGP record into a GeoBlacklight record
58
+ # @param [Hash] layer an OGP hash for a given layer
59
+ def transform(layer, layers_json, skip_fgdc = false, skip_geoblacklight = false, skip_ogm_check = false)
60
+ id = layer['LayerId'].to_s.strip
61
+ puts "Tranforming #{id}"
62
+
63
+ # For URN style @see http://www.ietf.org/rfc/rfc2141.txt
64
+ # For ARK @see https://wiki.ucop.edu/display/Curation/ARK
65
+ layer['Institution'].strip!
66
+ prefix = case layer['Institution']
67
+ when 'Stanford'
68
+ 'http://purl.stanford.edu/'
69
+ when 'Tufts'
70
+ 'urn:geodata.tufts.edu:'
71
+ when 'MassGIS'
72
+ 'urn:massgis.state.ma.us:'
73
+ when 'Berkeley'
74
+ 'http://ark.cdlib.org/ark:/'
75
+ when 'MIT'
76
+ 'urn:arrowsmith.mit.edu:'
77
+ when 'Harvard'
78
+ 'urn:hul.harvard.edu:'
79
+ when 'Minnesota'
80
+ 'urn:umn.edu:'
81
+ when 'UCLA'
82
+ 'urn:ucla.edu:'
83
+ when 'Columbia'
84
+ 'urn:columbia.edu:'
85
+ else
86
+ raise ArgumentError, 'ERROR: Skipping urn:UNKNOWN:'
87
+ end
88
+ uuid = prefix + URI.encode(id)
89
+
90
+ # Parse out the Location to get the WMS/WFS/WCS URLs, if available
91
+ location = JSON::parse(layer['Location'])
92
+ raise ArgumentError, "ERROR: #{id} has malformed location" unless location.is_a? Hash
93
+
94
+ # Parse out the bounding box
95
+ s = layer['MinY'].to_f
96
+ w = layer['MinX'].to_f
97
+ n = layer['MaxY'].to_f
98
+ e = layer['MaxX'].to_f
99
+
100
+ # Parse out the ContentDate date/time
101
+ begin
102
+ dt = DateTime.rfc3339(layer['ContentDate'])
103
+ rescue => e2
104
+ raise ArgumentError, "ERROR: #{id} has bad ContentDate: #{layer['ContentDate']}"
105
+ end
106
+
107
+ # pub_dt = DateTime.rfc3339('2000-01-01T00:00:00Z') # XXX fake data, get from MODS
108
+
109
+ access = layer['Access']
110
+ collection = nil
111
+
112
+ # Parse out the PURL and other metadata for Stanford
113
+ if layer['Institution'] == 'Stanford'
114
+ purl = location['purl']
115
+ if purl.is_a? Array
116
+ purl = purl.first
117
+ end
118
+ if purl.nil? and uuid =~ /^http/
119
+ purl = uuid
120
+ end
121
+ else
122
+ purl = nil
123
+ # Because OGP does not deliminate keywords, we use a heuristic here
124
+ %w{PlaceKeywords ThemeKeywords}.each do |k|
125
+ # layer[k] = nil
126
+ # unless layer[k] =~ /[;,]/ or layer[k].split.size < 4
127
+ # layer[k] = layer[k].split.join(';')
128
+ # end
129
+ end
130
+ end
131
+
132
+ slug = to_slug(id, layer)
133
+
134
+ layer_geom_type = layer['DataType'].to_s.strip
135
+ if (layer_geom_type.downcase == 'raster' || layer['LayerDisplayName'] =~ /\(Raster Image\)/) ||
136
+ (layer['Institution'] == 'Harvard' && layer_geom_type.downcase == 'paper map')
137
+ format = 'GeoTIFF'
138
+ layer_geom_type = 'Raster'
139
+ elsif %w{Point Line Polygon}.include?(layer_geom_type)
140
+ format = 'Shapefile'
141
+ elsif layer_geom_type.downcase == 'paper map'
142
+ format = 'Paper'
143
+ layer_geom_type = 'Paper Map'
144
+ elsif layer_geom_type.downcase =~ /-rom$/
145
+ format = layer_geom_type
146
+ layer_geom_type = 'Mixed'
147
+ else
148
+ raise ArgumentError, "ERROR: Invalid layer_geom_type: #{layer_geom_type}"
149
+ end
150
+
151
+ # if layer['LayerDisplayName'] =~ /Scanned Map/
152
+ # layer_geom_type = 'Scanned Map'
153
+ # format = 'Paper'
154
+ # end
155
+
156
+ layer_id = layer['WorkspaceName'] + ':' + layer['Name']
157
+
158
+ # @see https://github.com/OSGeo/Cat-Interop
159
+ %w{wcs wfs wms}.each do |k|
160
+ location[k] = location[k].first if location[k].is_a? Array
161
+ end
162
+ refs = {}
163
+ refs['http://www.opengis.net/def/serviceType/ogc/wcs'] = "#{location['wcs']}" if location['wcs']
164
+ refs['http://www.opengis.net/def/serviceType/ogc/wfs'] = "#{location['wfs']}" if location['wfs']
165
+ refs['http://www.opengis.net/def/serviceType/ogc/wms'] = "#{location['wms']}" if location['wms']
166
+ if layer['Institution'] == 'Harvard'
167
+ refs['http://schema.org/DownloadAction'] = "#{location['download']}" if location['download']
168
+ refs['http://schema.org/UserDownloads'] = "#{location['serviceStart']}" if location['serviceStart']
169
+ refs['http://tilecache.org'] = "#{location['tilecache'].first}" if location['tilecache']
170
+ else
171
+ refs['http://schema.org/downloadUrl'] = "#{location['download']}" if location['download']
172
+ end
173
+ refs['http://schema.org/url'] = "#{location['url']}" if location['url']
174
+ if purl
175
+ refs["http://schema.org/thumbnailUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/preview.jpg"
176
+ refs["http://schema.org/url"] = "#{clean_uri(purl)}"
177
+ refs["http://schema.org/downloadUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/data.zip"
178
+ refs["http://www.loc.gov/mods/v3"] = "#{purl}.mods"
179
+ refs["http://www.isotc211.org/schemas/2005/gmd/"] = "http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/#{layer_id}/iso19139.xml"
180
+ refs['http://www.w3.org/1999/xhtml'] = "http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/#{layer_id}/iso19139.html"
181
+ else
182
+ refs['http://www.opengis.net/cat/csw/csdgm'] = "http://opengeometadata.stanford.edu/metadata/org.opengeoportal/#{URI::encode(layer_id)}/fgdc.xml"
183
+ begin
184
+ _f = open(refs['http://www.opengis.net/cat/csw/csdgm'])
185
+ refs['http://www.w3.org/1999/xhtml'] = "http://opengeometadata.stanford.edu/metadata/org.opengeoportal/#{URI::encode(layer_id)}/fgdc.html"
186
+ rescue OpenURI::HTTPError => e2
187
+ refs.delete('http://www.opengis.net/cat/csw/csdgm')
188
+ rescue URI::InvalidURIError => e2
189
+ raise ArgumentError, "ERROR: #{id} has bad LayerId: #{layer['layer_id']}"
190
+ end unless skip_ogm_check
191
+ end
192
+
193
+ # If there's no homepage, use the HTML version of the Metadata if available
194
+ if refs['http://schema.org/url'].nil? && !refs['http://www.w3.org/1999/xhtml'].nil?
195
+ refs['http://schema.org/url'] = refs['http://www.w3.org/1999/xhtml']
196
+ end
197
+
198
+ # Make the conversion from OGP to GeoBlacklight
199
+ #
200
+ # @see http://dublincore.org/documents/dcmi-terms/
201
+ # @see http://wiki.dublincore.org/index.php/User_Guide/Creating_Metadata
202
+ # @see http://www.ietf.org/rfc/rfc5013.txt
203
+ new_layer = {
204
+ :uuid => uuid,
205
+
206
+ # Dublin Core elements
207
+ :dc_creator_sm => string2array(layer['Originator']),
208
+ :dc_description_s => layer['Abstract'],
209
+ :dc_format_s => format,
210
+ :dc_identifier_s => uuid,
211
+ :dc_language_s => 'English', # 'en', # XXX: fake data
212
+ :dc_publisher_s => layer['Publisher'],
213
+ :dc_rights_s => access,
214
+ :dc_subject_sm => string2array(layer['ThemeKeywords'], true),
215
+ :dc_title_s => layer['LayerDisplayName'],
216
+ :dc_type_s => 'Dataset', # or 'Image' for non-georectified,
217
+ # or 'PhysicalObject' for non-digitized maps
218
+ # Dublin Core terms
219
+ :dct_isPartOf_sm => collection.nil?? nil : [collection],
220
+ :dct_references_s => refs.to_json.to_s,
221
+ :dct_spatial_sm => string2array(layer['PlaceKeywords'], true),
222
+ :dct_temporal_sm => [dt.year.to_s],
223
+ # :dct_issued_s => pub_dt.year.to_s,
224
+ :dct_provenance_s => layer['Institution'],
225
+
226
+ #
227
+ # xmlns:georss="http://www.georss.org/georss"
228
+ # A bounding box is a rectangular region, often used to define the extents of a map or a rough area of interest. A box contains two space seperate latitude-longitude pairs, with each pair separated by whitespace. The first pair is the lower corner, the second is the upper corner.
229
+ :georss_box_s => "#{s} #{w} #{n} #{e}",
230
+ :georss_polygon_s => "#{n} #{w} #{n} #{e} #{s} #{e} #{s} #{w} #{n} #{w}",
231
+
232
+ # Layer-specific schema
233
+ :layer_slug_s => slug,
234
+ :layer_id_s => layer_id,
235
+ # :layer_srs_s => 'EPSG:4326', # XXX: fake data
236
+ :layer_geom_type_s => layer_geom_type,
237
+ :layer_modified_dt => Time.now.utc.strftime('%FT%TZ'),
238
+
239
+ # derived fields used only by solr, for which copyField is insufficient
240
+ :solr_bbox => "#{w} #{s} #{e} #{n}", # minX minY maxX maxY
241
+ # :solr_ne_pt => "#{n},#{e}",
242
+ # :solr_sw_pt => "#{s},#{w}",
243
+ :solr_geom => "ENVELOPE(#{w}, #{e}, #{n}, #{s})",
244
+ :solr_year_i => dt.year,
245
+ # :solr_issued_dt => pub_dt.strftime('%FT%TZ') # Solr requires 1995-12-31T23:59:59Z
246
+ # :solr_wms_url => location['wms'],
247
+ # :solr_wfs_url => location['wfs'],
248
+ # :solr_wcs_url => location['wcs']
249
+
250
+ # :layer_year_i => dt.year#, # XXX: migrate to copyField
251
+ # :ogp_area_f => layer['Area'],
252
+ # :ogp_center_x_f => layer['CenterX'],
253
+ # :ogp_center_y_f => layer['CenterY'],
254
+ # :ogp_georeferenced_b => (layer['GeoReferenced'].to_s.downcase == 'true'),
255
+ # :ogp_halfheight_f => layer['HalfHeight'],
256
+ # :ogp_halfwidth_f => layer['HalfWidth'],
257
+ # :ogp_layer_id_s => layer['LayerId'],
258
+ # :ogp_name_s => layer['Name'],
259
+ # :ogp_location_s => layer['Location'],
260
+ # :ogp_workspace_s => layer['WorkspaceName']
261
+ }
262
+
263
+ # Remove any fields that are blank
264
+ new_layer.each do |k, v|
265
+ new_layer.delete(k) if v.nil? or (v.respond_to?(:empty?) and v.empty?)
266
+ end
267
+
268
+ # Write the JSON record for the GeoBlacklight layer
269
+ @output.write JSON::pretty_generate(new_layer)
270
+ @output.write "\n,\n"
271
+
272
+ # export into OGM
273
+ ogm_dir = new_layer[:dct_provenance_s] + '/' + new_layer[:layer_id_s][-2,2].downcase.gsub(/[^a-z0-9]/, '_') + '/' + new_layer[:layer_id_s]
274
+ unless skip_fgdc or layer['FgdcText'].nil? or layer['FgdcText'].empty?
275
+ _fn = 'opengeometadata/org.opengeoportal/' + ogm_dir + '/fgdc.xml'
276
+ unless File.size?(_fn)
277
+ FileUtils.mkdir_p(File.dirname(_fn)) unless File.directory?(File.dirname(_fn))
278
+ xml = Nokogiri::XML(layer['FgdcText'])
279
+ xml.write_xml_to(File.open(_fn, 'wb'), encoding: 'UTF-8', indent: 2)
280
+ end
281
+ end
282
+
283
+ unless skip_geoblacklight
284
+ _fn = 'opengeometadata/org.opengeoportal/' + ogm_dir + '/geoblacklight.json'
285
+ layers_json[new_layer[:layer_id_s]] = ogm_dir
286
+ unless File.size?(_fn)
287
+ FileUtils.mkdir_p(File.dirname(_fn)) unless File.directory?(File.dirname(_fn))
288
+ _s = JSON::pretty_generate(new_layer)
289
+ File.open(_fn, 'wb') {|f| f.write(_s) }
290
+ end
291
+ end
292
+ end
293
+
294
+ def close
295
+ @output.write "\n {} \n]\n"
296
+ @output.close
297
+ end
298
+
299
+ # @param [String] s has semi-colon/comma/gt delimited array
300
+ # @return [Array] results as array
301
+ def string2array(s, clean_only = false)
302
+ return nil if s.nil?
303
+ if clean_only
304
+ if s.strip.size > 0 && !s.strip.index(' ') && s.strip.downcase != 'none'
305
+ [s.strip]
306
+ else
307
+ nil
308
+ end
309
+ else
310
+ if s.to_s =~ /[;,>]/
311
+ s.split(/\s*[;,>]\s*/).uniq.collect {|i| i.strip}
312
+ elsif s.is_a?(String) && s.size > 0
313
+ [s.strip]
314
+ else
315
+ nil
316
+ end
317
+ end
318
+ end
319
+
320
+ @@slugs = {}
321
+ def to_slug(id, layer)
322
+ # strip out schema and usernames
323
+ name = layer['Name'].sub('SDE_DATA.', '').sub('SDE.', '').sub('SDE2.', '').sub('GISPORTAL.GISOWNER01.', '').sub('GISDATA.', '').sub('MORIS.', '')
324
+ unless name.size > 1
325
+ # use first word of title is empty name
326
+ name = layer['LayerDisplayName'].split.first
327
+ end
328
+ slug = layer['Institution'] + '-' + name
329
+
330
+ # slugs should only have a-z, A-Z, 0-9, and -
331
+ slug.gsub!(/[^a-zA-Z0-9\-]/, '-')
332
+ slug.gsub!(/[\-]+/, '-')
333
+
334
+ # only lowercase
335
+ slug.downcase!
336
+
337
+ # ensure slugs are unique for this pass
338
+ if @@slugs.include?(slug)
339
+ slug += '-' + sprintf("%06d", Random.rand(999999))
340
+ end
341
+ @@slugs[slug] = true
342
+
343
+ slug
344
+ end
345
+
346
+ # Ensure that the WMS/WFS/WCS location values are as expected
347
+ def validate_location(id, location)
348
+ begin
349
+ x = JSON::parse(location)
350
+ %w{download url wms wcs wfs}.each do |protocol|
351
+ begin
352
+ unless x[protocol].nil?
353
+ if x[protocol].is_a? String
354
+ x[protocol] = [x[protocol]]
355
+ end
356
+
357
+ unless x[protocol].is_a? Array
358
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
359
+ end
360
+
361
+ x[protocol].each do |url|
362
+ uri = clean_uri.parse(url)
363
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(clean_uri::HTTP) or uri.kind_of?(clean_uri::HTTPS)
364
+ end
365
+ x[protocol] = x[protocol].first
366
+ end
367
+ rescue Exception => e
368
+ raise ArgumentError, "ERROR: #{id}: Invalid #{k}: #{x}"
369
+ end
370
+ end
371
+
372
+ return x.to_json
373
+ rescue JSON::ParserError => e
374
+ raise ArgumentError, "ERROR: #{id}: Invalid JSON: #{location}"
375
+ end
376
+ nil
377
+ end
378
+
379
+ def lon? lon
380
+ lon >= -180 and lon <= 180
381
+ end
382
+
383
+ def lat? lat
384
+ lat >= -90 and lat <= 90
385
+ end
386
+ end
387
+
388
+
389
+ # __MAIN__
390
+ #
391
+ TransformOgp.new(ARGV[0].nil?? 'transformed.json' : ARGV[0]) do |ogp|
392
+ stats = { :accepted => 0, :rejected => 0 }
393
+ layers_json = {}
394
+
395
+ Dir.glob('valid*.json') do |fn|
396
+ s = ogp.transform_file(fn, layers_json)
397
+ stats[:accepted] += s[:accepted]
398
+ stats[:rejected] += s[:rejected]
399
+ end
400
+
401
+ File.open('opengeometadata/org.opengeoportal/layers.json', 'wb') do |f|
402
+ f << JSON::pretty_generate(layers_json)
403
+ end
404
+
405
+ ap({:statistics => stats})
406
+ end
407
+
408
+ # example input data
409
+ __END__
410
+ [
411
+ {
412
+ "Abstract": "The boundaries of each supervisorial district in Sonoma County based on 2000 census. Redrawn in 2001 using Autobound.",
413
+ "Access": "Public",
414
+ "Area": 0.9463444815860053,
415
+ "Availability": "Online",
416
+ "CenterX": -122.942159,
417
+ "CenterY": 38.4580755,
418
+ "ContentDate": "2000-01-01T01:01:01Z",
419
+ "DataType": "Polygon",
420
+ "FgdcText": "...",
421
+ "GeoReferenced": true,
422
+ "HalfHeight": 0.39885650000000084,
423
+ "HalfWidth": 0.593161000000002,
424
+ "Institution": "Berkeley",
425
+ "LayerDisplayName": "SCGISDB2_BASE_ADM_SUPERVISOR",
426
+ "LayerId": "28722/bk0012h5s52",
427
+ "Location": "{\"wms\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wms\"],\"tilecache\":[\"http://gis.lib.berkeley.edu:8080/geoserver/gwc/service/wms\"],\"download\":\"\",\"wfs\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wfs\"]}",
428
+ "MaxX": -122.348998,
429
+ "MaxY": 38.856932,
430
+ "MinX": -123.53532,
431
+ "MinY": 38.059219,
432
+ "Name": "ADM_SUPERVISOR",
433
+ "PlaceKeywords": "Sonoma County County of Sonoma Sonoma California Bay Area",
434
+ "Publisher": "UC Berkeley Libraries",
435
+ "ThemeKeywords": "Supervisorial districts 1st District 2nd District 3rd District 4th District 5th District",
436
+ "WorkspaceName": "UCB"
437
+ }
438
+ ]