geoblacklight-schema 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +3 -0
  3. data/LICENSE +14 -0
  4. data/README.md +44 -0
  5. data/bin/fgdc2mods.rb +5 -0
  6. data/bin/mods2geoblacklight.rb +5 -0
  7. data/bin/xsltproc-saxon +14 -0
  8. data/conf/protwords.txt +21 -0
  9. data/conf/schema.xml +158 -0
  10. data/conf/solrconfig.xml +160 -0
  11. data/conf/stopwords_en.txt +34 -0
  12. data/conf/synonyms.txt +29 -0
  13. data/examples/Gemfile +4 -0
  14. data/examples/generate-example-doc.rb +42 -0
  15. data/examples/selected.json +5787 -0
  16. data/examples/upload-to-solr.rb +50 -0
  17. data/geoblacklight-schema.gemspec +23 -0
  18. data/lib/geoblacklight/gazetteer.csv +1011 -0
  19. data/lib/geoblacklight/gazetteer.rb +104 -0
  20. data/lib/xslt/arcgis_to_iso19110.xsl +364 -0
  21. data/lib/xslt/fgdc2mods.xsl +1007 -0
  22. data/lib/xslt/iso2mods.xsl +939 -0
  23. data/lib/xslt/mods2geoblacklight.xsl +268 -0
  24. data/lib/xslt/mods2ogp.xsl +195 -0
  25. data/tools/fgdc2html/Gemfile +2 -0
  26. data/tools/fgdc2html/fgdc2html.css +71 -0
  27. data/tools/fgdc2html/fgdc2html.js +6 -0
  28. data/tools/fgdc2html/fgdc2html.xsl +1034 -0
  29. data/tools/fgdc2html/render.rb +30 -0
  30. data/tools/iso2html/Gemfile +2 -0
  31. data/tools/iso2html/iso-html.xsl +1745 -0
  32. data/tools/iso2html/render.rb +24 -0
  33. data/tools/iso2html/utils/convert-enumerations.xsl +97 -0
  34. data/tools/iso2html/utils/convert-latlong.xsl +73 -0
  35. data/tools/iso2html/utils/decode-uri/base.css +408 -0
  36. data/tools/iso2html/utils/decode-uri/index.html +29 -0
  37. data/tools/iso2html/utils/elements-fgdc.xml +824 -0
  38. data/tools/iso2html/utils/elements-iso.xml +636 -0
  39. data/tools/iso2html/utils/printFormatted.xsl +267 -0
  40. data/tools/iso2html/utils/printTextLines.xsl +192 -0
  41. data/tools/iso2html/utils/replace-newlines.xsl +97 -0
  42. data/tools/iso2html/utils/replace-string.xsl +80 -0
  43. data/tools/iso2html/utils/strip-digits.xsl +60 -0
  44. data/tools/iso2html/utils/url-decode.xsl +87 -0
  45. data/tools/iso2html/utils/wrap-text.xsl +174 -0
  46. data/tools/ogp/0_download.rb +96 -0
  47. data/tools/ogp/1_validate.rb +225 -0
  48. data/tools/ogp/2_transform.rb +438 -0
  49. data/tools/ogp/3_stanford.rb +35 -0
  50. data/tools/ogp/4_select.rb +189 -0
  51. data/tools/ogp/5_ingest.rb +55 -0
  52. data/tools/ogp/Gemfile +2 -0
  53. data/tools/solr/Gemfile +3 -0
  54. data/tools/solr/purge.rb +33 -0
  55. data/tools/solr/upload.rb +35 -0
  56. data/vendor/.keep +0 -0
  57. metadata +131 -0
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'open-uri'
4
+
5
+ class DownloadOgp
6
+ URL = {
7
+ 'tufts' => 'http://geodata.tufts.edu/solr/select',
8
+ 'mit' => 'http://arrowsmith.mit.edu/solr/select',
9
+ 'berkeley' => 'http://gis.lib.berkeley.edu:9081/solr4/select',
10
+ 'harvard' => 'http://geodata.tufts.edu/solr/select', # Harvard uses Tufts solr index
11
+ 'ucla' => 'http://vizsla.oit.ucla.edu:8080/solr/select',
12
+ 'columbia' => 'http://culspatial.cul.columbia.edu/solr/select',
13
+ 'minnesota' => 'http://ec2-54-87-229-228.compute-1.amazonaws.com:8080/solr/collection1/select'
14
+ }
15
+
16
+ FIELDS = %w{
17
+ Abstract
18
+ Access
19
+ Area
20
+ Availability
21
+ CenterX
22
+ CenterY
23
+ ContentDate
24
+ DataType
25
+ ExternalLayerId
26
+ FgdcText
27
+ GeoReferenced
28
+ HalfHeight
29
+ HalfWidth
30
+ Institution
31
+ LayerDisplayName
32
+ LayerId
33
+ Location
34
+ MaxX
35
+ MaxY
36
+ MinX
37
+ MinY
38
+ Name
39
+ PlaceKeywords
40
+ Publisher
41
+ SrsProjectionCode
42
+ ThemeKeywords
43
+ WorkspaceName
44
+ }.join(',')
45
+
46
+ def download(src, i, n, w=50)
47
+ start = 0
48
+ i = src if i.nil?
49
+ while start < n do
50
+ fetch(src, i, start, w)
51
+ start += w
52
+ end
53
+ end
54
+
55
+ # fetch a set of Solr records from the src provider about the target institution
56
+ #
57
+ # @param [String] src The source provider of the Solr records
58
+ # @param [String] target the target institution
59
+ # @param [Integer] start
60
+ # @param [Integer] rows
61
+ def fetch(src, target, start, rows, datadir = 'data')
62
+ fn = File.join(datadir, "#{src.downcase}_#{target.downcase}_#{sprintf('%05i', start)}_#{rows}.json")
63
+ unless File.exist?(fn)
64
+ raise "Unknown URL for #{src}" unless URL.include?(src.downcase)
65
+ puts "Downloading #{target} #{start} to #{start+rows}"
66
+ url = "#{URL[src.downcase]}?" + URI::encode_www_form(
67
+ 'q' => '*:*',
68
+ 'fq' => "Institution:#{target}",
69
+ 'start' => start,
70
+ 'rows' => rows,
71
+ 'wt' => 'json',
72
+ 'indent' => 'on',
73
+ 'fl' => FIELDS
74
+ )
75
+ puts " #{url}" if $DEBUG
76
+ open(url) do |res|
77
+ File.open(fn, 'wb') do |f|
78
+ f.write(res.read())
79
+ end
80
+ end
81
+ else
82
+ puts "Using cache for #{target} #{start} to #{start+rows}"
83
+ end
84
+ end
85
+ end
86
+
87
+ # __MAIN__
88
+ ogp = DownloadOgp.new
89
+ ogp.download('Berkeley', 'Berkeley', 450)
90
+ ogp.download('Tufts', 'MassGIS', 600)
91
+ ogp.download('Tufts', 'Tufts', 2850)
92
+ ogp.download('Tufts', 'Harvard', 10000)
93
+ ogp.download('MIT', 'MIT', 9200)
94
+ ogp.download('UCLA', 'UCLA', 200)
95
+ ogp.download('Columbia', 'Columbia', 3600)
96
+ ogp.download('Minnesota', 'Minnesota', 2300)
@@ -0,0 +1,225 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: validate_ogp [output.json]
4
+ #
5
+ # Requires data/*.json as input and output to valid.json
6
+ #
7
+ require 'awesome_print'
8
+ require 'json'
9
+ require 'uri'
10
+ require 'date'
11
+
12
+ class ValidateOgp
13
+ def initialize(fn)
14
+ @wms_servers = {}
15
+ @output = File.open(fn, 'wb')
16
+ @output.write "[\n"
17
+ yield self
18
+ self.close
19
+ end
20
+
21
+ def validate_file(fn)
22
+ stats = { :accepted => 0, :rejected => 0 }
23
+ puts "Validating #{fn}"
24
+ json = JSON::parse(File.read(fn))
25
+ json['response']['docs'].each do |doc| # contains JSON Solr query results
26
+ begin
27
+ validate(doc)
28
+ stats[:accepted] += 1
29
+ rescue Exception => e
30
+ puts e
31
+ stats[:rejected] += 1
32
+ end
33
+ end
34
+ stats
35
+ end
36
+
37
+
38
+ def validate(layer)
39
+ id = layer['LayerId']
40
+
41
+ %w{LayerId Name Institution Access MinX MinY MaxX MaxY LayerDisplayName}.each do |k|
42
+ if layer[k].nil? || layer[k].to_s.empty?
43
+ raise ArgumentError, "ERROR: #{id} missing #{k}"
44
+ end
45
+ end
46
+
47
+ k = 'LayerId'
48
+ if layer[k].is_a? Array
49
+ layer[k] = layer[k].first
50
+ end
51
+ raise ArgumentError, "ERROR: #{k} is not a String: #{layer[k]}" unless layer[k].is_a? String
52
+
53
+ %w{MinX MaxX}.each do |lon|
54
+ raise ArgumentError, "ERROR: #{id}: Invalid longitude value: #{layer[lon]}" unless lon?(layer[lon])
55
+ end
56
+
57
+ %w{MinY MaxY}.each do |lat|
58
+ raise ArgumentError, "ERROR: #{id} Invalid latitude value: #{layer[lat]}" unless lat?(layer[lat])
59
+ end
60
+
61
+ if layer['MinX'].to_s.to_f > layer['MaxX'].to_s.to_f
62
+ raise ArgumentError, "ERROR: #{id} has MinX > MaxX"
63
+ end
64
+
65
+ if layer['MinY'].to_s.to_f > layer['MaxY'].to_s.to_f
66
+ raise ArgumentError, "ERROR: #{id} has MinY > MaxY"
67
+ end
68
+
69
+ k = 'Institution'
70
+ layer[k] = 'Columbia' if layer[k] == 'Columbia University'
71
+ if ([layer[k]] & %w{Berkeley Harvard MIT MassGIS Stanford Tufts UCLA Minnesota Columbia}).empty?
72
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
73
+ end
74
+
75
+ k = 'DataType'
76
+ layer[k] = 'Paper Map' if layer[k] == 'Paper'
77
+ if ([layer[k]] & %w{Line Paper\ Map Point Polygon Raster CD-ROM DVD-ROM}).empty?
78
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
79
+ end
80
+
81
+ k = 'Access'
82
+ if ([layer[k]] & %w{Public Restricted}).empty?
83
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
84
+ end
85
+
86
+ k = 'WorkspaceName'
87
+ if layer[k].nil?
88
+ layer[k] = layer['Institution']
89
+ end
90
+
91
+ k = 'Availability'
92
+ if layer[k].downcase == 'online' # cleanup
93
+ layer[k] = 'Online'
94
+ end
95
+ if layer[k].downcase == 'offline'
96
+ layer[k] = 'Offline'
97
+ end
98
+ if ([layer[k]] & %w{Online Offline}).empty?
99
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
100
+ end
101
+
102
+ k = 'Location'
103
+ layer[k] = validate_location(id, layer[k])
104
+ if layer[k].nil?
105
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
106
+ end
107
+
108
+ k = 'GeoReferenced'
109
+ unless layer[k].nil? or layer[k] == true
110
+ puts "WARNING: #{id} has boundingbox but claims it is not georeferenced"
111
+ #layer[k] = true
112
+ end
113
+
114
+ k = 'Area'
115
+ unless layer[k].to_i >= 0
116
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
117
+ end
118
+
119
+ k = 'ContentDate'
120
+ if layer[k].nil? || layer[k].to_s.strip.empty?
121
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
122
+ end
123
+ dt = Date.rfc3339(layer[k])
124
+ if dt.year <= 1 or dt.year > 2100
125
+ raise ArgumentError, "ERROR: #{id} has invalid #{k}: #{layer[k]}: #{dt}"
126
+ end
127
+
128
+ # k = 'FgdcText'
129
+ # unless layer[k].nil? or layer[k].empty?
130
+ # layer[k] = ''
131
+ # end
132
+
133
+ @output.write JSON::pretty_generate(layer)
134
+ @output.write "\n,\n"
135
+ end
136
+
137
+ def close
138
+ @output.write "\n {} \n]\n"
139
+ @output.close
140
+ ap({:wms_servers => @wms_servers})
141
+ end
142
+
143
+ private
144
+
145
+ def validate_location(id, location)
146
+ begin
147
+ begin
148
+ x = JSON::parse(location)
149
+ rescue JSON::ParserError => e
150
+ x = JSON::parse("{ #{location} }") # wrap in dictionary
151
+ end
152
+
153
+ unless x['externalDownload'].nil?
154
+ x['download'] = x['externalDownload']
155
+ x.delete('externalDownload')
156
+ end
157
+ unless x['libRecord'].nil?
158
+ x['url'] = x['libRecord']
159
+ x.delete('libRecord')
160
+ end
161
+ if x['download'].nil? && x['wms'].nil? && (x['wcs'].nil? && x['wfs'].nil?) && x['url'].nil?
162
+ puts "WARNING: #{id}: Missing Download or WMS or WCS/WFS or URL: #{x}"
163
+ return {}.to_json
164
+ end
165
+
166
+ %w{download wms wcs wfs url}.each do |protocol|
167
+ begin
168
+ unless x[protocol].nil?
169
+ if x[protocol].is_a? String
170
+ if x[protocol].empty? || x[protocol] == "None available"
171
+ x[protocol] = nil
172
+ next
173
+ else
174
+ x[protocol] = [x[protocol]]
175
+ end
176
+ end
177
+
178
+ unless x[protocol].is_a? Array
179
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
180
+ end
181
+
182
+ x[protocol].each do |url|
183
+ uri = URI.parse(url)
184
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(URI::HTTP) or uri.kind_of?(URI::HTTPS) or uri.kind_of?(URI::FTP)
185
+ end
186
+
187
+ # convert from Array to String
188
+ x[protocol] = x[protocol].first if x[protocol].is_a? Array
189
+ end
190
+ rescue URI::InvalidURIError => e
191
+ raise ArgumentError, "ERROR: #{id}: Invalid URL parsing: #{x}"
192
+ end
193
+ end
194
+
195
+ @wms_servers[x['wms']] = true unless x['wms'].nil?
196
+
197
+ return x.to_json
198
+ end
199
+ nil
200
+ end
201
+
202
+ def lon? lon
203
+ lon >= -180 and lon <= 180
204
+ end
205
+
206
+ def lat? lat
207
+ lat >= -90 and lat <= 90
208
+ end
209
+ end
210
+
211
+
212
+ # __MAIN__
213
+ ValidateOgp.new(ARGV[0].nil?? 'valid.json' : ARGV[0]) do |ogp|
214
+ stats = { :accepted => 0, :rejected => 0 }
215
+ Dir.glob('data/*.json') do |fn|
216
+ begin
217
+ s = ogp.validate_file(fn)
218
+ rescue Exception => e
219
+ end
220
+
221
+ stats[:accepted] += s[:accepted]
222
+ stats[:rejected] += s[:rejected]
223
+ end
224
+ ap({:statistics => stats})
225
+ end
@@ -0,0 +1,438 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: transform_ogp output.json
4
+ #
5
+ # Reads valid*.json in current directory
6
+ #
7
+
8
+ require 'awesome_print'
9
+ require 'json'
10
+ require 'uri'
11
+ require 'date'
12
+ require 'nokogiri'
13
+ require 'fileutils'
14
+ require 'open-uri'
15
+
16
+ # Transforms an OGP schema into GeoBlacklight. Requires input of a JSON array
17
+ # of OGP hashs.
18
+ class TransformOgp
19
+
20
+ def initialize(fn)
21
+ @output = File.open(fn, 'wb')
22
+ @output.write "[\n"
23
+ @fgdcdir = 'fgdc'
24
+ yield self
25
+ self.close
26
+ end
27
+
28
+ # @param [String|Array] s the URI to clean up
29
+ # @return [String] a normalized URI
30
+ def clean_uri(s)
31
+ unless s.nil? or s.empty?
32
+ return (s.is_a?(Array) ? URI(s.first.to_s.strip) : URI(s.to_s.strip)).to_s
33
+ end
34
+ ''
35
+ end
36
+
37
+ # @param [String] fn filename of JSON array of OGP hash objects
38
+ # @return [Hash] stats about :accepted vs. :rejected records
39
+ def transform_file(fn, layers_json)
40
+ stats = { :accepted => 0, :rejected => 0 }
41
+ puts "Parsing #{fn}"
42
+ json = JSON::parse(File.open(fn, 'rb').read)
43
+ json.each do |doc| # contains JSON Solr query results
44
+ unless doc.empty?
45
+ begin
46
+ transform(doc, layers_json)
47
+ stats[:accepted] += 1
48
+ rescue ArgumentError => e
49
+ puts e, e.backtrace
50
+ stats[:rejected] += 1
51
+ end
52
+ end
53
+ end
54
+ stats
55
+ end
56
+
57
+ # Transforms a single OGP record into a GeoBlacklight record
58
+ # @param [Hash] layer an OGP hash for a given layer
59
+ def transform(layer, layers_json, skip_fgdc = false, skip_geoblacklight = false, skip_ogm_check = false)
60
+ id = layer['LayerId'].to_s.strip
61
+ puts "Tranforming #{id}"
62
+
63
+ # For URN style @see http://www.ietf.org/rfc/rfc2141.txt
64
+ # For ARK @see https://wiki.ucop.edu/display/Curation/ARK
65
+ layer['Institution'].strip!
66
+ prefix = case layer['Institution']
67
+ when 'Stanford'
68
+ 'http://purl.stanford.edu/'
69
+ when 'Tufts'
70
+ 'urn:geodata.tufts.edu:'
71
+ when 'MassGIS'
72
+ 'urn:massgis.state.ma.us:'
73
+ when 'Berkeley'
74
+ 'http://ark.cdlib.org/ark:/'
75
+ when 'MIT'
76
+ 'urn:arrowsmith.mit.edu:'
77
+ when 'Harvard'
78
+ 'urn:hul.harvard.edu:'
79
+ when 'Minnesota'
80
+ 'urn:umn.edu:'
81
+ when 'UCLA'
82
+ 'urn:ucla.edu:'
83
+ when 'Columbia'
84
+ 'urn:columbia.edu:'
85
+ else
86
+ raise ArgumentError, 'ERROR: Skipping urn:UNKNOWN:'
87
+ end
88
+ uuid = prefix + URI.encode(id)
89
+
90
+ # Parse out the Location to get the WMS/WFS/WCS URLs, if available
91
+ location = JSON::parse(layer['Location'])
92
+ raise ArgumentError, "ERROR: #{id} has malformed location" unless location.is_a? Hash
93
+
94
+ # Parse out the bounding box
95
+ s = layer['MinY'].to_f
96
+ w = layer['MinX'].to_f
97
+ n = layer['MaxY'].to_f
98
+ e = layer['MaxX'].to_f
99
+
100
+ # Parse out the ContentDate date/time
101
+ begin
102
+ dt = DateTime.rfc3339(layer['ContentDate'])
103
+ rescue => e2
104
+ raise ArgumentError, "ERROR: #{id} has bad ContentDate: #{layer['ContentDate']}"
105
+ end
106
+
107
+ # pub_dt = DateTime.rfc3339('2000-01-01T00:00:00Z') # XXX fake data, get from MODS
108
+
109
+ access = layer['Access']
110
+ collection = nil
111
+
112
+ # Parse out the PURL and other metadata for Stanford
113
+ if layer['Institution'] == 'Stanford'
114
+ purl = location['purl']
115
+ if purl.is_a? Array
116
+ purl = purl.first
117
+ end
118
+ if purl.nil? and uuid =~ /^http/
119
+ purl = uuid
120
+ end
121
+ else
122
+ purl = nil
123
+ # Because OGP does not deliminate keywords, we use a heuristic here
124
+ %w{PlaceKeywords ThemeKeywords}.each do |k|
125
+ # layer[k] = nil
126
+ # unless layer[k] =~ /[;,]/ or layer[k].split.size < 4
127
+ # layer[k] = layer[k].split.join(';')
128
+ # end
129
+ end
130
+ end
131
+
132
+ slug = to_slug(id, layer)
133
+
134
+ layer_geom_type = layer['DataType'].to_s.strip
135
+ if (layer_geom_type.downcase == 'raster' || layer['LayerDisplayName'] =~ /\(Raster Image\)/) ||
136
+ (layer['Institution'] == 'Harvard' && layer_geom_type.downcase == 'paper map')
137
+ format = 'GeoTIFF'
138
+ layer_geom_type = 'Raster'
139
+ elsif %w{Point Line Polygon}.include?(layer_geom_type)
140
+ format = 'Shapefile'
141
+ elsif layer_geom_type.downcase == 'paper map'
142
+ format = 'Paper'
143
+ layer_geom_type = 'Paper Map'
144
+ elsif layer_geom_type.downcase =~ /-rom$/
145
+ format = layer_geom_type
146
+ layer_geom_type = 'Mixed'
147
+ else
148
+ raise ArgumentError, "ERROR: Invalid layer_geom_type: #{layer_geom_type}"
149
+ end
150
+
151
+ # if layer['LayerDisplayName'] =~ /Scanned Map/
152
+ # layer_geom_type = 'Scanned Map'
153
+ # format = 'Paper'
154
+ # end
155
+
156
+ layer_id = layer['WorkspaceName'] + ':' + layer['Name']
157
+
158
+ # @see https://github.com/OSGeo/Cat-Interop
159
+ %w{wcs wfs wms}.each do |k|
160
+ location[k] = location[k].first if location[k].is_a? Array
161
+ end
162
+ refs = {}
163
+ refs['http://www.opengis.net/def/serviceType/ogc/wcs'] = "#{location['wcs']}" if location['wcs']
164
+ refs['http://www.opengis.net/def/serviceType/ogc/wfs'] = "#{location['wfs']}" if location['wfs']
165
+ refs['http://www.opengis.net/def/serviceType/ogc/wms'] = "#{location['wms']}" if location['wms']
166
+ if layer['Institution'] == 'Harvard'
167
+ refs['http://schema.org/DownloadAction'] = "#{location['download']}" if location['download']
168
+ refs['http://schema.org/UserDownloads'] = "#{location['serviceStart']}" if location['serviceStart']
169
+ refs['http://tilecache.org'] = "#{location['tilecache'].first}" if location['tilecache']
170
+ else
171
+ refs['http://schema.org/downloadUrl'] = "#{location['download']}" if location['download']
172
+ end
173
+ refs['http://schema.org/url'] = "#{location['url']}" if location['url']
174
+ if purl
175
+ refs["http://schema.org/thumbnailUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/preview.jpg"
176
+ refs["http://schema.org/url"] = "#{clean_uri(purl)}"
177
+ refs["http://schema.org/downloadUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/data.zip"
178
+ refs["http://www.loc.gov/mods/v3"] = "#{purl}.mods"
179
+ refs["http://www.isotc211.org/schemas/2005/gmd/"] = "http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/#{layer_id}/iso19139.xml"
180
+ refs['http://www.w3.org/1999/xhtml'] = "http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/#{layer_id}/iso19139.html"
181
+ else
182
+ refs['http://www.opengis.net/cat/csw/csdgm'] = "http://opengeometadata.stanford.edu/metadata/org.opengeoportal/#{URI::encode(layer_id)}/fgdc.xml"
183
+ begin
184
+ _f = open(refs['http://www.opengis.net/cat/csw/csdgm'])
185
+ refs['http://www.w3.org/1999/xhtml'] = "http://opengeometadata.stanford.edu/metadata/org.opengeoportal/#{URI::encode(layer_id)}/fgdc.html"
186
+ rescue OpenURI::HTTPError => e2
187
+ refs.delete('http://www.opengis.net/cat/csw/csdgm')
188
+ rescue URI::InvalidURIError => e2
189
+ raise ArgumentError, "ERROR: #{id} has bad LayerId: #{layer['layer_id']}"
190
+ end unless skip_ogm_check
191
+ end
192
+
193
+ # If there's no homepage, use the HTML version of the Metadata if available
194
+ if refs['http://schema.org/url'].nil? && !refs['http://www.w3.org/1999/xhtml'].nil?
195
+ refs['http://schema.org/url'] = refs['http://www.w3.org/1999/xhtml']
196
+ end
197
+
198
+ # Make the conversion from OGP to GeoBlacklight
199
+ #
200
+ # @see http://dublincore.org/documents/dcmi-terms/
201
+ # @see http://wiki.dublincore.org/index.php/User_Guide/Creating_Metadata
202
+ # @see http://www.ietf.org/rfc/rfc5013.txt
203
+ new_layer = {
204
+ :uuid => uuid,
205
+
206
+ # Dublin Core elements
207
+ :dc_creator_sm => string2array(layer['Originator']),
208
+ :dc_description_s => layer['Abstract'],
209
+ :dc_format_s => format,
210
+ :dc_identifier_s => uuid,
211
+ :dc_language_s => 'English', # 'en', # XXX: fake data
212
+ :dc_publisher_s => layer['Publisher'],
213
+ :dc_rights_s => access,
214
+ :dc_subject_sm => string2array(layer['ThemeKeywords'], true),
215
+ :dc_title_s => layer['LayerDisplayName'],
216
+ :dc_type_s => 'Dataset', # or 'Image' for non-georectified,
217
+ # or 'PhysicalObject' for non-digitized maps
218
+ # Dublin Core terms
219
+ :dct_isPartOf_sm => collection.nil?? nil : [collection],
220
+ :dct_references_s => refs.to_json.to_s,
221
+ :dct_spatial_sm => string2array(layer['PlaceKeywords'], true),
222
+ :dct_temporal_sm => [dt.year.to_s],
223
+ # :dct_issued_s => pub_dt.year.to_s,
224
+ :dct_provenance_s => layer['Institution'],
225
+
226
+ #
227
+ # xmlns:georss="http://www.georss.org/georss"
228
+ # A bounding box is a rectangular region, often used to define the extents of a map or a rough area of interest. A box contains two space seperate latitude-longitude pairs, with each pair separated by whitespace. The first pair is the lower corner, the second is the upper corner.
229
+ :georss_box_s => "#{s} #{w} #{n} #{e}",
230
+ :georss_polygon_s => "#{n} #{w} #{n} #{e} #{s} #{e} #{s} #{w} #{n} #{w}",
231
+
232
+ # Layer-specific schema
233
+ :layer_slug_s => slug,
234
+ :layer_id_s => layer_id,
235
+ # :layer_srs_s => 'EPSG:4326', # XXX: fake data
236
+ :layer_geom_type_s => layer_geom_type,
237
+ :layer_modified_dt => Time.now.utc.strftime('%FT%TZ'),
238
+
239
+ # derived fields used only by solr, for which copyField is insufficient
240
+ :solr_bbox => "#{w} #{s} #{e} #{n}", # minX minY maxX maxY
241
+ # :solr_ne_pt => "#{n},#{e}",
242
+ # :solr_sw_pt => "#{s},#{w}",
243
+ :solr_geom => "ENVELOPE(#{w}, #{e}, #{n}, #{s})",
244
+ :solr_year_i => dt.year,
245
+ # :solr_issued_dt => pub_dt.strftime('%FT%TZ') # Solr requires 1995-12-31T23:59:59Z
246
+ # :solr_wms_url => location['wms'],
247
+ # :solr_wfs_url => location['wfs'],
248
+ # :solr_wcs_url => location['wcs']
249
+
250
+ # :layer_year_i => dt.year#, # XXX: migrate to copyField
251
+ # :ogp_area_f => layer['Area'],
252
+ # :ogp_center_x_f => layer['CenterX'],
253
+ # :ogp_center_y_f => layer['CenterY'],
254
+ # :ogp_georeferenced_b => (layer['GeoReferenced'].to_s.downcase == 'true'),
255
+ # :ogp_halfheight_f => layer['HalfHeight'],
256
+ # :ogp_halfwidth_f => layer['HalfWidth'],
257
+ # :ogp_layer_id_s => layer['LayerId'],
258
+ # :ogp_name_s => layer['Name'],
259
+ # :ogp_location_s => layer['Location'],
260
+ # :ogp_workspace_s => layer['WorkspaceName']
261
+ }
262
+
263
+ # Remove any fields that are blank
264
+ new_layer.each do |k, v|
265
+ new_layer.delete(k) if v.nil? or (v.respond_to?(:empty?) and v.empty?)
266
+ end
267
+
268
+ # Write the JSON record for the GeoBlacklight layer
269
+ @output.write JSON::pretty_generate(new_layer)
270
+ @output.write "\n,\n"
271
+
272
+ # export into OGM
273
+ ogm_dir = new_layer[:dct_provenance_s] + '/' + new_layer[:layer_id_s][-2,2].downcase.gsub(/[^a-z0-9]/, '_') + '/' + new_layer[:layer_id_s]
274
+ unless skip_fgdc or layer['FgdcText'].nil? or layer['FgdcText'].empty?
275
+ _fn = 'opengeometadata/org.opengeoportal/' + ogm_dir + '/fgdc.xml'
276
+ unless File.size?(_fn)
277
+ FileUtils.mkdir_p(File.dirname(_fn)) unless File.directory?(File.dirname(_fn))
278
+ xml = Nokogiri::XML(layer['FgdcText'])
279
+ xml.write_xml_to(File.open(_fn, 'wb'), encoding: 'UTF-8', indent: 2)
280
+ end
281
+ end
282
+
283
+ unless skip_geoblacklight
284
+ _fn = 'opengeometadata/org.opengeoportal/' + ogm_dir + '/geoblacklight.json'
285
+ layers_json[new_layer[:layer_id_s]] = ogm_dir
286
+ unless File.size?(_fn)
287
+ FileUtils.mkdir_p(File.dirname(_fn)) unless File.directory?(File.dirname(_fn))
288
+ _s = JSON::pretty_generate(new_layer)
289
+ File.open(_fn, 'wb') {|f| f.write(_s) }
290
+ end
291
+ end
292
+ end
293
+
294
+ def close
295
+ @output.write "\n {} \n]\n"
296
+ @output.close
297
+ end
298
+
299
+ # @param [String] s has semi-colon/comma/gt delimited array
300
+ # @return [Array] results as array
301
+ def string2array(s, clean_only = false)
302
+ return nil if s.nil?
303
+ if clean_only
304
+ if s.strip.size > 0 && !s.strip.index(' ') && s.strip.downcase != 'none'
305
+ [s.strip]
306
+ else
307
+ nil
308
+ end
309
+ else
310
+ if s.to_s =~ /[;,>]/
311
+ s.split(/\s*[;,>]\s*/).uniq.collect {|i| i.strip}
312
+ elsif s.is_a?(String) && s.size > 0
313
+ [s.strip]
314
+ else
315
+ nil
316
+ end
317
+ end
318
+ end
319
+
320
+ @@slugs = {}
321
+ def to_slug(id, layer)
322
+ # strip out schema and usernames
323
+ name = layer['Name'].sub('SDE_DATA.', '').sub('SDE.', '').sub('SDE2.', '').sub('GISPORTAL.GISOWNER01.', '').sub('GISDATA.', '').sub('MORIS.', '')
324
+ unless name.size > 1
325
+ # use first word of title is empty name
326
+ name = layer['LayerDisplayName'].split.first
327
+ end
328
+ slug = layer['Institution'] + '-' + name
329
+
330
+ # slugs should only have a-z, A-Z, 0-9, and -
331
+ slug.gsub!(/[^a-zA-Z0-9\-]/, '-')
332
+ slug.gsub!(/[\-]+/, '-')
333
+
334
+ # only lowercase
335
+ slug.downcase!
336
+
337
+ # ensure slugs are unique for this pass
338
+ if @@slugs.include?(slug)
339
+ slug += '-' + sprintf("%06d", Random.rand(999999))
340
+ end
341
+ @@slugs[slug] = true
342
+
343
+ slug
344
+ end
345
+
346
+ # Ensure that the WMS/WFS/WCS location values are as expected
347
+ def validate_location(id, location)
348
+ begin
349
+ x = JSON::parse(location)
350
+ %w{download url wms wcs wfs}.each do |protocol|
351
+ begin
352
+ unless x[protocol].nil?
353
+ if x[protocol].is_a? String
354
+ x[protocol] = [x[protocol]]
355
+ end
356
+
357
+ unless x[protocol].is_a? Array
358
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
359
+ end
360
+
361
+ x[protocol].each do |url|
362
+ uri = clean_uri.parse(url)
363
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(clean_uri::HTTP) or uri.kind_of?(clean_uri::HTTPS)
364
+ end
365
+ x[protocol] = x[protocol].first
366
+ end
367
+ rescue Exception => e
368
+ raise ArgumentError, "ERROR: #{id}: Invalid #{k}: #{x}"
369
+ end
370
+ end
371
+
372
+ return x.to_json
373
+ rescue JSON::ParserError => e
374
+ raise ArgumentError, "ERROR: #{id}: Invalid JSON: #{location}"
375
+ end
376
+ nil
377
+ end
378
+
379
+ def lon? lon
380
+ lon >= -180 and lon <= 180
381
+ end
382
+
383
+ def lat? lat
384
+ lat >= -90 and lat <= 90
385
+ end
386
+ end
387
+
388
+
389
+ # __MAIN__
390
+ #
391
+ TransformOgp.new(ARGV[0].nil?? 'transformed.json' : ARGV[0]) do |ogp|
392
+ stats = { :accepted => 0, :rejected => 0 }
393
+ layers_json = {}
394
+
395
+ Dir.glob('valid*.json') do |fn|
396
+ s = ogp.transform_file(fn, layers_json)
397
+ stats[:accepted] += s[:accepted]
398
+ stats[:rejected] += s[:rejected]
399
+ end
400
+
401
+ File.open('opengeometadata/org.opengeoportal/layers.json', 'wb') do |f|
402
+ f << JSON::pretty_generate(layers_json)
403
+ end
404
+
405
+ ap({:statistics => stats})
406
+ end
407
+
408
+ # example input data
409
+ __END__
410
+ [
411
+ {
412
+ "Abstract": "The boundaries of each supervisorial district in Sonoma County based on 2000 census. Redrawn in 2001 using Autobound.",
413
+ "Access": "Public",
414
+ "Area": 0.9463444815860053,
415
+ "Availability": "Online",
416
+ "CenterX": -122.942159,
417
+ "CenterY": 38.4580755,
418
+ "ContentDate": "2000-01-01T01:01:01Z",
419
+ "DataType": "Polygon",
420
+ "FgdcText": "...",
421
+ "GeoReferenced": true,
422
+ "HalfHeight": 0.39885650000000084,
423
+ "HalfWidth": 0.593161000000002,
424
+ "Institution": "Berkeley",
425
+ "LayerDisplayName": "SCGISDB2_BASE_ADM_SUPERVISOR",
426
+ "LayerId": "28722/bk0012h5s52",
427
+ "Location": "{\"wms\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wms\"],\"tilecache\":[\"http://gis.lib.berkeley.edu:8080/geoserver/gwc/service/wms\"],\"download\":\"\",\"wfs\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wfs\"]}",
428
+ "MaxX": -122.348998,
429
+ "MaxY": 38.856932,
430
+ "MinX": -123.53532,
431
+ "MinY": 38.059219,
432
+ "Name": "ADM_SUPERVISOR",
433
+ "PlaceKeywords": "Sonoma County County of Sonoma Sonoma California Bay Area",
434
+ "Publisher": "UC Berkeley Libraries",
435
+ "ThemeKeywords": "Supervisorial districts 1st District 2nd District 3rd District 4th District 5th District",
436
+ "WorkspaceName": "UCB"
437
+ }
438
+ ]