geohydra 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (194) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -1
  3. data/.travis.yml +1 -2
  4. data/Gemfile +1 -8
  5. data/Gemfile.lock +87 -102
  6. data/README.md +2 -2
  7. data/VERSION +1 -1
  8. data/bin/accession.rb +99 -89
  9. data/bin/assemble.rb +288 -247
  10. data/bin/assemble_data.rb +54 -51
  11. data/bin/assemble_placenames.rb +85 -85
  12. data/bin/build_stage_options.rb +24 -18
  13. data/bin/derive_wgs84.rb +65 -66
  14. data/bin/extract_thumbnail.rb +38 -37
  15. data/bin/geo2mods.rb +78 -0
  16. data/bin/geohydra +14 -5
  17. data/bin/ingest_arcgis.rb +80 -60
  18. data/bin/iso2geo.rb +64 -0
  19. data/bin/loader_postgis.rb +121 -227
  20. data/bin/run_task.rb +23 -0
  21. data/bin/sync_geoserver_metadata.rb +132 -127
  22. data/bin/xsltproc-saxon +6 -0
  23. data/geohydra.gemspec +6 -4
  24. data/lib/geohydra.rb +5 -0
  25. data/lib/geohydra/accession.rb +24 -13
  26. data/lib/geohydra/{arcgis_to_iso19139_fc.xsl → arcgis_to_iso19110.xsl} +0 -0
  27. data/lib/geohydra/gazetteer.csv +842 -36
  28. data/lib/geohydra/gazetteer.rb +48 -24
  29. data/lib/geohydra/mods2geoblacklight.xsl +248 -0
  30. data/lib/geohydra/mods2ogp.xsl +5 -8
  31. data/lib/geohydra/transform.rb +8 -2
  32. data/lib/geohydra/utils.rb +6 -0
  33. data/lib/geohydra/workflow/gisAssemblyWF.rb +109 -0
  34. data/lib/geohydra/workflow/gisAssemblyWF.xml +85 -0
  35. data/lib/geohydra/workflow/gisDeliveryWF.rb +33 -0
  36. data/lib/geohydra/workflow/gisDeliveryWF.xml +36 -0
  37. data/lib/geohydra/workflow/gisDiscoveryWF.rb +55 -0
  38. data/lib/geohydra/workflow/gisDiscoveryWF.xml +28 -0
  39. data/lib/geohydra/workflow/task.rb +82 -0
  40. data/ogp/README.md +350 -0
  41. data/ogp/download.rb +92 -0
  42. data/ogp/fgdc2mods.sh +9 -0
  43. data/ogp/fgdc2mods.xsl +884 -0
  44. data/ogp/ingest.rb +48 -0
  45. data/ogp/select.rb +20 -0
  46. data/ogp/transform.rb +354 -0
  47. data/ogp/validate.rb +182 -0
  48. data/{bin → scripts}/ingest_tufts.rb +0 -0
  49. data/scripts/iso2html/doit.sh +15 -0
  50. data/scripts/iso2html/main.css +66 -0
  51. data/scripts/iso2html/pacioos-iso-html.xsl +1749 -0
  52. data/scripts/iso2html/utils/replace-newlines.xsl +97 -0
  53. data/scripts/iso2html/utils/replace-string.xsl +80 -0
  54. data/scripts/iso2html/utils/strip-digits.xsl +60 -0
  55. data/{bin → scripts}/loader.rb +0 -0
  56. data/scripts/rename_shapefiles.rb +5 -0
  57. data/scripts/render_gazetteer.rb +36 -0
  58. data/{bin → scripts}/seed.rb +0 -0
  59. data/{bin → scripts}/solr_indexer.rb +0 -0
  60. data/scripts/status.csv +253 -0
  61. data/scripts/status.rb +32 -0
  62. data/{bin → scripts}/validate_data.rb +1 -1
  63. data/solr/kurma-app-dev/conf/lang/stopwords_en.txt +34 -0
  64. data/solr/kurma-app-dev/conf/protwords.txt +21 -0
  65. data/solr/kurma-app-dev/conf/schema.xml +156 -0
  66. data/solr/kurma-app-dev/conf/solrconfig.xml +161 -0
  67. data/solr/kurma-app-dev/conf/synonyms.txt +29 -0
  68. data/solr/kurma-app-dev/purge.sh +8 -0
  69. data/solr/kurma-app-test/conf/lang/stopwords_en.txt +34 -0
  70. data/solr/kurma-app-test/conf/protwords.txt +21 -0
  71. data/solr/kurma-app-test/conf/schema.xml +158 -0
  72. data/solr/kurma-app-test/conf/solrconfig.xml +161 -0
  73. data/solr/kurma-app-test/conf/synonyms.txt +29 -0
  74. data/solr/kurma-app-test/deploy.sh +15 -0
  75. data/solr/kurma-app-test/purge.sh +8 -0
  76. data/solr/ogp-dev/purge.sh +1 -2
  77. data/spec/fixtures/bw938nk9584/temp/{TRIPURA-iso19139-fc.xml → TRIPURA-iso19110.xml} +0 -0
  78. data/spec/fixtures/cc142xj8436/temp/{HARYANA-iso19139-fc.xml → HARYANA-iso19110.xml} +0 -0
  79. data/spec/fixtures/cg716wc7949/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  80. data/spec/fixtures/cm007pv9601/temp/{MEGHALAYA-iso19139-fc.xml → MEGHALAYA-iso19110.xml} +0 -0
  81. data/spec/fixtures/cp055nb0189/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  82. data/spec/fixtures/cs838pw3418/temp/{OIL_GAS_FIELDS-iso19139-fc.xml → OIL_GAS_FIELDS-iso19110.xml} +0 -0
  83. data/spec/fixtures/dd308sy5843/temp/{ORISSA-iso19139-fc.xml → ORISSA-iso19110.xml} +0 -0
  84. data/spec/fixtures/dd452vk1873/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  85. data/spec/fixtures/dg850pt1796/temp/{STATE1951-iso19139-fc.xml → STATE1951-iso19110.xml} +0 -0
  86. data/spec/fixtures/dn744tf5427/temp/{DISTRICT1991-iso19139-fc.xml → DISTRICT1991-iso19110.xml} +0 -0
  87. data/spec/fixtures/dq603nz8402/temp/{STATE2001-iso19139-fc.xml → STATE2001-iso19110.xml} +0 -0
  88. data/spec/fixtures/dv609zt4699/temp/{ASSAM-iso19139-fc.xml → ASSAM-iso19110.xml} +0 -0
  89. data/spec/fixtures/dz222hw0585/temp/{PUNJAB-iso19139-fc.xml → PUNJAB-iso19110.xml} +0 -0
  90. data/spec/fixtures/fd673qb9705/temp/{STATE1971-iso19139-fc.xml → STATE1971-iso19110.xml} +0 -0
  91. data/spec/fixtures/fg451wp8917/temp/{SIKKIM-iso19139-fc.xml → SIKKIM-iso19110.xml} +0 -0
  92. data/spec/fixtures/fh247yz0156/temp/{RAJASTHAN-iso19139-fc.xml → RAJASTHAN-iso19110.xml} +0 -0
  93. data/spec/fixtures/fs487vd1465/temp/{CHHATTISGARH-iso19139-fc.xml → CHHATTISGARH-iso19110.xml} +0 -0
  94. data/spec/fixtures/fs591bn3317/temp/{HIMACHAL_PRADESH-iso19139-fc.xml → HIMACHAL_PRADESH-iso19110.xml} +0 -0
  95. data/spec/fixtures/fw920bc5473/temp/{PLSS_TWN-iso19139-fc.xml → PLSS_TWN-iso19110.xml} +0 -0
  96. data/spec/fixtures/gj831wj3625/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  97. data/spec/fixtures/gp075nv3265/temp/{PONDICHERRY-iso19139-fc.xml → PONDICHERRY-iso19110.xml} +0 -0
  98. data/spec/fixtures/gv800hj8141/temp/{BIHAR-iso19139-fc.xml → BIHAR-iso19110.xml} +0 -0
  99. data/spec/fixtures/gw520gz6339/temp/{DADRA_NAGAR_HAVELI_PT-iso19139-fc.xml → DADRA_NAGAR_HAVELI_PT-iso19110.xml} +0 -0
  100. data/spec/fixtures/gy054hz1045/temp/{HARYANA-iso19139-fc.xml → HARYANA-iso19110.xml} +0 -0
  101. data/spec/fixtures/gz352mw6982/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  102. data/spec/fixtures/hb489vm9892/temp/{DISTRICT1981-iso19139-fc.xml → DISTRICT1981-iso19110.xml} +0 -0
  103. data/spec/fixtures/hw125dq0418/temp/{DELHI-iso19139-fc.xml → DELHI-iso19110.xml} +0 -0
  104. data/spec/fixtures/hw892mn4587/temp/{KERALA-iso19139-fc.xml → KERALA-iso19110.xml} +0 -0
  105. data/spec/fixtures/jb371hz3868/temp/{INCOME-iso19139-fc.xml → INCOME-iso19110.xml} +0 -0
  106. data/spec/fixtures/jc017yk9928/temp/{KARNATAKA-iso19139-fc.xml → KARNATAKA-iso19110.xml} +0 -0
  107. data/spec/fixtures/jf841ys4828/temp/{ANDHRA_PRADESH-iso19139-fc.xml → ANDHRA_PRADESH-iso19110.xml} +0 -0
  108. data/spec/fixtures/jh802mp2160/temp/{DELHI_PT-iso19139-fc.xml → DELHI_PT-iso19110.xml} +0 -0
  109. data/spec/fixtures/jj806fc3801/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  110. data/spec/fixtures/jq835yn7161/temp/{HIMACHAL_PRADESH-iso19139-fc.xml → HIMACHAL_PRADESH-iso19110.xml} +0 -0
  111. data/spec/fixtures/jr455pt6676/temp/{TAMILNADU-iso19139-fc.xml → TAMILNADU-iso19110.xml} +0 -0
  112. data/spec/fixtures/js637zp2537/temp/{BIHAR-iso19139-fc.xml → BIHAR-iso19110.xml} +0 -0
  113. data/spec/fixtures/jv502wg9611/temp/{GOA-iso19139-fc.xml → GOA-iso19110.xml} +0 -0
  114. data/spec/fixtures/jw462ck6560/temp/{JAMMU_KASHMIR-iso19139-fc.xml → JAMMU_KASHMIR-iso19110.xml} +0 -0
  115. data/spec/fixtures/kj800fb6273/temp/{STATE2011-iso19139-fc.xml → STATE2011-iso19110.xml} +0 -0
  116. data/spec/fixtures/km504zq3948/temp/{HIMACHAL_PRADESH-iso19139-fc.xml → HIMACHAL_PRADESH-iso19110.xml} +0 -0
  117. data/spec/fixtures/ks297fy1411/temp/{OFFSH_BLOCKS-iso19139-fc.xml → OFFSH_BLOCKS-iso19110.xml} +0 -0
  118. data/spec/fixtures/md358hy5049/temp/{MIZORAM-iso19139-fc.xml → MIZORAM-iso19110.xml} +0 -0
  119. data/spec/fixtures/mg745bq0193/temp/{MADHYA_PRADESH-iso19139-fc.xml → MADHYA_PRADESH-iso19110.xml} +0 -0
  120. data/spec/fixtures/mh187yx3536/temp/{WEST_BENGAL-iso19139-fc.xml → WEST_BENGAL-iso19110.xml} +0 -0
  121. data/spec/fixtures/mk488yn6694/temp/{GUJARAT-iso19139-fc.xml → GUJARAT-iso19110.xml} +0 -0
  122. data/spec/fixtures/my216kp3008/temp/{DELHI-iso19139-fc.xml → DELHI-iso19110.xml} +0 -0
  123. data/spec/fixtures/my504nz9827/temp/{JAMMU_KASHMIR-iso19139-fc.xml → JAMMU_KASHMIR-iso19110.xml} +0 -0
  124. data/spec/fixtures/ng819jm8700/temp/{MANIPUR-iso19139-fc.xml → MANIPUR-iso19110.xml} +0 -0
  125. data/spec/fixtures/np020jq2139/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  126. data/spec/fixtures/ns377mt1608/temp/{STATE1991-iso19139-fc.xml → STATE1991-iso19110.xml} +0 -0
  127. data/spec/fixtures/nw926np8508/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  128. data/spec/fixtures/ny358rm8559/temp/{TRIPURA-iso19139-fc.xml → TRIPURA-iso19110.xml} +0 -0
  129. data/spec/fixtures/nz176rm8192/temp/{DISTRICT2011-iso19139-fc.xml → DISTRICT2011-iso19110.xml} +0 -0
  130. data/spec/fixtures/nz252rq2252/temp/{UTTAR_PRADESH-iso19139-fc.xml → UTTAR_PRADESH-iso19110.xml} +0 -0
  131. data/spec/fixtures/pd902kb3348/temp/{MADHYA_PRADESH-iso19139-fc.xml → MADHYA_PRADESH-iso19110.xml} +0 -0
  132. data/spec/fixtures/pz792fz1776/temp/{MAHARASHTRA-iso19139-fc.xml → MAHARASHTRA-iso19110.xml} +0 -0
  133. data/spec/fixtures/qb767ss4042/temp/{UTTAR_PRADESH-iso19139-fc.xml → UTTAR_PRADESH-iso19110.xml} +0 -0
  134. data/spec/fixtures/qc091qw0570/temp/{GUJARAT-iso19139-fc.xml → GUJARAT-iso19110.xml} +0 -0
  135. data/spec/fixtures/qc652vr7204/temp/{ANDHRA_PRADESH_PT-iso19139-fc.xml → ANDHRA_PRADESH_PT-iso19110.xml} +0 -0
  136. data/spec/fixtures/qk786js7484/temp/{DISTRICT1961-iso19139-fc.xml → DISTRICT1961-iso19110.xml} +0 -0
  137. data/spec/fixtures/qn676pg6767/temp/{GOA-iso19139-fc.xml → GOA-iso19110.xml} +0 -0
  138. data/spec/fixtures/qr255jh4074/temp/{LOKSABHA_14-iso19139-fc.xml → LOKSABHA_14-iso19110.xml} +0 -0
  139. data/spec/fixtures/qr374kj4827/temp/{ASSAM-iso19139-fc.xml → ASSAM-iso19110.xml} +0 -0
  140. data/spec/fixtures/qy162js1748/temp/{CHHATTISGARH-iso19139-fc.xml → CHHATTISGARH-iso19110.xml} +0 -0
  141. data/spec/fixtures/rd446vf2633/temp/{NAGALAND-iso19139-fc.xml → NAGALAND-iso19110.xml} +0 -0
  142. data/spec/fixtures/rf389hf2983/temp/{CHHATTISGARH_PT-iso19139-fc.xml → CHHATTISGARH_PT-iso19110.xml} +0 -0
  143. data/spec/fixtures/rf859ff4582/temp/{JHARKHAND-iso19139-fc.xml → JHARKHAND-iso19110.xml} +0 -0
  144. data/spec/fixtures/rh343ds8931/temp/{BIHAR-iso19139-fc.xml → BIHAR-iso19110.xml} +0 -0
  145. data/spec/fixtures/rn815xk8157/temp/{SIKKIM-iso19139-fc.xml → SIKKIM-iso19110.xml} +0 -0
  146. data/spec/fixtures/rq653sz4470/temp/{CHHATTISGARH-iso19139-fc.xml → CHHATTISGARH-iso19110.xml} +0 -0
  147. data/spec/fixtures/rt625ws6022/temp/{GULF_FAIRWAYS-iso19139-fc.xml → GULF_FAIRWAYS-iso19110.xml} +0 -0
  148. data/spec/fixtures/sc330vf4259/temp/{JHARKHAND-iso19139-fc.xml → JHARKHAND-iso19110.xml} +0 -0
  149. data/spec/fixtures/sq479mx3086/temp/{OFFSH_PLATF-iso19139-fc.xml → OFFSH_PLATF-iso19110.xml} +0 -0
  150. data/spec/fixtures/sr686bm4098/temp/{DAMAN_DIU_PT-iso19139-fc.xml → DAMAN_DIU_PT-iso19110.xml} +0 -0
  151. data/spec/fixtures/sv303sh5583/temp/{ARUNACHAL_PRADESH-iso19139-fc.xml → ARUNACHAL_PRADESH-iso19110.xml} +0 -0
  152. data/spec/fixtures/sy319nh8520/temp/{GUJARAT-iso19139-fc.xml → GUJARAT-iso19110.xml} +0 -0
  153. data/spec/fixtures/td363vx2792/temp/{HIMACHAL_PRADESH_PT-iso19139-fc.xml → HIMACHAL_PRADESH_PT-iso19110.xml} +0 -0
  154. data/spec/fixtures/tf374bd2484/temp/{DISTRICT1951-iso19139-fc.xml → DISTRICT1951-iso19110.xml} +0 -0
  155. data/spec/fixtures/tj797mj7877/temp/{LOKSABHA_15-iso19139-fc.xml → LOKSABHA_15-iso19110.xml} +0 -0
  156. data/spec/fixtures/tv060wq5179/temp/{ASSAM-iso19139-fc.xml → ASSAM-iso19110.xml} +0 -0
  157. data/spec/fixtures/tv536bn1915/temp/{ARUNACHAL_PRADESH-iso19139-fc.xml → ARUNACHAL_PRADESH-iso19110.xml} +0 -0
  158. data/spec/fixtures/tz359cc2977/temp/{MANIPUR-iso19139-fc.xml → MANIPUR-iso19110.xml} +0 -0
  159. data/spec/fixtures/vb525my6511/temp/{UTTARAKHAND-iso19139-fc.xml → UTTARAKHAND-iso19110.xml} +0 -0
  160. data/spec/fixtures/vh802fs4240/temp/{PONDICHERRY-iso19139-fc.xml → PONDICHERRY-iso19110.xml} +0 -0
  161. data/spec/fixtures/vk120xn2474/temp/{PLSS_SEC-iso19139-fc.xml → PLSS_SEC-iso19110.xml} +0 -0
  162. data/spec/fixtures/vn439bc7316/temp/{KERALA-iso19139-fc.xml → KERALA-iso19110.xml} +0 -0
  163. data/spec/fixtures/vq745jk0695/temp/{MEGHALAYA-iso19139-fc.xml → MEGHALAYA-iso19110.xml} +0 -0
  164. data/spec/fixtures/vr593vj7147/temp/{ANDHRA_PRADESH-iso19139-fc.xml → ANDHRA_PRADESH-iso19110.xml} +0 -0
  165. data/spec/fixtures/vw911qb5271/temp/{DISTRICT2001-iso19139-fc.xml → DISTRICT2001-iso19110.xml} +0 -0
  166. data/spec/fixtures/wg680pz0365/temp/{ANDHRA_PRADESH-iso19139-fc.xml → ANDHRA_PRADESH-iso19110.xml} +0 -0
  167. data/spec/fixtures/wg761xn1926/temp/{HARYANA-iso19139-fc.xml → HARYANA-iso19110.xml} +0 -0
  168. data/spec/fixtures/wh870qw1934/temp/{PUNJAB-iso19139-fc.xml → PUNJAB-iso19110.xml} +0 -0
  169. data/spec/fixtures/wk775mm4673/temp/{MAHARASHTRA-iso19139-fc.xml → MAHARASHTRA-iso19110.xml} +0 -0
  170. data/spec/fixtures/ws171yz2165/temp/{ARUNACHAL_PRADESH_PT-iso19139-fc.xml → ARUNACHAL_PRADESH_PT-iso19110.xml} +0 -0
  171. data/spec/fixtures/wt473hz7153/temp/{CHANDIGARH_PT-iso19139-fc.xml → CHANDIGARH_PT-iso19110.xml} +0 -0
  172. data/spec/fixtures/ww217dj0457/temp/{CO2_PIPE-iso19139-fc.xml → CO2_PIPE-iso19110.xml} +0 -0
  173. data/spec/fixtures/wy875pk9849/temp/{STATE1961-iso19139-fc.xml → STATE1961-iso19110.xml} +0 -0
  174. data/spec/fixtures/xb018tk2042/temp/{STATE1981-iso19139-fc.xml → STATE1981-iso19110.xml} +0 -0
  175. data/spec/fixtures/xg539vw8586/temp/{ORISSA-iso19139-fc.xml → ORISSA-iso19110.xml} +0 -0
  176. data/spec/fixtures/xv475kp4644/temp/{ASSAM_PT-iso19139-fc.xml → ASSAM_PT-iso19110.xml} +0 -0
  177. data/spec/fixtures/xy096gc2959/temp/{GOA-iso19139-fc.xml → GOA-iso19110.xml} +0 -0
  178. data/spec/fixtures/xz518gz3362/temp/{UTTARAKHAND-iso19139-fc.xml → UTTARAKHAND-iso19110.xml} +0 -0
  179. data/spec/fixtures/yh986wy4737/temp/{NAGALAND-iso19139-fc.xml → NAGALAND-iso19110.xml} +0 -0
  180. data/spec/fixtures/yn187fq4474/temp/{KARNATAKA-iso19139-fc.xml → KARNATAKA-iso19110.xml} +0 -0
  181. data/spec/fixtures/yn236mw3250/temp/{TAMILNADU-iso19139-fc.xml → TAMILNADU-iso19110.xml} +0 -0
  182. data/spec/fixtures/yz596nz0112/temp/{WEST_BENGAL-iso19139-fc.xml → WEST_BENGAL-iso19110.xml} +0 -0
  183. data/spec/fixtures/zk596gy7380/temp/{DISTRICT1971-iso19139-fc.xml → DISTRICT1971-iso19110.xml} +0 -0
  184. data/spec/fixtures/zn452hh7431/temp/{RAJASTHAN-iso19139-fc.xml → RAJASTHAN-iso19110.xml} +0 -0
  185. data/spec/fixtures/zt093fw6519/temp/{MIZORAM-iso19139-fc.xml → MIZORAM-iso19110.xml} +0 -0
  186. data/spec/fixtures/zv925hd6723/temp/{OGWELLS-iso19139-fc.xml → OGWELLS-iso19110.xml} +0 -0
  187. data/spec/fixtures/zy658cr1728/temp/{ANDAMAAN_NICOBAR_PT-iso19139-fc.xml → ANDAMAAN_NICOBAR_PT-iso19110.xml} +0 -0
  188. data/spec/fixtures/zz943vx1492/temp/{BASINS-iso19139-fc.xml → BASINS-iso19110.xml} +0 -0
  189. data/spec/unit/gazetteer_spec.rb +100 -35
  190. data/spec/unit/task_spec.rb +68 -0
  191. data/spec/unit/transform_spec.rb +1 -1
  192. data/spec/unit/utils_spec.rb +17 -3
  193. data/workflow.rb +35 -0
  194. metadata +323 -316
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+ require 'rsolr'
5
+
6
+ class IngestOgp
7
+ def initialize(collection, url)
8
+ raise ArgumentError, 'Collection not defined' unless collection.is_a? String
9
+ @solr = RSolr.connect(:url => (url + '/' + collection))
10
+ yield self
11
+ close
12
+ end
13
+
14
+ def ingest(fn)
15
+ puts "Ingesting #{fn}"
16
+ json = JSON::parse(File.read(fn))
17
+ n = 0
18
+ json.each do |doc|
19
+ next unless doc.is_a? Hash and not doc.empty?
20
+ doc.delete('_version_')
21
+ doc.delete('timestamp')
22
+ putc "."
23
+ @solr.add doc
24
+ n += 1
25
+ if n % 100 == 0
26
+ @solr.commit
27
+ puts "\ncommit 100 records, #{n} total\n"
28
+ end
29
+ end
30
+ puts "\n#{n} records\n"
31
+ @solr.commit
32
+ end
33
+
34
+ def close
35
+ @solr.commit
36
+ #@solr.optimize
37
+ @solr = nil
38
+ end
39
+
40
+ end
41
+
42
+
43
+ # __MAIN__
44
+ IngestOgp.new(ARGV[0], (ARGV[1].nil?? 'http://localhost:18080/solr' : ARGV[1])) do |ogp|
45
+ Dir.glob("transformed*.json") do |fn|
46
+ ogp.ingest(fn)
47
+ end
48
+ end
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: select.rb
4
+
5
+
6
+ require 'awesome_print'
7
+ require 'json'
8
+
9
+
10
+ # __MAIN__
11
+ selected = []
12
+ Dir.glob('transformed*.json') do |fn|
13
+ JSON::parse(File.read(fn)).each do |i|
14
+ if rand < 0.01
15
+ selected << i
16
+ end
17
+ end
18
+ end
19
+ ap({:selected => selected})
20
+ File.open('selected.json', 'wb') {|f| f << JSON.pretty_generate(selected)}
@@ -0,0 +1,354 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: transform_ogp output.json
4
+ #
5
+ # Reads valid*.json in current directory
6
+ #
7
+
8
+ require 'awesome_print'
9
+ require 'json'
10
+ require 'uri'
11
+ require 'date'
12
+ require 'nokogiri'
13
+
14
+ # Transforms an OGP schema into GeoBlacklight. Requires input of a JSON array
15
+ # of OGP hashs.
16
+ class TransformOgp
17
+
18
+ def initialize(fn)
19
+ @output = File.open(fn, 'wb')
20
+ @output.write "[\n"
21
+ @fgdcdir = 'fgdc'
22
+ yield self
23
+ self.close
24
+ end
25
+
26
+ # @param [String|Array] s the URI to clean up
27
+ # @return [String] a normalized URI
28
+ def clean_uri(s)
29
+ unless s.nil? or s.empty?
30
+ return (s.is_a?(Array) ? URI(s.first) : URI(s)).to_s
31
+ end
32
+ ''
33
+ end
34
+
35
+ # @param [String] fn filename of JSON array of OGP hash objects
36
+ # @return [Hash] stats about :accepted vs. :rejected records
37
+ def transform_file(fn)
38
+ stats = { :accepted => 0, :rejected => 0 }
39
+ puts "Parsing #{fn}"
40
+ json = JSON::parse(File.open(fn, 'rb').read)
41
+ json.each do |doc| # contains JSON Solr query results
42
+ unless doc.empty?
43
+ begin
44
+ transform(doc)
45
+ stats[:accepted] += 1
46
+ rescue ArgumentError => e
47
+ puts e
48
+ stats[:rejected] += 1
49
+ end
50
+ end
51
+ end
52
+ stats
53
+ end
54
+
55
+ # Transforms a single OGP record into a GeoBlacklight record
56
+ # @param [Hash] layer an OGP hash for a given layer
57
+ def transform(layer, skip_fgdc = true)
58
+ id = layer['LayerId'].to_s.strip
59
+ puts "Tranforming #{id}"
60
+
61
+ # For URN style @see http://www.ietf.org/rfc/rfc2141.txt
62
+ # For ARK @see https://wiki.ucop.edu/display/Curation/ARK
63
+ prefix = case layer['Institution']
64
+ when 'Stanford'
65
+ 'http://purl.stanford.edu/'
66
+ when 'Tufts'
67
+ 'urn:geodata.tufts.edu:'
68
+ when 'MassGIS'
69
+ 'urn:massgis.state.ma.us:'
70
+ when 'Berkeley'
71
+ 'http://ark.cdlib.org/ark:/'
72
+ when 'MIT'
73
+ 'urn:arrowsmith.mit.edu:'
74
+ when 'Harvard'
75
+ 'urn:hul.harvard.edu:'
76
+ else
77
+ ''
78
+ end
79
+ uuid = prefix + URI.encode(id)
80
+
81
+ # Parse out the Location to get the WMS/WFS/WCS URLs
82
+ raise ArgumentError, "ERROR: #{id} no location" if layer['Location'].nil? or layer['Location'].empty?
83
+ location = JSON::parse(layer['Location'])
84
+ raise ArgumentError, "ERROR: #{id} has malformed location" unless location.is_a? Hash
85
+
86
+ # Parse out the bounding box
87
+ s = layer['MinY'].to_f
88
+ w = layer['MinX'].to_f
89
+ n = layer['MaxY'].to_f
90
+ e = layer['MaxX'].to_f
91
+
92
+ # Parse out the ContentDate date/time
93
+ dt = DateTime.rfc3339(layer['ContentDate'])
94
+ pub_dt = DateTime.rfc3339('2000-01-01T00:00:00Z') # XXX fake data, get from MODS
95
+
96
+ access = layer['Access']
97
+ collection = nil
98
+
99
+ # Parse out the PURL and other metadata for Stanford
100
+ if layer['Institution'] == 'Stanford'
101
+ purl = location['purl']
102
+ if purl.is_a? Array
103
+ purl = purl.first
104
+ end
105
+ if purl.nil? and uuid =~ /^http/
106
+ purl = uuid
107
+ end
108
+ else
109
+ purl = nil
110
+ # Because OGP does not deliminate keywords, we use a heuristic here
111
+ %w{PlaceKeywords ThemeKeywords}.each do |k|
112
+ unless layer[k] =~ /[;,]/ or layer[k].split.size < 4
113
+ layer[k] = layer[k].split.join(';')
114
+ end
115
+ end
116
+ end
117
+
118
+ slug = to_slug(id, layer)
119
+
120
+ layer_geom_type = layer['DataType'].to_s.downcase
121
+ layer_geom_type = 'raster' if layer_geom_type == 'paper map'
122
+
123
+ # @see https://github.com/OSGeo/Cat-Interop
124
+ %w{wcs wfs wms}.each do |k|
125
+ location[k] = location[k].first if location[k].is_a? Array
126
+ end
127
+ refs = {}
128
+ refs['http://www.opengis.net/def/serviceType/ogc/wcs'] = "#{location['wcs']}" if location['wcs']
129
+ refs['http://www.opengis.net/def/serviceType/ogc/wfs'] = "#{location['wfs']}" if location['wfs']
130
+ refs['http://www.opengis.net/def/serviceType/ogc/wms'] = "#{location['wms']}" if location['wms']
131
+ if purl
132
+ refs["http://schema.org/thumbnailUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/preview.jpg"
133
+ refs["http://schema.org/url"] = "#{clean_uri(purl)}"
134
+ refs["http://schema.org/DownloadAction"] = "http://stacks.stanford.edu/file/druid:#{id}/data.zip"
135
+ refs["http://www.isotc211.org/schemas/2005/gmd/"] = "#{purl}.iso19139"
136
+ refs["http://www.loc.gov/mods/v3"] = "#{purl}.mods"
137
+ end
138
+
139
+ # Make the conversion from OGP to GeoBlacklight
140
+ #
141
+ # @see http://dublincore.org/documents/dcmi-terms/
142
+ # @see http://wiki.dublincore.org/index.php/User_Guide/Creating_Metadata
143
+ # @see http://www.ietf.org/rfc/rfc5013.txt
144
+ new_layer = {
145
+ :uuid => uuid,
146
+
147
+ # Dublin Core elements
148
+ :dc_creator_sm => string2array(layer['Originator']),
149
+ :dc_description_s => layer['Abstract'],
150
+ :dc_format_s => (
151
+ (layer_geom_type == 'raster') ?
152
+ 'GeoTIFF' : # 'image/tiff' :
153
+ 'Shapefile' # 'application/x-esri-shapefile'
154
+ ), # XXX: fake data
155
+ :dc_identifier_s => uuid,
156
+ :dc_language_s => 'English', # 'en', # XXX: fake data
157
+ :dc_publisher_s => layer['Publisher'],
158
+ :dc_rights_s => access,
159
+ :dc_subject_sm => string2array(layer['ThemeKeywords']),
160
+ :dc_title_s => layer['LayerDisplayName'],
161
+ :dc_type_s => 'Dataset', # or 'Image' for non-georectified,
162
+ # or 'PhysicalObject' for non-digitized maps
163
+ # Dublin Core terms
164
+ :dct_isPartOf_sm => collection.nil?? nil : [collection],
165
+ :dct_references_s => refs.to_json.to_s,
166
+ :dct_spatial_sm => string2array(layer['PlaceKeywords']),
167
+ :dct_temporal_sm => [dt.year.to_s],
168
+ :dct_issued_s => pub_dt.year.to_s,
169
+ :dct_provenance_s => layer['Institution'],
170
+
171
+ #
172
+ # xmlns:georss="http://www.georss.org/georss"
173
+ # A bounding box is a rectangular region, often used to define the extents of a map or a rough area of interest. A box contains two space seperate latitude-longitude pairs, with each pair separated by whitespace. The first pair is the lower corner, the second is the upper corner.
174
+ :georss_box_s => "#{s} #{w} #{n} #{e}",
175
+ :georss_polygon_s => "#{n} #{w} #{n} #{e} #{s} #{e} #{s} #{w} #{n} #{w}",
176
+
177
+ # Layer-specific schema
178
+ :layer_slug_s => slug,
179
+ :layer_id_s => layer['WorkspaceName'] + ':' + layer['Name'],
180
+ # :layer_srs_s => 'EPSG:4326', # XXX: fake data
181
+ :layer_geom_type_s => layer_geom_type.capitalize,
182
+ :layer_modified_dt => Time.now.utc.strftime('%FT%TZ'),
183
+
184
+ # derived fields used only by solr, for which copyField is insufficient
185
+ :solr_bbox => "#{w} #{s} #{e} #{n}", # minX minY maxX maxY
186
+ :solr_ne_pt => "#{n},#{e}",
187
+ :solr_sw_pt => "#{s},#{w}",
188
+ :solr_geom => "ENVELOPE(#{w}, #{e}, #{n}, #{s})",
189
+ :solr_year_i => dt.year,
190
+ :solr_issued_dt => pub_dt.strftime('%FT%TZ'), # Solr requires 1995-12-31T23:59:59Z
191
+ :solr_wms_url => location['wms'],
192
+ :solr_wfs_url => location['wfs'],
193
+ :solr_wcs_url => location['wcs']
194
+
195
+ # :layer_year_i => dt.year#, # XXX: migrate to copyField
196
+ # :ogp_area_f => layer['Area'],
197
+ # :ogp_center_x_f => layer['CenterX'],
198
+ # :ogp_center_y_f => layer['CenterY'],
199
+ # :ogp_georeferenced_b => (layer['GeoReferenced'].to_s.downcase == 'true'),
200
+ # :ogp_halfheight_f => layer['HalfHeight'],
201
+ # :ogp_halfwidth_f => layer['HalfWidth'],
202
+ # :ogp_layer_id_s => layer['LayerId'],
203
+ # :ogp_name_s => layer['Name'],
204
+ # :ogp_location_s => layer['Location'],
205
+ # :ogp_workspace_s => layer['WorkspaceName']
206
+ }
207
+
208
+ # Remove any fields that are blank
209
+ new_layer.each do |k, v|
210
+ new_layer.delete(k) if v.nil? or (v.respond_to?(:empty?) and v.empty?)
211
+ end
212
+
213
+ # Write the JSON record for the GeoBlacklight layer
214
+ @output.write JSON::pretty_generate(new_layer)
215
+ @output.write "\n,\n"
216
+
217
+ unless skip_fgdc or layer['FgdcText'].nil? or layer['FgdcText'].empty?
218
+ xml = Nokogiri::XML(layer['FgdcText'])
219
+ xml.write_xml_to(File.open('fgdc' + '/' + slug + '.xml', 'wb'), :encoding => 'UTF-8', :indent => 2)
220
+ end
221
+ end
222
+
223
+ def close
224
+ @output.write "\n {} \n]\n"
225
+ @output.close
226
+ end
227
+
228
+ # @param [String] s has semi-colon/comma/gt delimited array
229
+ # @return [Array] results as array
230
+ def string2array(s)
231
+ if s.to_s =~ /[;,>]/
232
+ s.split(/\s*[;,>]\s*/).uniq.collect {|i| i.strip}
233
+ elsif s.is_a?(String) and s.size > 0
234
+ [s.strip]
235
+ else
236
+ nil
237
+ end
238
+ end
239
+
240
+ @@slugs = {}
241
+ def to_slug(id, layer)
242
+ # strip out schema and usernames
243
+ name = layer['Name'].sub('SDE_DATA.', '').sub('SDE.', '').sub('SDE2.', '').sub('GISPORTAL.GISOWNER01.', '').sub('GISDATA.', '').sub('MORIS.', '')
244
+ unless name.size > 1
245
+ # use first word of title is empty name
246
+ name = layer['LayerDisplayName'].split.first
247
+ end
248
+ slug = layer['Institution'] + '-' + name
249
+
250
+ # slugs should only have a-z, A-Z, 0-9, and -
251
+ slug.gsub!(/[^a-zA-Z0-9\-]/, '-')
252
+ slug.gsub!(/[\-]+/, '-')
253
+
254
+ # only lowercase
255
+ slug.downcase!
256
+
257
+ # ensure slugs are unique for this pass
258
+ if @@slugs.include?(slug)
259
+ slug += '-' + sprintf("%06d", Random.rand(999999))
260
+ end
261
+ @@slugs[slug] = true
262
+
263
+ slug
264
+ end
265
+
266
+ # Ensure that the WMS/WFS/WCS location values are as expected
267
+ def validate_location(id, location)
268
+ begin
269
+ x = JSON::parse(location)
270
+ if x['wms'].nil? or (x['wcs'].nil? and x['wfs'].nil?)
271
+ raise ArgumentError, "ERROR: #{id}: Missing WMS or WCS/WFS: #{x}"
272
+ end
273
+
274
+ %w{wms wcs wfs}.each do |protocol|
275
+ begin
276
+ unless x[protocol].nil?
277
+ if x[protocol].is_a? String
278
+ x[protocol] = [x[protocol]]
279
+ end
280
+
281
+ unless x[protocol].is_a? Array
282
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
283
+ end
284
+
285
+ x[protocol].each do |url|
286
+ uri = clean_uri.parse(url)
287
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(clean_uri::HTTP) or uri.kind_of?(clean_uri::HTTPS)
288
+ end
289
+ end
290
+ rescue Exception => e
291
+ raise ArgumentError, "ERROR: #{id}: Invalid #{k}: #{x}"
292
+ end
293
+ end
294
+
295
+ return x.to_json
296
+ rescue JSON::ParserError => e
297
+ raise ArgumentError, "ERROR: #{id}: Invalid JSON: #{location}"
298
+ end
299
+ nil
300
+ end
301
+
302
+ def lon? lon
303
+ lon >= -180 and lon <= 180
304
+ end
305
+
306
+ def lat? lat
307
+ lat >= -90 and lat <= 90
308
+ end
309
+ end
310
+
311
+
312
+ # __MAIN__
313
+ #
314
+ TransformOgp.new(ARGV[0].nil?? 'transformed.json' : ARGV[0]) do |ogp|
315
+ stats = { :accepted => 0, :rejected => 0 }
316
+ Dir.glob('valid*.json') do |fn|
317
+ s = ogp.transform_file(fn)
318
+ stats[:accepted] += s[:accepted]
319
+ stats[:rejected] += s[:rejected]
320
+ end
321
+ ap({:statistics => stats})
322
+ end
323
+
324
+ # example input data
325
+ __END__
326
+ [
327
+ {
328
+ "Abstract": "The boundaries of each supervisorial district in Sonoma County based on 2000 census. Redrawn in 2001 using Autobound.",
329
+ "Access": "Public",
330
+ "Area": 0.9463444815860053,
331
+ "Availability": "Online",
332
+ "CenterX": -122.942159,
333
+ "CenterY": 38.4580755,
334
+ "ContentDate": "2000-01-01T01:01:01Z",
335
+ "DataType": "Polygon",
336
+ "FgdcText": "...",
337
+ "GeoReferenced": true,
338
+ "HalfHeight": 0.39885650000000084,
339
+ "HalfWidth": 0.593161000000002,
340
+ "Institution": "Berkeley",
341
+ "LayerDisplayName": "SCGISDB2_BASE_ADM_SUPERVISOR",
342
+ "LayerId": "28722/bk0012h5s52",
343
+ "Location": "{\"wms\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wms\"],\"tilecache\":[\"http://gis.lib.berkeley.edu:8080/geoserver/gwc/service/wms\"],\"download\":\"\",\"wfs\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wfs\"]}",
344
+ "MaxX": -122.348998,
345
+ "MaxY": 38.856932,
346
+ "MinX": -123.53532,
347
+ "MinY": 38.059219,
348
+ "Name": "ADM_SUPERVISOR",
349
+ "PlaceKeywords": "Sonoma County County of Sonoma Sonoma California Bay Area",
350
+ "Publisher": "UC Berkeley Libraries",
351
+ "ThemeKeywords": "Supervisorial districts 1st District 2nd District 3rd District 4th District 5th District",
352
+ "WorkspaceName": "UCB"
353
+ }
354
+ ]
@@ -0,0 +1,182 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: validate_ogp [output.json]
4
+ #
5
+ # Requires data/*.json as input and output to valid.json
6
+ #
7
+ require 'awesome_print'
8
+ require 'json'
9
+ require 'uri'
10
+ require 'date'
11
+
12
+ class ValidateOgp
13
+ def initialize(fn)
14
+ @wms_servers = {}
15
+ @output = File.open(fn, 'wb')
16
+ @output.write "[\n"
17
+ yield self
18
+ self.close
19
+ end
20
+
21
+ def validate_file(fn)
22
+ stats = { :accepted => 0, :rejected => 0 }
23
+ puts "Validating #{fn}"
24
+ json = JSON::parse(File.read(fn))
25
+ json['response']['docs'].each do |doc| # contains JSON Solr query results
26
+ begin
27
+ validate(doc)
28
+ stats[:accepted] += 1
29
+ rescue ArgumentError => e
30
+ puts e
31
+ stats[:rejected] += 1
32
+ end
33
+ end
34
+ stats
35
+ end
36
+
37
+
38
+ def validate(layer)
39
+ id = layer['LayerId']
40
+
41
+ %w{LayerId Name Institution Access MinX MinY MaxX MaxY LayerDisplayName Location}.each do |k|
42
+ if layer[k].nil? or layer[k].to_s.empty?
43
+ raise ArgumentError, "ERROR: #{id} missing #{k}"
44
+ return
45
+ end
46
+ end
47
+
48
+ %w{MinX MaxX}.each do |lon|
49
+ raise ArgumentError, "ERROR: #{id}: Invalid longitude value: #{layer[lon]}" unless lon?(layer[lon])
50
+ end
51
+
52
+
53
+ %w{MinY MaxY}.each do |lat|
54
+ raise ArgumentError, "ERROR: #{id} Invalid latitude value: #{layer[lat]}" unless lat?(layer[lat])
55
+ end
56
+
57
+ k = 'Institution'
58
+ if ([layer[k]] & %w{Berkeley Harvard MIT MassGIS Stanford Tufts}).empty?
59
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
60
+ return
61
+ end
62
+
63
+ k = 'DataType'
64
+ if ([layer[k]] & %w{Line Paper\ Map Point Polygon Raster LibraryRecord}).empty?
65
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
66
+ return
67
+ end
68
+
69
+ k = 'Access'
70
+ if ([layer[k]] & %w{Public Restricted}).empty?
71
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
72
+ end
73
+
74
+ k = 'Availability'
75
+ if layer[k].downcase == 'online' # cleanup
76
+ layer[k] = 'Online'
77
+ end
78
+ if ([layer[k]] & %w{Online}).empty?
79
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
80
+ return
81
+ end
82
+
83
+ k = 'Location'
84
+ layer[k] = validate_location(id, layer[k])
85
+ if layer[k].nil? or layer[k].empty?
86
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
87
+ end
88
+
89
+ k = 'GeoReferenced'
90
+ unless layer[k].nil? or layer[k] == true
91
+ puts "WARNING: #{id} has boundingbox but claims it is not georeferenced"
92
+ #layer[k] = true
93
+ end
94
+
95
+ k = 'Area'
96
+ unless layer[k] > 0
97
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
98
+ end
99
+
100
+ k = 'ContentDate'
101
+ if layer[k].nil? or layer[k].empty?
102
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
103
+ end
104
+ dt = Date.rfc3339(layer[k])
105
+ if dt.year < 1500 or dt.year > 2100
106
+ raise ArgumentError, "ERROR: #{id} has suspect #{k}: #{layer[k]}"
107
+ end
108
+
109
+ # k = 'FgdcText'
110
+ # unless layer[k].nil? or layer[k].empty?
111
+ # layer[k] = ''
112
+ # end
113
+
114
+ @output.write JSON::pretty_generate(layer)
115
+ @output.write "\n,\n"
116
+ end
117
+
118
+ def close
119
+ @output.write "\n {} \n]\n"
120
+ @output.close
121
+ ap({:wms_servers => @wms_servers})
122
+ end
123
+
124
+ private
125
+
126
+ def validate_location(id, location)
127
+ begin
128
+ x = JSON::parse(location)
129
+ if x['wms'].nil? or (x['wcs'].nil? and x['wfs'].nil?)
130
+ raise ArgumentError, "ERROR: #{id}: Missing WMS or WCS/WFS: #{x}"
131
+ end
132
+
133
+ %w{wms wcs wfs}.each do |protocol|
134
+ begin
135
+ unless x[protocol].nil?
136
+ if x[protocol].is_a? String
137
+ x[protocol] = [x[protocol]]
138
+ end
139
+
140
+ unless x[protocol].is_a? Array
141
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
142
+ end
143
+
144
+ x[protocol].each do |url|
145
+ uri = URI.parse(url)
146
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(URI::HTTP) or uri.kind_of?(URI::HTTPS)
147
+ end
148
+ end
149
+ rescue Exception => e
150
+ raise ArgumentError, "ERROR: #{id}: Invalid #{k}: #{x}"
151
+ end
152
+ end
153
+
154
+ @wms_servers[x['wms'].first] = true
155
+
156
+ return x.to_json
157
+ rescue JSON::ParserError => e
158
+ raise ArgumentError, "ERROR: #{id}: Invalid JSON: #{location}"
159
+ end
160
+ nil
161
+ end
162
+
163
+ def lon? lon
164
+ lon >= -180 and lon <= 180
165
+ end
166
+
167
+ def lat? lat
168
+ lat >= -90 and lat <= 90
169
+ end
170
+ end
171
+
172
+
173
+ # __MAIN__
174
+ ValidateOgp.new(ARGV[0].nil?? 'valid.json' : ARGV[0]) do |ogp|
175
+ stats = { :accepted => 0, :rejected => 0 }
176
+ Dir.glob('data/*.json') do |fn|
177
+ s = ogp.validate_file(fn)
178
+ stats[:accepted] += s[:accepted]
179
+ stats[:rejected] += s[:rejected]
180
+ end
181
+ ap({:statistics => stats})
182
+ end