geohydra 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -1
  3. data/.travis.yml +1 -2
  4. data/Gemfile +1 -8
  5. data/Gemfile.lock +87 -102
  6. data/README.md +2 -2
  7. data/VERSION +1 -1
  8. data/bin/accession.rb +99 -89
  9. data/bin/assemble.rb +288 -247
  10. data/bin/assemble_data.rb +54 -51
  11. data/bin/assemble_placenames.rb +85 -85
  12. data/bin/build_stage_options.rb +24 -18
  13. data/bin/derive_wgs84.rb +65 -66
  14. data/bin/extract_thumbnail.rb +38 -37
  15. data/bin/geo2mods.rb +78 -0
  16. data/bin/geohydra +14 -5
  17. data/bin/ingest_arcgis.rb +80 -60
  18. data/bin/iso2geo.rb +64 -0
  19. data/bin/loader_postgis.rb +121 -227
  20. data/bin/run_task.rb +23 -0
  21. data/bin/sync_geoserver_metadata.rb +132 -127
  22. data/bin/xsltproc-saxon +6 -0
  23. data/geohydra.gemspec +6 -4
  24. data/lib/geohydra.rb +5 -0
  25. data/lib/geohydra/accession.rb +24 -13
  26. data/lib/geohydra/{arcgis_to_iso19139_fc.xsl → arcgis_to_iso19110.xsl} +0 -0
  27. data/lib/geohydra/gazetteer.csv +842 -36
  28. data/lib/geohydra/gazetteer.rb +48 -24
  29. data/lib/geohydra/mods2geoblacklight.xsl +248 -0
  30. data/lib/geohydra/mods2ogp.xsl +5 -8
  31. data/lib/geohydra/transform.rb +8 -2
  32. data/lib/geohydra/utils.rb +6 -0
  33. data/lib/geohydra/workflow/gisAssemblyWF.rb +109 -0
  34. data/lib/geohydra/workflow/gisAssemblyWF.xml +85 -0
  35. data/lib/geohydra/workflow/gisDeliveryWF.rb +33 -0
  36. data/lib/geohydra/workflow/gisDeliveryWF.xml +36 -0
  37. data/lib/geohydra/workflow/gisDiscoveryWF.rb +55 -0
  38. data/lib/geohydra/workflow/gisDiscoveryWF.xml +28 -0
  39. data/lib/geohydra/workflow/task.rb +82 -0
  40. data/ogp/README.md +350 -0
  41. data/ogp/download.rb +92 -0
  42. data/ogp/fgdc2mods.sh +9 -0
  43. data/ogp/fgdc2mods.xsl +884 -0
  44. data/ogp/ingest.rb +48 -0
  45. data/ogp/select.rb +20 -0
  46. data/ogp/transform.rb +354 -0
  47. data/ogp/validate.rb +182 -0
  48. data/{bin → scripts}/ingest_tufts.rb +0 -0
  49. data/scripts/iso2html/doit.sh +15 -0
  50. data/scripts/iso2html/main.css +66 -0
  51. data/scripts/iso2html/pacioos-iso-html.xsl +1749 -0
  52. data/scripts/iso2html/utils/replace-newlines.xsl +97 -0
  53. data/scripts/iso2html/utils/replace-string.xsl +80 -0
  54. data/scripts/iso2html/utils/strip-digits.xsl +60 -0
  55. data/{bin → scripts}/loader.rb +0 -0
  56. data/scripts/rename_shapefiles.rb +5 -0
  57. data/scripts/render_gazetteer.rb +36 -0
  58. data/{bin → scripts}/seed.rb +0 -0
  59. data/{bin → scripts}/solr_indexer.rb +0 -0
  60. data/scripts/status.csv +253 -0
  61. data/scripts/status.rb +32 -0
  62. data/{bin → scripts}/validate_data.rb +1 -1
  63. data/solr/kurma-app-dev/conf/lang/stopwords_en.txt +34 -0
  64. data/solr/kurma-app-dev/conf/protwords.txt +21 -0
  65. data/solr/kurma-app-dev/conf/schema.xml +156 -0
  66. data/solr/kurma-app-dev/conf/solrconfig.xml +161 -0
  67. data/solr/kurma-app-dev/conf/synonyms.txt +29 -0
  68. data/solr/kurma-app-dev/purge.sh +8 -0
  69. data/solr/kurma-app-test/conf/lang/stopwords_en.txt +34 -0
  70. data/solr/kurma-app-test/conf/protwords.txt +21 -0
  71. data/solr/kurma-app-test/conf/schema.xml +158 -0
  72. data/solr/kurma-app-test/conf/solrconfig.xml +161 -0
  73. data/solr/kurma-app-test/conf/synonyms.txt +29 -0
  74. data/solr/kurma-app-test/deploy.sh +15 -0
  75. data/solr/kurma-app-test/purge.sh +8 -0
  76. data/solr/ogp-dev/purge.sh +1 -2
  77. data/spec/fixtures/bw938nk9584/temp/{TRIPURA-iso19139-fc.xml → TRIPURA-iso19110.xml} +0 -0
  78. data/spec/fixtures/cc142xj8436/temp/{HARYANA-iso19139-fc.xml → HARYANA-iso19110.xml} +0 -0
  79. data/spec/fixtures/cg716wc7949/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  80. data/spec/fixtures/cm007pv9601/temp/{MEGHALAYA-iso19139-fc.xml → MEGHALAYA-iso19110.xml} +0 -0
  81. data/spec/fixtures/cp055nb0189/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  82. data/spec/fixtures/cs838pw3418/temp/{OIL_GAS_FIELDS-iso19139-fc.xml → OIL_GAS_FIELDS-iso19110.xml} +0 -0
  83. data/spec/fixtures/dd308sy5843/temp/{ORISSA-iso19139-fc.xml → ORISSA-iso19110.xml} +0 -0
  84. data/spec/fixtures/dd452vk1873/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  85. data/spec/fixtures/dg850pt1796/temp/{STATE1951-iso19139-fc.xml → STATE1951-iso19110.xml} +0 -0
  86. data/spec/fixtures/dn744tf5427/temp/{DISTRICT1991-iso19139-fc.xml → DISTRICT1991-iso19110.xml} +0 -0
  87. data/spec/fixtures/dq603nz8402/temp/{STATE2001-iso19139-fc.xml → STATE2001-iso19110.xml} +0 -0
  88. data/spec/fixtures/dv609zt4699/temp/{ASSAM-iso19139-fc.xml → ASSAM-iso19110.xml} +0 -0
  89. data/spec/fixtures/dz222hw0585/temp/{PUNJAB-iso19139-fc.xml → PUNJAB-iso19110.xml} +0 -0
  90. data/spec/fixtures/fd673qb9705/temp/{STATE1971-iso19139-fc.xml → STATE1971-iso19110.xml} +0 -0
  91. data/spec/fixtures/fg451wp8917/temp/{SIKKIM-iso19139-fc.xml → SIKKIM-iso19110.xml} +0 -0
  92. data/spec/fixtures/fh247yz0156/temp/{RAJASTHAN-iso19139-fc.xml → RAJASTHAN-iso19110.xml} +0 -0
  93. data/spec/fixtures/fs487vd1465/temp/{CHHATTISGARH-iso19139-fc.xml → CHHATTISGARH-iso19110.xml} +0 -0
  94. data/spec/fixtures/fs591bn3317/temp/{HIMACHAL_PRADESH-iso19139-fc.xml → HIMACHAL_PRADESH-iso19110.xml} +0 -0
  95. data/spec/fixtures/fw920bc5473/temp/{PLSS_TWN-iso19139-fc.xml → PLSS_TWN-iso19110.xml} +0 -0
  96. data/spec/fixtures/gj831wj3625/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  97. data/spec/fixtures/gp075nv3265/temp/{PONDICHERRY-iso19139-fc.xml → PONDICHERRY-iso19110.xml} +0 -0
  98. data/spec/fixtures/gv800hj8141/temp/{BIHAR-iso19139-fc.xml → BIHAR-iso19110.xml} +0 -0
  99. data/spec/fixtures/gw520gz6339/temp/{DADRA_NAGAR_HAVELI_PT-iso19139-fc.xml → DADRA_NAGAR_HAVELI_PT-iso19110.xml} +0 -0
  100. data/spec/fixtures/gy054hz1045/temp/{HARYANA-iso19139-fc.xml → HARYANA-iso19110.xml} +0 -0
  101. data/spec/fixtures/gz352mw6982/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  102. data/spec/fixtures/hb489vm9892/temp/{DISTRICT1981-iso19139-fc.xml → DISTRICT1981-iso19110.xml} +0 -0
  103. data/spec/fixtures/hw125dq0418/temp/{DELHI-iso19139-fc.xml → DELHI-iso19110.xml} +0 -0
  104. data/spec/fixtures/hw892mn4587/temp/{KERALA-iso19139-fc.xml → KERALA-iso19110.xml} +0 -0
  105. data/spec/fixtures/jb371hz3868/temp/{INCOME-iso19139-fc.xml → INCOME-iso19110.xml} +0 -0
  106. data/spec/fixtures/jc017yk9928/temp/{KARNATAKA-iso19139-fc.xml → KARNATAKA-iso19110.xml} +0 -0
  107. data/spec/fixtures/jf841ys4828/temp/{ANDHRA_PRADESH-iso19139-fc.xml → ANDHRA_PRADESH-iso19110.xml} +0 -0
  108. data/spec/fixtures/jh802mp2160/temp/{DELHI_PT-iso19139-fc.xml → DELHI_PT-iso19110.xml} +0 -0
  109. data/spec/fixtures/jj806fc3801/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  110. data/spec/fixtures/jq835yn7161/temp/{HIMACHAL_PRADESH-iso19139-fc.xml → HIMACHAL_PRADESH-iso19110.xml} +0 -0
  111. data/spec/fixtures/jr455pt6676/temp/{TAMILNADU-iso19139-fc.xml → TAMILNADU-iso19110.xml} +0 -0
  112. data/spec/fixtures/js637zp2537/temp/{BIHAR-iso19139-fc.xml → BIHAR-iso19110.xml} +0 -0
  113. data/spec/fixtures/jv502wg9611/temp/{GOA-iso19139-fc.xml → GOA-iso19110.xml} +0 -0
  114. data/spec/fixtures/jw462ck6560/temp/{JAMMU_KASHMIR-iso19139-fc.xml → JAMMU_KASHMIR-iso19110.xml} +0 -0
  115. data/spec/fixtures/kj800fb6273/temp/{STATE2011-iso19139-fc.xml → STATE2011-iso19110.xml} +0 -0
  116. data/spec/fixtures/km504zq3948/temp/{HIMACHAL_PRADESH-iso19139-fc.xml → HIMACHAL_PRADESH-iso19110.xml} +0 -0
  117. data/spec/fixtures/ks297fy1411/temp/{OFFSH_BLOCKS-iso19139-fc.xml → OFFSH_BLOCKS-iso19110.xml} +0 -0
  118. data/spec/fixtures/md358hy5049/temp/{MIZORAM-iso19139-fc.xml → MIZORAM-iso19110.xml} +0 -0
  119. data/spec/fixtures/mg745bq0193/temp/{MADHYA_PRADESH-iso19139-fc.xml → MADHYA_PRADESH-iso19110.xml} +0 -0
  120. data/spec/fixtures/mh187yx3536/temp/{WEST_BENGAL-iso19139-fc.xml → WEST_BENGAL-iso19110.xml} +0 -0
  121. data/spec/fixtures/mk488yn6694/temp/{GUJARAT-iso19139-fc.xml → GUJARAT-iso19110.xml} +0 -0
  122. data/spec/fixtures/my216kp3008/temp/{DELHI-iso19139-fc.xml → DELHI-iso19110.xml} +0 -0
  123. data/spec/fixtures/my504nz9827/temp/{JAMMU_KASHMIR-iso19139-fc.xml → JAMMU_KASHMIR-iso19110.xml} +0 -0
  124. data/spec/fixtures/ng819jm8700/temp/{MANIPUR-iso19139-fc.xml → MANIPUR-iso19110.xml} +0 -0
  125. data/spec/fixtures/np020jq2139/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  126. data/spec/fixtures/ns377mt1608/temp/{STATE1991-iso19139-fc.xml → STATE1991-iso19110.xml} +0 -0
  127. data/spec/fixtures/nw926np8508/temp/{metadata.iso19139-fc.xml → metadata.iso19110.xml} +0 -0
  128. data/spec/fixtures/ny358rm8559/temp/{TRIPURA-iso19139-fc.xml → TRIPURA-iso19110.xml} +0 -0
  129. data/spec/fixtures/nz176rm8192/temp/{DISTRICT2011-iso19139-fc.xml → DISTRICT2011-iso19110.xml} +0 -0
  130. data/spec/fixtures/nz252rq2252/temp/{UTTAR_PRADESH-iso19139-fc.xml → UTTAR_PRADESH-iso19110.xml} +0 -0
  131. data/spec/fixtures/pd902kb3348/temp/{MADHYA_PRADESH-iso19139-fc.xml → MADHYA_PRADESH-iso19110.xml} +0 -0
  132. data/spec/fixtures/pz792fz1776/temp/{MAHARASHTRA-iso19139-fc.xml → MAHARASHTRA-iso19110.xml} +0 -0
  133. data/spec/fixtures/qb767ss4042/temp/{UTTAR_PRADESH-iso19139-fc.xml → UTTAR_PRADESH-iso19110.xml} +0 -0
  134. data/spec/fixtures/qc091qw0570/temp/{GUJARAT-iso19139-fc.xml → GUJARAT-iso19110.xml} +0 -0
  135. data/spec/fixtures/qc652vr7204/temp/{ANDHRA_PRADESH_PT-iso19139-fc.xml → ANDHRA_PRADESH_PT-iso19110.xml} +0 -0
  136. data/spec/fixtures/qk786js7484/temp/{DISTRICT1961-iso19139-fc.xml → DISTRICT1961-iso19110.xml} +0 -0
  137. data/spec/fixtures/qn676pg6767/temp/{GOA-iso19139-fc.xml → GOA-iso19110.xml} +0 -0
  138. data/spec/fixtures/qr255jh4074/temp/{LOKSABHA_14-iso19139-fc.xml → LOKSABHA_14-iso19110.xml} +0 -0
  139. data/spec/fixtures/qr374kj4827/temp/{ASSAM-iso19139-fc.xml → ASSAM-iso19110.xml} +0 -0
  140. data/spec/fixtures/qy162js1748/temp/{CHHATTISGARH-iso19139-fc.xml → CHHATTISGARH-iso19110.xml} +0 -0
  141. data/spec/fixtures/rd446vf2633/temp/{NAGALAND-iso19139-fc.xml → NAGALAND-iso19110.xml} +0 -0
  142. data/spec/fixtures/rf389hf2983/temp/{CHHATTISGARH_PT-iso19139-fc.xml → CHHATTISGARH_PT-iso19110.xml} +0 -0
  143. data/spec/fixtures/rf859ff4582/temp/{JHARKHAND-iso19139-fc.xml → JHARKHAND-iso19110.xml} +0 -0
  144. data/spec/fixtures/rh343ds8931/temp/{BIHAR-iso19139-fc.xml → BIHAR-iso19110.xml} +0 -0
  145. data/spec/fixtures/rn815xk8157/temp/{SIKKIM-iso19139-fc.xml → SIKKIM-iso19110.xml} +0 -0
  146. data/spec/fixtures/rq653sz4470/temp/{CHHATTISGARH-iso19139-fc.xml → CHHATTISGARH-iso19110.xml} +0 -0
  147. data/spec/fixtures/rt625ws6022/temp/{GULF_FAIRWAYS-iso19139-fc.xml → GULF_FAIRWAYS-iso19110.xml} +0 -0
  148. data/spec/fixtures/sc330vf4259/temp/{JHARKHAND-iso19139-fc.xml → JHARKHAND-iso19110.xml} +0 -0
  149. data/spec/fixtures/sq479mx3086/temp/{OFFSH_PLATF-iso19139-fc.xml → OFFSH_PLATF-iso19110.xml} +0 -0
  150. data/spec/fixtures/sr686bm4098/temp/{DAMAN_DIU_PT-iso19139-fc.xml → DAMAN_DIU_PT-iso19110.xml} +0 -0
  151. data/spec/fixtures/sv303sh5583/temp/{ARUNACHAL_PRADESH-iso19139-fc.xml → ARUNACHAL_PRADESH-iso19110.xml} +0 -0
  152. data/spec/fixtures/sy319nh8520/temp/{GUJARAT-iso19139-fc.xml → GUJARAT-iso19110.xml} +0 -0
  153. data/spec/fixtures/td363vx2792/temp/{HIMACHAL_PRADESH_PT-iso19139-fc.xml → HIMACHAL_PRADESH_PT-iso19110.xml} +0 -0
  154. data/spec/fixtures/tf374bd2484/temp/{DISTRICT1951-iso19139-fc.xml → DISTRICT1951-iso19110.xml} +0 -0
  155. data/spec/fixtures/tj797mj7877/temp/{LOKSABHA_15-iso19139-fc.xml → LOKSABHA_15-iso19110.xml} +0 -0
  156. data/spec/fixtures/tv060wq5179/temp/{ASSAM-iso19139-fc.xml → ASSAM-iso19110.xml} +0 -0
  157. data/spec/fixtures/tv536bn1915/temp/{ARUNACHAL_PRADESH-iso19139-fc.xml → ARUNACHAL_PRADESH-iso19110.xml} +0 -0
  158. data/spec/fixtures/tz359cc2977/temp/{MANIPUR-iso19139-fc.xml → MANIPUR-iso19110.xml} +0 -0
  159. data/spec/fixtures/vb525my6511/temp/{UTTARAKHAND-iso19139-fc.xml → UTTARAKHAND-iso19110.xml} +0 -0
  160. data/spec/fixtures/vh802fs4240/temp/{PONDICHERRY-iso19139-fc.xml → PONDICHERRY-iso19110.xml} +0 -0
  161. data/spec/fixtures/vk120xn2474/temp/{PLSS_SEC-iso19139-fc.xml → PLSS_SEC-iso19110.xml} +0 -0
  162. data/spec/fixtures/vn439bc7316/temp/{KERALA-iso19139-fc.xml → KERALA-iso19110.xml} +0 -0
  163. data/spec/fixtures/vq745jk0695/temp/{MEGHALAYA-iso19139-fc.xml → MEGHALAYA-iso19110.xml} +0 -0
  164. data/spec/fixtures/vr593vj7147/temp/{ANDHRA_PRADESH-iso19139-fc.xml → ANDHRA_PRADESH-iso19110.xml} +0 -0
  165. data/spec/fixtures/vw911qb5271/temp/{DISTRICT2001-iso19139-fc.xml → DISTRICT2001-iso19110.xml} +0 -0
  166. data/spec/fixtures/wg680pz0365/temp/{ANDHRA_PRADESH-iso19139-fc.xml → ANDHRA_PRADESH-iso19110.xml} +0 -0
  167. data/spec/fixtures/wg761xn1926/temp/{HARYANA-iso19139-fc.xml → HARYANA-iso19110.xml} +0 -0
  168. data/spec/fixtures/wh870qw1934/temp/{PUNJAB-iso19139-fc.xml → PUNJAB-iso19110.xml} +0 -0
  169. data/spec/fixtures/wk775mm4673/temp/{MAHARASHTRA-iso19139-fc.xml → MAHARASHTRA-iso19110.xml} +0 -0
  170. data/spec/fixtures/ws171yz2165/temp/{ARUNACHAL_PRADESH_PT-iso19139-fc.xml → ARUNACHAL_PRADESH_PT-iso19110.xml} +0 -0
  171. data/spec/fixtures/wt473hz7153/temp/{CHANDIGARH_PT-iso19139-fc.xml → CHANDIGARH_PT-iso19110.xml} +0 -0
  172. data/spec/fixtures/ww217dj0457/temp/{CO2_PIPE-iso19139-fc.xml → CO2_PIPE-iso19110.xml} +0 -0
  173. data/spec/fixtures/wy875pk9849/temp/{STATE1961-iso19139-fc.xml → STATE1961-iso19110.xml} +0 -0
  174. data/spec/fixtures/xb018tk2042/temp/{STATE1981-iso19139-fc.xml → STATE1981-iso19110.xml} +0 -0
  175. data/spec/fixtures/xg539vw8586/temp/{ORISSA-iso19139-fc.xml → ORISSA-iso19110.xml} +0 -0
  176. data/spec/fixtures/xv475kp4644/temp/{ASSAM_PT-iso19139-fc.xml → ASSAM_PT-iso19110.xml} +0 -0
  177. data/spec/fixtures/xy096gc2959/temp/{GOA-iso19139-fc.xml → GOA-iso19110.xml} +0 -0
  178. data/spec/fixtures/xz518gz3362/temp/{UTTARAKHAND-iso19139-fc.xml → UTTARAKHAND-iso19110.xml} +0 -0
  179. data/spec/fixtures/yh986wy4737/temp/{NAGALAND-iso19139-fc.xml → NAGALAND-iso19110.xml} +0 -0
  180. data/spec/fixtures/yn187fq4474/temp/{KARNATAKA-iso19139-fc.xml → KARNATAKA-iso19110.xml} +0 -0
  181. data/spec/fixtures/yn236mw3250/temp/{TAMILNADU-iso19139-fc.xml → TAMILNADU-iso19110.xml} +0 -0
  182. data/spec/fixtures/yz596nz0112/temp/{WEST_BENGAL-iso19139-fc.xml → WEST_BENGAL-iso19110.xml} +0 -0
  183. data/spec/fixtures/zk596gy7380/temp/{DISTRICT1971-iso19139-fc.xml → DISTRICT1971-iso19110.xml} +0 -0
  184. data/spec/fixtures/zn452hh7431/temp/{RAJASTHAN-iso19139-fc.xml → RAJASTHAN-iso19110.xml} +0 -0
  185. data/spec/fixtures/zt093fw6519/temp/{MIZORAM-iso19139-fc.xml → MIZORAM-iso19110.xml} +0 -0
  186. data/spec/fixtures/zv925hd6723/temp/{OGWELLS-iso19139-fc.xml → OGWELLS-iso19110.xml} +0 -0
  187. data/spec/fixtures/zy658cr1728/temp/{ANDAMAAN_NICOBAR_PT-iso19139-fc.xml → ANDAMAAN_NICOBAR_PT-iso19110.xml} +0 -0
  188. data/spec/fixtures/zz943vx1492/temp/{BASINS-iso19139-fc.xml → BASINS-iso19110.xml} +0 -0
  189. data/spec/unit/gazetteer_spec.rb +100 -35
  190. data/spec/unit/task_spec.rb +68 -0
  191. data/spec/unit/transform_spec.rb +1 -1
  192. data/spec/unit/utils_spec.rb +17 -3
  193. data/workflow.rb +35 -0
  194. metadata +323 -316
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+ require 'rsolr'
5
+
6
+ class IngestOgp
7
+ def initialize(collection, url)
8
+ raise ArgumentError, 'Collection not defined' unless collection.is_a? String
9
+ @solr = RSolr.connect(:url => (url + '/' + collection))
10
+ yield self
11
+ close
12
+ end
13
+
14
+ def ingest(fn)
15
+ puts "Ingesting #{fn}"
16
+ json = JSON::parse(File.read(fn))
17
+ n = 0
18
+ json.each do |doc|
19
+ next unless doc.is_a? Hash and not doc.empty?
20
+ doc.delete('_version_')
21
+ doc.delete('timestamp')
22
+ putc "."
23
+ @solr.add doc
24
+ n += 1
25
+ if n % 100 == 0
26
+ @solr.commit
27
+ puts "\ncommit 100 records, #{n} total\n"
28
+ end
29
+ end
30
+ puts "\n#{n} records\n"
31
+ @solr.commit
32
+ end
33
+
34
+ def close
35
+ @solr.commit
36
+ #@solr.optimize
37
+ @solr = nil
38
+ end
39
+
40
+ end
41
+
42
+
43
+ # __MAIN__
44
+ IngestOgp.new(ARGV[0], (ARGV[1].nil?? 'http://localhost:18080/solr' : ARGV[1])) do |ogp|
45
+ Dir.glob("transformed*.json") do |fn|
46
+ ogp.ingest(fn)
47
+ end
48
+ end
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: select.rb
4
+
5
+
6
+ require 'awesome_print'
7
+ require 'json'
8
+
9
+
10
+ # __MAIN__
11
+ selected = []
12
+ Dir.glob('transformed*.json') do |fn|
13
+ JSON::parse(File.read(fn)).each do |i|
14
+ if rand < 0.01
15
+ selected << i
16
+ end
17
+ end
18
+ end
19
+ ap({:selected => selected})
20
+ File.open('selected.json', 'wb') {|f| f << JSON.pretty_generate(selected)}
@@ -0,0 +1,354 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: transform_ogp output.json
4
+ #
5
+ # Reads valid*.json in current directory
6
+ #
7
+
8
+ require 'awesome_print'
9
+ require 'json'
10
+ require 'uri'
11
+ require 'date'
12
+ require 'nokogiri'
13
+
14
+ # Transforms an OGP schema into GeoBlacklight. Requires input of a JSON array
15
+ # of OGP hashs.
16
+ class TransformOgp
17
+
18
+ def initialize(fn)
19
+ @output = File.open(fn, 'wb')
20
+ @output.write "[\n"
21
+ @fgdcdir = 'fgdc'
22
+ yield self
23
+ self.close
24
+ end
25
+
26
+ # @param [String|Array] s the URI to clean up
27
+ # @return [String] a normalized URI
28
+ def clean_uri(s)
29
+ unless s.nil? or s.empty?
30
+ return (s.is_a?(Array) ? URI(s.first) : URI(s)).to_s
31
+ end
32
+ ''
33
+ end
34
+
35
+ # @param [String] fn filename of JSON array of OGP hash objects
36
+ # @return [Hash] stats about :accepted vs. :rejected records
37
+ def transform_file(fn)
38
+ stats = { :accepted => 0, :rejected => 0 }
39
+ puts "Parsing #{fn}"
40
+ json = JSON::parse(File.open(fn, 'rb').read)
41
+ json.each do |doc| # contains JSON Solr query results
42
+ unless doc.empty?
43
+ begin
44
+ transform(doc)
45
+ stats[:accepted] += 1
46
+ rescue ArgumentError => e
47
+ puts e
48
+ stats[:rejected] += 1
49
+ end
50
+ end
51
+ end
52
+ stats
53
+ end
54
+
55
+ # Transforms a single OGP record into a GeoBlacklight record
56
+ # @param [Hash] layer an OGP hash for a given layer
57
+ def transform(layer, skip_fgdc = true)
58
+ id = layer['LayerId'].to_s.strip
59
+ puts "Tranforming #{id}"
60
+
61
+ # For URN style @see http://www.ietf.org/rfc/rfc2141.txt
62
+ # For ARK @see https://wiki.ucop.edu/display/Curation/ARK
63
+ prefix = case layer['Institution']
64
+ when 'Stanford'
65
+ 'http://purl.stanford.edu/'
66
+ when 'Tufts'
67
+ 'urn:geodata.tufts.edu:'
68
+ when 'MassGIS'
69
+ 'urn:massgis.state.ma.us:'
70
+ when 'Berkeley'
71
+ 'http://ark.cdlib.org/ark:/'
72
+ when 'MIT'
73
+ 'urn:arrowsmith.mit.edu:'
74
+ when 'Harvard'
75
+ 'urn:hul.harvard.edu:'
76
+ else
77
+ ''
78
+ end
79
+ uuid = prefix + URI.encode(id)
80
+
81
+ # Parse out the Location to get the WMS/WFS/WCS URLs
82
+ raise ArgumentError, "ERROR: #{id} no location" if layer['Location'].nil? or layer['Location'].empty?
83
+ location = JSON::parse(layer['Location'])
84
+ raise ArgumentError, "ERROR: #{id} has malformed location" unless location.is_a? Hash
85
+
86
+ # Parse out the bounding box
87
+ s = layer['MinY'].to_f
88
+ w = layer['MinX'].to_f
89
+ n = layer['MaxY'].to_f
90
+ e = layer['MaxX'].to_f
91
+
92
+ # Parse out the ContentDate date/time
93
+ dt = DateTime.rfc3339(layer['ContentDate'])
94
+ pub_dt = DateTime.rfc3339('2000-01-01T00:00:00Z') # XXX fake data, get from MODS
95
+
96
+ access = layer['Access']
97
+ collection = nil
98
+
99
+ # Parse out the PURL and other metadata for Stanford
100
+ if layer['Institution'] == 'Stanford'
101
+ purl = location['purl']
102
+ if purl.is_a? Array
103
+ purl = purl.first
104
+ end
105
+ if purl.nil? and uuid =~ /^http/
106
+ purl = uuid
107
+ end
108
+ else
109
+ purl = nil
110
+ # Because OGP does not deliminate keywords, we use a heuristic here
111
+ %w{PlaceKeywords ThemeKeywords}.each do |k|
112
+ unless layer[k] =~ /[;,]/ or layer[k].split.size < 4
113
+ layer[k] = layer[k].split.join(';')
114
+ end
115
+ end
116
+ end
117
+
118
+ slug = to_slug(id, layer)
119
+
120
+ layer_geom_type = layer['DataType'].to_s.downcase
121
+ layer_geom_type = 'raster' if layer_geom_type == 'paper map'
122
+
123
+ # @see https://github.com/OSGeo/Cat-Interop
124
+ %w{wcs wfs wms}.each do |k|
125
+ location[k] = location[k].first if location[k].is_a? Array
126
+ end
127
+ refs = {}
128
+ refs['http://www.opengis.net/def/serviceType/ogc/wcs'] = "#{location['wcs']}" if location['wcs']
129
+ refs['http://www.opengis.net/def/serviceType/ogc/wfs'] = "#{location['wfs']}" if location['wfs']
130
+ refs['http://www.opengis.net/def/serviceType/ogc/wms'] = "#{location['wms']}" if location['wms']
131
+ if purl
132
+ refs["http://schema.org/thumbnailUrl"] = "http://stacks.stanford.edu/file/druid:#{id}/preview.jpg"
133
+ refs["http://schema.org/url"] = "#{clean_uri(purl)}"
134
+ refs["http://schema.org/DownloadAction"] = "http://stacks.stanford.edu/file/druid:#{id}/data.zip"
135
+ refs["http://www.isotc211.org/schemas/2005/gmd/"] = "#{purl}.iso19139"
136
+ refs["http://www.loc.gov/mods/v3"] = "#{purl}.mods"
137
+ end
138
+
139
+ # Make the conversion from OGP to GeoBlacklight
140
+ #
141
+ # @see http://dublincore.org/documents/dcmi-terms/
142
+ # @see http://wiki.dublincore.org/index.php/User_Guide/Creating_Metadata
143
+ # @see http://www.ietf.org/rfc/rfc5013.txt
144
+ new_layer = {
145
+ :uuid => uuid,
146
+
147
+ # Dublin Core elements
148
+ :dc_creator_sm => string2array(layer['Originator']),
149
+ :dc_description_s => layer['Abstract'],
150
+ :dc_format_s => (
151
+ (layer_geom_type == 'raster') ?
152
+ 'GeoTIFF' : # 'image/tiff' :
153
+ 'Shapefile' # 'application/x-esri-shapefile'
154
+ ), # XXX: fake data
155
+ :dc_identifier_s => uuid,
156
+ :dc_language_s => 'English', # 'en', # XXX: fake data
157
+ :dc_publisher_s => layer['Publisher'],
158
+ :dc_rights_s => access,
159
+ :dc_subject_sm => string2array(layer['ThemeKeywords']),
160
+ :dc_title_s => layer['LayerDisplayName'],
161
+ :dc_type_s => 'Dataset', # or 'Image' for non-georectified,
162
+ # or 'PhysicalObject' for non-digitized maps
163
+ # Dublin Core terms
164
+ :dct_isPartOf_sm => collection.nil?? nil : [collection],
165
+ :dct_references_s => refs.to_json.to_s,
166
+ :dct_spatial_sm => string2array(layer['PlaceKeywords']),
167
+ :dct_temporal_sm => [dt.year.to_s],
168
+ :dct_issued_s => pub_dt.year.to_s,
169
+ :dct_provenance_s => layer['Institution'],
170
+
171
+ #
172
+ # xmlns:georss="http://www.georss.org/georss"
173
+ # A bounding box is a rectangular region, often used to define the extents of a map or a rough area of interest. A box contains two space seperate latitude-longitude pairs, with each pair separated by whitespace. The first pair is the lower corner, the second is the upper corner.
174
+ :georss_box_s => "#{s} #{w} #{n} #{e}",
175
+ :georss_polygon_s => "#{n} #{w} #{n} #{e} #{s} #{e} #{s} #{w} #{n} #{w}",
176
+
177
+ # Layer-specific schema
178
+ :layer_slug_s => slug,
179
+ :layer_id_s => layer['WorkspaceName'] + ':' + layer['Name'],
180
+ # :layer_srs_s => 'EPSG:4326', # XXX: fake data
181
+ :layer_geom_type_s => layer_geom_type.capitalize,
182
+ :layer_modified_dt => Time.now.utc.strftime('%FT%TZ'),
183
+
184
+ # derived fields used only by solr, for which copyField is insufficient
185
+ :solr_bbox => "#{w} #{s} #{e} #{n}", # minX minY maxX maxY
186
+ :solr_ne_pt => "#{n},#{e}",
187
+ :solr_sw_pt => "#{s},#{w}",
188
+ :solr_geom => "ENVELOPE(#{w}, #{e}, #{n}, #{s})",
189
+ :solr_year_i => dt.year,
190
+ :solr_issued_dt => pub_dt.strftime('%FT%TZ'), # Solr requires 1995-12-31T23:59:59Z
191
+ :solr_wms_url => location['wms'],
192
+ :solr_wfs_url => location['wfs'],
193
+ :solr_wcs_url => location['wcs']
194
+
195
+ # :layer_year_i => dt.year#, # XXX: migrate to copyField
196
+ # :ogp_area_f => layer['Area'],
197
+ # :ogp_center_x_f => layer['CenterX'],
198
+ # :ogp_center_y_f => layer['CenterY'],
199
+ # :ogp_georeferenced_b => (layer['GeoReferenced'].to_s.downcase == 'true'),
200
+ # :ogp_halfheight_f => layer['HalfHeight'],
201
+ # :ogp_halfwidth_f => layer['HalfWidth'],
202
+ # :ogp_layer_id_s => layer['LayerId'],
203
+ # :ogp_name_s => layer['Name'],
204
+ # :ogp_location_s => layer['Location'],
205
+ # :ogp_workspace_s => layer['WorkspaceName']
206
+ }
207
+
208
+ # Remove any fields that are blank
209
+ new_layer.each do |k, v|
210
+ new_layer.delete(k) if v.nil? or (v.respond_to?(:empty?) and v.empty?)
211
+ end
212
+
213
+ # Write the JSON record for the GeoBlacklight layer
214
+ @output.write JSON::pretty_generate(new_layer)
215
+ @output.write "\n,\n"
216
+
217
+ unless skip_fgdc or layer['FgdcText'].nil? or layer['FgdcText'].empty?
218
+ xml = Nokogiri::XML(layer['FgdcText'])
219
+ xml.write_xml_to(File.open('fgdc' + '/' + slug + '.xml', 'wb'), :encoding => 'UTF-8', :indent => 2)
220
+ end
221
+ end
222
+
223
+ def close
224
+ @output.write "\n {} \n]\n"
225
+ @output.close
226
+ end
227
+
228
+ # @param [String] s has semi-colon/comma/gt delimited array
229
+ # @return [Array] results as array
230
+ def string2array(s)
231
+ if s.to_s =~ /[;,>]/
232
+ s.split(/\s*[;,>]\s*/).uniq.collect {|i| i.strip}
233
+ elsif s.is_a?(String) and s.size > 0
234
+ [s.strip]
235
+ else
236
+ nil
237
+ end
238
+ end
239
+
240
+ @@slugs = {}
241
+ def to_slug(id, layer)
242
+ # strip out schema and usernames
243
+ name = layer['Name'].sub('SDE_DATA.', '').sub('SDE.', '').sub('SDE2.', '').sub('GISPORTAL.GISOWNER01.', '').sub('GISDATA.', '').sub('MORIS.', '')
244
+ unless name.size > 1
245
+ # use first word of title is empty name
246
+ name = layer['LayerDisplayName'].split.first
247
+ end
248
+ slug = layer['Institution'] + '-' + name
249
+
250
+ # slugs should only have a-z, A-Z, 0-9, and -
251
+ slug.gsub!(/[^a-zA-Z0-9\-]/, '-')
252
+ slug.gsub!(/[\-]+/, '-')
253
+
254
+ # only lowercase
255
+ slug.downcase!
256
+
257
+ # ensure slugs are unique for this pass
258
+ if @@slugs.include?(slug)
259
+ slug += '-' + sprintf("%06d", Random.rand(999999))
260
+ end
261
+ @@slugs[slug] = true
262
+
263
+ slug
264
+ end
265
+
266
+ # Ensure that the WMS/WFS/WCS location values are as expected
267
+ def validate_location(id, location)
268
+ begin
269
+ x = JSON::parse(location)
270
+ if x['wms'].nil? or (x['wcs'].nil? and x['wfs'].nil?)
271
+ raise ArgumentError, "ERROR: #{id}: Missing WMS or WCS/WFS: #{x}"
272
+ end
273
+
274
+ %w{wms wcs wfs}.each do |protocol|
275
+ begin
276
+ unless x[protocol].nil?
277
+ if x[protocol].is_a? String
278
+ x[protocol] = [x[protocol]]
279
+ end
280
+
281
+ unless x[protocol].is_a? Array
282
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
283
+ end
284
+
285
+ x[protocol].each do |url|
286
+ uri = clean_uri.parse(url)
287
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(clean_uri::HTTP) or uri.kind_of?(clean_uri::HTTPS)
288
+ end
289
+ end
290
+ rescue Exception => e
291
+ raise ArgumentError, "ERROR: #{id}: Invalid #{k}: #{x}"
292
+ end
293
+ end
294
+
295
+ return x.to_json
296
+ rescue JSON::ParserError => e
297
+ raise ArgumentError, "ERROR: #{id}: Invalid JSON: #{location}"
298
+ end
299
+ nil
300
+ end
301
+
302
+ def lon? lon
303
+ lon >= -180 and lon <= 180
304
+ end
305
+
306
+ def lat? lat
307
+ lat >= -90 and lat <= 90
308
+ end
309
+ end
310
+
311
+
312
+ # __MAIN__
313
+ #
314
+ TransformOgp.new(ARGV[0].nil?? 'transformed.json' : ARGV[0]) do |ogp|
315
+ stats = { :accepted => 0, :rejected => 0 }
316
+ Dir.glob('valid*.json') do |fn|
317
+ s = ogp.transform_file(fn)
318
+ stats[:accepted] += s[:accepted]
319
+ stats[:rejected] += s[:rejected]
320
+ end
321
+ ap({:statistics => stats})
322
+ end
323
+
324
+ # example input data
325
+ __END__
326
+ [
327
+ {
328
+ "Abstract": "The boundaries of each supervisorial district in Sonoma County based on 2000 census. Redrawn in 2001 using Autobound.",
329
+ "Access": "Public",
330
+ "Area": 0.9463444815860053,
331
+ "Availability": "Online",
332
+ "CenterX": -122.942159,
333
+ "CenterY": 38.4580755,
334
+ "ContentDate": "2000-01-01T01:01:01Z",
335
+ "DataType": "Polygon",
336
+ "FgdcText": "...",
337
+ "GeoReferenced": true,
338
+ "HalfHeight": 0.39885650000000084,
339
+ "HalfWidth": 0.593161000000002,
340
+ "Institution": "Berkeley",
341
+ "LayerDisplayName": "SCGISDB2_BASE_ADM_SUPERVISOR",
342
+ "LayerId": "28722/bk0012h5s52",
343
+ "Location": "{\"wms\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wms\"],\"tilecache\":[\"http://gis.lib.berkeley.edu:8080/geoserver/gwc/service/wms\"],\"download\":\"\",\"wfs\":[\"http://gis.lib.berkeley.edu:8080/geoserver/wfs\"]}",
344
+ "MaxX": -122.348998,
345
+ "MaxY": 38.856932,
346
+ "MinX": -123.53532,
347
+ "MinY": 38.059219,
348
+ "Name": "ADM_SUPERVISOR",
349
+ "PlaceKeywords": "Sonoma County County of Sonoma Sonoma California Bay Area",
350
+ "Publisher": "UC Berkeley Libraries",
351
+ "ThemeKeywords": "Supervisorial districts 1st District 2nd District 3rd District 4th District 5th District",
352
+ "WorkspaceName": "UCB"
353
+ }
354
+ ]
@@ -0,0 +1,182 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Usage: validate_ogp [output.json]
4
+ #
5
+ # Requires data/*.json as input and output to valid.json
6
+ #
7
+ require 'awesome_print'
8
+ require 'json'
9
+ require 'uri'
10
+ require 'date'
11
+
12
+ class ValidateOgp
13
+ def initialize(fn)
14
+ @wms_servers = {}
15
+ @output = File.open(fn, 'wb')
16
+ @output.write "[\n"
17
+ yield self
18
+ self.close
19
+ end
20
+
21
+ def validate_file(fn)
22
+ stats = { :accepted => 0, :rejected => 0 }
23
+ puts "Validating #{fn}"
24
+ json = JSON::parse(File.read(fn))
25
+ json['response']['docs'].each do |doc| # contains JSON Solr query results
26
+ begin
27
+ validate(doc)
28
+ stats[:accepted] += 1
29
+ rescue ArgumentError => e
30
+ puts e
31
+ stats[:rejected] += 1
32
+ end
33
+ end
34
+ stats
35
+ end
36
+
37
+
38
+ def validate(layer)
39
+ id = layer['LayerId']
40
+
41
+ %w{LayerId Name Institution Access MinX MinY MaxX MaxY LayerDisplayName Location}.each do |k|
42
+ if layer[k].nil? or layer[k].to_s.empty?
43
+ raise ArgumentError, "ERROR: #{id} missing #{k}"
44
+ return
45
+ end
46
+ end
47
+
48
+ %w{MinX MaxX}.each do |lon|
49
+ raise ArgumentError, "ERROR: #{id}: Invalid longitude value: #{layer[lon]}" unless lon?(layer[lon])
50
+ end
51
+
52
+
53
+ %w{MinY MaxY}.each do |lat|
54
+ raise ArgumentError, "ERROR: #{id} Invalid latitude value: #{layer[lat]}" unless lat?(layer[lat])
55
+ end
56
+
57
+ k = 'Institution'
58
+ if ([layer[k]] & %w{Berkeley Harvard MIT MassGIS Stanford Tufts}).empty?
59
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
60
+ return
61
+ end
62
+
63
+ k = 'DataType'
64
+ if ([layer[k]] & %w{Line Paper\ Map Point Polygon Raster LibraryRecord}).empty?
65
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
66
+ return
67
+ end
68
+
69
+ k = 'Access'
70
+ if ([layer[k]] & %w{Public Restricted}).empty?
71
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
72
+ end
73
+
74
+ k = 'Availability'
75
+ if layer[k].downcase == 'online' # cleanup
76
+ layer[k] = 'Online'
77
+ end
78
+ if ([layer[k]] & %w{Online}).empty?
79
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
80
+ return
81
+ end
82
+
83
+ k = 'Location'
84
+ layer[k] = validate_location(id, layer[k])
85
+ if layer[k].nil? or layer[k].empty?
86
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
87
+ end
88
+
89
+ k = 'GeoReferenced'
90
+ unless layer[k].nil? or layer[k] == true
91
+ puts "WARNING: #{id} has boundingbox but claims it is not georeferenced"
92
+ #layer[k] = true
93
+ end
94
+
95
+ k = 'Area'
96
+ unless layer[k] > 0
97
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
98
+ end
99
+
100
+ k = 'ContentDate'
101
+ if layer[k].nil? or layer[k].empty?
102
+ raise ArgumentError, "ERROR: #{id} has unsupported #{k}: #{layer[k]}"
103
+ end
104
+ dt = Date.rfc3339(layer[k])
105
+ if dt.year < 1500 or dt.year > 2100
106
+ raise ArgumentError, "ERROR: #{id} has suspect #{k}: #{layer[k]}"
107
+ end
108
+
109
+ # k = 'FgdcText'
110
+ # unless layer[k].nil? or layer[k].empty?
111
+ # layer[k] = ''
112
+ # end
113
+
114
+ @output.write JSON::pretty_generate(layer)
115
+ @output.write "\n,\n"
116
+ end
117
+
118
+ def close
119
+ @output.write "\n {} \n]\n"
120
+ @output.close
121
+ ap({:wms_servers => @wms_servers})
122
+ end
123
+
124
+ private
125
+
126
+ def validate_location(id, location)
127
+ begin
128
+ x = JSON::parse(location)
129
+ if x['wms'].nil? or (x['wcs'].nil? and x['wfs'].nil?)
130
+ raise ArgumentError, "ERROR: #{id}: Missing WMS or WCS/WFS: #{x}"
131
+ end
132
+
133
+ %w{wms wcs wfs}.each do |protocol|
134
+ begin
135
+ unless x[protocol].nil?
136
+ if x[protocol].is_a? String
137
+ x[protocol] = [x[protocol]]
138
+ end
139
+
140
+ unless x[protocol].is_a? Array
141
+ raise ArgumentError, "ERROR: #{id}: Unknown #{protocol} value: #{x}"
142
+ end
143
+
144
+ x[protocol].each do |url|
145
+ uri = URI.parse(url)
146
+ raise ArgumentError, "ERROR: #{id}: Invalid URL: #{uri}" unless uri.kind_of?(URI::HTTP) or uri.kind_of?(URI::HTTPS)
147
+ end
148
+ end
149
+ rescue Exception => e
150
+ raise ArgumentError, "ERROR: #{id}: Invalid #{k}: #{x}"
151
+ end
152
+ end
153
+
154
+ @wms_servers[x['wms'].first] = true
155
+
156
+ return x.to_json
157
+ rescue JSON::ParserError => e
158
+ raise ArgumentError, "ERROR: #{id}: Invalid JSON: #{location}"
159
+ end
160
+ nil
161
+ end
162
+
163
+ def lon? lon
164
+ lon >= -180 and lon <= 180
165
+ end
166
+
167
+ def lat? lat
168
+ lat >= -90 and lat <= 90
169
+ end
170
+ end
171
+
172
+
173
+ # __MAIN__
174
+ ValidateOgp.new(ARGV[0].nil?? 'valid.json' : ARGV[0]) do |ogp|
175
+ stats = { :accepted => 0, :rejected => 0 }
176
+ Dir.glob('data/*.json') do |fn|
177
+ s = ogp.validate_file(fn)
178
+ stats[:accepted] += s[:accepted]
179
+ stats[:rejected] += s[:rejected]
180
+ end
181
+ ap({:statistics => stats})
182
+ end