geo_combine 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/ruby.yml +53 -0
  3. data/.gitignore +2 -0
  4. data/.rubocop.yml +20 -0
  5. data/.rubocop_todo.yml +165 -0
  6. data/Gemfile +3 -1
  7. data/README.md +80 -1
  8. data/Rakefile +4 -2
  9. data/bin/geocombine +1 -0
  10. data/geo_combine.gemspec +5 -0
  11. data/lib/geo_combine/bounding_box.rb +7 -1
  12. data/lib/geo_combine/ckan_metadata.rb +10 -8
  13. data/lib/geo_combine/cli.rb +3 -1
  14. data/lib/geo_combine/esri_open_data.rb +2 -0
  15. data/lib/geo_combine/exceptions.rb +3 -0
  16. data/lib/geo_combine/fgdc.rb +2 -2
  17. data/lib/geo_combine/formats.rb +2 -0
  18. data/lib/geo_combine/formatting.rb +3 -1
  19. data/lib/geo_combine/geo_blacklight_harvester.rb +211 -0
  20. data/lib/geo_combine/geoblacklight.rb +20 -6
  21. data/lib/geo_combine/geometry_types.rb +2 -0
  22. data/lib/geo_combine/iso19139.rb +2 -1
  23. data/lib/geo_combine/ogp.rb +13 -11
  24. data/lib/geo_combine/railtie.rb +2 -0
  25. data/lib/geo_combine/subjects.rb +2 -0
  26. data/lib/geo_combine/version.rb +3 -1
  27. data/lib/geo_combine.rb +7 -3
  28. data/lib/tasks/geo_combine.rake +57 -26
  29. data/lib/xslt/fgdc2html.xsl +38 -9
  30. data/lib/xslt/iso2html.xsl +1107 -1070
  31. data/spec/features/fgdc2html_spec.rb +53 -1
  32. data/spec/features/iso2html_spec.rb +17 -2
  33. data/spec/fixtures/docs/princeton_fgdc.xml +374 -0
  34. data/spec/fixtures/docs/repos.json +3224 -0
  35. data/spec/fixtures/docs/simple_xml.xml +10 -0
  36. data/spec/fixtures/docs/simple_xslt.xsl +11 -0
  37. data/spec/fixtures/docs/stanford_iso.xml +652 -0
  38. data/spec/fixtures/docs/tufts_fgdc.xml +977 -0
  39. data/spec/fixtures/indexing/basic_geoblacklight.json +27 -0
  40. data/spec/fixtures/indexing/geoblacklight.json +33 -0
  41. data/spec/fixtures/indexing/layers.json +16119 -0
  42. data/spec/fixtures/indexing/test.txt +1 -0
  43. data/spec/fixtures/json_docs.rb +2 -0
  44. data/spec/fixtures/xml_docs.rb +9 -1659
  45. data/spec/helpers.rb +7 -7
  46. data/spec/lib/geo_combine/bounding_box_spec.rb +18 -0
  47. data/spec/lib/geo_combine/ckan_metadata_spec.rb +34 -11
  48. data/spec/lib/geo_combine/esri_open_data_spec.rb +23 -2
  49. data/spec/lib/geo_combine/fgdc_spec.rb +41 -10
  50. data/spec/lib/geo_combine/formatting_spec.rb +13 -5
  51. data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +194 -0
  52. data/spec/lib/geo_combine/geoblacklight_spec.rb +41 -11
  53. data/spec/lib/geo_combine/iso19139_spec.rb +26 -14
  54. data/spec/lib/geo_combine/ogp_spec.rb +28 -8
  55. data/spec/lib/geo_combine_spec.rb +7 -4
  56. data/spec/lib/tasks/geo_combine_spec.rb +45 -0
  57. data/spec/spec_helper.rb +19 -84
  58. data/spec/support/fixtures.rb +9 -0
  59. metadata +103 -6
  60. data/.coveralls.yml +0 -1
  61. data/.travis.yml +0 -7
@@ -0,0 +1,211 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GeoCombine
4
+ ##
5
+ # A class to harvest and index results from GeoBlacklight sites
6
+ # You can configure the sites to be harvested via a configure command.
7
+ # GeoCombine::GeoBlacklightHarvester.configure do
8
+ # {
9
+ # SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } }
10
+ # }
11
+ # end
12
+ # The class configuration also allows for various other things to be configured:
13
+ # - A debug parameter to print out details of what is being harvested and indexed
14
+ # - crawl delays for each page of results (globally or on a per site basis)
15
+ # - Solr's commitWithin parameter (defaults to 5000)
16
+ # - A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)
17
+ # Example: GeoCombine::GeoBlacklightHarvester.new('SITE').index
18
+ class GeoBlacklightHarvester
19
+ require 'active_support/core_ext/object/to_query'
20
+
21
+ class << self
22
+ attr_writer :document_transformer
23
+
24
+ def configure(&block)
25
+ @config = yield block
26
+ end
27
+
28
+ def config
29
+ @config || {}
30
+ end
31
+
32
+ def document_transformer
33
+ @document_transformer || lambda do |document|
34
+ document.delete('_version_')
35
+ document.delete('score')
36
+ document.delete('timestamp')
37
+ document.delete('solr_bboxtype__minX')
38
+ document.delete('solr_bboxtype__minY')
39
+ document.delete('solr_bboxtype__maxX')
40
+ document.delete('solr_bboxtype__maxY')
41
+ document
42
+ end
43
+ end
44
+ end
45
+
46
+ attr_reader :site, :site_key
47
+
48
+ def initialize(site_key)
49
+ @site_key = site_key
50
+ @site = self.class.config[site_key]
51
+
52
+ raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
53
+ end
54
+
55
+ def index
56
+ puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
57
+ response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
58
+ response_class = BlacklightResponseVersionFactory.call(response)
59
+
60
+ response_class.new(response: response, base_url: base_url).documents.each do |docs|
61
+ docs.map! do |document|
62
+ self.class.document_transformer&.call(document)
63
+ end.compact
64
+
65
+ puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
66
+ solr_connection.update params: { commitWithin: commit_within, overwrite: true },
67
+ data: docs.to_json,
68
+ headers: { 'Content-Type' => 'application/json' }
69
+
70
+ sleep(crawl_delay.to_i) if crawl_delay
71
+ end
72
+ end
73
+
74
+ ##
75
+ # A "factory" class to determine the blacklight response version to use
76
+ class BlacklightResponseVersionFactory
77
+ def self.call(json)
78
+ keys = json.keys
79
+ if keys.include?('response')
80
+ LegacyBlacklightResponse
81
+ elsif keys.any? && %w[links data].all? { |param| keys.include?(param) }
82
+ ModernBlacklightResponse
83
+ else
84
+ raise NotImplementedError,
85
+ "The following json response was not able to be parsed by the GeoBlacklightHarvester\n#{json}"
86
+ end
87
+ end
88
+ end
89
+
90
+ class LegacyBlacklightResponse
91
+ attr_reader :base_url
92
+ attr_accessor :response, :page
93
+
94
+ def initialize(response:, base_url:)
95
+ @base_url = base_url
96
+ @response = response
97
+ @page = 1
98
+ end
99
+
100
+ def documents
101
+ return enum_for(:documents) unless block_given?
102
+
103
+ while current_page && total_pages && (current_page <= total_pages)
104
+ yield response.dig('response', 'docs')
105
+
106
+ break if current_page == total_pages
107
+
108
+ self.page += 1
109
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
110
+
111
+ begin
112
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
113
+ rescue StandardError => e
114
+ puts "Request for #{url} failed with #{e}"
115
+ self.response = nil
116
+ end
117
+ end
118
+ end
119
+
120
+ private
121
+
122
+ def url
123
+ "#{base_url}&page=#{page}"
124
+ end
125
+
126
+ def current_page
127
+ response.dig('response', 'pages', 'current_page')
128
+ end
129
+
130
+ def total_pages
131
+ response.dig('response', 'pages', 'total_pages')
132
+ end
133
+ end
134
+
135
+ ##
136
+ # Class to return documents from the Blacklight API (v7 and above)
137
+ class ModernBlacklightResponse
138
+ attr_reader :base_url
139
+ attr_accessor :response, :page
140
+
141
+ def initialize(response:, base_url:)
142
+ @base_url = base_url
143
+ @response = response
144
+ @page = 1
145
+ end
146
+
147
+ def documents
148
+ return enum_for(:documents) unless block_given?
149
+
150
+ while response && response['data'].any?
151
+ document_urls = response['data'].collect { |data| data.dig('links', 'self') }.compact
152
+
153
+ yield documents_from_urls(document_urls)
154
+
155
+ url = response.dig('links', 'next')
156
+ break unless url
157
+
158
+ url = "#{url}&format=json"
159
+ self.page += 1
160
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
161
+ begin
162
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
163
+ rescue StandardError => e
164
+ puts "Request for #{url} failed with #{e}"
165
+ self.response = nil
166
+ end
167
+ end
168
+ end
169
+
170
+ private
171
+
172
+ def documents_from_urls(urls)
173
+ puts "Fetching #{urls.count} documents for page #{page}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
174
+ urls.map do |url|
175
+ JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
176
+ rescue StandardError => e
177
+ puts "Fetching \"#{url}/raw\" failed with #{e}"
178
+
179
+ nil
180
+ end.compact
181
+ end
182
+ end
183
+
184
+ private
185
+
186
+ def base_url
187
+ "#{site[:host]}?#{default_params.to_query}"
188
+ end
189
+
190
+ def solr_connection
191
+ solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
192
+
193
+ RSolr.connect url: solr_url, adapter: :net_http_persistent
194
+ end
195
+
196
+ def commit_within
197
+ self.class.config[:commit_within] || '5000'
198
+ end
199
+
200
+ def crawl_delay
201
+ site[:crawl_delay] || self.class.config[:crawl_delay]
202
+ end
203
+
204
+ def default_params
205
+ {
206
+ per_page: 100,
207
+ format: :json
208
+ }.merge(site[:params])
209
+ end
210
+ end
211
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'active_support/core_ext/object/blank'
2
4
  require 'active_support/core_ext/hash/except'
3
5
  require 'open-uri'
@@ -10,8 +12,8 @@ module GeoCombine
10
12
 
11
13
  attr_reader :metadata
12
14
 
13
- GEOBLACKLIGHT_VERSION = 'v1.1.0'
14
- SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/#{GEOBLACKLIGHT_VERSION}/schema/geoblacklight-schema.json".freeze
15
+ GEOBLACKLIGHT_VERSION = '1.0'
16
+ SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/main/schema/geoblacklight-schema-#{GEOBLACKLIGHT_VERSION}.json"
15
17
  DEPRECATED_KEYS_V1 = %w[
16
18
  uuid
17
19
  georss_polygon_s
@@ -29,7 +31,6 @@ module GeoCombine
29
31
  # @param [Hash] fields enhancements to metadata that are merged with @metadata
30
32
  def initialize(metadata, fields = {})
31
33
  @metadata = JSON.parse(metadata).merge(fields)
32
- @schema = nil
33
34
  end
34
35
 
35
36
  ##
@@ -58,8 +59,7 @@ module GeoCombine
58
59
  # Validates a GeoBlacklight-Schema json document
59
60
  # @return [Boolean]
60
61
  def valid?
61
- @schema ||= JSON.parse(open(SCHEMA_JSON_URL).read)
62
- JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') &&
62
+ JSON::Validator.validate!(schema, to_json, fragment: '#/definitions/layer') &&
63
63
  dct_references_validate! &&
64
64
  spatial_validate!
65
65
  end
@@ -69,9 +69,14 @@ module GeoCombine
69
69
  # @return [Boolean]
70
70
  def dct_references_validate!
71
71
  return true unless metadata.key?('dct_references_s') # TODO: shouldn't we require this field?
72
+
72
73
  begin
73
74
  ref = JSON.parse(metadata['dct_references_s'])
74
- raise GeoCombine::Exceptions::InvalidDCTReferences, 'dct_references must be parsed to a Hash' unless ref.is_a?(Hash)
75
+ unless ref.is_a?(Hash)
76
+ raise GeoCombine::Exceptions::InvalidDCTReferences,
77
+ 'dct_references must be parsed to a Hash'
78
+ end
79
+
75
80
  true
76
81
  rescue JSON::ParserError => e
77
82
  raise e, "Invalid JSON in dct_references_s: #{e.message}"
@@ -89,6 +94,7 @@ module GeoCombine
89
94
  # GeoBlacklight-Schema format
90
95
  def translate_formats(key, value)
91
96
  return unless key == 'dc_format_s' && formats.include?(value)
97
+
92
98
  metadata[key] = formats[value]
93
99
  end
94
100
 
@@ -96,6 +102,7 @@ module GeoCombine
96
102
  # Enhances the 'layer_geom_type_s' field by translating from known types
97
103
  def translate_geometry_type(key, value)
98
104
  return unless key == 'layer_geom_type_s' && geometry_types.include?(value)
105
+
99
106
  metadata[key] = geometry_types[value]
100
107
  end
101
108
 
@@ -104,6 +111,7 @@ module GeoCombine
104
111
  # categories
105
112
  def enhance_subjects(key, value)
106
113
  return unless key == 'dc_subject_sm'
114
+
107
115
  metadata[key] = value.map do |val|
108
116
  if subjects.include?(val)
109
117
  subjects[val]
@@ -118,11 +126,13 @@ module GeoCombine
118
126
  # and ISO8601 (for indexing into Solr)
119
127
  def format_proper_date(key, value)
120
128
  return unless key == 'layer_modified_dt'
129
+
121
130
  metadata[key] = Time.parse(value).utc.iso8601
122
131
  end
123
132
 
124
133
  def fields_should_be_array(key, value)
125
134
  return unless should_be_array.include?(key) && !value.is_a?(Array)
135
+
126
136
  metadata[key] = [value]
127
137
  end
128
138
 
@@ -155,5 +165,9 @@ module GeoCombine
155
165
  # ensure we have a proper v1 record
156
166
  valid?
157
167
  end
168
+
169
+ def schema
170
+ @schema ||= JSON.parse(URI.open(SCHEMA_JSON_URL).read)
171
+ end
158
172
  end
159
173
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  module GeometryTypes
3
5
  def geometry_types
@@ -1,6 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  class Iso19139 < Metadata
3
-
4
5
  ##
5
6
  # Returns a Nokogiri::XSLT object containing the ISO19139 to GeoBlacklight
6
7
  # XSL
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'active_support/core_ext/object/blank'
2
4
  require 'cgi'
3
5
 
@@ -77,15 +79,13 @@ module GeoCombine
77
79
  end
78
80
 
79
81
  def date
80
- begin
81
- DateTime.rfc3339(metadata['ContentDate'])
82
- rescue
83
- nil
84
- end
82
+ DateTime.rfc3339(metadata['ContentDate'])
83
+ rescue StandardError
84
+ nil
85
85
  end
86
86
 
87
87
  def year
88
- date.year unless date.nil?
88
+ date&.year
89
89
  end
90
90
 
91
91
  ##
@@ -104,9 +104,9 @@ module GeoCombine
104
104
  def ogp_formats
105
105
  case metadata['DataType']
106
106
  when 'Paper Map', 'Raster'
107
- return 'GeoTIFF'
107
+ 'GeoTIFF'
108
108
  when 'Polygon', 'Point', 'Line'
109
- return 'Shapefile'
109
+ 'Shapefile'
110
110
  else
111
111
  raise ArgumentError, metadata['DataType']
112
112
  end
@@ -128,6 +128,7 @@ module GeoCombine
128
128
  north >= -90 && north <= 90 &&
129
129
  south >= -90 && south <= 90 &&
130
130
  west <= east && south <= north
131
+
131
132
  "ENVELOPE(#{west}, #{east}, #{north}, #{south})"
132
133
  end
133
134
 
@@ -165,6 +166,7 @@ module GeoCombine
165
166
 
166
167
  def download_uri
167
168
  return 'http://schema.org/DownloadAction' if institution == 'Harvard'
169
+
168
170
  'http://schema.org/downloadUrl'
169
171
  end
170
172
 
@@ -205,7 +207,7 @@ module GeoCombine
205
207
  sluggify(filter_name(name))
206
208
  end
207
209
 
208
- SLUG_BLACKLIST = %w[
210
+ SLUG_STRIP_VALUES = %w[
209
211
  SDE_DATA.
210
212
  SDE.
211
213
  SDE2.
@@ -216,8 +218,8 @@ module GeoCombine
216
218
 
217
219
  def filter_name(name)
218
220
  # strip out schema and usernames
219
- SLUG_BLACKLIST.each do |blacklisted|
220
- name.sub!(blacklisted, '')
221
+ SLUG_STRIP_VALUES.each do |strip_val|
222
+ name.sub!(strip_val, '')
221
223
  end
222
224
  unless name.size > 1
223
225
  # use first word of title is empty name
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  class Railtie < Rails::Railtie
3
5
  rake_tasks do
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  ##
3
5
  # Translation dictionary to ISO topics
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
- VERSION = '0.4.0'
4
+ VERSION = '0.6.0'
3
5
  end
data/lib/geo_combine.rb CHANGED
@@ -1,10 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'json'
3
5
  require 'json-schema'
4
6
  require 'sanitize'
5
7
 
6
8
  module GeoCombine
7
-
8
9
  ##
9
10
  # TODO: Create a parse method that can interpret the type of metadata being
10
11
  # passed in.
@@ -23,7 +24,7 @@ module GeoCombine
23
24
  # @param [String] metadata can be a File path
24
25
  # "./tmp/edu.stanford.purl/bb/338/jh/0716/iso19139.xml" or a String of XML
25
26
  # metadata
26
- def initialize metadata
27
+ def initialize(metadata)
27
28
  metadata = File.read metadata if File.readable? metadata
28
29
  metadata = Nokogiri::XML(metadata) if metadata.instance_of? String
29
30
  @metadata = metadata
@@ -35,7 +36,7 @@ module GeoCombine
35
36
  # GeoCombine::Geoblacklight on its instantiation
36
37
  # @return [GeoCombine::Geoblacklight] the data transformed into
37
38
  # geoblacklight schema, returned as a GeoCombine::Geoblacklight
38
- def to_geoblacklight fields = {}
39
+ def to_geoblacklight(fields = {})
39
40
  GeoCombine::Geoblacklight.new(xsl_geoblacklight.apply_to(@metadata), fields)
40
41
  end
41
42
 
@@ -68,6 +69,9 @@ require 'geo_combine/esri_open_data'
68
69
  require 'geo_combine/ckan_metadata'
69
70
  require 'geo_combine/ogp'
70
71
 
72
+ # Require harvesting/indexing files
73
+ require 'geo_combine/geo_blacklight_harvester'
74
+
71
75
  # Require gem files
72
76
  require 'geo_combine/version'
73
77
  require 'geo_combine/railtie' if defined?(Rails)
@@ -1,16 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'net/http'
2
4
  require 'json'
3
5
  require 'rsolr'
4
6
  require 'find'
7
+ require 'geo_combine/geo_blacklight_harvester'
5
8
 
6
9
  namespace :geocombine do
7
- commit_within = (ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
8
- ogm_path = ENV['OGM_PATH'] || 'tmp/opengeometadata'
9
- solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
10
- whitelist = %w[
11
- https://github.com/OpenGeoMetadata/big-ten.git
12
- ]
13
-
14
10
  desc 'Clone OpenGeoMetadata repositories'
15
11
  task :clone, [:repo] do |_t, args|
16
12
  if args.repo
@@ -18,46 +14,81 @@ namespace :geocombine do
18
14
  else
19
15
  ogm_api_uri = URI('https://api.github.com/orgs/opengeometadata/repos')
20
16
  ogm_repos = JSON.parse(Net::HTTP.get(ogm_api_uri)).map do |repo|
21
- repo['clone_url'] if repo['size'] > 0
17
+ repo['clone_url'] if (repo['size']).positive?
22
18
  end.compact
23
- ogm_repos.select! { |repo| whitelist.include?(repo) || repo =~ /(edu|org|uk)\..*\.git$/ }
19
+ ogm_repos.reject! { |repo| GeoCombineRake.denylist.include?(repo) }
24
20
  end
25
21
  ogm_repos.each do |repo|
26
- system "echo #{repo} && mkdir -p #{ogm_path} && cd #{ogm_path} && git clone --depth 1 #{repo}"
22
+ Kernel.system "echo #{repo} && mkdir -p #{GeoCombineRake.ogm_path} && cd #{GeoCombineRake.ogm_path} && git clone --depth 1 #{repo}"
27
23
  end
28
24
  end
29
25
 
30
26
  desc '"git pull" OpenGeoMetadata repositories'
31
27
  task :pull, [:repo] do |_t, args|
32
28
  paths = if args.repo
33
- [File.join(ogm_path, args.repo)]
29
+ [File.join(GeoCombineRake.ogm_path, args.repo)]
34
30
  else
35
- Dir.glob("#{ogm_path}/*")
31
+ Dir.glob("#{GeoCombineRake.ogm_path}/*")
36
32
  end
37
33
  paths.each do |path|
38
34
  next unless File.directory?(path)
39
- system "echo #{path} && cd #{path} && git pull origin"
35
+
36
+ Kernel.system "echo #{path} && cd #{path} && git pull origin"
40
37
  end
41
38
  end
42
39
 
43
- desc 'Index all of the GeoBlacklight JSON documents'
40
+ desc 'Index all JSON documents except Layers.json'
44
41
  task :index do
45
- puts "Indexing #{ogm_path} into #{solr_url}"
46
- solr = RSolr.connect url: solr_url, adapter: :net_http_persistent
47
- Find.find(ogm_path) do |path|
48
- next unless File.basename(path) == 'geoblacklight.json'
42
+ puts "Indexing #{GeoCombineRake.ogm_path} into #{GeoCombineRake.solr_url}"
43
+ solr = RSolr.connect url: GeoCombineRake.solr_url, adapter: :net_http_persistent
44
+ Find.find(GeoCombineRake.ogm_path) do |path|
45
+ next unless File.basename(path).include?('.json') && File.basename(path) != 'layers.json'
46
+
49
47
  doc = JSON.parse(File.read(path))
50
48
  [doc].flatten.each do |record|
51
- begin
52
- puts "Indexing #{record['layer_slug_s']}: #{path}" if $DEBUG
53
- solr.update params: { commitWithin: commit_within, overwrite: true },
54
- data: [record].to_json,
55
- headers: { 'Content-Type' => 'application/json' }
56
- rescue RSolr::Error::Http => error
57
- puts error
58
- end
49
+ puts "Indexing #{record['layer_slug_s']}: #{path}" if $DEBUG
50
+ solr.update params: { commitWithin: GeoCombineRake.commit_within, overwrite: true },
51
+ data: [record].to_json,
52
+ headers: { 'Content-Type' => 'application/json' }
53
+ rescue RSolr::Error::Http => e
54
+ puts e
59
55
  end
60
56
  end
61
57
  solr.commit
62
58
  end
59
+
60
+ namespace :geoblacklight_harvester do
61
+ desc 'Harvest documents from a configured GeoBlacklight instance'
62
+ task :index, [:site] => [:environment] do |_t, args|
63
+ raise ArgumentError, 'A site argument is required' unless args.site
64
+
65
+ GeoCombine::GeoBlacklightHarvester.new(args.site.to_sym).index
66
+ end
67
+ end
68
+ end
69
+
70
+ # Class to hold helper methods for use in GeoCombine rake tasks
71
+ class GeoCombineRake
72
+ def self.commit_within
73
+ (ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
74
+ end
75
+
76
+ def self.denylist
77
+ [
78
+ 'https://github.com/OpenGeoMetadata/GeoCombine.git',
79
+ 'https://github.com/OpenGeoMetadata/aardvark.git',
80
+ 'https://github.com/OpenGeoMetadata/metadata-issues.git',
81
+ 'https://github.com/OpenGeoMetadata/ogm_utils-python.git',
82
+ 'https://github.com/OpenGeoMetadata/opengeometadata.github.io.git',
83
+ 'https://github.com/OpenGeoMetadata/opengeometadata-rails.git'
84
+ ]
85
+ end
86
+
87
+ def self.ogm_path
88
+ ENV['OGM_PATH'] || 'tmp/opengeometadata'
89
+ end
90
+
91
+ def self.solr_url
92
+ ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
93
+ end
63
94
  end