geo_combine 0.4.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/ruby.yml +53 -0
  3. data/.gitignore +2 -0
  4. data/.rubocop.yml +20 -0
  5. data/.rubocop_todo.yml +165 -0
  6. data/Gemfile +3 -1
  7. data/README.md +80 -1
  8. data/Rakefile +4 -2
  9. data/bin/geocombine +1 -0
  10. data/geo_combine.gemspec +5 -0
  11. data/lib/geo_combine/bounding_box.rb +7 -1
  12. data/lib/geo_combine/ckan_metadata.rb +10 -8
  13. data/lib/geo_combine/cli.rb +3 -1
  14. data/lib/geo_combine/esri_open_data.rb +2 -0
  15. data/lib/geo_combine/exceptions.rb +3 -0
  16. data/lib/geo_combine/fgdc.rb +2 -2
  17. data/lib/geo_combine/formats.rb +2 -0
  18. data/lib/geo_combine/formatting.rb +3 -1
  19. data/lib/geo_combine/geo_blacklight_harvester.rb +211 -0
  20. data/lib/geo_combine/geoblacklight.rb +20 -6
  21. data/lib/geo_combine/geometry_types.rb +2 -0
  22. data/lib/geo_combine/iso19139.rb +2 -1
  23. data/lib/geo_combine/ogp.rb +13 -11
  24. data/lib/geo_combine/railtie.rb +2 -0
  25. data/lib/geo_combine/subjects.rb +2 -0
  26. data/lib/geo_combine/version.rb +3 -1
  27. data/lib/geo_combine.rb +7 -3
  28. data/lib/tasks/geo_combine.rake +57 -26
  29. data/lib/xslt/fgdc2html.xsl +38 -9
  30. data/lib/xslt/iso2html.xsl +1107 -1070
  31. data/spec/features/fgdc2html_spec.rb +53 -1
  32. data/spec/features/iso2html_spec.rb +17 -2
  33. data/spec/fixtures/docs/princeton_fgdc.xml +374 -0
  34. data/spec/fixtures/docs/repos.json +3224 -0
  35. data/spec/fixtures/docs/simple_xml.xml +10 -0
  36. data/spec/fixtures/docs/simple_xslt.xsl +11 -0
  37. data/spec/fixtures/docs/stanford_iso.xml +652 -0
  38. data/spec/fixtures/docs/tufts_fgdc.xml +977 -0
  39. data/spec/fixtures/indexing/basic_geoblacklight.json +27 -0
  40. data/spec/fixtures/indexing/geoblacklight.json +33 -0
  41. data/spec/fixtures/indexing/layers.json +16119 -0
  42. data/spec/fixtures/indexing/test.txt +1 -0
  43. data/spec/fixtures/json_docs.rb +2 -0
  44. data/spec/fixtures/xml_docs.rb +9 -1659
  45. data/spec/helpers.rb +7 -7
  46. data/spec/lib/geo_combine/bounding_box_spec.rb +18 -0
  47. data/spec/lib/geo_combine/ckan_metadata_spec.rb +34 -11
  48. data/spec/lib/geo_combine/esri_open_data_spec.rb +23 -2
  49. data/spec/lib/geo_combine/fgdc_spec.rb +41 -10
  50. data/spec/lib/geo_combine/formatting_spec.rb +13 -5
  51. data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +194 -0
  52. data/spec/lib/geo_combine/geoblacklight_spec.rb +41 -11
  53. data/spec/lib/geo_combine/iso19139_spec.rb +26 -14
  54. data/spec/lib/geo_combine/ogp_spec.rb +28 -8
  55. data/spec/lib/geo_combine_spec.rb +7 -4
  56. data/spec/lib/tasks/geo_combine_spec.rb +45 -0
  57. data/spec/spec_helper.rb +19 -84
  58. data/spec/support/fixtures.rb +9 -0
  59. metadata +103 -6
  60. data/.coveralls.yml +0 -1
  61. data/.travis.yml +0 -7
@@ -0,0 +1,211 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GeoCombine
4
+ ##
5
+ # A class to harvest and index results from GeoBlacklight sites
6
+ # You can configure the sites to be harvested via a configure command.
7
+ # GeoCombine::GeoBlacklightHarvester.configure do
8
+ # {
9
+ # SITE: { host: 'https://example.com', params: { f: { dct_provenance_s: ['SITE'] } } }
10
+ # }
11
+ # end
12
+ # The class configuration also allows for various other things to be configured:
13
+ # - A debug parameter to print out details of what is being harvested and indexed
14
+ # - crawl delays for each page of results (globally or on a per site basis)
15
+ # - Solr's commitWithin parameter (defaults to 5000)
16
+ # - A document transformer proc to modify a document before indexing (defaults to removing _version_, score, and timestamp)
17
+ # Example: GeoCombine::GeoBlacklightHarvester.new('SITE').index
18
+ class GeoBlacklightHarvester
19
+ require 'active_support/core_ext/object/to_query'
20
+
21
+ class << self
22
+ attr_writer :document_transformer
23
+
24
+ def configure(&block)
25
+ @config = yield block
26
+ end
27
+
28
+ def config
29
+ @config || {}
30
+ end
31
+
32
+ def document_transformer
33
+ @document_transformer || lambda do |document|
34
+ document.delete('_version_')
35
+ document.delete('score')
36
+ document.delete('timestamp')
37
+ document.delete('solr_bboxtype__minX')
38
+ document.delete('solr_bboxtype__minY')
39
+ document.delete('solr_bboxtype__maxX')
40
+ document.delete('solr_bboxtype__maxY')
41
+ document
42
+ end
43
+ end
44
+ end
45
+
46
+ attr_reader :site, :site_key
47
+
48
+ def initialize(site_key)
49
+ @site_key = site_key
50
+ @site = self.class.config[site_key]
51
+
52
+ raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
53
+ end
54
+
55
+ def index
56
+ puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
57
+ response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
58
+ response_class = BlacklightResponseVersionFactory.call(response)
59
+
60
+ response_class.new(response: response, base_url: base_url).documents.each do |docs|
61
+ docs.map! do |document|
62
+ self.class.document_transformer&.call(document)
63
+ end.compact
64
+
65
+ puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
66
+ solr_connection.update params: { commitWithin: commit_within, overwrite: true },
67
+ data: docs.to_json,
68
+ headers: { 'Content-Type' => 'application/json' }
69
+
70
+ sleep(crawl_delay.to_i) if crawl_delay
71
+ end
72
+ end
73
+
74
+ ##
75
+ # A "factory" class to determine the blacklight response version to use
76
+ class BlacklightResponseVersionFactory
77
+ def self.call(json)
78
+ keys = json.keys
79
+ if keys.include?('response')
80
+ LegacyBlacklightResponse
81
+ elsif keys.any? && %w[links data].all? { |param| keys.include?(param) }
82
+ ModernBlacklightResponse
83
+ else
84
+ raise NotImplementedError,
85
+ "The following json response was not able to be parsed by the GeoBlacklightHarvester\n#{json}"
86
+ end
87
+ end
88
+ end
89
+
90
+ class LegacyBlacklightResponse
91
+ attr_reader :base_url
92
+ attr_accessor :response, :page
93
+
94
+ def initialize(response:, base_url:)
95
+ @base_url = base_url
96
+ @response = response
97
+ @page = 1
98
+ end
99
+
100
+ def documents
101
+ return enum_for(:documents) unless block_given?
102
+
103
+ while current_page && total_pages && (current_page <= total_pages)
104
+ yield response.dig('response', 'docs')
105
+
106
+ break if current_page == total_pages
107
+
108
+ self.page += 1
109
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
110
+
111
+ begin
112
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
113
+ rescue StandardError => e
114
+ puts "Request for #{url} failed with #{e}"
115
+ self.response = nil
116
+ end
117
+ end
118
+ end
119
+
120
+ private
121
+
122
+ def url
123
+ "#{base_url}&page=#{page}"
124
+ end
125
+
126
+ def current_page
127
+ response.dig('response', 'pages', 'current_page')
128
+ end
129
+
130
+ def total_pages
131
+ response.dig('response', 'pages', 'total_pages')
132
+ end
133
+ end
134
+
135
+ ##
136
+ # Class to return documents from the Blacklight API (v7 and above)
137
+ class ModernBlacklightResponse
138
+ attr_reader :base_url
139
+ attr_accessor :response, :page
140
+
141
+ def initialize(response:, base_url:)
142
+ @base_url = base_url
143
+ @response = response
144
+ @page = 1
145
+ end
146
+
147
+ def documents
148
+ return enum_for(:documents) unless block_given?
149
+
150
+ while response && response['data'].any?
151
+ document_urls = response['data'].collect { |data| data.dig('links', 'self') }.compact
152
+
153
+ yield documents_from_urls(document_urls)
154
+
155
+ url = response.dig('links', 'next')
156
+ break unless url
157
+
158
+ url = "#{url}&format=json"
159
+ self.page += 1
160
+ puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
161
+ begin
162
+ self.response = JSON.parse(Net::HTTP.get(URI(url)))
163
+ rescue StandardError => e
164
+ puts "Request for #{url} failed with #{e}"
165
+ self.response = nil
166
+ end
167
+ end
168
+ end
169
+
170
+ private
171
+
172
+ def documents_from_urls(urls)
173
+ puts "Fetching #{urls.count} documents for page #{page}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
174
+ urls.map do |url|
175
+ JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
176
+ rescue StandardError => e
177
+ puts "Fetching \"#{url}/raw\" failed with #{e}"
178
+
179
+ nil
180
+ end.compact
181
+ end
182
+ end
183
+
184
+ private
185
+
186
+ def base_url
187
+ "#{site[:host]}?#{default_params.to_query}"
188
+ end
189
+
190
+ def solr_connection
191
+ solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
192
+
193
+ RSolr.connect url: solr_url, adapter: :net_http_persistent
194
+ end
195
+
196
+ def commit_within
197
+ self.class.config[:commit_within] || '5000'
198
+ end
199
+
200
+ def crawl_delay
201
+ site[:crawl_delay] || self.class.config[:crawl_delay]
202
+ end
203
+
204
+ def default_params
205
+ {
206
+ per_page: 100,
207
+ format: :json
208
+ }.merge(site[:params])
209
+ end
210
+ end
211
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'active_support/core_ext/object/blank'
2
4
  require 'active_support/core_ext/hash/except'
3
5
  require 'open-uri'
@@ -10,8 +12,8 @@ module GeoCombine
10
12
 
11
13
  attr_reader :metadata
12
14
 
13
- GEOBLACKLIGHT_VERSION = 'v1.1.0'
14
- SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/#{GEOBLACKLIGHT_VERSION}/schema/geoblacklight-schema.json".freeze
15
+ GEOBLACKLIGHT_VERSION = '1.0'
16
+ SCHEMA_JSON_URL = "https://raw.githubusercontent.com/geoblacklight/geoblacklight/main/schema/geoblacklight-schema-#{GEOBLACKLIGHT_VERSION}.json"
15
17
  DEPRECATED_KEYS_V1 = %w[
16
18
  uuid
17
19
  georss_polygon_s
@@ -29,7 +31,6 @@ module GeoCombine
29
31
  # @param [Hash] fields enhancements to metadata that are merged with @metadata
30
32
  def initialize(metadata, fields = {})
31
33
  @metadata = JSON.parse(metadata).merge(fields)
32
- @schema = nil
33
34
  end
34
35
 
35
36
  ##
@@ -58,8 +59,7 @@ module GeoCombine
58
59
  # Validates a GeoBlacklight-Schema json document
59
60
  # @return [Boolean]
60
61
  def valid?
61
- @schema ||= JSON.parse(open(SCHEMA_JSON_URL).read)
62
- JSON::Validator.validate!(@schema, to_json, fragment: '#/properties/layer') &&
62
+ JSON::Validator.validate!(schema, to_json, fragment: '#/definitions/layer') &&
63
63
  dct_references_validate! &&
64
64
  spatial_validate!
65
65
  end
@@ -69,9 +69,14 @@ module GeoCombine
69
69
  # @return [Boolean]
70
70
  def dct_references_validate!
71
71
  return true unless metadata.key?('dct_references_s') # TODO: shouldn't we require this field?
72
+
72
73
  begin
73
74
  ref = JSON.parse(metadata['dct_references_s'])
74
- raise GeoCombine::Exceptions::InvalidDCTReferences, 'dct_references must be parsed to a Hash' unless ref.is_a?(Hash)
75
+ unless ref.is_a?(Hash)
76
+ raise GeoCombine::Exceptions::InvalidDCTReferences,
77
+ 'dct_references must be parsed to a Hash'
78
+ end
79
+
75
80
  true
76
81
  rescue JSON::ParserError => e
77
82
  raise e, "Invalid JSON in dct_references_s: #{e.message}"
@@ -89,6 +94,7 @@ module GeoCombine
89
94
  # GeoBlacklight-Schema format
90
95
  def translate_formats(key, value)
91
96
  return unless key == 'dc_format_s' && formats.include?(value)
97
+
92
98
  metadata[key] = formats[value]
93
99
  end
94
100
 
@@ -96,6 +102,7 @@ module GeoCombine
96
102
  # Enhances the 'layer_geom_type_s' field by translating from known types
97
103
  def translate_geometry_type(key, value)
98
104
  return unless key == 'layer_geom_type_s' && geometry_types.include?(value)
105
+
99
106
  metadata[key] = geometry_types[value]
100
107
  end
101
108
 
@@ -104,6 +111,7 @@ module GeoCombine
104
111
  # categories
105
112
  def enhance_subjects(key, value)
106
113
  return unless key == 'dc_subject_sm'
114
+
107
115
  metadata[key] = value.map do |val|
108
116
  if subjects.include?(val)
109
117
  subjects[val]
@@ -118,11 +126,13 @@ module GeoCombine
118
126
  # and ISO8601 (for indexing into Solr)
119
127
  def format_proper_date(key, value)
120
128
  return unless key == 'layer_modified_dt'
129
+
121
130
  metadata[key] = Time.parse(value).utc.iso8601
122
131
  end
123
132
 
124
133
  def fields_should_be_array(key, value)
125
134
  return unless should_be_array.include?(key) && !value.is_a?(Array)
135
+
126
136
  metadata[key] = [value]
127
137
  end
128
138
 
@@ -155,5 +165,9 @@ module GeoCombine
155
165
  # ensure we have a proper v1 record
156
166
  valid?
157
167
  end
168
+
169
+ def schema
170
+ @schema ||= JSON.parse(URI.open(SCHEMA_JSON_URL).read)
171
+ end
158
172
  end
159
173
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  module GeometryTypes
3
5
  def geometry_types
@@ -1,6 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  class Iso19139 < Metadata
3
-
4
5
  ##
5
6
  # Returns a Nokogiri::XSLT object containing the ISO19139 to GeoBlacklight
6
7
  # XSL
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'active_support/core_ext/object/blank'
2
4
  require 'cgi'
3
5
 
@@ -77,15 +79,13 @@ module GeoCombine
77
79
  end
78
80
 
79
81
  def date
80
- begin
81
- DateTime.rfc3339(metadata['ContentDate'])
82
- rescue
83
- nil
84
- end
82
+ DateTime.rfc3339(metadata['ContentDate'])
83
+ rescue StandardError
84
+ nil
85
85
  end
86
86
 
87
87
  def year
88
- date.year unless date.nil?
88
+ date&.year
89
89
  end
90
90
 
91
91
  ##
@@ -104,9 +104,9 @@ module GeoCombine
104
104
  def ogp_formats
105
105
  case metadata['DataType']
106
106
  when 'Paper Map', 'Raster'
107
- return 'GeoTIFF'
107
+ 'GeoTIFF'
108
108
  when 'Polygon', 'Point', 'Line'
109
- return 'Shapefile'
109
+ 'Shapefile'
110
110
  else
111
111
  raise ArgumentError, metadata['DataType']
112
112
  end
@@ -128,6 +128,7 @@ module GeoCombine
128
128
  north >= -90 && north <= 90 &&
129
129
  south >= -90 && south <= 90 &&
130
130
  west <= east && south <= north
131
+
131
132
  "ENVELOPE(#{west}, #{east}, #{north}, #{south})"
132
133
  end
133
134
 
@@ -165,6 +166,7 @@ module GeoCombine
165
166
 
166
167
  def download_uri
167
168
  return 'http://schema.org/DownloadAction' if institution == 'Harvard'
169
+
168
170
  'http://schema.org/downloadUrl'
169
171
  end
170
172
 
@@ -205,7 +207,7 @@ module GeoCombine
205
207
  sluggify(filter_name(name))
206
208
  end
207
209
 
208
- SLUG_BLACKLIST = %w[
210
+ SLUG_STRIP_VALUES = %w[
209
211
  SDE_DATA.
210
212
  SDE.
211
213
  SDE2.
@@ -216,8 +218,8 @@ module GeoCombine
216
218
 
217
219
  def filter_name(name)
218
220
  # strip out schema and usernames
219
- SLUG_BLACKLIST.each do |blacklisted|
220
- name.sub!(blacklisted, '')
221
+ SLUG_STRIP_VALUES.each do |strip_val|
222
+ name.sub!(strip_val, '')
221
223
  end
222
224
  unless name.size > 1
223
225
  # use first word of title is empty name
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  class Railtie < Rails::Railtie
3
5
  rake_tasks do
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
4
  ##
3
5
  # Translation dictionary to ISO topics
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module GeoCombine
2
- VERSION = '0.4.0'
4
+ VERSION = '0.6.0'
3
5
  end
data/lib/geo_combine.rb CHANGED
@@ -1,10 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'json'
3
5
  require 'json-schema'
4
6
  require 'sanitize'
5
7
 
6
8
  module GeoCombine
7
-
8
9
  ##
9
10
  # TODO: Create a parse method that can interpret the type of metadata being
10
11
  # passed in.
@@ -23,7 +24,7 @@ module GeoCombine
23
24
  # @param [String] metadata can be a File path
24
25
  # "./tmp/edu.stanford.purl/bb/338/jh/0716/iso19139.xml" or a String of XML
25
26
  # metadata
26
- def initialize metadata
27
+ def initialize(metadata)
27
28
  metadata = File.read metadata if File.readable? metadata
28
29
  metadata = Nokogiri::XML(metadata) if metadata.instance_of? String
29
30
  @metadata = metadata
@@ -35,7 +36,7 @@ module GeoCombine
35
36
  # GeoCombine::Geoblacklight on its instantiation
36
37
  # @return [GeoCombine::Geoblacklight] the data transformed into
37
38
  # geoblacklight schema, returned as a GeoCombine::Geoblacklight
38
- def to_geoblacklight fields = {}
39
+ def to_geoblacklight(fields = {})
39
40
  GeoCombine::Geoblacklight.new(xsl_geoblacklight.apply_to(@metadata), fields)
40
41
  end
41
42
 
@@ -68,6 +69,9 @@ require 'geo_combine/esri_open_data'
68
69
  require 'geo_combine/ckan_metadata'
69
70
  require 'geo_combine/ogp'
70
71
 
72
+ # Require harvesting/indexing files
73
+ require 'geo_combine/geo_blacklight_harvester'
74
+
71
75
  # Require gem files
72
76
  require 'geo_combine/version'
73
77
  require 'geo_combine/railtie' if defined?(Rails)
@@ -1,16 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'net/http'
2
4
  require 'json'
3
5
  require 'rsolr'
4
6
  require 'find'
7
+ require 'geo_combine/geo_blacklight_harvester'
5
8
 
6
9
  namespace :geocombine do
7
- commit_within = (ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
8
- ogm_path = ENV['OGM_PATH'] || 'tmp/opengeometadata'
9
- solr_url = ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
10
- whitelist = %w[
11
- https://github.com/OpenGeoMetadata/big-ten.git
12
- ]
13
-
14
10
  desc 'Clone OpenGeoMetadata repositories'
15
11
  task :clone, [:repo] do |_t, args|
16
12
  if args.repo
@@ -18,46 +14,81 @@ namespace :geocombine do
18
14
  else
19
15
  ogm_api_uri = URI('https://api.github.com/orgs/opengeometadata/repos')
20
16
  ogm_repos = JSON.parse(Net::HTTP.get(ogm_api_uri)).map do |repo|
21
- repo['clone_url'] if repo['size'] > 0
17
+ repo['clone_url'] if (repo['size']).positive?
22
18
  end.compact
23
- ogm_repos.select! { |repo| whitelist.include?(repo) || repo =~ /(edu|org|uk)\..*\.git$/ }
19
+ ogm_repos.reject! { |repo| GeoCombineRake.denylist.include?(repo) }
24
20
  end
25
21
  ogm_repos.each do |repo|
26
- system "echo #{repo} && mkdir -p #{ogm_path} && cd #{ogm_path} && git clone --depth 1 #{repo}"
22
+ Kernel.system "echo #{repo} && mkdir -p #{GeoCombineRake.ogm_path} && cd #{GeoCombineRake.ogm_path} && git clone --depth 1 #{repo}"
27
23
  end
28
24
  end
29
25
 
30
26
  desc '"git pull" OpenGeoMetadata repositories'
31
27
  task :pull, [:repo] do |_t, args|
32
28
  paths = if args.repo
33
- [File.join(ogm_path, args.repo)]
29
+ [File.join(GeoCombineRake.ogm_path, args.repo)]
34
30
  else
35
- Dir.glob("#{ogm_path}/*")
31
+ Dir.glob("#{GeoCombineRake.ogm_path}/*")
36
32
  end
37
33
  paths.each do |path|
38
34
  next unless File.directory?(path)
39
- system "echo #{path} && cd #{path} && git pull origin"
35
+
36
+ Kernel.system "echo #{path} && cd #{path} && git pull origin"
40
37
  end
41
38
  end
42
39
 
43
- desc 'Index all of the GeoBlacklight JSON documents'
40
+ desc 'Index all JSON documents except Layers.json'
44
41
  task :index do
45
- puts "Indexing #{ogm_path} into #{solr_url}"
46
- solr = RSolr.connect url: solr_url, adapter: :net_http_persistent
47
- Find.find(ogm_path) do |path|
48
- next unless File.basename(path) == 'geoblacklight.json'
42
+ puts "Indexing #{GeoCombineRake.ogm_path} into #{GeoCombineRake.solr_url}"
43
+ solr = RSolr.connect url: GeoCombineRake.solr_url, adapter: :net_http_persistent
44
+ Find.find(GeoCombineRake.ogm_path) do |path|
45
+ next unless File.basename(path).include?('.json') && File.basename(path) != 'layers.json'
46
+
49
47
  doc = JSON.parse(File.read(path))
50
48
  [doc].flatten.each do |record|
51
- begin
52
- puts "Indexing #{record['layer_slug_s']}: #{path}" if $DEBUG
53
- solr.update params: { commitWithin: commit_within, overwrite: true },
54
- data: [record].to_json,
55
- headers: { 'Content-Type' => 'application/json' }
56
- rescue RSolr::Error::Http => error
57
- puts error
58
- end
49
+ puts "Indexing #{record['layer_slug_s']}: #{path}" if $DEBUG
50
+ solr.update params: { commitWithin: GeoCombineRake.commit_within, overwrite: true },
51
+ data: [record].to_json,
52
+ headers: { 'Content-Type' => 'application/json' }
53
+ rescue RSolr::Error::Http => e
54
+ puts e
59
55
  end
60
56
  end
61
57
  solr.commit
62
58
  end
59
+
60
+ namespace :geoblacklight_harvester do
61
+ desc 'Harvest documents from a configured GeoBlacklight instance'
62
+ task :index, [:site] => [:environment] do |_t, args|
63
+ raise ArgumentError, 'A site argument is required' unless args.site
64
+
65
+ GeoCombine::GeoBlacklightHarvester.new(args.site.to_sym).index
66
+ end
67
+ end
68
+ end
69
+
70
+ # Class to hold helper methods for use in GeoCombine rake tasks
71
+ class GeoCombineRake
72
+ def self.commit_within
73
+ (ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
74
+ end
75
+
76
+ def self.denylist
77
+ [
78
+ 'https://github.com/OpenGeoMetadata/GeoCombine.git',
79
+ 'https://github.com/OpenGeoMetadata/aardvark.git',
80
+ 'https://github.com/OpenGeoMetadata/metadata-issues.git',
81
+ 'https://github.com/OpenGeoMetadata/ogm_utils-python.git',
82
+ 'https://github.com/OpenGeoMetadata/opengeometadata.github.io.git',
83
+ 'https://github.com/OpenGeoMetadata/opengeometadata-rails.git'
84
+ ]
85
+ end
86
+
87
+ def self.ogm_path
88
+ ENV['OGM_PATH'] || 'tmp/opengeometadata'
89
+ end
90
+
91
+ def self.solr_url
92
+ ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
93
+ end
63
94
  end