geo_combine 0.7.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +7 -16
- data/.gitignore +1 -0
- data/.rubocop.yml +5 -1
- data/.rubocop_todo.yml +34 -36
- data/README.md +47 -22
- data/geo_combine.gemspec +2 -0
- data/lib/geo_combine/ckan_metadata.rb +5 -4
- data/lib/geo_combine/formatting.rb +1 -1
- data/lib/geo_combine/geo_blacklight_harvester.rb +17 -12
- data/lib/geo_combine/geoblacklight.rb +1 -1
- data/lib/geo_combine/harvester.rb +132 -0
- data/lib/geo_combine/indexer.rb +126 -0
- data/lib/geo_combine/logger.rb +16 -0
- data/lib/geo_combine/migrators/v1_aardvark_migrator.rb +118 -0
- data/lib/geo_combine/ogp.rb +1 -1
- data/lib/geo_combine/railtie.rb +1 -0
- data/lib/geo_combine/version.rb +1 -1
- data/lib/geo_combine.rb +3 -0
- data/lib/tasks/geo_combine.rake +10 -65
- data/spec/fixtures/docs/full_geoblacklight.json +8 -1
- data/spec/fixtures/docs/full_geoblacklight_aardvark.json +51 -0
- data/spec/fixtures/indexing/aardvark.json +57 -0
- data/spec/fixtures/json_docs.rb +6 -0
- data/spec/lib/geo_combine/bounding_box_spec.rb +1 -1
- data/spec/lib/geo_combine/geo_blacklight_harvester_spec.rb +5 -4
- data/spec/lib/geo_combine/geoblacklight_spec.rb +3 -3
- data/spec/lib/geo_combine/harvester_spec.rb +133 -0
- data/spec/lib/geo_combine/indexer_spec.rb +134 -0
- data/spec/lib/geo_combine/migrators/v1_aardvark_migrator_spec.rb +46 -0
- data/spec/lib/geo_combine_spec.rb +20 -17
- data/spec/spec_helper.rb +1 -2
- metadata +46 -9
- data/bin/geocombine +0 -6
- data/lib/geo_combine/cli.rb +0 -27
- data/spec/lib/tasks/geo_combine_spec.rb +0 -45
@@ -0,0 +1,132 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'find'
|
5
|
+
require 'git'
|
6
|
+
require 'net/http'
|
7
|
+
require 'geo_combine/logger'
|
8
|
+
|
9
|
+
module GeoCombine
|
10
|
+
# Harvests Geoblacklight documents from OpenGeoMetadata for indexing
|
11
|
+
class Harvester
|
12
|
+
attr_reader :ogm_path, :schema_version
|
13
|
+
|
14
|
+
# Non-metadata repositories that shouldn't be harvested
|
15
|
+
def self.denylist
|
16
|
+
[
|
17
|
+
'GeoCombine',
|
18
|
+
'aardvark',
|
19
|
+
'metadata-issues',
|
20
|
+
'ogm_utils-python',
|
21
|
+
'opengeometadata.github.io',
|
22
|
+
'opengeometadata-rails',
|
23
|
+
'gbl-1_to_aardvark'
|
24
|
+
]
|
25
|
+
end
|
26
|
+
|
27
|
+
# GitHub API endpoint for OpenGeoMetadata repositories
|
28
|
+
def self.ogm_api_uri
|
29
|
+
URI('https://api.github.com/orgs/opengeometadata/repos?per_page=1000')
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize(
|
33
|
+
ogm_path: ENV.fetch('OGM_PATH', 'tmp/opengeometadata'),
|
34
|
+
schema_version: ENV.fetch('SCHEMA_VERSION', '1.0'),
|
35
|
+
logger: GeoCombine::Logger.logger
|
36
|
+
)
|
37
|
+
@ogm_path = ogm_path
|
38
|
+
@schema_version = schema_version
|
39
|
+
@logger = logger
|
40
|
+
end
|
41
|
+
|
42
|
+
# Enumerable of docs to index, for passing to an indexer
|
43
|
+
def docs_to_index
|
44
|
+
return to_enum(:docs_to_index) unless block_given?
|
45
|
+
|
46
|
+
@logger.info "loading documents from #{ogm_path}"
|
47
|
+
Find.find(@ogm_path) do |path|
|
48
|
+
# skip non-json and layers.json files
|
49
|
+
if File.basename(path) == 'layers.json' || !File.basename(path).end_with?('.json')
|
50
|
+
@logger.debug "skipping #{path}; not a geoblacklight JSON document"
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
doc = JSON.parse(File.read(path))
|
55
|
+
[doc].flatten.each do |record|
|
56
|
+
# skip indexing if this record has a different schema version than what we want
|
57
|
+
record_schema = record['gbl_mdVersion_s'] || record['geoblacklight_version']
|
58
|
+
record_id = record['layer_slug_s'] || record['dc_identifier_s']
|
59
|
+
if record_schema != @schema_version
|
60
|
+
@logger.debug "skipping #{record_id}; schema version #{record_schema} doesn't match #{@schema_version}"
|
61
|
+
next
|
62
|
+
end
|
63
|
+
|
64
|
+
@logger.debug "found record #{record_id} at #{path}"
|
65
|
+
yield record, path
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Update a repository via git
|
71
|
+
# If the repository doesn't exist, clone it.
|
72
|
+
def pull(repo)
|
73
|
+
repo_path = File.join(@ogm_path, repo)
|
74
|
+
clone(repo) unless File.directory? repo_path
|
75
|
+
|
76
|
+
Git.open(repo_path).pull
|
77
|
+
@logger.info "updated #{repo}"
|
78
|
+
repo
|
79
|
+
end
|
80
|
+
|
81
|
+
# Update all repositories
|
82
|
+
# Return the names of repositories updated
|
83
|
+
def pull_all
|
84
|
+
updated = repositories.map(&method(:pull)).compact
|
85
|
+
@logger.info "updated #{updated.size} repositories"
|
86
|
+
updated
|
87
|
+
end
|
88
|
+
|
89
|
+
# Clone a repository via git
|
90
|
+
# If the repository already exists, skip it.
|
91
|
+
def clone(repo)
|
92
|
+
repo_path = File.join(@ogm_path, repo)
|
93
|
+
repo_info = repository_info(repo)
|
94
|
+
repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git"
|
95
|
+
|
96
|
+
# Skip if exists; warn if archived or empty
|
97
|
+
if File.directory? repo_path
|
98
|
+
@logger.warn "skipping clone to #{repo_path}; directory exists"
|
99
|
+
return nil
|
100
|
+
end
|
101
|
+
@logger.warn "repository is archived: #{repo_url}" if repo_info['archived']
|
102
|
+
@logger.warn "repository is empty: #{repo_url}" if repo_info['size'].zero?
|
103
|
+
|
104
|
+
Git.clone(repo_url, nil, path: ogm_path, depth: 1)
|
105
|
+
@logger.info "cloned #{repo_url} to #{repo_path}"
|
106
|
+
repo
|
107
|
+
end
|
108
|
+
|
109
|
+
# Clone all repositories via git
|
110
|
+
# Return the names of repositories cloned.
|
111
|
+
def clone_all
|
112
|
+
cloned = repositories.map(&method(:clone)).compact
|
113
|
+
@logger.info "cloned #{cloned.size} repositories"
|
114
|
+
cloned
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
# List of repository names to harvest
|
120
|
+
def repositories
|
121
|
+
@repositories ||= JSON.parse(Net::HTTP.get(self.class.ogm_api_uri))
|
122
|
+
.filter { |repo| repo['size'].positive? }
|
123
|
+
.reject { |repo| repo['archived'] }
|
124
|
+
.map { |repo| repo['name'] }
|
125
|
+
.reject { |name| self.class.denylist.include? name }
|
126
|
+
end
|
127
|
+
|
128
|
+
def repository_info(repo_name)
|
129
|
+
JSON.parse(Net::HTTP.get(URI("https://api.github.com/repos/opengeometadata/#{repo_name}")))
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rsolr'
|
4
|
+
require 'faraday/retry'
|
5
|
+
require 'faraday/net_http_persistent'
|
6
|
+
require 'geo_combine/logger'
|
7
|
+
|
8
|
+
module GeoCombine
|
9
|
+
# Indexes Geoblacklight documents into Solr
|
10
|
+
class Indexer
|
11
|
+
attr_reader :solr
|
12
|
+
|
13
|
+
def initialize(solr: nil, logger: GeoCombine::Logger.logger)
|
14
|
+
@logger = logger
|
15
|
+
@batch_size = ENV.fetch('SOLR_BATCH_SIZE', 100).to_i
|
16
|
+
|
17
|
+
# If SOLR_URL is set, use it; if in a Geoblacklight app, use its solr core
|
18
|
+
solr_url = ENV.fetch('SOLR_URL', nil)
|
19
|
+
solr_url ||= Blacklight.default_index.connection.base_uri.to_s if defined? Blacklight
|
20
|
+
|
21
|
+
# If neither, warn and try to use local Blacklight default solr core
|
22
|
+
if solr_url.nil?
|
23
|
+
@logger.warn 'SOLR_URL not set; using Blacklight default'
|
24
|
+
solr_url = 'http://localhost:8983/solr/blacklight-core'
|
25
|
+
end
|
26
|
+
|
27
|
+
@solr = solr || RSolr.connect(client, url: solr_url)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Index everything and return the number of docs successfully indexed
|
31
|
+
def index(docs)
|
32
|
+
# Track total indexed and time spent
|
33
|
+
@logger.info "indexing into #{solr_url}"
|
34
|
+
total_indexed = 0
|
35
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
36
|
+
|
37
|
+
# Index in batches; set batch size via BATCH_SIZE
|
38
|
+
batch = []
|
39
|
+
docs.each do |doc, path|
|
40
|
+
if batch.size < @batch_size
|
41
|
+
batch << [doc, path]
|
42
|
+
else
|
43
|
+
total_indexed += index_batch(batch)
|
44
|
+
batch = []
|
45
|
+
end
|
46
|
+
end
|
47
|
+
total_indexed += index_batch(batch) unless batch.empty?
|
48
|
+
|
49
|
+
# Issue a commit to make sure all documents are indexed
|
50
|
+
@solr.commit
|
51
|
+
end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
52
|
+
sec = end_time - start_time
|
53
|
+
@logger.info format('indexed %<total_indexed>d documents in %<sec>.2f seconds', total_indexed:, sec:)
|
54
|
+
total_indexed
|
55
|
+
end
|
56
|
+
|
57
|
+
# URL to the solr instance being used
|
58
|
+
def solr_url
|
59
|
+
@solr.options[:url]
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
# Index a batch of documents; if we fail, index them all individually
|
65
|
+
def index_batch(batch)
|
66
|
+
docs = batch.map(&:first)
|
67
|
+
@solr.update(data: batch_json(docs), params:, headers:)
|
68
|
+
@logger.debug "indexed batch (#{batch.size} docs)"
|
69
|
+
batch.size
|
70
|
+
rescue RSolr::Error::Http => e
|
71
|
+
@logger.error "error indexing batch (#{batch.size} docs): #{format_error(e)}"
|
72
|
+
@logger.warn 'retrying documents individually'
|
73
|
+
batch.map { |doc, path| index_single(doc, path) }.compact.size
|
74
|
+
end
|
75
|
+
|
76
|
+
# Index a single document; if it fails, log the error and continue
|
77
|
+
def index_single(doc, path)
|
78
|
+
@solr.add(doc, params:, headers:)
|
79
|
+
@logger.debug "indexed #{path}"
|
80
|
+
doc
|
81
|
+
rescue RSolr::Error::Http => e
|
82
|
+
@logger.error "error indexing #{path}: #{format_error(e)}"
|
83
|
+
nil
|
84
|
+
end
|
85
|
+
|
86
|
+
# Generate a JSON string to send to solr update API for a batch of documents
|
87
|
+
def batch_json(batch)
|
88
|
+
batch.map { |doc| "add: { doc: #{doc.to_json} }" }.join(",\n").prepend('{ ').concat(' }')
|
89
|
+
end
|
90
|
+
|
91
|
+
# Generate a friendly error message for logging including status code and message
|
92
|
+
def format_error(error)
|
93
|
+
code = error.response[:status]
|
94
|
+
status_info = "#{code} #{RSolr::Error::Http::STATUS_CODES[code.to_i]}"
|
95
|
+
error_info = parse_solr_error(error)
|
96
|
+
[status_info, error_info].compact.join(' - ')
|
97
|
+
end
|
98
|
+
|
99
|
+
# Extract the specific error message from a solr JSON error response, if any
|
100
|
+
def parse_solr_error(error)
|
101
|
+
JSON.parse(error.response[:body]).dig('error', 'msg')
|
102
|
+
rescue StandardError
|
103
|
+
nil
|
104
|
+
end
|
105
|
+
|
106
|
+
def headers
|
107
|
+
{ 'Content-Type' => 'application/json' }
|
108
|
+
end
|
109
|
+
|
110
|
+
def params
|
111
|
+
{ overwrite: true }
|
112
|
+
end
|
113
|
+
|
114
|
+
def client
|
115
|
+
@client ||= Faraday.new do |conn|
|
116
|
+
conn.request :retry, max: 3, interval: 1, backoff_factor: 2, exceptions: [
|
117
|
+
Faraday::TimeoutError,
|
118
|
+
Faraday::ConnectionFailed,
|
119
|
+
Faraday::TooManyRequestsError
|
120
|
+
]
|
121
|
+
conn.response :raise_error
|
122
|
+
conn.adapter :net_http_persistent
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
module GeoCombine
|
6
|
+
# Logger for gem
|
7
|
+
class Logger
|
8
|
+
def self.logger
|
9
|
+
@logger ||= ::Logger.new(
|
10
|
+
$stderr,
|
11
|
+
progname: 'GeoCombine',
|
12
|
+
level: ENV.fetch('LOG_LEVEL', 'info').to_sym
|
13
|
+
)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'active_support'
|
4
|
+
|
5
|
+
module GeoCombine
|
6
|
+
module Migrators
|
7
|
+
# migrates the v1 schema to the aardvark schema
|
8
|
+
class V1AardvarkMigrator
|
9
|
+
attr_reader :v1_hash
|
10
|
+
|
11
|
+
# @param v1_hash [Hash] parsed json in the v1 schema
|
12
|
+
# @param collection_id_map [Hash] a hash mapping collection names to ids for converting dct_isPartOf_sm
|
13
|
+
def initialize(v1_hash:, collection_id_map: {})
|
14
|
+
@v1_hash = v1_hash
|
15
|
+
@v2_hash = v1_hash
|
16
|
+
@collection_id_map = collection_id_map
|
17
|
+
end
|
18
|
+
|
19
|
+
def run
|
20
|
+
# Return unchanged if already in the aardvark schema
|
21
|
+
return @v2_hash if @v2_hash['gbl_mdVersion_s'] == 'Aardvark'
|
22
|
+
|
23
|
+
# Convert the record
|
24
|
+
convert_keys
|
25
|
+
convert_single_to_multi_valued_fields
|
26
|
+
convert_non_crosswalked_fields
|
27
|
+
remove_deprecated_fields
|
28
|
+
|
29
|
+
# Mark the record as converted and return it
|
30
|
+
@v2_hash['gbl_mdVersion_s'] = 'Aardvark'
|
31
|
+
@v2_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
# Namespace and URI changes to fields
|
35
|
+
def convert_keys
|
36
|
+
@v2_hash.transform_keys! do |k|
|
37
|
+
SCHEMA_FIELD_MAP[k] || k
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Fields that need to be converted from single to multi-valued
|
42
|
+
def convert_single_to_multi_valued_fields
|
43
|
+
@v2_hash = @v2_hash.each_with_object({}) do |(k, v), h|
|
44
|
+
h[k] = if !v.is_a?(Array) && k.match?(/.*_[s|i]m/)
|
45
|
+
[v]
|
46
|
+
else
|
47
|
+
v
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Convert non-crosswalked fields via lookup tables
|
53
|
+
def convert_non_crosswalked_fields
|
54
|
+
# Keys may or may not include whitespace, so we normalize them.
|
55
|
+
# Resource class is required so we default to "Other"; resource type is not required.
|
56
|
+
@v2_hash['gbl_resourceClass_sm'] = RESOURCE_CLASS_MAP[@v1_hash['dc_type_s']&.gsub(/\s+/, '')] || ['Other']
|
57
|
+
resource_type = RESOURCE_TYPE_MAP[@v1_hash['layer_geom_type_s']&.gsub(/\s+/, '')]
|
58
|
+
@v2_hash['gbl_resourceType_sm'] = resource_type unless resource_type.nil?
|
59
|
+
|
60
|
+
# If the user specified a collection id map, use it to convert the collection names to ids
|
61
|
+
is_part_of = @v1_hash['dct_isPartOf_sm']&.map { |name| @collection_id_map[name] }&.compact
|
62
|
+
if is_part_of.present?
|
63
|
+
@v2_hash['dct_isPartOf_sm'] = is_part_of
|
64
|
+
else
|
65
|
+
@v2_hash.delete('dct_isPartOf_sm')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Remove fields that are no longer used
|
70
|
+
def remove_deprecated_fields
|
71
|
+
@v2_hash = @v2_hash.except(*SCHEMA_FIELD_MAP.keys, 'dc_type_s', 'layer_geom_type_s')
|
72
|
+
end
|
73
|
+
|
74
|
+
SCHEMA_FIELD_MAP = {
|
75
|
+
'dc_title_s' => 'dct_title_s', # new namespace
|
76
|
+
'dc_description_s' => 'dct_description_sm', # new namespace; single to multi-valued
|
77
|
+
'dc_language_s' => 'dct_language_sm', # new namespace; single to multi-valued
|
78
|
+
'dc_language_sm' => 'dct_language_sm', # new namespace
|
79
|
+
'dc_creator_sm' => 'dct_creator_sm', # new namespace
|
80
|
+
'dc_publisher_s' => 'dct_publisher_sm', # new namespace; single to multi-valued
|
81
|
+
'dct_provenance_s' => 'schema_provider_s', # new URI name
|
82
|
+
'dc_subject_sm' => 'dct_subject_sm', # new namespace
|
83
|
+
'solr_geom' => 'dcat_bbox', # new URI name
|
84
|
+
'solr_year_i' => 'gbl_indexYear_im', # new URI name; single to multi-valued
|
85
|
+
'dc_source_sm' => 'dct_source_sm', # new namespace
|
86
|
+
'dc_rights_s' => 'dct_accessRights_s', # new URI name
|
87
|
+
'dc_format_s' => 'dct_format_s', # new namespace
|
88
|
+
'layer_id_s' => 'gbl_wxsIdentifier_s', # new URI name
|
89
|
+
'layer_slug_s' => 'id', # new URI name
|
90
|
+
'dc_identifier_s' => 'dct_identifier_sm', # new namespace; single to multi-valued
|
91
|
+
'layer_modified_dt' => 'gbl_mdModified_dt', # new URI name
|
92
|
+
'geoblacklight_version' => 'gbl_mdVersion_s', # new URI name
|
93
|
+
'suppressed_b' => 'gbl_suppressed_b' # new namespace
|
94
|
+
}.freeze
|
95
|
+
|
96
|
+
# Map Dublin Core types to Aardvark resource class sets
|
97
|
+
# See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-class.md
|
98
|
+
RESOURCE_CLASS_MAP = {
|
99
|
+
'Collection' => ['Collections'],
|
100
|
+
'Dataset' => ['Datasets'],
|
101
|
+
'Image' => ['Imagery'],
|
102
|
+
'InteractiveResource' => ['Websites'],
|
103
|
+
'Service' => ['Web services'],
|
104
|
+
'StillImage' => ['Imagery']
|
105
|
+
}.freeze
|
106
|
+
|
107
|
+
# Map geometry types to Aardvark resource type sets
|
108
|
+
# See: https://github.com/OpenGeoMetadata/opengeometadata.github.io/blob/main/docs/ogm-aardvark/resource-type.md
|
109
|
+
RESOURCE_TYPE_MAP = {
|
110
|
+
'Point' => ['Point data'],
|
111
|
+
'Line' => ['Line data'],
|
112
|
+
'Polygon' => ['Polygon data'],
|
113
|
+
'Raster' => ['Raster data'],
|
114
|
+
'Table' => ['Table data']
|
115
|
+
}.freeze
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
data/lib/geo_combine/ogp.rb
CHANGED
data/lib/geo_combine/railtie.rb
CHANGED
data/lib/geo_combine/version.rb
CHANGED
data/lib/geo_combine.rb
CHANGED
@@ -72,6 +72,9 @@ require 'geo_combine/ogp'
|
|
72
72
|
# Require harvesting/indexing files
|
73
73
|
require 'geo_combine/geo_blacklight_harvester'
|
74
74
|
|
75
|
+
# Migrators
|
76
|
+
require 'geo_combine/migrators/v1_aardvark_migrator'
|
77
|
+
|
75
78
|
# Require gem files
|
76
79
|
require 'geo_combine/version'
|
77
80
|
require 'geo_combine/railtie' if defined?(Rails)
|
data/lib/tasks/geo_combine.rake
CHANGED
@@ -3,58 +3,29 @@
|
|
3
3
|
require 'json'
|
4
4
|
require 'rsolr'
|
5
5
|
require 'find'
|
6
|
-
require 'geo_combine/geo_blacklight_harvester'
|
7
6
|
require 'faraday/net_http_persistent'
|
7
|
+
require 'geo_combine/harvester'
|
8
|
+
require 'geo_combine/indexer'
|
9
|
+
require 'geo_combine/geo_blacklight_harvester'
|
8
10
|
|
9
11
|
namespace :geocombine do
|
10
12
|
desc 'Clone OpenGeoMetadata repositories'
|
11
13
|
task :clone, [:repo] do |_t, args|
|
12
|
-
|
13
|
-
|
14
|
-
else
|
15
|
-
ogm_api_uri = URI('https://api.github.com/orgs/opengeometadata/repos')
|
16
|
-
ogm_repos = JSON.parse(Net::HTTP.get(ogm_api_uri)).map do |repo|
|
17
|
-
repo['clone_url'] if (repo['size']).positive?
|
18
|
-
end.compact
|
19
|
-
ogm_repos.reject! { |repo| GeoCombineRake.denylist.include?(repo) }
|
20
|
-
end
|
21
|
-
ogm_repos.each do |repo|
|
22
|
-
Kernel.system "echo #{repo} && mkdir -p #{GeoCombineRake.ogm_path} && cd #{GeoCombineRake.ogm_path} && git clone --depth 1 #{repo}"
|
23
|
-
end
|
14
|
+
harvester = GeoCombine::Harvester.new
|
15
|
+
args[:repo] ? harvester.clone(args.repo) : harvester.clone_all
|
24
16
|
end
|
25
17
|
|
26
18
|
desc '"git pull" OpenGeoMetadata repositories'
|
27
19
|
task :pull, [:repo] do |_t, args|
|
28
|
-
|
29
|
-
|
30
|
-
else
|
31
|
-
Dir.glob("#{GeoCombineRake.ogm_path}/*")
|
32
|
-
end
|
33
|
-
paths.each do |path|
|
34
|
-
next unless File.directory?(path)
|
35
|
-
|
36
|
-
Kernel.system "echo #{path} && cd #{path} && git pull origin"
|
37
|
-
end
|
20
|
+
harvester = GeoCombine::Harvester.new
|
21
|
+
args[:repo] ? harvester.pull(args.repo) : harvester.pull_all
|
38
22
|
end
|
39
23
|
|
40
24
|
desc 'Index all JSON documents except Layers.json'
|
41
25
|
task :index do
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
next unless File.basename(path).include?('.json') && File.basename(path) != 'layers.json'
|
46
|
-
|
47
|
-
doc = JSON.parse(File.read(path))
|
48
|
-
[doc].flatten.each do |record|
|
49
|
-
puts "Indexing #{record['layer_slug_s']}: #{path}" if $DEBUG
|
50
|
-
solr.update params: { commitWithin: GeoCombineRake.commit_within, overwrite: true },
|
51
|
-
data: [record].to_json,
|
52
|
-
headers: { 'Content-Type' => 'application/json' }
|
53
|
-
rescue RSolr::Error::Http => e
|
54
|
-
puts e
|
55
|
-
end
|
56
|
-
end
|
57
|
-
solr.commit
|
26
|
+
harvester = GeoCombine::Harvester.new
|
27
|
+
indexer = GeoCombine::Indexer.new
|
28
|
+
indexer.index(harvester.docs_to_index)
|
58
29
|
end
|
59
30
|
|
60
31
|
namespace :geoblacklight_harvester do
|
@@ -66,29 +37,3 @@ namespace :geocombine do
|
|
66
37
|
end
|
67
38
|
end
|
68
39
|
end
|
69
|
-
|
70
|
-
# Class to hold helper methods for use in GeoCombine rake tasks
|
71
|
-
class GeoCombineRake
|
72
|
-
def self.commit_within
|
73
|
-
(ENV['SOLR_COMMIT_WITHIN'] || 5000).to_i
|
74
|
-
end
|
75
|
-
|
76
|
-
def self.denylist
|
77
|
-
[
|
78
|
-
'https://github.com/OpenGeoMetadata/GeoCombine.git',
|
79
|
-
'https://github.com/OpenGeoMetadata/aardvark.git',
|
80
|
-
'https://github.com/OpenGeoMetadata/metadata-issues.git',
|
81
|
-
'https://github.com/OpenGeoMetadata/ogm_utils-python.git',
|
82
|
-
'https://github.com/OpenGeoMetadata/opengeometadata.github.io.git',
|
83
|
-
'https://github.com/OpenGeoMetadata/opengeometadata-rails.git'
|
84
|
-
]
|
85
|
-
end
|
86
|
-
|
87
|
-
def self.ogm_path
|
88
|
-
ENV['OGM_PATH'] || 'tmp/opengeometadata'
|
89
|
-
end
|
90
|
-
|
91
|
-
def self.solr_url
|
92
|
-
ENV['SOLR_URL'] || 'http://127.0.0.1:8983/solr/blacklight-core'
|
93
|
-
end
|
94
|
-
end
|
@@ -28,6 +28,13 @@
|
|
28
28
|
"dct_spatial_sm":[
|
29
29
|
"Uganda"
|
30
30
|
],
|
31
|
+
"dct_isPartOf_sm":[
|
32
|
+
"Uganda GIS Maps and Data, 2000-2010"
|
33
|
+
],
|
34
|
+
"dc_source_sm": [
|
35
|
+
"stanford-rb371kw9607"
|
36
|
+
],
|
31
37
|
"solr_geom":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)",
|
32
|
-
"solr_year_i":2005
|
38
|
+
"solr_year_i":2005,
|
39
|
+
"suppressed_b":false
|
33
40
|
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
{
|
2
|
+
"gbl_mdVersion_s":"Aardvark",
|
3
|
+
"dct_identifier_sm":[
|
4
|
+
"http://purl.stanford.edu/cz128vq0535"
|
5
|
+
],
|
6
|
+
"dct_title_s":"2005 Rural Poverty GIS Database: Uganda",
|
7
|
+
"dct_description_sm":[
|
8
|
+
"This polygon shapefile contains 2005 poverty data for 855 rural subcounties in Uganda. These data are intended for researchers, students, policy makers and the general public for reference and mapping purposes, and may be used for basic applications such as viewing, querying, and map output production."
|
9
|
+
],
|
10
|
+
"dct_accessRights_s":"Public",
|
11
|
+
"schema_provider_s":"Stanford",
|
12
|
+
"dct_references_s":"{\"http://schema.org/url\":\"http://purl.stanford.edu/cz128vq0535\",\"http://schema.org/downloadUrl\":\"http://stacks.stanford.edu/file/druid:cz128vq0535/data.zip\",\"http://www.loc.gov/mods/v3\":\"http://purl.stanford.edu/cz128vq0535.mods\",\"http://www.isotc211.org/schemas/2005/gmd/\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/iso19139.xml\",\"http://www.w3.org/1999/xhtml\":\"http://opengeometadata.stanford.edu/metadata/edu.stanford.purl/druid:cz128vq0535/default.html\",\"http://www.opengis.net/def/serviceType/ogc/wfs\":\"https://geowebservices.stanford.edu/geoserver/wfs\",\"http://www.opengis.net/def/serviceType/ogc/wms\":\"https://geowebservices.stanford.edu/geoserver/wms\"}",
|
13
|
+
"gbl_wxsIdentifier_s":"druid:cz128vq0535",
|
14
|
+
"id":"stanford-cz128vq0535",
|
15
|
+
"gbl_resourceType_sm": [
|
16
|
+
"Polygon data"
|
17
|
+
],
|
18
|
+
"gbl_mdModified_dt":"2015-01-13T18:46:38Z",
|
19
|
+
"dct_format_s":"Shapefile",
|
20
|
+
"dct_language_sm":[
|
21
|
+
"English"
|
22
|
+
],
|
23
|
+
"gbl_resourceClass_sm":[
|
24
|
+
"Datasets"
|
25
|
+
],
|
26
|
+
"dct_publisher_sm":[
|
27
|
+
"Uganda Bureau of Statistics"
|
28
|
+
],
|
29
|
+
"dct_creator_sm":[
|
30
|
+
"Uganda Bureau of Statistics"
|
31
|
+
],
|
32
|
+
"dct_subject_sm":[
|
33
|
+
"Poverty",
|
34
|
+
"Statistics"
|
35
|
+
],
|
36
|
+
"dct_issued_s":"2005",
|
37
|
+
"dct_temporal_sm":[
|
38
|
+
"2005"
|
39
|
+
],
|
40
|
+
"dct_spatial_sm":[
|
41
|
+
"Uganda"
|
42
|
+
],
|
43
|
+
"dct_source_sm": [
|
44
|
+
"stanford-rb371kw9607"
|
45
|
+
],
|
46
|
+
"dcat_bbox":"ENVELOPE(29.572742, 35.000308, 4.234077, -1.478794)",
|
47
|
+
"gbl_indexYear_im":[
|
48
|
+
2005
|
49
|
+
],
|
50
|
+
"gbl_suppressed_b":false
|
51
|
+
}
|
@@ -0,0 +1,57 @@
|
|
1
|
+
{
|
2
|
+
"dct_title_s": "A description of the coast & city of Goa.",
|
3
|
+
"dct_alternative_sm": [
|
4
|
+
"A description of the coast & city of Goa."
|
5
|
+
],
|
6
|
+
"dct_description_sm": [
|
7
|
+
"Photocopy. Some relief shown pictorially. North oriented to the left. \"The city of Goa & all its dependencies doth justly belong to the crown of England by the mariage [sic] of King Charles the Second with Queen Catherine\"--upper right. \"21\"--upper right. Outside of original margin is image of British Museum rule and \"7-Tab-125.\" Dimensions: 51 x 61 centimeters"
|
8
|
+
],
|
9
|
+
"dct_language_sm": [
|
10
|
+
"eng"
|
11
|
+
],
|
12
|
+
"dct_publisher_sm": [
|
13
|
+
"British Museum? (London?)"
|
14
|
+
],
|
15
|
+
"schema_provider_s": "University of Minnesota",
|
16
|
+
"gbl_resourceClass_sm": [
|
17
|
+
"Maps"
|
18
|
+
],
|
19
|
+
"dcat_keyword_sm": [
|
20
|
+
"Velha Goa (India) Maps",
|
21
|
+
"Maps"
|
22
|
+
],
|
23
|
+
"dct_temporal_sm": [
|
24
|
+
"1900-1999"
|
25
|
+
],
|
26
|
+
"dct_issued_s": "1900 - 1999?",
|
27
|
+
"gbl_indexYear_im": [
|
28
|
+
"1900"
|
29
|
+
],
|
30
|
+
"gbl_dateRange_drsim": [
|
31
|
+
"[1900 TO 1999]"
|
32
|
+
],
|
33
|
+
"dct_spatial_sm": [
|
34
|
+
"India"
|
35
|
+
],
|
36
|
+
"locn_geometry": "ENVELOPE(-2.36,-2.06,11.73,11.5101)",
|
37
|
+
"dcat_bbox": "ENVELOPE(-2.36,-2.06,11.73,11.5101)",
|
38
|
+
"dcat_centroid": "11.620049999999999,-2.21",
|
39
|
+
"pcdm_memberOf_sm": [
|
40
|
+
"64bd8c4c-8e60-4956-b43d-bdc3f93db488"
|
41
|
+
],
|
42
|
+
"dct_isPartOf_sm": [
|
43
|
+
"05d-01"
|
44
|
+
],
|
45
|
+
"dct_rights_sm": [
|
46
|
+
"Use of this item may be governed by US and international copyright laws. You may be able to use this item, but copyright and other considerations may apply. For possible additional information or guidance on your use, please contact the contributing organization."
|
47
|
+
],
|
48
|
+
"dct_accessRights_s": "Public",
|
49
|
+
"dct_format_s": "JPEG",
|
50
|
+
"dct_references_s": "{\"http://schema.org/downloadUrl\":\"http://cdm16022.contentdm.oclc.org/utils/getfile/collection/p16022coll205/id/236/filename/print/page/download/fparams/forcedownload\",\"http://schema.org/url\":\"https://umedia.lib.umn.edu/item/p16022coll205:236\",\"http://iiif.io/api/presentation#manifest\":\"https://cdm16022.contentdm.oclc.org/iiif/info/p16022coll205/236/manifest.json\"}",
|
51
|
+
"id": "p16022coll205:236",
|
52
|
+
"dct_identifier_sm": [
|
53
|
+
"UMN_ALMA:9949551790001701"
|
54
|
+
],
|
55
|
+
"gbl_mdModified_dt": "2022-04-01T15:27:13Z",
|
56
|
+
"gbl_mdVersion_s": "Aardvark"
|
57
|
+
}
|
data/spec/fixtures/json_docs.rb
CHANGED
@@ -15,6 +15,12 @@ module JsonDocs
|
|
15
15
|
File.read(File.join(File.dirname(__FILE__), './docs/full_geoblacklight.json'))
|
16
16
|
end
|
17
17
|
|
18
|
+
##
|
19
|
+
# full_geoblacklight fixture converted to the aardvark schema
|
20
|
+
def full_geoblacklight_aardvark
|
21
|
+
File.read(File.join(File.dirname(__FILE__), './docs/full_geoblacklight_aardvark.json'))
|
22
|
+
end
|
23
|
+
|
18
24
|
##
|
19
25
|
# A sample Esri OpenData metadata record
|
20
26
|
def esri_opendata_metadata
|