search_solr_tools 6.0.0 → 6.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/bin/search_solr_tools +1 -13
- data/lib/search_solr_tools/config/environments.yaml +0 -32
- data/lib/search_solr_tools/harvesters/base.rb +0 -1
- data/lib/search_solr_tools/helpers/solr_format.rb +0 -15
- data/lib/search_solr_tools/helpers/translate_spatial_coverage.rb +0 -1
- data/lib/search_solr_tools/version.rb +1 -1
- data/lib/search_solr_tools.rb +1 -2
- data/search_solr_tools.gemspec +6 -6
- metadata +14 -56
- data/lib/search_solr_tools/harvesters/adc.rb +0 -49
- data/lib/search_solr_tools/harvesters/ade_auto_suggest.rb +0 -46
- data/lib/search_solr_tools/harvesters/bcodmo.rb +0 -64
- data/lib/search_solr_tools/harvesters/data_one.rb +0 -49
- data/lib/search_solr_tools/harvesters/echo.rb +0 -52
- data/lib/search_solr_tools/harvesters/eol.rb +0 -51
- data/lib/search_solr_tools/harvesters/gtnp.rb +0 -67
- data/lib/search_solr_tools/harvesters/ices.rb +0 -58
- data/lib/search_solr_tools/harvesters/ncdc_paleo.rb +0 -62
- data/lib/search_solr_tools/harvesters/nmi.rb +0 -34
- data/lib/search_solr_tools/harvesters/nodc.rb +0 -75
- data/lib/search_solr_tools/harvesters/oai.rb +0 -62
- data/lib/search_solr_tools/harvesters/pdc.rb +0 -40
- data/lib/search_solr_tools/harvesters/r2r.rb +0 -61
- data/lib/search_solr_tools/harvesters/rda.rb +0 -35
- data/lib/search_solr_tools/harvesters/tdar.rb +0 -71
- data/lib/search_solr_tools/harvesters/usgs.rb +0 -76
- data/lib/search_solr_tools/helpers/csw_iso_query_builder.rb +0 -29
- data/lib/search_solr_tools/helpers/data_one_format.rb +0 -74
- data/lib/search_solr_tools/helpers/iso_to_solr.rb +0 -97
- data/lib/search_solr_tools/helpers/iso_to_solr_format.rb +0 -197
- data/lib/search_solr_tools/helpers/ncdc_paleo_format.rb +0 -61
- data/lib/search_solr_tools/helpers/query_builder.rb +0 -13
- data/lib/search_solr_tools/helpers/r2r_format.rb +0 -25
- data/lib/search_solr_tools/helpers/selectors.rb +0 -22
- data/lib/search_solr_tools/helpers/tdar_format.rb +0 -70
- data/lib/search_solr_tools/helpers/usgs_format.rb +0 -50
- data/lib/search_solr_tools/selectors/adc.rb +0 -96
- data/lib/search_solr_tools/selectors/data_one.rb +0 -96
- data/lib/search_solr_tools/selectors/echo_iso.rb +0 -112
- data/lib/search_solr_tools/selectors/ices_iso.rb +0 -108
- data/lib/search_solr_tools/selectors/ncdc_paleo.rb +0 -90
- data/lib/search_solr_tools/selectors/nmi.rb +0 -107
- data/lib/search_solr_tools/selectors/nodc_iso.rb +0 -108
- data/lib/search_solr_tools/selectors/pdc_iso.rb +0 -109
- data/lib/search_solr_tools/selectors/r2r.rb +0 -115
- data/lib/search_solr_tools/selectors/rda.rb +0 -107
- data/lib/search_solr_tools/selectors/tdar_opensearch.rb +0 -91
- data/lib/search_solr_tools/selectors/usgs_iso.rb +0 -107
- data/lib/search_solr_tools/translators/bcodmo_json.rb +0 -89
- data/lib/search_solr_tools/translators/eol_to_solr.rb +0 -84
- data/lib/search_solr_tools/translators/gtnp_json.rb +0 -59
| @@ -1,51 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
            require 'json'
         | 
| 3 | 
            -
            require 'rgeo/geo_json'
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            module SearchSolrTools
         | 
| 6 | 
            -
              module Harvesters
         | 
| 7 | 
            -
                class Eol < Base
         | 
| 8 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 9 | 
            -
                    super env, die_on_failure
         | 
| 10 | 
            -
                    @translator = SearchSolrTools::Translators::EolToSolr.new
         | 
| 11 | 
            -
                  end
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                  def harvest_and_delete
         | 
| 14 | 
            -
                    puts 'Running harvest of EOL catalog using the following configured EOL URLs:'
         | 
| 15 | 
            -
                    SearchSolrTools::SolrEnvironments[:common][:eol].each { |x| puts x }
         | 
| 16 | 
            -
                    super(method(:harvest_eol_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:EOL][:long_name]}\"")
         | 
| 17 | 
            -
                  end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                  def harvest_eol_into_solr
         | 
| 20 | 
            -
                    solr_add_queries = eol_dataset_urls.map do |dataset|
         | 
| 21 | 
            -
                      begin
         | 
| 22 | 
            -
                        doc = open_xml_document(dataset)
         | 
| 23 | 
            -
                        if doc.xpath('//xmlns:metadata').size > 1
         | 
| 24 | 
            -
                          # THREDDS allows for a dataset of datasests, EOL should not utilize this
         | 
| 25 | 
            -
                          fail "Complex dataset encountered at #{doc.xpath('//xmlns:catalog').to_html}"
         | 
| 26 | 
            -
                        end
         | 
| 27 | 
            -
                        metadata_doc = open_xml_document(doc.xpath('//xmlns:metadata')[0]['xlink:href'])
         | 
| 28 | 
            -
                        { 'add' => { 'doc' => @translator.translate(doc, metadata_doc) } }
         | 
| 29 | 
            -
                      rescue => e
         | 
| 30 | 
            -
                        puts "ERROR: #{e}"
         | 
| 31 | 
            -
                        puts "Failed to translate this record: #{doc} -> #{metadata_doc}"
         | 
| 32 | 
            -
                        raise e if @die_on_failure
         | 
| 33 | 
            -
                        next
         | 
| 34 | 
            -
                      end
         | 
| 35 | 
            -
                    end
         | 
| 36 | 
            -
                    insert_solr_docs solr_add_queries, Base::JSON_CONTENT_TYPE
         | 
| 37 | 
            -
                  end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                  def eol_dataset_urls
         | 
| 40 | 
            -
                    SearchSolrTools::SolrEnvironments[:common][:eol].flat_map do |endpoint|
         | 
| 41 | 
            -
                      doc = open_xml_document(endpoint)
         | 
| 42 | 
            -
                      doc.xpath('//xmlns:catalogRef').map { |node| node['xlink:href'] }
         | 
| 43 | 
            -
                    end
         | 
| 44 | 
            -
                  end
         | 
| 45 | 
            -
             | 
| 46 | 
            -
                  def open_xml_document(url)
         | 
| 47 | 
            -
                    Nokogiri::XML(open(url), &:strict)
         | 
| 48 | 
            -
                  end
         | 
| 49 | 
            -
                end
         | 
| 50 | 
            -
              end
         | 
| 51 | 
            -
            end
         | 
| @@ -1,67 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
            require 'json'
         | 
| 3 | 
            -
            require 'rest-client'
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            module SearchSolrTools
         | 
| 6 | 
            -
              module Harvesters
         | 
| 7 | 
            -
                # Harvests data from GTN-P endpoints, translates and adds it to solr
         | 
| 8 | 
            -
                class GtnP < Base
         | 
| 9 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 10 | 
            -
                    super env, die_on_failure
         | 
| 11 | 
            -
                    @translator = Translators::GtnpJsonToSolr.new
         | 
| 12 | 
            -
                  end
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                  def gtnp_service_urls
         | 
| 15 | 
            -
                    json_records = []
         | 
| 16 | 
            -
                    SearchSolrTools::SolrEnvironments[:common][:gtnp].flat_map do |endpoint|
         | 
| 17 | 
            -
                      record = request_json(endpoint)
         | 
| 18 | 
            -
                      json_records << record
         | 
| 19 | 
            -
                    end
         | 
| 20 | 
            -
                    json_records
         | 
| 21 | 
            -
                  end
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                  def harvest_and_delete
         | 
| 24 | 
            -
                    puts 'Running harvest of GTN-P catalog using the following configured GTN-P URLs:'
         | 
| 25 | 
            -
                    SearchSolrTools::SolrEnvironments[:common][:gtnp].each { |x| puts x }
         | 
| 26 | 
            -
                    super(method(:harvest_gtnp_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:GTNP][:long_name]}\"")
         | 
| 27 | 
            -
                  end
         | 
| 28 | 
            -
             | 
| 29 | 
            -
                  def harvest_gtnp_into_solr
         | 
| 30 | 
            -
                    result = translate_gtnp
         | 
| 31 | 
            -
                    insert_solr_docs result[:add_docs], Base::JSON_CONTENT_TYPE
         | 
| 32 | 
            -
                    fail 'Failed to harvest some records from the provider' if result[:failure_ids].length > 0
         | 
| 33 | 
            -
                  end
         | 
| 34 | 
            -
             | 
| 35 | 
            -
                  def translate_gtnp
         | 
| 36 | 
            -
                    documents = []
         | 
| 37 | 
            -
                    failure_ids = []
         | 
| 38 | 
            -
                    gtnp_records = gtnp_service_urls
         | 
| 39 | 
            -
                    gtnp_records.each do |record|
         | 
| 40 | 
            -
                      results = parse_record(record)
         | 
| 41 | 
            -
                      results[:documents].each { |d| documents << d }
         | 
| 42 | 
            -
                      results[:failure_ids].each { |id| failure_ids << id }
         | 
| 43 | 
            -
                    end
         | 
| 44 | 
            -
                    { add_docs: documents, failure_ids: failure_ids }
         | 
| 45 | 
            -
                  end
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                  def request_json(url)
         | 
| 48 | 
            -
                    JSON.parse(RestClient.get(url))
         | 
| 49 | 
            -
                  end
         | 
| 50 | 
            -
             | 
| 51 | 
            -
                  def parse_record(record)
         | 
| 52 | 
            -
                    documents = []
         | 
| 53 | 
            -
                    failure_ids = []
         | 
| 54 | 
            -
                    begin
         | 
| 55 | 
            -
                      record.drop(1).each do |dataset|
         | 
| 56 | 
            -
                        trans_doc = @translator.translate(dataset, record[0])
         | 
| 57 | 
            -
                        documents << { 'add' => { 'doc' => trans_doc } }
         | 
| 58 | 
            -
                      end
         | 
| 59 | 
            -
                    rescue => e
         | 
| 60 | 
            -
                      puts "Failed to add record #{record[0][:title]} with error #{e} (#{e.message}) : #{e.backtrace.join("\n")}"
         | 
| 61 | 
            -
                      failure_ids << record[0][:title]
         | 
| 62 | 
            -
                    end
         | 
| 63 | 
            -
                    { documents: documents, failure_ids: failure_ids }
         | 
| 64 | 
            -
                  end
         | 
| 65 | 
            -
                end
         | 
| 66 | 
            -
              end
         | 
| 67 | 
            -
            end
         | 
| @@ -1,58 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
            require_relative '../helpers/csw_iso_query_builder'
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            module SearchSolrTools
         | 
| 5 | 
            -
              module Harvesters
         | 
| 6 | 
            -
                # Harvests data from ICES and inserts it into Solr after it has been translated
         | 
| 7 | 
            -
                class Ices < Base
         | 
| 8 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 9 | 
            -
                    super env, die_on_failure
         | 
| 10 | 
            -
                    @page_size = 100
         | 
| 11 | 
            -
                    @translator = Helpers::IsoToSolr.new :ices
         | 
| 12 | 
            -
                  end
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                  def harvest_and_delete
         | 
| 15 | 
            -
                    puts "Running harvest of ICES catalog from #{ices_url}"
         | 
| 16 | 
            -
                    super(method(:harvest_ices_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:ICES][:long_name]}\"")
         | 
| 17 | 
            -
                  end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                  # get translated entries from ICES and add them to Solr
         | 
| 20 | 
            -
                  # this is the main entry point for the class
         | 
| 21 | 
            -
                  def harvest_ices_into_solr
         | 
| 22 | 
            -
                    start_index = 1
         | 
| 23 | 
            -
                    while (entries = get_results_from_ices(start_index)) && (entries.length > 0)
         | 
| 24 | 
            -
                      begin
         | 
| 25 | 
            -
                        insert_solr_docs get_docs_with_translated_entries_from_ices(entries)
         | 
| 26 | 
            -
                      rescue => e
         | 
| 27 | 
            -
                        puts "ERROR: #{e}"
         | 
| 28 | 
            -
                        raise e if @die_on_failure
         | 
| 29 | 
            -
                      end
         | 
| 30 | 
            -
                      start_index += @page_size
         | 
| 31 | 
            -
                    end
         | 
| 32 | 
            -
                  end
         | 
| 33 | 
            -
             | 
| 34 | 
            -
                  def ices_url
         | 
| 35 | 
            -
                    SolrEnvironments[@environment][:ices_url]
         | 
| 36 | 
            -
                  end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                  def get_results_from_ices(start_index)
         | 
| 39 | 
            -
                    get_results build_csw_request('results', @page_size, start_index), '//gmd:MD_Metadata'
         | 
| 40 | 
            -
                  end
         | 
| 41 | 
            -
             | 
| 42 | 
            -
                  def get_docs_with_translated_entries_from_ices(entries)
         | 
| 43 | 
            -
                    entries.map do |entry|
         | 
| 44 | 
            -
                      create_new_solr_add_doc_with_child(@translator.translate(entry).root)
         | 
| 45 | 
            -
                    end
         | 
| 46 | 
            -
                  end
         | 
| 47 | 
            -
             | 
| 48 | 
            -
                  def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
         | 
| 49 | 
            -
                    Helpers::CswIsoQueryBuilder.get_query_string(ices_url,
         | 
| 50 | 
            -
                                                                 'resultType' => resultType,
         | 
| 51 | 
            -
                                                                 'maxRecords' => maxRecords,
         | 
| 52 | 
            -
                                                                 'startPosition' => startPosition,
         | 
| 53 | 
            -
                                                                 'constraintLanguage' => 'CQL_TEXT',
         | 
| 54 | 
            -
                                                                 'outputSchema' => 'http://www.isotc211.org/2005/gmd')
         | 
| 55 | 
            -
                  end
         | 
| 56 | 
            -
                end
         | 
| 57 | 
            -
              end
         | 
| 58 | 
            -
            end
         | 
| @@ -1,62 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
            require_relative '../helpers/csw_iso_query_builder'
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            module SearchSolrTools
         | 
| 5 | 
            -
              module Harvesters
         | 
| 6 | 
            -
                # Harvests data from NODC PALEO and inserts it into Solr after it has been translated
         | 
| 7 | 
            -
                class NcdcPaleo < Base
         | 
| 8 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 9 | 
            -
                    super env, die_on_failure
         | 
| 10 | 
            -
                    @page_size = 50
         | 
| 11 | 
            -
                    @translator = Helpers::IsoToSolr.new :ncdc_paleo
         | 
| 12 | 
            -
                  end
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                  def harvest_and_delete
         | 
| 15 | 
            -
                    puts "Running harvest of NCDC Paleo catalog from #{ncdc_paleo_url}"
         | 
| 16 | 
            -
                    super(method(:harvest_ncdc_paleo_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NCDC_PALEO][:long_name]}\"")
         | 
| 17 | 
            -
                  end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                  def harvest_ncdc_paleo_into_solr
         | 
| 20 | 
            -
                    start_index = 1
         | 
| 21 | 
            -
                    while (entries = get_results_from_ncdc_paleo_url(start_index)) && (entries.length > 0)
         | 
| 22 | 
            -
                      begin
         | 
| 23 | 
            -
                        insert_solr_docs get_docs_with_translated_entries_from_ncdc_paleo(entries)
         | 
| 24 | 
            -
                      rescue => e
         | 
| 25 | 
            -
                        puts "ERROR: #{e}"
         | 
| 26 | 
            -
                        raise e if @die_on_failure
         | 
| 27 | 
            -
                      end
         | 
| 28 | 
            -
                      start_index += @page_size
         | 
| 29 | 
            -
                    end
         | 
| 30 | 
            -
                  end
         | 
| 31 | 
            -
             | 
| 32 | 
            -
                  def ncdc_paleo_url
         | 
| 33 | 
            -
                    SolrEnvironments[@environment][:ncdc_paleo_url]
         | 
| 34 | 
            -
                  end
         | 
| 35 | 
            -
             | 
| 36 | 
            -
                  def get_results_from_ncdc_paleo_url(start_index)
         | 
| 37 | 
            -
                    get_results build_csw_request('results', @page_size, start_index), '//csw:Record'
         | 
| 38 | 
            -
                  end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
                  def get_docs_with_translated_entries_from_ncdc_paleo(entries)
         | 
| 41 | 
            -
                    auth_ids = entries.map { |e| e.xpath("./dc:identifier[@scheme='urn:x-esri:specification:ServiceType:ArcIMS:Metadata:DocID']").text }
         | 
| 42 | 
            -
             | 
| 43 | 
            -
                    auth_ids.map do |record|
         | 
| 44 | 
            -
                      result_xml = get_results("https://gis.ncdc.noaa.gov/gptpaleo/csw?getxml=#{record}",
         | 
| 45 | 
            -
                                               '/rdf:RDF/rdf:Description').first
         | 
| 46 | 
            -
                      solr_doc = create_new_solr_add_doc_with_child(@translator.translate(result_xml).root)
         | 
| 47 | 
            -
                      insert_node = solr_doc.at_xpath('//doc')
         | 
| 48 | 
            -
                      insert_node.add_child("<field name='authoritative_id'>#{record}</field>")
         | 
| 49 | 
            -
                      insert_node.add_child("<field name='dataset_url'>https://gis.ncdc.noaa.gov/gptpaleo/catalog/search/resource/details.page?uuid=#{record}")
         | 
| 50 | 
            -
                      solr_doc.root
         | 
| 51 | 
            -
                    end
         | 
| 52 | 
            -
                  end
         | 
| 53 | 
            -
             | 
| 54 | 
            -
                  def build_csw_request(resultType = 'results', maxRecords = '1000', startPosition = '1')
         | 
| 55 | 
            -
                    Helpers::CswIsoQueryBuilder.get_query_string(ncdc_paleo_url,
         | 
| 56 | 
            -
                                                                 'resultType' => resultType,
         | 
| 57 | 
            -
                                                                 'maxRecords' => maxRecords,
         | 
| 58 | 
            -
                                                                 'startPosition' => startPosition)
         | 
| 59 | 
            -
                  end
         | 
| 60 | 
            -
                end
         | 
| 61 | 
            -
              end
         | 
| 62 | 
            -
            end
         | 
| @@ -1,34 +0,0 @@ | |
| 1 | 
            -
            require_relative 'oai'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            module SearchSolrTools
         | 
| 4 | 
            -
              module Harvesters
         | 
| 5 | 
            -
                class Nmi < Oai
         | 
| 6 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 7 | 
            -
                    super
         | 
| 8 | 
            -
                    @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:NMI][:long_name]
         | 
| 9 | 
            -
                    @translator = Helpers::IsoToSolr.new :nmi
         | 
| 10 | 
            -
                  end
         | 
| 11 | 
            -
             | 
| 12 | 
            -
                  def metadata_url
         | 
| 13 | 
            -
                    SolrEnvironments[@environment][:nmi_url]
         | 
| 14 | 
            -
                  end
         | 
| 15 | 
            -
             | 
| 16 | 
            -
                  # resumption_token must be empty to stop the harvest loop; NMI's feed does not
         | 
| 17 | 
            -
                  # provide any resumption token and gets all the records in just one go
         | 
| 18 | 
            -
                  def results
         | 
| 19 | 
            -
                    @resumption_token = ''
         | 
| 20 | 
            -
                    list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
         | 
| 21 | 
            -
                    list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
         | 
| 22 | 
            -
                  end
         | 
| 23 | 
            -
             | 
| 24 | 
            -
                  private
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                  def request_params
         | 
| 27 | 
            -
                    {
         | 
| 28 | 
            -
                      verb: 'ListRecords',
         | 
| 29 | 
            -
                      metadataPrefix: 'dif'
         | 
| 30 | 
            -
                    }
         | 
| 31 | 
            -
                  end
         | 
| 32 | 
            -
                end
         | 
| 33 | 
            -
              end
         | 
| 34 | 
            -
            end
         | 
| @@ -1,75 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
            require_relative '../helpers/csw_iso_query_builder'
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            module SearchSolrTools
         | 
| 5 | 
            -
              module Harvesters
         | 
| 6 | 
            -
                # Harvests data from NODC and inserts it into Solr after it has been translated
         | 
| 7 | 
            -
                class Nodc < Base
         | 
| 8 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 9 | 
            -
                    super env, die_on_failure
         | 
| 10 | 
            -
                    @page_size = 50
         | 
| 11 | 
            -
                    @translator = Helpers::IsoToSolr.new :nodc
         | 
| 12 | 
            -
                  end
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                  def harvest_and_delete
         | 
| 15 | 
            -
                    puts "Running harvest of NODC catalog from #{nodc_url}"
         | 
| 16 | 
            -
                    super(method(:harvest_nodc_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:NODC][:long_name]}\"")
         | 
| 17 | 
            -
                  end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                  # get translated entries from NODC and add them to Solr
         | 
| 20 | 
            -
                  # this is the main entry point for the class
         | 
| 21 | 
            -
                  def harvest_nodc_into_solr
         | 
| 22 | 
            -
                    start_index = 1
         | 
| 23 | 
            -
                    while (entries = get_results_from_nodc(start_index)) && (entries.length > 0)
         | 
| 24 | 
            -
                      begin
         | 
| 25 | 
            -
                        insert_solr_docs get_docs_with_translated_entries_from_nodc(entries)
         | 
| 26 | 
            -
                      rescue => e
         | 
| 27 | 
            -
                        puts "ERROR: #{e}"
         | 
| 28 | 
            -
                        raise e if @die_on_failure
         | 
| 29 | 
            -
                      end
         | 
| 30 | 
            -
                      start_index += @page_size
         | 
| 31 | 
            -
                    end
         | 
| 32 | 
            -
                  end
         | 
| 33 | 
            -
             | 
| 34 | 
            -
                  def nodc_url
         | 
| 35 | 
            -
                    SolrEnvironments[@environment][:nodc_url]
         | 
| 36 | 
            -
                  end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                  def get_results_from_nodc(start_index)
         | 
| 39 | 
            -
                    get_results build_csw_request('results', @page_size, start_index), '//gmi:MI_Metadata'
         | 
| 40 | 
            -
                  end
         | 
| 41 | 
            -
             | 
| 42 | 
            -
                  def get_docs_with_translated_entries_from_nodc(entries)
         | 
| 43 | 
            -
                    entries.map do |entry|
         | 
| 44 | 
            -
                      create_new_solr_add_doc_with_child(@translator.translate(entry).root)
         | 
| 45 | 
            -
                    end
         | 
| 46 | 
            -
                  end
         | 
| 47 | 
            -
             | 
| 48 | 
            -
                  def build_csw_request(resultType = 'results', maxRecords = '25', startPosition = '1')
         | 
| 49 | 
            -
                    Helpers::CswIsoQueryBuilder.get_query_string(nodc_url,
         | 
| 50 | 
            -
                                                                 'resultType' => resultType,
         | 
| 51 | 
            -
                                                                 'maxRecords' => maxRecords,
         | 
| 52 | 
            -
                                                                 'startPosition' => startPosition,
         | 
| 53 | 
            -
                                                                 'constraint' => bbox_constraint,
         | 
| 54 | 
            -
                                                                 'outputSchema' => 'http://www.isotc211.org/2005/gmd')
         | 
| 55 | 
            -
                  end
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                  def bbox_constraint
         | 
| 58 | 
            -
                    bbox = {
         | 
| 59 | 
            -
                      west: '-180',
         | 
| 60 | 
            -
                      south: '45',
         | 
| 61 | 
            -
                      east: '180',
         | 
| 62 | 
            -
                      north: '90'
         | 
| 63 | 
            -
                    }
         | 
| 64 | 
            -
             | 
| 65 | 
            -
                    URI.encode '<Filter xmlns:ogc="http://www.opengis.net/ogc" ' \
         | 
| 66 | 
            -
                               'xmlns:gml="http://www.opengis.net/gml" ' \
         | 
| 67 | 
            -
                               'xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0">' \
         | 
| 68 | 
            -
                               '<ogc:BBOX><PropertyName>apiso:BoundingBox</PropertyName><gml:Envelope>' \
         | 
| 69 | 
            -
                               '<gml:lowerCorner>' + bbox[:west] + ' ' + bbox[:south] + '</gml:lowerCorner>' \
         | 
| 70 | 
            -
                                                                                        '<gml:upperCorner>' + bbox[:east] + ' ' + bbox[:north] + '</gml:upperCorner>' \
         | 
| 71 | 
            -
                                                                                                                                                 '</gml:Envelope></ogc:BBOX></Filter>'
         | 
| 72 | 
            -
                  end
         | 
| 73 | 
            -
                end
         | 
| 74 | 
            -
              end
         | 
| 75 | 
            -
            end
         | 
| @@ -1,62 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
            require_relative '../helpers/query_builder'
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            require 'json'
         | 
| 5 | 
            -
             | 
| 6 | 
            -
            module SearchSolrTools
         | 
| 7 | 
            -
              module Harvesters
         | 
| 8 | 
            -
                # Base class for harvesting Oai feeds into SOLR
         | 
| 9 | 
            -
                class Oai < Base
         | 
| 10 | 
            -
                  # Used in query string params, resumptionToken
         | 
| 11 | 
            -
             | 
| 12 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 13 | 
            -
                    super env, die_on_failure
         | 
| 14 | 
            -
                    # This is updated when we harvest based on the response
         | 
| 15 | 
            -
                    # from the server.
         | 
| 16 | 
            -
                    @resumption_token = nil
         | 
| 17 | 
            -
                  end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                  def encode_data_provider_url(url)
         | 
| 20 | 
            -
                    URI.encode(url)
         | 
| 21 | 
            -
                  end
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                  def harvest_and_delete
         | 
| 24 | 
            -
                    puts "Running #{self.class.name} at #{metadata_url}"
         | 
| 25 | 
            -
                    super(method(:harvest), %(data_centers:"#{@data_centers}"))
         | 
| 26 | 
            -
                  end
         | 
| 27 | 
            -
             | 
| 28 | 
            -
                  def harvest
         | 
| 29 | 
            -
                    while @resumption_token.nil? || !@resumption_token.empty?
         | 
| 30 | 
            -
                      begin
         | 
| 31 | 
            -
                        insert_solr_docs(translated_docs(results))
         | 
| 32 | 
            -
                      rescue => e
         | 
| 33 | 
            -
                        puts "ERROR: #{e.class} #{e}"
         | 
| 34 | 
            -
                        raise e if @die_on_failure
         | 
| 35 | 
            -
                      end
         | 
| 36 | 
            -
                    end
         | 
| 37 | 
            -
                  end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                  def results
         | 
| 40 | 
            -
                    fail NotImplementedError
         | 
| 41 | 
            -
                  end
         | 
| 42 | 
            -
             | 
| 43 | 
            -
                  def metadata_url
         | 
| 44 | 
            -
                    fail NotImplementedError
         | 
| 45 | 
            -
                  end
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                  def translated_docs(entries)
         | 
| 48 | 
            -
                    entries.map { |e| create_new_solr_add_doc_with_child(@translator.translate(e).root) }
         | 
| 49 | 
            -
                  end
         | 
| 50 | 
            -
             | 
| 51 | 
            -
                  private
         | 
| 52 | 
            -
             | 
| 53 | 
            -
                  def request_params
         | 
| 54 | 
            -
                    fail NotImplementedError
         | 
| 55 | 
            -
                  end
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                  def request_string
         | 
| 58 | 
            -
                    "#{metadata_url}#{Helpers::QueryBuilder.build(request_params)}"
         | 
| 59 | 
            -
                  end
         | 
| 60 | 
            -
                end
         | 
| 61 | 
            -
              end
         | 
| 62 | 
            -
            end
         | 
| @@ -1,40 +0,0 @@ | |
| 1 | 
            -
            require_relative 'oai'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            module SearchSolrTools
         | 
| 4 | 
            -
              module Harvesters
         | 
| 5 | 
            -
                # Harvests data from Polar data catalogue and inserts it into
         | 
| 6 | 
            -
                # Solr after it has been translated
         | 
| 7 | 
            -
                class Pdc < Oai
         | 
| 8 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 9 | 
            -
                    super
         | 
| 10 | 
            -
                    @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:PDC][:long_name]
         | 
| 11 | 
            -
                    @translator = Helpers::IsoToSolr.new :pdc
         | 
| 12 | 
            -
                  end
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                  def metadata_url
         | 
| 15 | 
            -
                    SolrEnvironments[@environment][:pdc_url]
         | 
| 16 | 
            -
                  end
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                  def results
         | 
| 19 | 
            -
                    list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
         | 
| 20 | 
            -
             | 
| 21 | 
            -
                    @resumption_token = list_records_oai_response.xpath('.//oai:resumptionToken', Helpers::IsoNamespaces.namespaces).first.text
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                    list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
         | 
| 24 | 
            -
                  end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                  private
         | 
| 27 | 
            -
             | 
| 28 | 
            -
                  def request_params
         | 
| 29 | 
            -
                    # If a 'resumptionToken' is supplied with any arguments other than 'verb',
         | 
| 30 | 
            -
                    # the response from PDC gives a badArgument error, saying "The argument
         | 
| 31 | 
            -
                    # 'resumptionToken' must be supplied without other arguments"
         | 
| 32 | 
            -
                    {
         | 
| 33 | 
            -
                      verb: 'ListRecords',
         | 
| 34 | 
            -
                      metadataPrefix: @resumption_token.nil? ? 'iso' : nil,
         | 
| 35 | 
            -
                      resumptionToken: @resumption_token
         | 
| 36 | 
            -
                    }.delete_if { |_k, v| v.nil? }
         | 
| 37 | 
            -
                  end
         | 
| 38 | 
            -
                end
         | 
| 39 | 
            -
              end
         | 
| 40 | 
            -
            end
         | 
| @@ -1,61 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            require 'nokogiri'
         | 
| 4 | 
            -
            require 'rest-client'
         | 
| 5 | 
            -
             | 
| 6 | 
            -
            module SearchSolrTools
         | 
| 7 | 
            -
              module Harvesters
         | 
| 8 | 
            -
                class R2R < Base
         | 
| 9 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 10 | 
            -
                    super
         | 
| 11 | 
            -
                    @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:R2R][:long_name]
         | 
| 12 | 
            -
                    @translator = Helpers::IsoToSolr.new :r2r
         | 
| 13 | 
            -
                    @metadata_url = SolrEnvironments[@environment][:r2r_url]
         | 
| 14 | 
            -
                  end
         | 
| 15 | 
            -
             | 
| 16 | 
            -
                  def harvest_and_delete
         | 
| 17 | 
            -
                    puts "Running #{self.class.name} at #{@metadata_url}"
         | 
| 18 | 
            -
                    super(method(:harvest), %(data_centers:"#{@data_centers}"))
         | 
| 19 | 
            -
                  end
         | 
| 20 | 
            -
             | 
| 21 | 
            -
                  # rubocop: disable MethodLength
         | 
| 22 | 
            -
                  # rubocop: disable AbcSize
         | 
| 23 | 
            -
                  def harvest
         | 
| 24 | 
            -
                    # first fetch list of available records at http://get.rvdata.us/services/cruise/
         | 
| 25 | 
            -
                    # then loop through each one of those, using the root <gmi:MI_Metadata> tag
         | 
| 26 | 
            -
                    puts "Getting list of records from #{@data_centers}"
         | 
| 27 | 
            -
                    RestClient.get(@metadata_url) do |resp, _req, _result, &_block|
         | 
| 28 | 
            -
                      unless resp.code == 200
         | 
| 29 | 
            -
                        puts "Got code #{resp.code} from #{@metadata_url}, skipping R2R harvest."
         | 
| 30 | 
            -
                        next
         | 
| 31 | 
            -
                      end
         | 
| 32 | 
            -
             | 
| 33 | 
            -
                      doc = Nokogiri::HTML(resp.body)
         | 
| 34 | 
            -
             | 
| 35 | 
            -
                      urls = doc.xpath('//a').map do |node|
         | 
| 36 | 
            -
                        "#{@metadata_url}#{node.attr('href')}"
         | 
| 37 | 
            -
                      end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                      urls.each_slice(50) do |url_subset|
         | 
| 40 | 
            -
                        # each result is a nokogirii doc with root element
         | 
| 41 | 
            -
                        # <gmi:MI_Metadata>
         | 
| 42 | 
            -
                        results = url_subset.map do |url|
         | 
| 43 | 
            -
                          get_results(url, '//gmi:MI_Metadata').first
         | 
| 44 | 
            -
                        end
         | 
| 45 | 
            -
             | 
| 46 | 
            -
                        begin
         | 
| 47 | 
            -
                          translated = results.map do |e|
         | 
| 48 | 
            -
                            create_new_solr_add_doc_with_child(@translator.translate(e).root)
         | 
| 49 | 
            -
                          end
         | 
| 50 | 
            -
             | 
| 51 | 
            -
                          insert_solr_docs(translated)
         | 
| 52 | 
            -
                        rescue => e
         | 
| 53 | 
            -
                          puts "ERROR: #{e}"
         | 
| 54 | 
            -
                          raise e if @die_on_failure
         | 
| 55 | 
            -
                        end
         | 
| 56 | 
            -
                      end
         | 
| 57 | 
            -
                    end
         | 
| 58 | 
            -
                  end
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
              end
         | 
| 61 | 
            -
            end
         | 
| @@ -1,35 +0,0 @@ | |
| 1 | 
            -
            require_relative 'oai'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            module SearchSolrTools
         | 
| 4 | 
            -
              module Harvesters
         | 
| 5 | 
            -
                # Harvests the RDA feed
         | 
| 6 | 
            -
                class Rda < Oai
         | 
| 7 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 8 | 
            -
                    super
         | 
| 9 | 
            -
                    @data_centers = Helpers::SolrFormat::DATA_CENTER_NAMES[:RDA][:long_name]
         | 
| 10 | 
            -
                    @translator = Helpers::IsoToSolr.new :rda
         | 
| 11 | 
            -
                  end
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                  def metadata_url
         | 
| 14 | 
            -
                    SolrEnvironments[@environment][:rda_url]
         | 
| 15 | 
            -
                  end
         | 
| 16 | 
            -
             | 
| 17 | 
            -
                  # resumption_token must be empty to stop the harvest loop; RDA's feed does not
         | 
| 18 | 
            -
                  # provide any resumption token and gets all the records in just one go
         | 
| 19 | 
            -
                  def results
         | 
| 20 | 
            -
                    @resumption_token = ''
         | 
| 21 | 
            -
                    list_records_oai_response = get_results(request_string, '//oai:ListRecords', '')
         | 
| 22 | 
            -
                    list_records_oai_response.xpath('.//oai:record', Helpers::IsoNamespaces.namespaces)
         | 
| 23 | 
            -
                  end
         | 
| 24 | 
            -
             | 
| 25 | 
            -
                  private
         | 
| 26 | 
            -
             | 
| 27 | 
            -
                  def request_params
         | 
| 28 | 
            -
                    {
         | 
| 29 | 
            -
                      verb: 'ListRecords',
         | 
| 30 | 
            -
                      metadataPrefix: 'dif'
         | 
| 31 | 
            -
                    }
         | 
| 32 | 
            -
                  end
         | 
| 33 | 
            -
                end
         | 
| 34 | 
            -
              end
         | 
| 35 | 
            -
            end
         | 
| @@ -1,71 +0,0 @@ | |
| 1 | 
            -
            require_relative 'base'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            module SearchSolrTools
         | 
| 4 | 
            -
              module Harvesters
         | 
| 5 | 
            -
                # Harvests data from TDAR and inserts it into Solr after it has been translated
         | 
| 6 | 
            -
                class Tdar < Base
         | 
| 7 | 
            -
                  def initialize(env = 'development', die_on_failure = false)
         | 
| 8 | 
            -
                    super env, die_on_failure
         | 
| 9 | 
            -
                    @page_size = 100
         | 
| 10 | 
            -
                    @translator = Helpers::IsoToSolr.new :tdar
         | 
| 11 | 
            -
                  end
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                  def harvest_and_delete
         | 
| 14 | 
            -
                    puts "Running harvest of TDAR catalog from #{tdar_url}"
         | 
| 15 | 
            -
                    super(method(:harvest_tdar_into_solr), "data_centers:\"#{Helpers::SolrFormat::DATA_CENTER_NAMES[:TDAR][:long_name]}\"")
         | 
| 16 | 
            -
                  end
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                  def harvest_tdar_into_solr
         | 
| 19 | 
            -
                    start_record = 0
         | 
| 20 | 
            -
                    total_harvested = 0
         | 
| 21 | 
            -
                    total_expected = total_results
         | 
| 22 | 
            -
                    while (entries = get_results_from_tdar(start_record)) && (entries.length > 0)
         | 
| 23 | 
            -
                      begin
         | 
| 24 | 
            -
                        insert_solr_docs(get_docs_with_translated_entries_from_tdar(entries))
         | 
| 25 | 
            -
                      rescue => e
         | 
| 26 | 
            -
                        puts "ERROR: #{e}\n\n"
         | 
| 27 | 
            -
                        raise e if @die_on_failure
         | 
| 28 | 
            -
                      end
         | 
| 29 | 
            -
             | 
| 30 | 
            -
                      # if we have all the records we expect, don't attempt another request;
         | 
| 31 | 
            -
                      # it would result in an error
         | 
| 32 | 
            -
                      total_harvested += entries.length
         | 
| 33 | 
            -
                      break if total_harvested >= total_expected
         | 
| 34 | 
            -
             | 
| 35 | 
            -
                      start_record += @page_size
         | 
| 36 | 
            -
                    end
         | 
| 37 | 
            -
                  end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                  def tdar_url
         | 
| 40 | 
            -
                    SolrEnvironments[@environment][:tdar_url]
         | 
| 41 | 
            -
                  end
         | 
| 42 | 
            -
             | 
| 43 | 
            -
                  def get_results_from_tdar(start_record)
         | 
| 44 | 
            -
                    get_results(build_request(@page_size, start_record), './/atom:entry', 'application/xml')
         | 
| 45 | 
            -
                  end
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                  def get_docs_with_translated_entries_from_tdar(entries)
         | 
| 48 | 
            -
                    entries.map do |entry|
         | 
| 49 | 
            -
                      create_new_solr_add_doc_with_child(@translator.translate(entry).root)
         | 
| 50 | 
            -
                    end
         | 
| 51 | 
            -
                  end
         | 
| 52 | 
            -
             | 
| 53 | 
            -
                  def build_request(max_records = '25', start_record = '0')
         | 
| 54 | 
            -
                    request_url = tdar_url + '?_tDAR.searchType=ACADIS_RSS&'\
         | 
| 55 | 
            -
                                             'resourceTypes=DATASET&'\
         | 
| 56 | 
            -
                                             'groups[0].latitudeLongitudeBoxes[0].maximumLongitude=180&'\
         | 
| 57 | 
            -
                                             'groups[0].latitudeLongitudeBoxes[0].minimumLatitude=45&'\
         | 
| 58 | 
            -
                                             'groups[0].latitudeLongitudeBoxes[0].minimumLongitude=-180&'\
         | 
| 59 | 
            -
                                             'groups[0].latitudeLongitudeBoxes[0].maximumLatitude=90&'\
         | 
| 60 | 
            -
                                             'geoMode=ENVELOPE&'\
         | 
| 61 | 
            -
                                             'recordsPerPage=' + max_records.to_s + '&startRecord=' + start_record.to_s
         | 
| 62 | 
            -
             | 
| 63 | 
            -
                    request_url
         | 
| 64 | 
            -
                  end
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                  def total_results
         | 
| 67 | 
            -
                    get_results(build_request(0, 0), './/opensearch:totalResults').text.to_i
         | 
| 68 | 
            -
                  end
         | 
| 69 | 
            -
                end
         | 
| 70 | 
            -
              end
         | 
| 71 | 
            -
            end
         |