maltese 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +19 -0
- data/.gitignore +55 -0
- data/.rubocop.yml +1156 -0
- data/.travis.yml +26 -0
- data/CHANGELOG.md +5 -0
- data/Dockerfile +16 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +128 -0
- data/LICENSE.md +21 -0
- data/README.md +51 -0
- data/bin/maltese +5 -0
- data/lib/maltese/cli.rb +30 -0
- data/lib/maltese/sitemap.rb +140 -0
- data/lib/maltese/utils.rb +87 -0
- data/lib/maltese/version.rb +3 -0
- data/lib/maltese.rb +8 -0
- data/maltese.gemspec +37 -0
- data/public/sitemap.xml.gz +0 -0
- data/spec/cli_spec.rb +43 -0
- data/spec/fixtures/sitemap.json +7574 -0
- data/spec/fixtures/sitemap_nil.json +11 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_no_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +86 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +86 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +270 -0
- data/spec/sitemap_spec.rb +111 -0
- data/spec/spec_helper.rb +95 -0
- metadata +310 -0
    
        data/.travis.yml
    ADDED
    
    | @@ -0,0 +1,26 @@ | |
| 1 | 
            +
            language: ruby
         | 
| 2 | 
            +
            rvm:
         | 
| 3 | 
            +
            - 2.3.3
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            addons:
         | 
| 6 | 
            +
              code_climate:
         | 
| 7 | 
            +
                repo_token: "$CODECLIMATE_REPO_TOKEN"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            install:
         | 
| 10 | 
            +
            - travis_retry bundle install
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            script:
         | 
| 13 | 
            +
            - bundle exec rspec
         | 
| 14 | 
            +
            - bundle exec codeclimate-test-reporter
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            notifications:
         | 
| 17 | 
            +
              email: false
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            deploy:
         | 
| 20 | 
            +
              provider: rubygems
         | 
| 21 | 
            +
              api_key:
         | 
| 22 | 
            +
                secure: uT9YVkRp1usg+glYDSG7KJkm8CQGI8pZDbHlmbPz6ibbA8DVyjbBtYjGbvODCoRkisC24kSy31gMqBSmIxLG0ICv2tOy/iaoiuVeUk6NFfP4dcVGsDueQXjqd6Fjw6fCBg42sojwAVWzvDP2EVjQnbcZqROasLPmKuC2qrm+f9aSYLXmGyBtpvJ5FsfpW33OvE3qJD3y0AlPMdCihPe03FVzSiLNMmGuYOH97MucuWGbUJN+tSFiBfqIrAGT2TQXFrdiT3HtxEt+vNH0cGoLQAKgTgx4XPAcKEjg/cML5yhY/OcPR0uNgqdjxqS3faaH31r1xZaGGfHTf9dj++123YLNHbI8odyA9eF+jYU/3D8UnmMpsTNGZXCFUS8xVUobDcejhPBNhqGPLruLtbvIaqpVZ2bF9BOY1F0ILp4GERzUUUxws+BB1EJ6zFpNrDl7MHlqrc+gRZWcWlazQ82BmLQsTVHiab3ZerGCP4+kYiNeyEnsa3wmVDDd2iffU05Bse44/W1/BKmlzV0QfYl1iMA8lkCrgqmslFecCf0xA01v4CF2Hv63PxOeNmNvZm4VIkgy9uPBjD91AVdscSzCRuTc149OluBqUoxUToX9rEegheUXhWs6ww6DHtlRQI+OBNauRUCo7Fb2zV+gTNzUCSln0fE+z9aLhduuA8JLAKk=
         | 
| 23 | 
            +
              gem: maltese
         | 
| 24 | 
            +
              on:
         | 
| 25 | 
            +
                tags: true
         | 
| 26 | 
            +
                repo: datacite/maltese
         | 
    
        data/CHANGELOG.md
    ADDED
    
    
    
        data/Dockerfile
    ADDED
    
    | @@ -0,0 +1,16 @@ | |
| 1 | 
            +
            FROM phusion/passenger-full:0.9.20
         | 
| 2 | 
            +
            MAINTAINER Martin Fenner "mfenner@datacite.org"
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # Install Ruby 2.3.3
         | 
| 5 | 
            +
            RUN bash -lc 'rvm --default use ruby-2.3.3'
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            ENV PATH="/usr/local/rvm/gems/ruby-2.3.3/bin:${PATH}"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # Update installed APT packages, clean up APT when done.
         | 
| 10 | 
            +
            RUN apt-get update && apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \
         | 
| 11 | 
            +
                apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            # Install maltese gem
         | 
| 14 | 
            +
            RUN /sbin/setuser app gem install maltese
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            CMD maltese sitemap --sitemap_bucket $SITEMAP_BUCKET --sitemap_url $SITEMAP_URL --from_date $FROM_DATE --until_date $UNTIL_DATE
         | 
    
        data/Gemfile
    ADDED
    
    
    
        data/Gemfile.lock
    ADDED
    
    | @@ -0,0 +1,128 @@ | |
| 1 | 
            +
            PATH
         | 
| 2 | 
            +
              remote: .
         | 
| 3 | 
            +
              specs:
         | 
| 4 | 
            +
                maltese (0.1.2)
         | 
| 5 | 
            +
                  activesupport (~> 4.2, >= 4.2.5)
         | 
| 6 | 
            +
                  fog-aws (~> 0.7.6)
         | 
| 7 | 
            +
                  maremma (~> 3.5)
         | 
| 8 | 
            +
                  mime-types (~> 3.1)
         | 
| 9 | 
            +
                  sitemap_generator (~> 5.1)
         | 
| 10 | 
            +
                  thor (~> 0.19)
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            GEM
         | 
| 13 | 
            +
              remote: https://rubygems.org/
         | 
| 14 | 
            +
              specs:
         | 
| 15 | 
            +
                activesupport (4.2.8)
         | 
| 16 | 
            +
                  i18n (~> 0.7)
         | 
| 17 | 
            +
                  minitest (~> 5.1)
         | 
| 18 | 
            +
                  thread_safe (~> 0.3, >= 0.3.4)
         | 
| 19 | 
            +
                  tzinfo (~> 1.1)
         | 
| 20 | 
            +
                addressable (2.5.0)
         | 
| 21 | 
            +
                  public_suffix (~> 2.0, >= 2.0.2)
         | 
| 22 | 
            +
                builder (3.2.3)
         | 
| 23 | 
            +
                codeclimate-test-reporter (1.0.6)
         | 
| 24 | 
            +
                  simplecov
         | 
| 25 | 
            +
                crack (0.4.3)
         | 
| 26 | 
            +
                  safe_yaml (~> 1.0.0)
         | 
| 27 | 
            +
                diff-lcs (1.3)
         | 
| 28 | 
            +
                docile (1.1.5)
         | 
| 29 | 
            +
                excon (0.45.4)
         | 
| 30 | 
            +
                faraday (0.9.2)
         | 
| 31 | 
            +
                  multipart-post (>= 1.2, < 3)
         | 
| 32 | 
            +
                faraday-encoding (0.0.4)
         | 
| 33 | 
            +
                  faraday
         | 
| 34 | 
            +
                faraday_middleware (0.10.1)
         | 
| 35 | 
            +
                  faraday (>= 0.7.4, < 1.0)
         | 
| 36 | 
            +
                fog-aws (0.7.6)
         | 
| 37 | 
            +
                  fog-core (~> 1.27)
         | 
| 38 | 
            +
                  fog-json (~> 1.0)
         | 
| 39 | 
            +
                  fog-xml (~> 0.1)
         | 
| 40 | 
            +
                  ipaddress (~> 0.8)
         | 
| 41 | 
            +
                fog-core (1.37.0)
         | 
| 42 | 
            +
                  builder
         | 
| 43 | 
            +
                  excon (~> 0.45)
         | 
| 44 | 
            +
                  formatador (~> 0.2)
         | 
| 45 | 
            +
                fog-json (1.0.2)
         | 
| 46 | 
            +
                  fog-core (~> 1.0)
         | 
| 47 | 
            +
                  multi_json (~> 1.10)
         | 
| 48 | 
            +
                fog-xml (0.1.2)
         | 
| 49 | 
            +
                  fog-core
         | 
| 50 | 
            +
                  nokogiri (~> 1.5, >= 1.5.11)
         | 
| 51 | 
            +
                formatador (0.2.5)
         | 
| 52 | 
            +
                hashdiff (0.3.2)
         | 
| 53 | 
            +
                i18n (0.8.1)
         | 
| 54 | 
            +
                ipaddress (0.8.3)
         | 
| 55 | 
            +
                json (2.0.3)
         | 
| 56 | 
            +
                maremma (3.5.1)
         | 
| 57 | 
            +
                  activesupport (~> 4.2, >= 4.2.5)
         | 
| 58 | 
            +
                  addressable (>= 2.3.6)
         | 
| 59 | 
            +
                  builder (~> 3.2, >= 3.2.2)
         | 
| 60 | 
            +
                  excon (~> 0.45.0)
         | 
| 61 | 
            +
                  faraday (~> 0.9.2)
         | 
| 62 | 
            +
                  faraday-encoding (~> 0.0.1)
         | 
| 63 | 
            +
                  faraday_middleware (~> 0.10.0)
         | 
| 64 | 
            +
                  multi_json (~> 1.11.2)
         | 
| 65 | 
            +
                  nokogiri (~> 1.6.7)
         | 
| 66 | 
            +
                  oj (~> 2.18, >= 2.18.1)
         | 
| 67 | 
            +
                mime-types (3.1)
         | 
| 68 | 
            +
                  mime-types-data (~> 3.2015)
         | 
| 69 | 
            +
                mime-types-data (3.2016.0521)
         | 
| 70 | 
            +
                mini_portile2 (2.1.0)
         | 
| 71 | 
            +
                minitest (5.10.1)
         | 
| 72 | 
            +
                multi_json (1.11.3)
         | 
| 73 | 
            +
                multipart-post (2.0.0)
         | 
| 74 | 
            +
                nokogiri (1.6.8.1)
         | 
| 75 | 
            +
                  mini_portile2 (~> 2.1.0)
         | 
| 76 | 
            +
                oj (2.18.1)
         | 
| 77 | 
            +
                public_suffix (2.0.5)
         | 
| 78 | 
            +
                rack (2.0.1)
         | 
| 79 | 
            +
                rack-test (0.6.3)
         | 
| 80 | 
            +
                  rack (>= 1.0)
         | 
| 81 | 
            +
                rake (12.0.0)
         | 
| 82 | 
            +
                rspec (3.5.0)
         | 
| 83 | 
            +
                  rspec-core (~> 3.5.0)
         | 
| 84 | 
            +
                  rspec-expectations (~> 3.5.0)
         | 
| 85 | 
            +
                  rspec-mocks (~> 3.5.0)
         | 
| 86 | 
            +
                rspec-core (3.5.4)
         | 
| 87 | 
            +
                  rspec-support (~> 3.5.0)
         | 
| 88 | 
            +
                rspec-expectations (3.5.0)
         | 
| 89 | 
            +
                  diff-lcs (>= 1.2.0, < 2.0)
         | 
| 90 | 
            +
                  rspec-support (~> 3.5.0)
         | 
| 91 | 
            +
                rspec-mocks (3.5.0)
         | 
| 92 | 
            +
                  diff-lcs (>= 1.2.0, < 2.0)
         | 
| 93 | 
            +
                  rspec-support (~> 3.5.0)
         | 
| 94 | 
            +
                rspec-support (3.5.0)
         | 
| 95 | 
            +
                safe_yaml (1.0.4)
         | 
| 96 | 
            +
                simplecov (0.12.0)
         | 
| 97 | 
            +
                  docile (~> 1.1.0)
         | 
| 98 | 
            +
                  json (>= 1.8, < 3)
         | 
| 99 | 
            +
                  simplecov-html (~> 0.10.0)
         | 
| 100 | 
            +
                simplecov-html (0.10.0)
         | 
| 101 | 
            +
                sitemap_generator (5.3.0)
         | 
| 102 | 
            +
                  builder (~> 3.0)
         | 
| 103 | 
            +
                thor (0.19.4)
         | 
| 104 | 
            +
                thread_safe (0.3.6)
         | 
| 105 | 
            +
                tzinfo (1.2.2)
         | 
| 106 | 
            +
                  thread_safe (~> 0.1)
         | 
| 107 | 
            +
                vcr (3.0.3)
         | 
| 108 | 
            +
                webmock (1.24.6)
         | 
| 109 | 
            +
                  addressable (>= 2.3.6)
         | 
| 110 | 
            +
                  crack (>= 0.3.2)
         | 
| 111 | 
            +
                  hashdiff
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            PLATFORMS
         | 
| 114 | 
            +
              ruby
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            DEPENDENCIES
         | 
| 117 | 
            +
              bundler (~> 1.0)
         | 
| 118 | 
            +
              codeclimate-test-reporter (~> 1.0, >= 1.0.0)
         | 
| 119 | 
            +
              maltese!
         | 
| 120 | 
            +
              rack-test (~> 0)
         | 
| 121 | 
            +
              rake (~> 12.0)
         | 
| 122 | 
            +
              rspec (~> 3.4)
         | 
| 123 | 
            +
              simplecov (~> 0.12.0)
         | 
| 124 | 
            +
              vcr (~> 3.0, >= 3.0.3)
         | 
| 125 | 
            +
              webmock (~> 1.22, >= 1.22.3)
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            BUNDLED WITH
         | 
| 128 | 
            +
               1.12.5
         | 
    
        data/LICENSE.md
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
| 1 | 
            +
            MIT License
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2017 DataCite
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         | 
| 6 | 
            +
            of this software and associated documentation files (the "Software"), to deal
         | 
| 7 | 
            +
            in the Software without restriction, including without limitation the rights
         | 
| 8 | 
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         | 
| 9 | 
            +
            copies of the Software, and to permit persons to whom the Software is
         | 
| 10 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            The above copyright notice and this permission notice shall be included in all
         | 
| 13 | 
            +
            copies or substantial portions of the Software.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         | 
| 16 | 
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         | 
| 17 | 
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         | 
| 18 | 
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         | 
| 19 | 
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 20 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
         | 
| 21 | 
            +
            SOFTWARE.
         | 
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,51 @@ | |
| 1 | 
            +
            [](https://travis-ci.org/datacite/maltese)
         | 
| 2 | 
            +
            [](https://codeclimate.com/github/datacite/maltese)
         | 
| 3 | 
            +
            [](https://codeclimate.com/github/datacite/maltese/coverage)
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # Maltese
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            Ruby gem and command-line tool for generating sitemap files from the DataCite REST API. Uses the [SitemapGenerator](https://github.com/kjvarga/sitemap_generator) gem and can be run as Docker container, e.g. using ECS and triggered by AWS Lambda, as described [here](https://medium.com/@pahud/ecs-task-runner-with-lambda-4594b72ccb#.5xpmf2inz).
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            Run as a command-line tool:
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            ```
         | 
| 12 | 
            +
            maltese sitemap --from_date 2017-02-15
         | 
| 13 | 
            +
            ```
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            ## Installation
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            Requires Ruby 2.2 or later. Then add the following to your `Gemfile` to install the
         | 
| 18 | 
            +
            latest version:
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            ```ruby
         | 
| 21 | 
            +
            gem 'maltese'
         | 
| 22 | 
            +
            ```
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            Then run `bundle install` to install into your environment.
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            You can also install the gem system-wide in the usual way:
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            ```bash
         | 
| 29 | 
            +
            gem install maltese
         | 
| 30 | 
            +
            ```
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            ## Development
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            We use rspec for unit testing:
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            ```
         | 
| 37 | 
            +
            bundle exec rspec
         | 
| 38 | 
            +
            ```
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            ### Note on Patches/Pull Requests
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            * Fork the project
         | 
| 45 | 
            +
            * Write tests for your new feature or a test that reproduces a bug
         | 
| 46 | 
            +
            * Implement your feature or make a bug fix
         | 
| 47 | 
            +
            * Do not mess with Rakefile, version or history
         | 
| 48 | 
            +
            * Commit, push and make a pull request. Bonus points for topical branches.
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            ## License
         | 
| 51 | 
            +
            **maltese** is released under the [MIT License](https://github.com/datacite/maltese/blob/master/LICENSE.md).
         | 
    
        data/bin/maltese
    ADDED
    
    
    
        data/lib/maltese/cli.rb
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
| 1 | 
            +
            # encoding: UTF-8
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require "thor"
         | 
| 4 | 
            +
            require_relative 'sitemap'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module Maltese
         | 
| 7 | 
            +
              class CLI < Thor
         | 
| 8 | 
            +
                def self.exit_on_failure?
         | 
| 9 | 
            +
                  true
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                # from http://stackoverflow.com/questions/22809972/adding-a-version-option-to-a-ruby-thor-cli
         | 
| 13 | 
            +
                map %w[--version -v] => :__print_version
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                desc "--version, -v", "print the version"
         | 
| 16 | 
            +
                def __print_version
         | 
| 17 | 
            +
                  puts Toccatore::VERSION
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                desc "sitemap", "generate sitemap for DataCite Search"
         | 
| 21 | 
            +
                method_option :sitemap_bucket, type: :string, default: ENV['SITEMAP_BUCKET']
         | 
| 22 | 
            +
                method_option :sitemap_url, type: :string, default: ENV['SITEMAP_URL']
         | 
| 23 | 
            +
                method_option :from_date, type: :string, default: (Time.now.to_date - 1.day).iso8601
         | 
| 24 | 
            +
                method_option :until_date, type: :string, default: Time.now.to_date.iso8601
         | 
| 25 | 
            +
                def sitemap
         | 
| 26 | 
            +
                  sitemap = Maltese::Sitemap.new(options)
         | 
| 27 | 
            +
                  sitemap.queue_jobs
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
            end
         | 
| @@ -0,0 +1,140 @@ | |
| 1 | 
            +
            module Maltese
         | 
| 2 | 
            +
              class Sitemap
         | 
| 3 | 
            +
                attr_reader :sitemap_bucket, :sitemap_url, :from_date, :until_date
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                def initialize(attributes={})
         | 
| 6 | 
            +
                  @sitemap_bucket = attributes[:sitemap_bucket]|| "sitemaps.datacite.org"
         | 
| 7 | 
            +
                  @sitemap_url = attributes[:sitemap_url] || "https://search.datacite.org"
         | 
| 8 | 
            +
                  @from_date = attributes[:from_date].presence || (Time.now.to_date - 1.day).iso8601
         | 
| 9 | 
            +
                  @until_date = attributes[:until_date].presence || Time.now.to_date.iso8601
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                # load ENV variables from container environment if json file exists
         | 
| 13 | 
            +
                # see https://github.com/phusion/baseimage-docker#envvar_dumps
         | 
| 14 | 
            +
                env_json_file = "/etc/container_environment.json"
         | 
| 15 | 
            +
                if File.size?(env_json_file).to_i > 2
         | 
| 16 | 
            +
                  env_vars = JSON.parse(File.read(env_json_file))
         | 
| 17 | 
            +
                  env_vars.each { |k, v| ENV[k] = v }
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def search_path
         | 
| 21 | 
            +
                  "#{sitemap_url}/api?"
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                def sitemaps_host
         | 
| 25 | 
            +
                  "http://#{sitemap_bucket}.s3.amazonaws.com/"
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                def sitemaps_path
         | 
| 29 | 
            +
                  'sitemaps/'
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                def timeout
         | 
| 33 | 
            +
                  120
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def job_batch_size
         | 
| 37 | 
            +
                  50000
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                def sitemap
         | 
| 41 | 
            +
                  @sitemap ||= SitemapGenerator::LinkSet.new(
         | 
| 42 | 
            +
                    default_host: sitemap_url,
         | 
| 43 | 
            +
                    adapter: s3_adapter,
         | 
| 44 | 
            +
                    sitemaps_host: sitemaps_host,
         | 
| 45 | 
            +
                    sitemaps_path: sitemaps_path,
         | 
| 46 | 
            +
                    finalize: false)
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                def s3_adapter
         | 
| 50 | 
            +
                  SitemapGenerator::S3Adapter.new(fog_provider: 'AWS',
         | 
| 51 | 
            +
                                                  aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
         | 
| 52 | 
            +
                                                  aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
         | 
| 53 | 
            +
                                                  fog_directory: sitemap_bucket,
         | 
| 54 | 
            +
                                                  fog_region: ENV['AWS_REGION'])
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                def fog_storage
         | 
| 58 | 
            +
                  Fog::Storage.new(provider: 'AWS',
         | 
| 59 | 
            +
                                   aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
         | 
| 60 | 
            +
                                   aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'])
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                def queue_jobs(options={})
         | 
| 64 | 
            +
                  total = get_total(options)
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                  if total > 0
         | 
| 67 | 
            +
                    puts process_data(options.merge(total: total))
         | 
| 68 | 
            +
                  else
         | 
| 69 | 
            +
                    puts "No works found for date range #{from_date} - #{until_date}."
         | 
| 70 | 
            +
                  end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                  # return number of works queued
         | 
| 73 | 
            +
                  total
         | 
| 74 | 
            +
                end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                def get_total(options={})
         | 
| 77 | 
            +
                  query_url = get_query_url(options.merge(rows: 0))
         | 
| 78 | 
            +
                  result = Maremma.get(query_url, options)
         | 
| 79 | 
            +
                  result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                def get_query_url(options={})
         | 
| 83 | 
            +
                  options[:offset] = options[:offset].to_i || 0
         | 
| 84 | 
            +
                  options[:rows] = options[:rows].presence || job_batch_size
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                  updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
         | 
| 87 | 
            +
                  fq = "#{updated} AND has_metadata:true AND is_active:true"
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                  params = { q: "*:*",
         | 
| 90 | 
            +
                             fq: fq,
         | 
| 91 | 
            +
                             start: options[:offset],
         | 
| 92 | 
            +
                             rows: options[:rows],
         | 
| 93 | 
            +
                             fl: "doi,updated",
         | 
| 94 | 
            +
                             sort: "updated asc",
         | 
| 95 | 
            +
                             wt: "json" }
         | 
| 96 | 
            +
                  search_path + URI.encode_www_form(params)
         | 
| 97 | 
            +
                end
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                def process_data(options = {})
         | 
| 100 | 
            +
                  options[:start_time] = Time.now
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                  # walk through paginated results
         | 
| 103 | 
            +
                  total_pages = (options[:total].to_f / job_batch_size).ceil
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                  (0...total_pages).each do |page|
         | 
| 106 | 
            +
                    options[:offset] = page * job_batch_size
         | 
| 107 | 
            +
                    data = get_data(options.merge(timeout: timeout))
         | 
| 108 | 
            +
                    parse_data(data)
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                  push_data(options)
         | 
| 112 | 
            +
                end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                def get_data(options={})
         | 
| 115 | 
            +
                  query_url = get_query_url(options)
         | 
| 116 | 
            +
                  Maremma.get(query_url, options)
         | 
| 117 | 
            +
                end
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                def parse_data(result, options={})
         | 
| 120 | 
            +
                  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                  items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
         | 
| 123 | 
            +
                  Array(items).each do |item|
         | 
| 124 | 
            +
                    loc = "/works/" + item.fetch("doi")
         | 
| 125 | 
            +
                    sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
         | 
| 126 | 
            +
                  end
         | 
| 127 | 
            +
                  sitemap.sitemap.link_count
         | 
| 128 | 
            +
                end
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                def push_data(options={})
         | 
| 131 | 
            +
                  # sync time with AWS S3 before uploading
         | 
| 132 | 
            +
                  fog_storage.sync_clock
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                  sitemap.finalize!
         | 
| 135 | 
            +
                  options[:start_time] ||= Time.now
         | 
| 136 | 
            +
                  sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
         | 
| 137 | 
            +
                  sitemap.sitemap.link_count
         | 
| 138 | 
            +
                end
         | 
| 139 | 
            +
              end
         | 
| 140 | 
            +
            end
         | 
| @@ -0,0 +1,87 @@ | |
| 1 | 
            +
            module Maltese
         | 
| 2 | 
            +
              module Utils
         | 
| 3 | 
            +
                # load ENV variables from container environment if json file exists
         | 
| 4 | 
            +
                # see https://github.com/phusion/baseimage-docker#envvar_dumps
         | 
| 5 | 
            +
                env_json_file = "/etc/container_environment.json"
         | 
| 6 | 
            +
                if File.size?(env_json_file).to_i > 2
         | 
| 7 | 
            +
                  env_vars = JSON.parse(File.read(env_json_file))
         | 
| 8 | 
            +
                  env_vars.each { |k, v| ENV[k] = v }
         | 
| 9 | 
            +
                end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                def queue_jobs(options={})
         | 
| 12 | 
            +
                  options[:offset] = options[:offset].to_i || 0
         | 
| 13 | 
            +
                  options[:rows] = options[:rows].presence || job_batch_size
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  total = get_total(options)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                  if total > 0
         | 
| 18 | 
            +
                    puts process_data(options.merge(total: total))
         | 
| 19 | 
            +
                  else
         | 
| 20 | 
            +
                    puts "No works found for date range #{from_date} - #{until_date}."
         | 
| 21 | 
            +
                  end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  # return number of works queued
         | 
| 24 | 
            +
                  total
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                def get_total(options={})
         | 
| 28 | 
            +
                  query_url = get_query_url(options.merge(rows: 0))
         | 
| 29 | 
            +
                  result = Maremma.get(query_url, options)
         | 
| 30 | 
            +
                  result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                def get_query_url(options={})
         | 
| 34 | 
            +
                  updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
         | 
| 35 | 
            +
                  fq = "#{updated} AND has_metadata:true AND is_active:true"
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  params = { q: "*:*",
         | 
| 38 | 
            +
                             fq: fq,
         | 
| 39 | 
            +
                             start: options[:offset],
         | 
| 40 | 
            +
                             rows: options[:rows],
         | 
| 41 | 
            +
                             fl: "doi,updated",
         | 
| 42 | 
            +
                             sort: "updated asc",
         | 
| 43 | 
            +
                             wt: "json" }
         | 
| 44 | 
            +
                  url +  URI.encode_www_form(params)
         | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                def process_data(options = {})
         | 
| 48 | 
            +
                  options[:start_time] = Time.now
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  # walk through paginated results
         | 
| 51 | 
            +
                  total_pages = (options[:total].to_f / job_batch_size).ceil
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  (0...total_pages).each do |page|
         | 
| 54 | 
            +
                    options[:offset] = page * job_batch_size
         | 
| 55 | 
            +
                    data = get_data(options.merge(timeout: timeout))
         | 
| 56 | 
            +
                    parse_data(data)
         | 
| 57 | 
            +
                  end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  push_data(options)
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                def get_data(options={})
         | 
| 63 | 
            +
                  query_url = get_query_url(options)
         | 
| 64 | 
            +
                  Maremma.get(query_url, options)
         | 
| 65 | 
            +
                end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                def parse_data(result, options={})
         | 
| 68 | 
            +
                  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                  items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
         | 
| 71 | 
            +
                  Array(items).each do |item|
         | 
| 72 | 
            +
                    loc = "/works/" + item.fetch("doi")
         | 
| 73 | 
            +
                    sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
         | 
| 74 | 
            +
                  end
         | 
| 75 | 
            +
                  sitemap.sitemap.link_count
         | 
| 76 | 
            +
                end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                def push_data(options={})
         | 
| 79 | 
            +
                  # sync time with AWS S3 before uploading
         | 
| 80 | 
            +
                  fog_storage.sync_clock
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                  sitemap.finalize!
         | 
| 83 | 
            +
                  time_taken = Time.now - options[:start_time]
         | 
| 84 | 
            +
                  sitemap.sitemap_index.stats_summary(:time_taken => time_taken)
         | 
| 85 | 
            +
                end
         | 
| 86 | 
            +
              end
         | 
| 87 | 
            +
            end
         | 
    
        data/lib/maltese.rb
    ADDED
    
    
    
        data/maltese.gemspec
    ADDED
    
    | @@ -0,0 +1,37 @@ | |
| 1 | 
            +
            require "date"
         | 
| 2 | 
            +
            require File.expand_path("../lib/maltese/version", __FILE__)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Gem::Specification.new do |s|
         | 
| 5 | 
            +
              s.authors       = "Martin Fenner"
         | 
| 6 | 
            +
              s.email         = "mfenner@datacite.org"
         | 
| 7 | 
            +
              s.name          = "maltese"
         | 
| 8 | 
            +
              s.homepage      = "https://github.com/datacite/maltese"
         | 
| 9 | 
            +
              s.summary       = "Ruby library to generate sitemap for DataCite Search"
         | 
| 10 | 
            +
              s.date          = Date.today
         | 
| 11 | 
            +
              s.description   = "Ruby library to generate sitemap for DataCite Search."
         | 
| 12 | 
            +
              s.require_paths = ["lib"]
         | 
| 13 | 
            +
              s.version       = Maltese::VERSION
         | 
| 14 | 
            +
              s.extra_rdoc_files = ["README.md"]
         | 
| 15 | 
            +
              s.license       = 'MIT'
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              # Declary dependencies here, rather than in the Gemfile
         | 
| 18 | 
            +
              s.add_dependency 'maremma', '~> 3.5'
         | 
| 19 | 
            +
              s.add_dependency 'activesupport', '~> 4.2', '>= 4.2.5'
         | 
| 20 | 
            +
              s.add_dependency 'thor', '~> 0.19'
         | 
| 21 | 
            +
              s.add_dependency 'sitemap_generator', '~> 5.1'
         | 
| 22 | 
            +
              s.add_dependency 'fog-aws', '~> 0.7.6'
         | 
| 23 | 
            +
              s.add_dependency 'mime-types', '~> 3.1'
         | 
| 24 | 
            +
              s.add_development_dependency 'bundler', '~> 1.0'
         | 
| 25 | 
            +
              s.add_development_dependency 'rspec', '~> 3.4'
         | 
| 26 | 
            +
              s.add_development_dependency 'rake', '~> 12.0'
         | 
| 27 | 
            +
              s.add_development_dependency 'rack-test', '~> 0'
         | 
| 28 | 
            +
              s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
         | 
| 29 | 
            +
              s.add_development_dependency 'webmock', '~> 1.22', '>= 1.22.3'
         | 
| 30 | 
            +
              s.add_development_dependency 'codeclimate-test-reporter', '~> 1.0', '>= 1.0.0'
         | 
| 31 | 
            +
              s.add_development_dependency 'simplecov', '~> 0.12.0'
         | 
| 32 | 
            +
             | 
| 33 | 
            +
              s.require_paths = ["lib"]
         | 
| 34 | 
            +
              s.files       = `git ls-files`.split($/)
         | 
| 35 | 
            +
              s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
         | 
| 36 | 
            +
              s.executables = ["maltese"]
         | 
| 37 | 
            +
            end
         | 
| Binary file | 
    
        data/spec/cli_spec.rb
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
            require 'maltese/cli'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            describe Maltese::CLI do
         | 
| 5 | 
            +
              let(:subject) do
         | 
| 6 | 
            +
                described_class.new
         | 
| 7 | 
            +
              end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              let(:push_url) { ENV['VOLPINO_URL'] }
         | 
| 10 | 
            +
              let(:access_token) { ENV['VOLPINO_TOKEN'] }
         | 
| 11 | 
            +
              let(:from_date) { "2015-04-07" }
         | 
| 12 | 
            +
              let(:until_date) { "2015-04-08" }
         | 
| 13 | 
            +
              let(:cli_options) { { push_url: push_url,
         | 
| 14 | 
            +
                                    access_token: access_token,
         | 
| 15 | 
            +
                                    from_date: from_date,
         | 
| 16 | 
            +
                                    until_date: until_date } }
         | 
| 17 | 
            +
             | 
| 18 | 
            +
              describe "sitemap", vcr: true, :order => :defined do
         | 
| 19 | 
            +
                it 'should succeed' do
         | 
| 20 | 
            +
                  subject.options = cli_options
         | 
| 21 | 
            +
                  expect { subject.sitemap }.to output(/2522 links/).to_stdout
         | 
| 22 | 
            +
                  sitemap = Zlib::GzipReader.open("public/sitemap.xml.gz") { |gz| gz.read }
         | 
| 23 | 
            +
                  doc = Nokogiri::XML(sitemap)
         | 
| 24 | 
            +
                  expect(doc.xpath("//xmlns:url").size).to eq(2522)
         | 
| 25 | 
            +
                  expect(doc.xpath("//xmlns:loc").last.text).to eq("https://search.datacite.org/works/10.6084/M9.FIGSHARE.1371139")
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                it 'should succeed with no works' do
         | 
| 29 | 
            +
                  from_date = "2005-04-07"
         | 
| 30 | 
            +
                  until_date = "2005-04-08"
         | 
| 31 | 
            +
                  subject.options = { push_url: push_url,
         | 
| 32 | 
            +
                                      access_token: access_token,
         | 
| 33 | 
            +
                                      from_date: from_date,
         | 
| 34 | 
            +
                                      until_date: until_date }
         | 
| 35 | 
            +
                  expect { subject.sitemap }.to output("No works found for date range 2005-04-07 - 2005-04-08.\n").to_stdout
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                # it 'should fail' do
         | 
| 39 | 
            +
                #   subject.options = cli_options.except(:access_token)
         | 
| 40 | 
            +
                #   expect { subject.sitemap }.to output(/An error occured: Access token missing.\n/).to_stdout
         | 
| 41 | 
            +
                # end
         | 
| 42 | 
            +
              end
         | 
| 43 | 
            +
            end
         |