maltese 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +19 -0
  3. data/.gitignore +55 -0
  4. data/.rubocop.yml +1156 -0
  5. data/.travis.yml +26 -0
  6. data/CHANGELOG.md +5 -0
  7. data/Dockerfile +16 -0
  8. data/Gemfile +3 -0
  9. data/Gemfile.lock +128 -0
  10. data/LICENSE.md +21 -0
  11. data/README.md +51 -0
  12. data/bin/maltese +5 -0
  13. data/lib/maltese/cli.rb +30 -0
  14. data/lib/maltese/sitemap.rb +140 -0
  15. data/lib/maltese/utils.rb +87 -0
  16. data/lib/maltese/version.rb +3 -0
  17. data/lib/maltese.rb +8 -0
  18. data/maltese.gemspec +37 -0
  19. data/public/sitemap.xml.gz +0 -0
  20. data/spec/cli_spec.rb +43 -0
  21. data/spec/fixtures/sitemap.json +7574 -0
  22. data/spec/fixtures/sitemap_nil.json +11 -0
  23. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
  24. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +38 -0
  25. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_no_works.yml +38 -0
  26. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_works.yml +38 -0
  27. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +86 -0
  28. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +86 -0
  29. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
  30. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +270 -0
  31. data/spec/sitemap_spec.rb +111 -0
  32. data/spec/spec_helper.rb +95 -0
  33. metadata +310 -0
data/.travis.yml ADDED
@@ -0,0 +1,26 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.3
4
+
5
+ addons:
6
+ code_climate:
7
+ repo_token: "$CODECLIMATE_REPO_TOKEN"
8
+
9
+ install:
10
+ - travis_retry bundle install
11
+
12
+ script:
13
+ - bundle exec rspec
14
+ - bundle exec codeclimate-test-reporter
15
+
16
+ notifications:
17
+ email: false
18
+
19
+ deploy:
20
+ provider: rubygems
21
+ api_key:
22
+ secure: uT9YVkRp1usg+glYDSG7KJkm8CQGI8pZDbHlmbPz6ibbA8DVyjbBtYjGbvODCoRkisC24kSy31gMqBSmIxLG0ICv2tOy/iaoiuVeUk6NFfP4dcVGsDueQXjqd6Fjw6fCBg42sojwAVWzvDP2EVjQnbcZqROasLPmKuC2qrm+f9aSYLXmGyBtpvJ5FsfpW33OvE3qJD3y0AlPMdCihPe03FVzSiLNMmGuYOH97MucuWGbUJN+tSFiBfqIrAGT2TQXFrdiT3HtxEt+vNH0cGoLQAKgTgx4XPAcKEjg/cML5yhY/OcPR0uNgqdjxqS3faaH31r1xZaGGfHTf9dj++123YLNHbI8odyA9eF+jYU/3D8UnmMpsTNGZXCFUS8xVUobDcejhPBNhqGPLruLtbvIaqpVZ2bF9BOY1F0ILp4GERzUUUxws+BB1EJ6zFpNrDl7MHlqrc+gRZWcWlazQ82BmLQsTVHiab3ZerGCP4+kYiNeyEnsa3wmVDDd2iffU05Bse44/W1/BKmlzV0QfYl1iMA8lkCrgqmslFecCf0xA01v4CF2Hv63PxOeNmNvZm4VIkgy9uPBjD91AVdscSzCRuTc149OluBqUoxUToX9rEegheUXhWs6ww6DHtlRQI+OBNauRUCo7Fb2zV+gTNzUCSln0fE+z9aLhduuA8JLAKk=
23
+ gem: maltese
24
+ on:
25
+ tags: true
26
+ repo: datacite/maltese
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## v.0.1 (February 25, 2017)
2
+
3
+ [maltese 0.1](https://github.com/datacite/maltese/releases/tag/v.0.1) was released on February 25, 2017:
4
+
5
+ * initial release
data/Dockerfile ADDED
@@ -0,0 +1,16 @@
1
+ FROM phusion/passenger-full:0.9.20
2
+ MAINTAINER Martin Fenner "mfenner@datacite.org"
3
+
4
+ # Install Ruby 2.3.3
5
+ RUN bash -lc 'rvm --default use ruby-2.3.3'
6
+
7
+ ENV PATH="/usr/local/rvm/gems/ruby-2.3.3/bin:${PATH}"
8
+
9
+ # Update installed APT packages, clean up APT when done.
10
+ RUN apt-get update && apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \
11
+ apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
12
+
13
+ # Install maltese gem
14
+ RUN /sbin/setuser app gem install maltese
15
+
16
+ CMD maltese sitemap --sitemap_bucket $SITEMAP_BUCKET --sitemap_url $SITEMAP_URL --from_date $FROM_DATE --until_date $UNTIL_DATE
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,128 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ maltese (0.1.2)
5
+ activesupport (~> 4.2, >= 4.2.5)
6
+ fog-aws (~> 0.7.6)
7
+ maremma (~> 3.5)
8
+ mime-types (~> 3.1)
9
+ sitemap_generator (~> 5.1)
10
+ thor (~> 0.19)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ activesupport (4.2.8)
16
+ i18n (~> 0.7)
17
+ minitest (~> 5.1)
18
+ thread_safe (~> 0.3, >= 0.3.4)
19
+ tzinfo (~> 1.1)
20
+ addressable (2.5.0)
21
+ public_suffix (~> 2.0, >= 2.0.2)
22
+ builder (3.2.3)
23
+ codeclimate-test-reporter (1.0.6)
24
+ simplecov
25
+ crack (0.4.3)
26
+ safe_yaml (~> 1.0.0)
27
+ diff-lcs (1.3)
28
+ docile (1.1.5)
29
+ excon (0.45.4)
30
+ faraday (0.9.2)
31
+ multipart-post (>= 1.2, < 3)
32
+ faraday-encoding (0.0.4)
33
+ faraday
34
+ faraday_middleware (0.10.1)
35
+ faraday (>= 0.7.4, < 1.0)
36
+ fog-aws (0.7.6)
37
+ fog-core (~> 1.27)
38
+ fog-json (~> 1.0)
39
+ fog-xml (~> 0.1)
40
+ ipaddress (~> 0.8)
41
+ fog-core (1.37.0)
42
+ builder
43
+ excon (~> 0.45)
44
+ formatador (~> 0.2)
45
+ fog-json (1.0.2)
46
+ fog-core (~> 1.0)
47
+ multi_json (~> 1.10)
48
+ fog-xml (0.1.2)
49
+ fog-core
50
+ nokogiri (~> 1.5, >= 1.5.11)
51
+ formatador (0.2.5)
52
+ hashdiff (0.3.2)
53
+ i18n (0.8.1)
54
+ ipaddress (0.8.3)
55
+ json (2.0.3)
56
+ maremma (3.5.1)
57
+ activesupport (~> 4.2, >= 4.2.5)
58
+ addressable (>= 2.3.6)
59
+ builder (~> 3.2, >= 3.2.2)
60
+ excon (~> 0.45.0)
61
+ faraday (~> 0.9.2)
62
+ faraday-encoding (~> 0.0.1)
63
+ faraday_middleware (~> 0.10.0)
64
+ multi_json (~> 1.11.2)
65
+ nokogiri (~> 1.6.7)
66
+ oj (~> 2.18, >= 2.18.1)
67
+ mime-types (3.1)
68
+ mime-types-data (~> 3.2015)
69
+ mime-types-data (3.2016.0521)
70
+ mini_portile2 (2.1.0)
71
+ minitest (5.10.1)
72
+ multi_json (1.11.3)
73
+ multipart-post (2.0.0)
74
+ nokogiri (1.6.8.1)
75
+ mini_portile2 (~> 2.1.0)
76
+ oj (2.18.1)
77
+ public_suffix (2.0.5)
78
+ rack (2.0.1)
79
+ rack-test (0.6.3)
80
+ rack (>= 1.0)
81
+ rake (12.0.0)
82
+ rspec (3.5.0)
83
+ rspec-core (~> 3.5.0)
84
+ rspec-expectations (~> 3.5.0)
85
+ rspec-mocks (~> 3.5.0)
86
+ rspec-core (3.5.4)
87
+ rspec-support (~> 3.5.0)
88
+ rspec-expectations (3.5.0)
89
+ diff-lcs (>= 1.2.0, < 2.0)
90
+ rspec-support (~> 3.5.0)
91
+ rspec-mocks (3.5.0)
92
+ diff-lcs (>= 1.2.0, < 2.0)
93
+ rspec-support (~> 3.5.0)
94
+ rspec-support (3.5.0)
95
+ safe_yaml (1.0.4)
96
+ simplecov (0.12.0)
97
+ docile (~> 1.1.0)
98
+ json (>= 1.8, < 3)
99
+ simplecov-html (~> 0.10.0)
100
+ simplecov-html (0.10.0)
101
+ sitemap_generator (5.3.0)
102
+ builder (~> 3.0)
103
+ thor (0.19.4)
104
+ thread_safe (0.3.6)
105
+ tzinfo (1.2.2)
106
+ thread_safe (~> 0.1)
107
+ vcr (3.0.3)
108
+ webmock (1.24.6)
109
+ addressable (>= 2.3.6)
110
+ crack (>= 0.3.2)
111
+ hashdiff
112
+
113
+ PLATFORMS
114
+ ruby
115
+
116
+ DEPENDENCIES
117
+ bundler (~> 1.0)
118
+ codeclimate-test-reporter (~> 1.0, >= 1.0.0)
119
+ maltese!
120
+ rack-test (~> 0)
121
+ rake (~> 12.0)
122
+ rspec (~> 3.4)
123
+ simplecov (~> 0.12.0)
124
+ vcr (~> 3.0, >= 3.0.3)
125
+ webmock (~> 1.22, >= 1.22.3)
126
+
127
+ BUNDLED WITH
128
+ 1.12.5
data/LICENSE.md ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2017 DataCite
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,51 @@
1
+ [![Build Status](https://travis-ci.org/datacite/maltese.svg?branch=master)](https://travis-ci.org/datacite/maltese)
2
+ [![Code Climate](https://codeclimate.com/github/datacite/maltese/badges/gpa.svg)](https://codeclimate.com/github/datacite/maltese)
3
+ [![Test Coverage](https://codeclimate.com/github/datacite/maltese/badges/coverage.svg)](https://codeclimate.com/github/datacite/maltese/coverage)
4
+
5
+ # Maltese
6
+
7
+ Ruby gem and command-line tool for generating sitemap files from the DataCite REST API. Uses the [SitemapGenerator](https://github.com/kjvarga/sitemap_generator) gem and can be run as Docker container, e.g. using ECS and triggered by AWS Lambda, as described [here](https://medium.com/@pahud/ecs-task-runner-with-lambda-4594b72ccb#.5xpmf2inz).
8
+
9
+ Run as a command-line tool:
10
+
11
+ ```
12
+ maltese sitemap --from_date 2017-02-15
13
+ ```
14
+
15
+ ## Installation
16
+
17
+ Requires Ruby 2.2 or later. Then add the following to your `Gemfile` to install the
18
+ latest version:
19
+
20
+ ```ruby
21
+ gem 'maltese'
22
+ ```
23
+
24
+ Then run `bundle install` to install into your environment.
25
+
26
+ You can also install the gem system-wide in the usual way:
27
+
28
+ ```bash
29
+ gem install maltese
30
+ ```
31
+
32
+ ## Development
33
+
34
+ We use rspec for unit testing:
35
+
36
+ ```
37
+ bundle exec rspec
38
+ ```
39
+
40
+ Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
41
+
42
+ ### Note on Patches/Pull Requests
43
+
44
+ * Fork the project
45
+ * Write tests for your new feature or a test that reproduces a bug
46
+ * Implement your feature or make a bug fix
47
+ * Do not mess with Rakefile, version or history
48
+ * Commit, push and make a pull request. Bonus points for topical branches.
49
+
50
+ ## License
51
+ **maltese** is released under the [MIT License](https://github.com/datacite/maltese/blob/master/LICENSE.md).
data/bin/maltese ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path("../../lib/maltese", __FILE__)
4
+
5
+ Maltese::CLI.start
@@ -0,0 +1,30 @@
1
+ # encoding: UTF-8
2
+
3
+ require "thor"
4
+ require_relative 'sitemap'
5
+
6
+ module Maltese
7
+ class CLI < Thor
8
+ def self.exit_on_failure?
9
+ true
10
+ end
11
+
12
+ # from http://stackoverflow.com/questions/22809972/adding-a-version-option-to-a-ruby-thor-cli
13
+ map %w[--version -v] => :__print_version
14
+
15
+ desc "--version, -v", "print the version"
16
+ def __print_version
17
+ puts Toccatore::VERSION
18
+ end
19
+
20
+ desc "sitemap", "generate sitemap for DataCite Search"
21
+ method_option :sitemap_bucket, type: :string, default: ENV['SITEMAP_BUCKET']
22
+ method_option :sitemap_url, type: :string, default: ENV['SITEMAP_URL']
23
+ method_option :from_date, type: :string, default: (Time.now.to_date - 1.day).iso8601
24
+ method_option :until_date, type: :string, default: Time.now.to_date.iso8601
25
+ def sitemap
26
+ sitemap = Maltese::Sitemap.new(options)
27
+ sitemap.queue_jobs
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,140 @@
1
+ module Maltese
2
+ class Sitemap
3
+ attr_reader :sitemap_bucket, :sitemap_url, :from_date, :until_date
4
+
5
+ def initialize(attributes={})
6
+ @sitemap_bucket = attributes[:sitemap_bucket]|| "sitemaps.datacite.org"
7
+ @sitemap_url = attributes[:sitemap_url] || "https://search.datacite.org"
8
+ @from_date = attributes[:from_date].presence || (Time.now.to_date - 1.day).iso8601
9
+ @until_date = attributes[:until_date].presence || Time.now.to_date.iso8601
10
+ end
11
+
12
+ # load ENV variables from container environment if json file exists
13
+ # see https://github.com/phusion/baseimage-docker#envvar_dumps
14
+ env_json_file = "/etc/container_environment.json"
15
+ if File.size?(env_json_file).to_i > 2
16
+ env_vars = JSON.parse(File.read(env_json_file))
17
+ env_vars.each { |k, v| ENV[k] = v }
18
+ end
19
+
20
+ def search_path
21
+ "#{sitemap_url}/api?"
22
+ end
23
+
24
+ def sitemaps_host
25
+ "http://#{sitemap_bucket}.s3.amazonaws.com/"
26
+ end
27
+
28
+ def sitemaps_path
29
+ 'sitemaps/'
30
+ end
31
+
32
+ def timeout
33
+ 120
34
+ end
35
+
36
+ def job_batch_size
37
+ 50000
38
+ end
39
+
40
+ def sitemap
41
+ @sitemap ||= SitemapGenerator::LinkSet.new(
42
+ default_host: sitemap_url,
43
+ adapter: s3_adapter,
44
+ sitemaps_host: sitemaps_host,
45
+ sitemaps_path: sitemaps_path,
46
+ finalize: false)
47
+ end
48
+
49
+ def s3_adapter
50
+ SitemapGenerator::S3Adapter.new(fog_provider: 'AWS',
51
+ aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
52
+ aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
53
+ fog_directory: sitemap_bucket,
54
+ fog_region: ENV['AWS_REGION'])
55
+ end
56
+
57
+ def fog_storage
58
+ Fog::Storage.new(provider: 'AWS',
59
+ aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
60
+ aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'])
61
+ end
62
+
63
+ def queue_jobs(options={})
64
+ total = get_total(options)
65
+
66
+ if total > 0
67
+ puts process_data(options.merge(total: total))
68
+ else
69
+ puts "No works found for date range #{from_date} - #{until_date}."
70
+ end
71
+
72
+ # return number of works queued
73
+ total
74
+ end
75
+
76
+ def get_total(options={})
77
+ query_url = get_query_url(options.merge(rows: 0))
78
+ result = Maremma.get(query_url, options)
79
+ result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
80
+ end
81
+
82
+ def get_query_url(options={})
83
+ options[:offset] = options[:offset].to_i || 0
84
+ options[:rows] = options[:rows].presence || job_batch_size
85
+
86
+ updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
87
+ fq = "#{updated} AND has_metadata:true AND is_active:true"
88
+
89
+ params = { q: "*:*",
90
+ fq: fq,
91
+ start: options[:offset],
92
+ rows: options[:rows],
93
+ fl: "doi,updated",
94
+ sort: "updated asc",
95
+ wt: "json" }
96
+ search_path + URI.encode_www_form(params)
97
+ end
98
+
99
+ def process_data(options = {})
100
+ options[:start_time] = Time.now
101
+
102
+ # walk through paginated results
103
+ total_pages = (options[:total].to_f / job_batch_size).ceil
104
+
105
+ (0...total_pages).each do |page|
106
+ options[:offset] = page * job_batch_size
107
+ data = get_data(options.merge(timeout: timeout))
108
+ parse_data(data)
109
+ end
110
+
111
+ push_data(options)
112
+ end
113
+
114
+ def get_data(options={})
115
+ query_url = get_query_url(options)
116
+ Maremma.get(query_url, options)
117
+ end
118
+
119
+ def parse_data(result, options={})
120
+ return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
121
+
122
+ items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
123
+ Array(items).each do |item|
124
+ loc = "/works/" + item.fetch("doi")
125
+ sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
126
+ end
127
+ sitemap.sitemap.link_count
128
+ end
129
+
130
+ def push_data(options={})
131
+ # sync time with AWS S3 before uploading
132
+ fog_storage.sync_clock
133
+
134
+ sitemap.finalize!
135
+ options[:start_time] ||= Time.now
136
+ sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
137
+ sitemap.sitemap.link_count
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,87 @@
1
+ module Maltese
2
+ module Utils
3
+ # load ENV variables from container environment if json file exists
4
+ # see https://github.com/phusion/baseimage-docker#envvar_dumps
5
+ env_json_file = "/etc/container_environment.json"
6
+ if File.size?(env_json_file).to_i > 2
7
+ env_vars = JSON.parse(File.read(env_json_file))
8
+ env_vars.each { |k, v| ENV[k] = v }
9
+ end
10
+
11
+ def queue_jobs(options={})
12
+ options[:offset] = options[:offset].to_i || 0
13
+ options[:rows] = options[:rows].presence || job_batch_size
14
+
15
+ total = get_total(options)
16
+
17
+ if total > 0
18
+ puts process_data(options.merge(total: total))
19
+ else
20
+ puts "No works found for date range #{from_date} - #{until_date}."
21
+ end
22
+
23
+ # return number of works queued
24
+ total
25
+ end
26
+
27
+ def get_total(options={})
28
+ query_url = get_query_url(options.merge(rows: 0))
29
+ result = Maremma.get(query_url, options)
30
+ result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
31
+ end
32
+
33
+ def get_query_url(options={})
34
+ updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
35
+ fq = "#{updated} AND has_metadata:true AND is_active:true"
36
+
37
+ params = { q: "*:*",
38
+ fq: fq,
39
+ start: options[:offset],
40
+ rows: options[:rows],
41
+ fl: "doi,updated",
42
+ sort: "updated asc",
43
+ wt: "json" }
44
+ url + URI.encode_www_form(params)
45
+ end
46
+
47
+ def process_data(options = {})
48
+ options[:start_time] = Time.now
49
+
50
+ # walk through paginated results
51
+ total_pages = (options[:total].to_f / job_batch_size).ceil
52
+
53
+ (0...total_pages).each do |page|
54
+ options[:offset] = page * job_batch_size
55
+ data = get_data(options.merge(timeout: timeout))
56
+ parse_data(data)
57
+ end
58
+
59
+ push_data(options)
60
+ end
61
+
62
+ def get_data(options={})
63
+ query_url = get_query_url(options)
64
+ Maremma.get(query_url, options)
65
+ end
66
+
67
+ def parse_data(result, options={})
68
+ return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
69
+
70
+ items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
71
+ Array(items).each do |item|
72
+ loc = "/works/" + item.fetch("doi")
73
+ sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
74
+ end
75
+ sitemap.sitemap.link_count
76
+ end
77
+
78
+ def push_data(options={})
79
+ # sync time with AWS S3 before uploading
80
+ fog_storage.sync_clock
81
+
82
+ sitemap.finalize!
83
+ time_taken = Time.now - options[:start_time]
84
+ sitemap.sitemap_index.stats_summary(:time_taken => time_taken)
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,3 @@
1
+ module Maltese
2
+ VERSION = "0.1.2"
3
+ end
data/lib/maltese.rb ADDED
@@ -0,0 +1,8 @@
1
+ require 'sitemap_generator'
2
+ require 'maremma'
3
+ require 'fog/aws'
4
+ require 'mime/types'
5
+ require 'active_support/all'
6
+
7
+ require "maltese/sitemap"
8
+ require "maltese/cli"
data/maltese.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ require "date"
2
+ require File.expand_path("../lib/maltese/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.authors = "Martin Fenner"
6
+ s.email = "mfenner@datacite.org"
7
+ s.name = "maltese"
8
+ s.homepage = "https://github.com/datacite/maltese"
9
+ s.summary = "Ruby library to generate sitemap for DataCite Search"
10
+ s.date = Date.today
11
+ s.description = "Ruby library to generate sitemap for DataCite Search."
12
+ s.require_paths = ["lib"]
13
+ s.version = Maltese::VERSION
14
+ s.extra_rdoc_files = ["README.md"]
15
+ s.license = 'MIT'
16
+
17
+ # Declary dependencies here, rather than in the Gemfile
18
+ s.add_dependency 'maremma', '~> 3.5'
19
+ s.add_dependency 'activesupport', '~> 4.2', '>= 4.2.5'
20
+ s.add_dependency 'thor', '~> 0.19'
21
+ s.add_dependency 'sitemap_generator', '~> 5.1'
22
+ s.add_dependency 'fog-aws', '~> 0.7.6'
23
+ s.add_dependency 'mime-types', '~> 3.1'
24
+ s.add_development_dependency 'bundler', '~> 1.0'
25
+ s.add_development_dependency 'rspec', '~> 3.4'
26
+ s.add_development_dependency 'rake', '~> 12.0'
27
+ s.add_development_dependency 'rack-test', '~> 0'
28
+ s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
29
+ s.add_development_dependency 'webmock', '~> 1.22', '>= 1.22.3'
30
+ s.add_development_dependency 'codeclimate-test-reporter', '~> 1.0', '>= 1.0.0'
31
+ s.add_development_dependency 'simplecov', '~> 0.12.0'
32
+
33
+ s.require_paths = ["lib"]
34
+ s.files = `git ls-files`.split($/)
35
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
36
+ s.executables = ["maltese"]
37
+ end
Binary file
data/spec/cli_spec.rb ADDED
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+ require 'maltese/cli'
3
+
4
+ describe Maltese::CLI do
5
+ let(:subject) do
6
+ described_class.new
7
+ end
8
+
9
+ let(:push_url) { ENV['VOLPINO_URL'] }
10
+ let(:access_token) { ENV['VOLPINO_TOKEN'] }
11
+ let(:from_date) { "2015-04-07" }
12
+ let(:until_date) { "2015-04-08" }
13
+ let(:cli_options) { { push_url: push_url,
14
+ access_token: access_token,
15
+ from_date: from_date,
16
+ until_date: until_date } }
17
+
18
+ describe "sitemap", vcr: true, :order => :defined do
19
+ it 'should succeed' do
20
+ subject.options = cli_options
21
+ expect { subject.sitemap }.to output(/2522 links/).to_stdout
22
+ sitemap = Zlib::GzipReader.open("public/sitemap.xml.gz") { |gz| gz.read }
23
+ doc = Nokogiri::XML(sitemap)
24
+ expect(doc.xpath("//xmlns:url").size).to eq(2522)
25
+ expect(doc.xpath("//xmlns:loc").last.text).to eq("https://search.datacite.org/works/10.6084/M9.FIGSHARE.1371139")
26
+ end
27
+
28
+ it 'should succeed with no works' do
29
+ from_date = "2005-04-07"
30
+ until_date = "2005-04-08"
31
+ subject.options = { push_url: push_url,
32
+ access_token: access_token,
33
+ from_date: from_date,
34
+ until_date: until_date }
35
+ expect { subject.sitemap }.to output("No works found for date range 2005-04-07 - 2005-04-08.\n").to_stdout
36
+ end
37
+
38
+ # it 'should fail' do
39
+ # subject.options = cli_options.except(:access_token)
40
+ # expect { subject.sitemap }.to output(/An error occured: Access token missing.\n/).to_stdout
41
+ # end
42
+ end
43
+ end