maltese 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/.codeclimate.yml +19 -0
  3. data/.gitignore +55 -0
  4. data/.rubocop.yml +1156 -0
  5. data/.travis.yml +26 -0
  6. data/CHANGELOG.md +5 -0
  7. data/Dockerfile +16 -0
  8. data/Gemfile +3 -0
  9. data/Gemfile.lock +128 -0
  10. data/LICENSE.md +21 -0
  11. data/README.md +51 -0
  12. data/bin/maltese +5 -0
  13. data/lib/maltese/cli.rb +30 -0
  14. data/lib/maltese/sitemap.rb +140 -0
  15. data/lib/maltese/utils.rb +87 -0
  16. data/lib/maltese/version.rb +3 -0
  17. data/lib/maltese.rb +8 -0
  18. data/maltese.gemspec +37 -0
  19. data/public/sitemap.xml.gz +0 -0
  20. data/spec/cli_spec.rb +43 -0
  21. data/spec/fixtures/sitemap.json +7574 -0
  22. data/spec/fixtures/sitemap_nil.json +11 -0
  23. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
  24. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +38 -0
  25. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_no_works.yml +38 -0
  26. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_works.yml +38 -0
  27. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +86 -0
  28. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +86 -0
  29. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
  30. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +270 -0
  31. data/spec/sitemap_spec.rb +111 -0
  32. data/spec/spec_helper.rb +95 -0
  33. metadata +310 -0
data/.travis.yml ADDED
@@ -0,0 +1,26 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.3
4
+
5
+ addons:
6
+ code_climate:
7
+ repo_token: "$CODECLIMATE_REPO_TOKEN"
8
+
9
+ install:
10
+ - travis_retry bundle install
11
+
12
+ script:
13
+ - bundle exec rspec
14
+ - bundle exec codeclimate-test-reporter
15
+
16
+ notifications:
17
+ email: false
18
+
19
+ deploy:
20
+ provider: rubygems
21
+ api_key:
22
+ secure: uT9YVkRp1usg+glYDSG7KJkm8CQGI8pZDbHlmbPz6ibbA8DVyjbBtYjGbvODCoRkisC24kSy31gMqBSmIxLG0ICv2tOy/iaoiuVeUk6NFfP4dcVGsDueQXjqd6Fjw6fCBg42sojwAVWzvDP2EVjQnbcZqROasLPmKuC2qrm+f9aSYLXmGyBtpvJ5FsfpW33OvE3qJD3y0AlPMdCihPe03FVzSiLNMmGuYOH97MucuWGbUJN+tSFiBfqIrAGT2TQXFrdiT3HtxEt+vNH0cGoLQAKgTgx4XPAcKEjg/cML5yhY/OcPR0uNgqdjxqS3faaH31r1xZaGGfHTf9dj++123YLNHbI8odyA9eF+jYU/3D8UnmMpsTNGZXCFUS8xVUobDcejhPBNhqGPLruLtbvIaqpVZ2bF9BOY1F0ILp4GERzUUUxws+BB1EJ6zFpNrDl7MHlqrc+gRZWcWlazQ82BmLQsTVHiab3ZerGCP4+kYiNeyEnsa3wmVDDd2iffU05Bse44/W1/BKmlzV0QfYl1iMA8lkCrgqmslFecCf0xA01v4CF2Hv63PxOeNmNvZm4VIkgy9uPBjD91AVdscSzCRuTc149OluBqUoxUToX9rEegheUXhWs6ww6DHtlRQI+OBNauRUCo7Fb2zV+gTNzUCSln0fE+z9aLhduuA8JLAKk=
23
+ gem: maltese
24
+ on:
25
+ tags: true
26
+ repo: datacite/maltese
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## v.0.1 (February 25, 2017)
2
+
3
+ [maltese 0.1](https://github.com/datacite/maltese/releases/tag/v.0.1) was released on February 25, 2017:
4
+
5
+ * initial release
data/Dockerfile ADDED
@@ -0,0 +1,16 @@
1
+ FROM phusion/passenger-full:0.9.20
2
+ MAINTAINER Martin Fenner "mfenner@datacite.org"
3
+
4
+ # Install Ruby 2.3.3
5
+ RUN bash -lc 'rvm --default use ruby-2.3.3'
6
+
7
+ ENV PATH="/usr/local/rvm/gems/ruby-2.3.3/bin:${PATH}"
8
+
9
+ # Update installed APT packages, clean up APT when done.
10
+ RUN apt-get update && apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \
11
+ apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
12
+
13
+ # Install maltese gem
14
+ RUN /sbin/setuser app gem install maltese
15
+
16
+ CMD maltese sitemap --sitemap_bucket $SITEMAP_BUCKET --sitemap_url $SITEMAP_URL --from_date $FROM_DATE --until_date $UNTIL_DATE
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,128 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ maltese (0.1.2)
5
+ activesupport (~> 4.2, >= 4.2.5)
6
+ fog-aws (~> 0.7.6)
7
+ maremma (~> 3.5)
8
+ mime-types (~> 3.1)
9
+ sitemap_generator (~> 5.1)
10
+ thor (~> 0.19)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ activesupport (4.2.8)
16
+ i18n (~> 0.7)
17
+ minitest (~> 5.1)
18
+ thread_safe (~> 0.3, >= 0.3.4)
19
+ tzinfo (~> 1.1)
20
+ addressable (2.5.0)
21
+ public_suffix (~> 2.0, >= 2.0.2)
22
+ builder (3.2.3)
23
+ codeclimate-test-reporter (1.0.6)
24
+ simplecov
25
+ crack (0.4.3)
26
+ safe_yaml (~> 1.0.0)
27
+ diff-lcs (1.3)
28
+ docile (1.1.5)
29
+ excon (0.45.4)
30
+ faraday (0.9.2)
31
+ multipart-post (>= 1.2, < 3)
32
+ faraday-encoding (0.0.4)
33
+ faraday
34
+ faraday_middleware (0.10.1)
35
+ faraday (>= 0.7.4, < 1.0)
36
+ fog-aws (0.7.6)
37
+ fog-core (~> 1.27)
38
+ fog-json (~> 1.0)
39
+ fog-xml (~> 0.1)
40
+ ipaddress (~> 0.8)
41
+ fog-core (1.37.0)
42
+ builder
43
+ excon (~> 0.45)
44
+ formatador (~> 0.2)
45
+ fog-json (1.0.2)
46
+ fog-core (~> 1.0)
47
+ multi_json (~> 1.10)
48
+ fog-xml (0.1.2)
49
+ fog-core
50
+ nokogiri (~> 1.5, >= 1.5.11)
51
+ formatador (0.2.5)
52
+ hashdiff (0.3.2)
53
+ i18n (0.8.1)
54
+ ipaddress (0.8.3)
55
+ json (2.0.3)
56
+ maremma (3.5.1)
57
+ activesupport (~> 4.2, >= 4.2.5)
58
+ addressable (>= 2.3.6)
59
+ builder (~> 3.2, >= 3.2.2)
60
+ excon (~> 0.45.0)
61
+ faraday (~> 0.9.2)
62
+ faraday-encoding (~> 0.0.1)
63
+ faraday_middleware (~> 0.10.0)
64
+ multi_json (~> 1.11.2)
65
+ nokogiri (~> 1.6.7)
66
+ oj (~> 2.18, >= 2.18.1)
67
+ mime-types (3.1)
68
+ mime-types-data (~> 3.2015)
69
+ mime-types-data (3.2016.0521)
70
+ mini_portile2 (2.1.0)
71
+ minitest (5.10.1)
72
+ multi_json (1.11.3)
73
+ multipart-post (2.0.0)
74
+ nokogiri (1.6.8.1)
75
+ mini_portile2 (~> 2.1.0)
76
+ oj (2.18.1)
77
+ public_suffix (2.0.5)
78
+ rack (2.0.1)
79
+ rack-test (0.6.3)
80
+ rack (>= 1.0)
81
+ rake (12.0.0)
82
+ rspec (3.5.0)
83
+ rspec-core (~> 3.5.0)
84
+ rspec-expectations (~> 3.5.0)
85
+ rspec-mocks (~> 3.5.0)
86
+ rspec-core (3.5.4)
87
+ rspec-support (~> 3.5.0)
88
+ rspec-expectations (3.5.0)
89
+ diff-lcs (>= 1.2.0, < 2.0)
90
+ rspec-support (~> 3.5.0)
91
+ rspec-mocks (3.5.0)
92
+ diff-lcs (>= 1.2.0, < 2.0)
93
+ rspec-support (~> 3.5.0)
94
+ rspec-support (3.5.0)
95
+ safe_yaml (1.0.4)
96
+ simplecov (0.12.0)
97
+ docile (~> 1.1.0)
98
+ json (>= 1.8, < 3)
99
+ simplecov-html (~> 0.10.0)
100
+ simplecov-html (0.10.0)
101
+ sitemap_generator (5.3.0)
102
+ builder (~> 3.0)
103
+ thor (0.19.4)
104
+ thread_safe (0.3.6)
105
+ tzinfo (1.2.2)
106
+ thread_safe (~> 0.1)
107
+ vcr (3.0.3)
108
+ webmock (1.24.6)
109
+ addressable (>= 2.3.6)
110
+ crack (>= 0.3.2)
111
+ hashdiff
112
+
113
+ PLATFORMS
114
+ ruby
115
+
116
+ DEPENDENCIES
117
+ bundler (~> 1.0)
118
+ codeclimate-test-reporter (~> 1.0, >= 1.0.0)
119
+ maltese!
120
+ rack-test (~> 0)
121
+ rake (~> 12.0)
122
+ rspec (~> 3.4)
123
+ simplecov (~> 0.12.0)
124
+ vcr (~> 3.0, >= 3.0.3)
125
+ webmock (~> 1.22, >= 1.22.3)
126
+
127
+ BUNDLED WITH
128
+ 1.12.5
data/LICENSE.md ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2017 DataCite
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,51 @@
1
+ [![Build Status](https://travis-ci.org/datacite/maltese.svg?branch=master)](https://travis-ci.org/datacite/maltese)
2
+ [![Code Climate](https://codeclimate.com/github/datacite/maltese/badges/gpa.svg)](https://codeclimate.com/github/datacite/maltese)
3
+ [![Test Coverage](https://codeclimate.com/github/datacite/maltese/badges/coverage.svg)](https://codeclimate.com/github/datacite/maltese/coverage)
4
+
5
+ # Maltese
6
+
7
+ Ruby gem and command-line tool for generating sitemap files from the DataCite REST API. Uses the [SitemapGenerator](https://github.com/kjvarga/sitemap_generator) gem and can be run as Docker container, e.g. using ECS and triggered by AWS Lambda, as described [here](https://medium.com/@pahud/ecs-task-runner-with-lambda-4594b72ccb#.5xpmf2inz).
8
+
9
+ Run as a command-line tool:
10
+
11
+ ```
12
+ maltese sitemap --from_date 2017-02-15
13
+ ```
14
+
15
+ ## Installation
16
+
17
+ Requires Ruby 2.2 or later. Then add the following to your `Gemfile` to install the
18
+ latest version:
19
+
20
+ ```ruby
21
+ gem 'maltese'
22
+ ```
23
+
24
+ Then run `bundle install` to install into your environment.
25
+
26
+ You can also install the gem system-wide in the usual way:
27
+
28
+ ```bash
29
+ gem install maltese
30
+ ```
31
+
32
+ ## Development
33
+
34
+ We use rspec for unit testing:
35
+
36
+ ```
37
+ bundle exec rspec
38
+ ```
39
+
40
+ Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
41
+
42
+ ### Note on Patches/Pull Requests
43
+
44
+ * Fork the project
45
+ * Write tests for your new feature or a test that reproduces a bug
46
+ * Implement your feature or make a bug fix
47
+ * Do not mess with Rakefile, version or history
48
+ * Commit, push and make a pull request. Bonus points for topical branches.
49
+
50
+ ## License
51
+ **maltese** is released under the [MIT License](https://github.com/datacite/maltese/blob/master/LICENSE.md).
data/bin/maltese ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path("../../lib/maltese", __FILE__)
4
+
5
+ Maltese::CLI.start
@@ -0,0 +1,30 @@
1
+ # encoding: UTF-8
2
+
3
+ require "thor"
4
+ require_relative 'sitemap'
5
+
6
+ module Maltese
7
+ class CLI < Thor
8
+ def self.exit_on_failure?
9
+ true
10
+ end
11
+
12
+ # from http://stackoverflow.com/questions/22809972/adding-a-version-option-to-a-ruby-thor-cli
13
+ map %w[--version -v] => :__print_version
14
+
15
+ desc "--version, -v", "print the version"
16
+ def __print_version
17
+ puts Toccatore::VERSION
18
+ end
19
+
20
+ desc "sitemap", "generate sitemap for DataCite Search"
21
+ method_option :sitemap_bucket, type: :string, default: ENV['SITEMAP_BUCKET']
22
+ method_option :sitemap_url, type: :string, default: ENV['SITEMAP_URL']
23
+ method_option :from_date, type: :string, default: (Time.now.to_date - 1.day).iso8601
24
+ method_option :until_date, type: :string, default: Time.now.to_date.iso8601
25
+ def sitemap
26
+ sitemap = Maltese::Sitemap.new(options)
27
+ sitemap.queue_jobs
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,140 @@
1
+ module Maltese
2
+ class Sitemap
3
+ attr_reader :sitemap_bucket, :sitemap_url, :from_date, :until_date
4
+
5
+ def initialize(attributes={})
6
+ @sitemap_bucket = attributes[:sitemap_bucket]|| "sitemaps.datacite.org"
7
+ @sitemap_url = attributes[:sitemap_url] || "https://search.datacite.org"
8
+ @from_date = attributes[:from_date].presence || (Time.now.to_date - 1.day).iso8601
9
+ @until_date = attributes[:until_date].presence || Time.now.to_date.iso8601
10
+ end
11
+
12
+ # load ENV variables from container environment if json file exists
13
+ # see https://github.com/phusion/baseimage-docker#envvar_dumps
14
+ env_json_file = "/etc/container_environment.json"
15
+ if File.size?(env_json_file).to_i > 2
16
+ env_vars = JSON.parse(File.read(env_json_file))
17
+ env_vars.each { |k, v| ENV[k] = v }
18
+ end
19
+
20
+ def search_path
21
+ "#{sitemap_url}/api?"
22
+ end
23
+
24
+ def sitemaps_host
25
+ "http://#{sitemap_bucket}.s3.amazonaws.com/"
26
+ end
27
+
28
+ def sitemaps_path
29
+ 'sitemaps/'
30
+ end
31
+
32
+ def timeout
33
+ 120
34
+ end
35
+
36
+ def job_batch_size
37
+ 50000
38
+ end
39
+
40
+ def sitemap
41
+ @sitemap ||= SitemapGenerator::LinkSet.new(
42
+ default_host: sitemap_url,
43
+ adapter: s3_adapter,
44
+ sitemaps_host: sitemaps_host,
45
+ sitemaps_path: sitemaps_path,
46
+ finalize: false)
47
+ end
48
+
49
+ def s3_adapter
50
+ SitemapGenerator::S3Adapter.new(fog_provider: 'AWS',
51
+ aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
52
+ aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
53
+ fog_directory: sitemap_bucket,
54
+ fog_region: ENV['AWS_REGION'])
55
+ end
56
+
57
+ def fog_storage
58
+ Fog::Storage.new(provider: 'AWS',
59
+ aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
60
+ aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'])
61
+ end
62
+
63
+ def queue_jobs(options={})
64
+ total = get_total(options)
65
+
66
+ if total > 0
67
+ puts process_data(options.merge(total: total))
68
+ else
69
+ puts "No works found for date range #{from_date} - #{until_date}."
70
+ end
71
+
72
+ # return number of works queued
73
+ total
74
+ end
75
+
76
+ def get_total(options={})
77
+ query_url = get_query_url(options.merge(rows: 0))
78
+ result = Maremma.get(query_url, options)
79
+ result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
80
+ end
81
+
82
+ def get_query_url(options={})
83
+ options[:offset] = options[:offset].to_i || 0
84
+ options[:rows] = options[:rows].presence || job_batch_size
85
+
86
+ updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
87
+ fq = "#{updated} AND has_metadata:true AND is_active:true"
88
+
89
+ params = { q: "*:*",
90
+ fq: fq,
91
+ start: options[:offset],
92
+ rows: options[:rows],
93
+ fl: "doi,updated",
94
+ sort: "updated asc",
95
+ wt: "json" }
96
+ search_path + URI.encode_www_form(params)
97
+ end
98
+
99
+ def process_data(options = {})
100
+ options[:start_time] = Time.now
101
+
102
+ # walk through paginated results
103
+ total_pages = (options[:total].to_f / job_batch_size).ceil
104
+
105
+ (0...total_pages).each do |page|
106
+ options[:offset] = page * job_batch_size
107
+ data = get_data(options.merge(timeout: timeout))
108
+ parse_data(data)
109
+ end
110
+
111
+ push_data(options)
112
+ end
113
+
114
+ def get_data(options={})
115
+ query_url = get_query_url(options)
116
+ Maremma.get(query_url, options)
117
+ end
118
+
119
+ def parse_data(result, options={})
120
+ return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
121
+
122
+ items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
123
+ Array(items).each do |item|
124
+ loc = "/works/" + item.fetch("doi")
125
+ sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
126
+ end
127
+ sitemap.sitemap.link_count
128
+ end
129
+
130
+ def push_data(options={})
131
+ # sync time with AWS S3 before uploading
132
+ fog_storage.sync_clock
133
+
134
+ sitemap.finalize!
135
+ options[:start_time] ||= Time.now
136
+ sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
137
+ sitemap.sitemap.link_count
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,87 @@
1
+ module Maltese
2
+ module Utils
3
+ # load ENV variables from container environment if json file exists
4
+ # see https://github.com/phusion/baseimage-docker#envvar_dumps
5
+ env_json_file = "/etc/container_environment.json"
6
+ if File.size?(env_json_file).to_i > 2
7
+ env_vars = JSON.parse(File.read(env_json_file))
8
+ env_vars.each { |k, v| ENV[k] = v }
9
+ end
10
+
11
+ def queue_jobs(options={})
12
+ options[:offset] = options[:offset].to_i || 0
13
+ options[:rows] = options[:rows].presence || job_batch_size
14
+
15
+ total = get_total(options)
16
+
17
+ if total > 0
18
+ puts process_data(options.merge(total: total))
19
+ else
20
+ puts "No works found for date range #{from_date} - #{until_date}."
21
+ end
22
+
23
+ # return number of works queued
24
+ total
25
+ end
26
+
27
+ def get_total(options={})
28
+ query_url = get_query_url(options.merge(rows: 0))
29
+ result = Maremma.get(query_url, options)
30
+ result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
31
+ end
32
+
33
+ def get_query_url(options={})
34
+ updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
35
+ fq = "#{updated} AND has_metadata:true AND is_active:true"
36
+
37
+ params = { q: "*:*",
38
+ fq: fq,
39
+ start: options[:offset],
40
+ rows: options[:rows],
41
+ fl: "doi,updated",
42
+ sort: "updated asc",
43
+ wt: "json" }
44
+ url + URI.encode_www_form(params)
45
+ end
46
+
47
+ def process_data(options = {})
48
+ options[:start_time] = Time.now
49
+
50
+ # walk through paginated results
51
+ total_pages = (options[:total].to_f / job_batch_size).ceil
52
+
53
+ (0...total_pages).each do |page|
54
+ options[:offset] = page * job_batch_size
55
+ data = get_data(options.merge(timeout: timeout))
56
+ parse_data(data)
57
+ end
58
+
59
+ push_data(options)
60
+ end
61
+
62
+ def get_data(options={})
63
+ query_url = get_query_url(options)
64
+ Maremma.get(query_url, options)
65
+ end
66
+
67
+ def parse_data(result, options={})
68
+ return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
69
+
70
+ items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
71
+ Array(items).each do |item|
72
+ loc = "/works/" + item.fetch("doi")
73
+ sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
74
+ end
75
+ sitemap.sitemap.link_count
76
+ end
77
+
78
+ def push_data(options={})
79
+ # sync time with AWS S3 before uploading
80
+ fog_storage.sync_clock
81
+
82
+ sitemap.finalize!
83
+ time_taken = Time.now - options[:start_time]
84
+ sitemap.sitemap_index.stats_summary(:time_taken => time_taken)
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,3 @@
1
+ module Maltese
2
+ VERSION = "0.1.2"
3
+ end
data/lib/maltese.rb ADDED
@@ -0,0 +1,8 @@
1
+ require 'sitemap_generator'
2
+ require 'maremma'
3
+ require 'fog/aws'
4
+ require 'mime/types'
5
+ require 'active_support/all'
6
+
7
+ require "maltese/sitemap"
8
+ require "maltese/cli"
data/maltese.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ require "date"
2
+ require File.expand_path("../lib/maltese/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.authors = "Martin Fenner"
6
+ s.email = "mfenner@datacite.org"
7
+ s.name = "maltese"
8
+ s.homepage = "https://github.com/datacite/maltese"
9
+ s.summary = "Ruby library to generate sitemap for DataCite Search"
10
+ s.date = Date.today
11
+ s.description = "Ruby library to generate sitemap for DataCite Search."
12
+ s.require_paths = ["lib"]
13
+ s.version = Maltese::VERSION
14
+ s.extra_rdoc_files = ["README.md"]
15
+ s.license = 'MIT'
16
+
17
+ # Declary dependencies here, rather than in the Gemfile
18
+ s.add_dependency 'maremma', '~> 3.5'
19
+ s.add_dependency 'activesupport', '~> 4.2', '>= 4.2.5'
20
+ s.add_dependency 'thor', '~> 0.19'
21
+ s.add_dependency 'sitemap_generator', '~> 5.1'
22
+ s.add_dependency 'fog-aws', '~> 0.7.6'
23
+ s.add_dependency 'mime-types', '~> 3.1'
24
+ s.add_development_dependency 'bundler', '~> 1.0'
25
+ s.add_development_dependency 'rspec', '~> 3.4'
26
+ s.add_development_dependency 'rake', '~> 12.0'
27
+ s.add_development_dependency 'rack-test', '~> 0'
28
+ s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
29
+ s.add_development_dependency 'webmock', '~> 1.22', '>= 1.22.3'
30
+ s.add_development_dependency 'codeclimate-test-reporter', '~> 1.0', '>= 1.0.0'
31
+ s.add_development_dependency 'simplecov', '~> 0.12.0'
32
+
33
+ s.require_paths = ["lib"]
34
+ s.files = `git ls-files`.split($/)
35
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
36
+ s.executables = ["maltese"]
37
+ end
Binary file
data/spec/cli_spec.rb ADDED
@@ -0,0 +1,43 @@
1
+ require 'spec_helper'
2
+ require 'maltese/cli'
3
+
4
+ describe Maltese::CLI do
5
+ let(:subject) do
6
+ described_class.new
7
+ end
8
+
9
+ let(:push_url) { ENV['VOLPINO_URL'] }
10
+ let(:access_token) { ENV['VOLPINO_TOKEN'] }
11
+ let(:from_date) { "2015-04-07" }
12
+ let(:until_date) { "2015-04-08" }
13
+ let(:cli_options) { { push_url: push_url,
14
+ access_token: access_token,
15
+ from_date: from_date,
16
+ until_date: until_date } }
17
+
18
+ describe "sitemap", vcr: true, :order => :defined do
19
+ it 'should succeed' do
20
+ subject.options = cli_options
21
+ expect { subject.sitemap }.to output(/2522 links/).to_stdout
22
+ sitemap = Zlib::GzipReader.open("public/sitemap.xml.gz") { |gz| gz.read }
23
+ doc = Nokogiri::XML(sitemap)
24
+ expect(doc.xpath("//xmlns:url").size).to eq(2522)
25
+ expect(doc.xpath("//xmlns:loc").last.text).to eq("https://search.datacite.org/works/10.6084/M9.FIGSHARE.1371139")
26
+ end
27
+
28
+ it 'should succeed with no works' do
29
+ from_date = "2005-04-07"
30
+ until_date = "2005-04-08"
31
+ subject.options = { push_url: push_url,
32
+ access_token: access_token,
33
+ from_date: from_date,
34
+ until_date: until_date }
35
+ expect { subject.sitemap }.to output("No works found for date range 2005-04-07 - 2005-04-08.\n").to_stdout
36
+ end
37
+
38
+ # it 'should fail' do
39
+ # subject.options = cli_options.except(:access_token)
40
+ # expect { subject.sitemap }.to output(/An error occured: Access token missing.\n/).to_stdout
41
+ # end
42
+ end
43
+ end