maltese 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +19 -0
- data/.gitignore +55 -0
- data/.rubocop.yml +1156 -0
- data/.travis.yml +26 -0
- data/CHANGELOG.md +5 -0
- data/Dockerfile +16 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +128 -0
- data/LICENSE.md +21 -0
- data/README.md +51 -0
- data/bin/maltese +5 -0
- data/lib/maltese/cli.rb +30 -0
- data/lib/maltese/sitemap.rb +140 -0
- data/lib/maltese/utils.rb +87 -0
- data/lib/maltese/version.rb +3 -0
- data/lib/maltese.rb +8 -0
- data/maltese.gemspec +37 -0
- data/public/sitemap.xml.gz +0 -0
- data/spec/cli_spec.rb +43 -0
- data/spec/fixtures/sitemap.json +7574 -0
- data/spec/fixtures/sitemap_nil.json +11 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_no_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +86 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +86 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +270 -0
- data/spec/sitemap_spec.rb +111 -0
- data/spec/spec_helper.rb +95 -0
- metadata +310 -0
data/.travis.yml
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 2.3.3
|
4
|
+
|
5
|
+
addons:
|
6
|
+
code_climate:
|
7
|
+
repo_token: "$CODECLIMATE_REPO_TOKEN"
|
8
|
+
|
9
|
+
install:
|
10
|
+
- travis_retry bundle install
|
11
|
+
|
12
|
+
script:
|
13
|
+
- bundle exec rspec
|
14
|
+
- bundle exec codeclimate-test-reporter
|
15
|
+
|
16
|
+
notifications:
|
17
|
+
email: false
|
18
|
+
|
19
|
+
deploy:
|
20
|
+
provider: rubygems
|
21
|
+
api_key:
|
22
|
+
secure: uT9YVkRp1usg+glYDSG7KJkm8CQGI8pZDbHlmbPz6ibbA8DVyjbBtYjGbvODCoRkisC24kSy31gMqBSmIxLG0ICv2tOy/iaoiuVeUk6NFfP4dcVGsDueQXjqd6Fjw6fCBg42sojwAVWzvDP2EVjQnbcZqROasLPmKuC2qrm+f9aSYLXmGyBtpvJ5FsfpW33OvE3qJD3y0AlPMdCihPe03FVzSiLNMmGuYOH97MucuWGbUJN+tSFiBfqIrAGT2TQXFrdiT3HtxEt+vNH0cGoLQAKgTgx4XPAcKEjg/cML5yhY/OcPR0uNgqdjxqS3faaH31r1xZaGGfHTf9dj++123YLNHbI8odyA9eF+jYU/3D8UnmMpsTNGZXCFUS8xVUobDcejhPBNhqGPLruLtbvIaqpVZ2bF9BOY1F0ILp4GERzUUUxws+BB1EJ6zFpNrDl7MHlqrc+gRZWcWlazQ82BmLQsTVHiab3ZerGCP4+kYiNeyEnsa3wmVDDd2iffU05Bse44/W1/BKmlzV0QfYl1iMA8lkCrgqmslFecCf0xA01v4CF2Hv63PxOeNmNvZm4VIkgy9uPBjD91AVdscSzCRuTc149OluBqUoxUToX9rEegheUXhWs6ww6DHtlRQI+OBNauRUCo7Fb2zV+gTNzUCSln0fE+z9aLhduuA8JLAKk=
|
23
|
+
gem: maltese
|
24
|
+
on:
|
25
|
+
tags: true
|
26
|
+
repo: datacite/maltese
|
data/CHANGELOG.md
ADDED
data/Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
FROM phusion/passenger-full:0.9.20
|
2
|
+
MAINTAINER Martin Fenner "mfenner@datacite.org"
|
3
|
+
|
4
|
+
# Install Ruby 2.3.3
|
5
|
+
RUN bash -lc 'rvm --default use ruby-2.3.3'
|
6
|
+
|
7
|
+
ENV PATH="/usr/local/rvm/gems/ruby-2.3.3/bin:${PATH}"
|
8
|
+
|
9
|
+
# Update installed APT packages, clean up APT when done.
|
10
|
+
RUN apt-get update && apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \
|
11
|
+
apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
12
|
+
|
13
|
+
# Install maltese gem
|
14
|
+
RUN /sbin/setuser app gem install maltese
|
15
|
+
|
16
|
+
CMD maltese sitemap --sitemap_bucket $SITEMAP_BUCKET --sitemap_url $SITEMAP_URL --from_date $FROM_DATE --until_date $UNTIL_DATE
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
maltese (0.1.2)
|
5
|
+
activesupport (~> 4.2, >= 4.2.5)
|
6
|
+
fog-aws (~> 0.7.6)
|
7
|
+
maremma (~> 3.5)
|
8
|
+
mime-types (~> 3.1)
|
9
|
+
sitemap_generator (~> 5.1)
|
10
|
+
thor (~> 0.19)
|
11
|
+
|
12
|
+
GEM
|
13
|
+
remote: https://rubygems.org/
|
14
|
+
specs:
|
15
|
+
activesupport (4.2.8)
|
16
|
+
i18n (~> 0.7)
|
17
|
+
minitest (~> 5.1)
|
18
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
19
|
+
tzinfo (~> 1.1)
|
20
|
+
addressable (2.5.0)
|
21
|
+
public_suffix (~> 2.0, >= 2.0.2)
|
22
|
+
builder (3.2.3)
|
23
|
+
codeclimate-test-reporter (1.0.6)
|
24
|
+
simplecov
|
25
|
+
crack (0.4.3)
|
26
|
+
safe_yaml (~> 1.0.0)
|
27
|
+
diff-lcs (1.3)
|
28
|
+
docile (1.1.5)
|
29
|
+
excon (0.45.4)
|
30
|
+
faraday (0.9.2)
|
31
|
+
multipart-post (>= 1.2, < 3)
|
32
|
+
faraday-encoding (0.0.4)
|
33
|
+
faraday
|
34
|
+
faraday_middleware (0.10.1)
|
35
|
+
faraday (>= 0.7.4, < 1.0)
|
36
|
+
fog-aws (0.7.6)
|
37
|
+
fog-core (~> 1.27)
|
38
|
+
fog-json (~> 1.0)
|
39
|
+
fog-xml (~> 0.1)
|
40
|
+
ipaddress (~> 0.8)
|
41
|
+
fog-core (1.37.0)
|
42
|
+
builder
|
43
|
+
excon (~> 0.45)
|
44
|
+
formatador (~> 0.2)
|
45
|
+
fog-json (1.0.2)
|
46
|
+
fog-core (~> 1.0)
|
47
|
+
multi_json (~> 1.10)
|
48
|
+
fog-xml (0.1.2)
|
49
|
+
fog-core
|
50
|
+
nokogiri (~> 1.5, >= 1.5.11)
|
51
|
+
formatador (0.2.5)
|
52
|
+
hashdiff (0.3.2)
|
53
|
+
i18n (0.8.1)
|
54
|
+
ipaddress (0.8.3)
|
55
|
+
json (2.0.3)
|
56
|
+
maremma (3.5.1)
|
57
|
+
activesupport (~> 4.2, >= 4.2.5)
|
58
|
+
addressable (>= 2.3.6)
|
59
|
+
builder (~> 3.2, >= 3.2.2)
|
60
|
+
excon (~> 0.45.0)
|
61
|
+
faraday (~> 0.9.2)
|
62
|
+
faraday-encoding (~> 0.0.1)
|
63
|
+
faraday_middleware (~> 0.10.0)
|
64
|
+
multi_json (~> 1.11.2)
|
65
|
+
nokogiri (~> 1.6.7)
|
66
|
+
oj (~> 2.18, >= 2.18.1)
|
67
|
+
mime-types (3.1)
|
68
|
+
mime-types-data (~> 3.2015)
|
69
|
+
mime-types-data (3.2016.0521)
|
70
|
+
mini_portile2 (2.1.0)
|
71
|
+
minitest (5.10.1)
|
72
|
+
multi_json (1.11.3)
|
73
|
+
multipart-post (2.0.0)
|
74
|
+
nokogiri (1.6.8.1)
|
75
|
+
mini_portile2 (~> 2.1.0)
|
76
|
+
oj (2.18.1)
|
77
|
+
public_suffix (2.0.5)
|
78
|
+
rack (2.0.1)
|
79
|
+
rack-test (0.6.3)
|
80
|
+
rack (>= 1.0)
|
81
|
+
rake (12.0.0)
|
82
|
+
rspec (3.5.0)
|
83
|
+
rspec-core (~> 3.5.0)
|
84
|
+
rspec-expectations (~> 3.5.0)
|
85
|
+
rspec-mocks (~> 3.5.0)
|
86
|
+
rspec-core (3.5.4)
|
87
|
+
rspec-support (~> 3.5.0)
|
88
|
+
rspec-expectations (3.5.0)
|
89
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
90
|
+
rspec-support (~> 3.5.0)
|
91
|
+
rspec-mocks (3.5.0)
|
92
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
93
|
+
rspec-support (~> 3.5.0)
|
94
|
+
rspec-support (3.5.0)
|
95
|
+
safe_yaml (1.0.4)
|
96
|
+
simplecov (0.12.0)
|
97
|
+
docile (~> 1.1.0)
|
98
|
+
json (>= 1.8, < 3)
|
99
|
+
simplecov-html (~> 0.10.0)
|
100
|
+
simplecov-html (0.10.0)
|
101
|
+
sitemap_generator (5.3.0)
|
102
|
+
builder (~> 3.0)
|
103
|
+
thor (0.19.4)
|
104
|
+
thread_safe (0.3.6)
|
105
|
+
tzinfo (1.2.2)
|
106
|
+
thread_safe (~> 0.1)
|
107
|
+
vcr (3.0.3)
|
108
|
+
webmock (1.24.6)
|
109
|
+
addressable (>= 2.3.6)
|
110
|
+
crack (>= 0.3.2)
|
111
|
+
hashdiff
|
112
|
+
|
113
|
+
PLATFORMS
|
114
|
+
ruby
|
115
|
+
|
116
|
+
DEPENDENCIES
|
117
|
+
bundler (~> 1.0)
|
118
|
+
codeclimate-test-reporter (~> 1.0, >= 1.0.0)
|
119
|
+
maltese!
|
120
|
+
rack-test (~> 0)
|
121
|
+
rake (~> 12.0)
|
122
|
+
rspec (~> 3.4)
|
123
|
+
simplecov (~> 0.12.0)
|
124
|
+
vcr (~> 3.0, >= 3.0.3)
|
125
|
+
webmock (~> 1.22, >= 1.22.3)
|
126
|
+
|
127
|
+
BUNDLED WITH
|
128
|
+
1.12.5
|
data/LICENSE.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2017 DataCite
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/datacite/maltese.svg?branch=master)](https://travis-ci.org/datacite/maltese)
|
2
|
+
[![Code Climate](https://codeclimate.com/github/datacite/maltese/badges/gpa.svg)](https://codeclimate.com/github/datacite/maltese)
|
3
|
+
[![Test Coverage](https://codeclimate.com/github/datacite/maltese/badges/coverage.svg)](https://codeclimate.com/github/datacite/maltese/coverage)
|
4
|
+
|
5
|
+
# Maltese
|
6
|
+
|
7
|
+
Ruby gem and command-line tool for generating sitemap files from the DataCite REST API. Uses the [SitemapGenerator](https://github.com/kjvarga/sitemap_generator) gem and can be run as Docker container, e.g. using ECS and triggered by AWS Lambda, as described [here](https://medium.com/@pahud/ecs-task-runner-with-lambda-4594b72ccb#.5xpmf2inz).
|
8
|
+
|
9
|
+
Run as a command-line tool:
|
10
|
+
|
11
|
+
```
|
12
|
+
maltese sitemap --from_date 2017-02-15
|
13
|
+
```
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
Requires Ruby 2.2 or later. Then add the following to your `Gemfile` to install the
|
18
|
+
latest version:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem 'maltese'
|
22
|
+
```
|
23
|
+
|
24
|
+
Then run `bundle install` to install into your environment.
|
25
|
+
|
26
|
+
You can also install the gem system-wide in the usual way:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
gem install maltese
|
30
|
+
```
|
31
|
+
|
32
|
+
## Development
|
33
|
+
|
34
|
+
We use rspec for unit testing:
|
35
|
+
|
36
|
+
```
|
37
|
+
bundle exec rspec
|
38
|
+
```
|
39
|
+
|
40
|
+
Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
|
41
|
+
|
42
|
+
### Note on Patches/Pull Requests
|
43
|
+
|
44
|
+
* Fork the project
|
45
|
+
* Write tests for your new feature or a test that reproduces a bug
|
46
|
+
* Implement your feature or make a bug fix
|
47
|
+
* Do not mess with Rakefile, version or history
|
48
|
+
* Commit, push and make a pull request. Bonus points for topical branches.
|
49
|
+
|
50
|
+
## License
|
51
|
+
**maltese** is released under the [MIT License](https://github.com/datacite/maltese/blob/master/LICENSE.md).
|
data/bin/maltese
ADDED
data/lib/maltese/cli.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require "thor"
|
4
|
+
require_relative 'sitemap'
|
5
|
+
|
6
|
+
module Maltese
|
7
|
+
class CLI < Thor
|
8
|
+
def self.exit_on_failure?
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
# from http://stackoverflow.com/questions/22809972/adding-a-version-option-to-a-ruby-thor-cli
|
13
|
+
map %w[--version -v] => :__print_version
|
14
|
+
|
15
|
+
desc "--version, -v", "print the version"
|
16
|
+
def __print_version
|
17
|
+
puts Toccatore::VERSION
|
18
|
+
end
|
19
|
+
|
20
|
+
desc "sitemap", "generate sitemap for DataCite Search"
|
21
|
+
method_option :sitemap_bucket, type: :string, default: ENV['SITEMAP_BUCKET']
|
22
|
+
method_option :sitemap_url, type: :string, default: ENV['SITEMAP_URL']
|
23
|
+
method_option :from_date, type: :string, default: (Time.now.to_date - 1.day).iso8601
|
24
|
+
method_option :until_date, type: :string, default: Time.now.to_date.iso8601
|
25
|
+
def sitemap
|
26
|
+
sitemap = Maltese::Sitemap.new(options)
|
27
|
+
sitemap.queue_jobs
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module Maltese
|
2
|
+
class Sitemap
|
3
|
+
attr_reader :sitemap_bucket, :sitemap_url, :from_date, :until_date
|
4
|
+
|
5
|
+
def initialize(attributes={})
|
6
|
+
@sitemap_bucket = attributes[:sitemap_bucket]|| "sitemaps.datacite.org"
|
7
|
+
@sitemap_url = attributes[:sitemap_url] || "https://search.datacite.org"
|
8
|
+
@from_date = attributes[:from_date].presence || (Time.now.to_date - 1.day).iso8601
|
9
|
+
@until_date = attributes[:until_date].presence || Time.now.to_date.iso8601
|
10
|
+
end
|
11
|
+
|
12
|
+
# load ENV variables from container environment if json file exists
|
13
|
+
# see https://github.com/phusion/baseimage-docker#envvar_dumps
|
14
|
+
env_json_file = "/etc/container_environment.json"
|
15
|
+
if File.size?(env_json_file).to_i > 2
|
16
|
+
env_vars = JSON.parse(File.read(env_json_file))
|
17
|
+
env_vars.each { |k, v| ENV[k] = v }
|
18
|
+
end
|
19
|
+
|
20
|
+
def search_path
|
21
|
+
"#{sitemap_url}/api?"
|
22
|
+
end
|
23
|
+
|
24
|
+
def sitemaps_host
|
25
|
+
"http://#{sitemap_bucket}.s3.amazonaws.com/"
|
26
|
+
end
|
27
|
+
|
28
|
+
def sitemaps_path
|
29
|
+
'sitemaps/'
|
30
|
+
end
|
31
|
+
|
32
|
+
def timeout
|
33
|
+
120
|
34
|
+
end
|
35
|
+
|
36
|
+
def job_batch_size
|
37
|
+
50000
|
38
|
+
end
|
39
|
+
|
40
|
+
def sitemap
|
41
|
+
@sitemap ||= SitemapGenerator::LinkSet.new(
|
42
|
+
default_host: sitemap_url,
|
43
|
+
adapter: s3_adapter,
|
44
|
+
sitemaps_host: sitemaps_host,
|
45
|
+
sitemaps_path: sitemaps_path,
|
46
|
+
finalize: false)
|
47
|
+
end
|
48
|
+
|
49
|
+
def s3_adapter
|
50
|
+
SitemapGenerator::S3Adapter.new(fog_provider: 'AWS',
|
51
|
+
aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
|
52
|
+
aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
|
53
|
+
fog_directory: sitemap_bucket,
|
54
|
+
fog_region: ENV['AWS_REGION'])
|
55
|
+
end
|
56
|
+
|
57
|
+
def fog_storage
|
58
|
+
Fog::Storage.new(provider: 'AWS',
|
59
|
+
aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
|
60
|
+
aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'])
|
61
|
+
end
|
62
|
+
|
63
|
+
def queue_jobs(options={})
|
64
|
+
total = get_total(options)
|
65
|
+
|
66
|
+
if total > 0
|
67
|
+
puts process_data(options.merge(total: total))
|
68
|
+
else
|
69
|
+
puts "No works found for date range #{from_date} - #{until_date}."
|
70
|
+
end
|
71
|
+
|
72
|
+
# return number of works queued
|
73
|
+
total
|
74
|
+
end
|
75
|
+
|
76
|
+
def get_total(options={})
|
77
|
+
query_url = get_query_url(options.merge(rows: 0))
|
78
|
+
result = Maremma.get(query_url, options)
|
79
|
+
result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
|
80
|
+
end
|
81
|
+
|
82
|
+
def get_query_url(options={})
|
83
|
+
options[:offset] = options[:offset].to_i || 0
|
84
|
+
options[:rows] = options[:rows].presence || job_batch_size
|
85
|
+
|
86
|
+
updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
|
87
|
+
fq = "#{updated} AND has_metadata:true AND is_active:true"
|
88
|
+
|
89
|
+
params = { q: "*:*",
|
90
|
+
fq: fq,
|
91
|
+
start: options[:offset],
|
92
|
+
rows: options[:rows],
|
93
|
+
fl: "doi,updated",
|
94
|
+
sort: "updated asc",
|
95
|
+
wt: "json" }
|
96
|
+
search_path + URI.encode_www_form(params)
|
97
|
+
end
|
98
|
+
|
99
|
+
def process_data(options = {})
|
100
|
+
options[:start_time] = Time.now
|
101
|
+
|
102
|
+
# walk through paginated results
|
103
|
+
total_pages = (options[:total].to_f / job_batch_size).ceil
|
104
|
+
|
105
|
+
(0...total_pages).each do |page|
|
106
|
+
options[:offset] = page * job_batch_size
|
107
|
+
data = get_data(options.merge(timeout: timeout))
|
108
|
+
parse_data(data)
|
109
|
+
end
|
110
|
+
|
111
|
+
push_data(options)
|
112
|
+
end
|
113
|
+
|
114
|
+
def get_data(options={})
|
115
|
+
query_url = get_query_url(options)
|
116
|
+
Maremma.get(query_url, options)
|
117
|
+
end
|
118
|
+
|
119
|
+
def parse_data(result, options={})
|
120
|
+
return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
|
121
|
+
|
122
|
+
items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
|
123
|
+
Array(items).each do |item|
|
124
|
+
loc = "/works/" + item.fetch("doi")
|
125
|
+
sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
|
126
|
+
end
|
127
|
+
sitemap.sitemap.link_count
|
128
|
+
end
|
129
|
+
|
130
|
+
def push_data(options={})
|
131
|
+
# sync time with AWS S3 before uploading
|
132
|
+
fog_storage.sync_clock
|
133
|
+
|
134
|
+
sitemap.finalize!
|
135
|
+
options[:start_time] ||= Time.now
|
136
|
+
sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
|
137
|
+
sitemap.sitemap.link_count
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Maltese
|
2
|
+
module Utils
|
3
|
+
# load ENV variables from container environment if json file exists
|
4
|
+
# see https://github.com/phusion/baseimage-docker#envvar_dumps
|
5
|
+
env_json_file = "/etc/container_environment.json"
|
6
|
+
if File.size?(env_json_file).to_i > 2
|
7
|
+
env_vars = JSON.parse(File.read(env_json_file))
|
8
|
+
env_vars.each { |k, v| ENV[k] = v }
|
9
|
+
end
|
10
|
+
|
11
|
+
def queue_jobs(options={})
|
12
|
+
options[:offset] = options[:offset].to_i || 0
|
13
|
+
options[:rows] = options[:rows].presence || job_batch_size
|
14
|
+
|
15
|
+
total = get_total(options)
|
16
|
+
|
17
|
+
if total > 0
|
18
|
+
puts process_data(options.merge(total: total))
|
19
|
+
else
|
20
|
+
puts "No works found for date range #{from_date} - #{until_date}."
|
21
|
+
end
|
22
|
+
|
23
|
+
# return number of works queued
|
24
|
+
total
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_total(options={})
|
28
|
+
query_url = get_query_url(options.merge(rows: 0))
|
29
|
+
result = Maremma.get(query_url, options)
|
30
|
+
result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_query_url(options={})
|
34
|
+
updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
|
35
|
+
fq = "#{updated} AND has_metadata:true AND is_active:true"
|
36
|
+
|
37
|
+
params = { q: "*:*",
|
38
|
+
fq: fq,
|
39
|
+
start: options[:offset],
|
40
|
+
rows: options[:rows],
|
41
|
+
fl: "doi,updated",
|
42
|
+
sort: "updated asc",
|
43
|
+
wt: "json" }
|
44
|
+
url + URI.encode_www_form(params)
|
45
|
+
end
|
46
|
+
|
47
|
+
def process_data(options = {})
|
48
|
+
options[:start_time] = Time.now
|
49
|
+
|
50
|
+
# walk through paginated results
|
51
|
+
total_pages = (options[:total].to_f / job_batch_size).ceil
|
52
|
+
|
53
|
+
(0...total_pages).each do |page|
|
54
|
+
options[:offset] = page * job_batch_size
|
55
|
+
data = get_data(options.merge(timeout: timeout))
|
56
|
+
parse_data(data)
|
57
|
+
end
|
58
|
+
|
59
|
+
push_data(options)
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_data(options={})
|
63
|
+
query_url = get_query_url(options)
|
64
|
+
Maremma.get(query_url, options)
|
65
|
+
end
|
66
|
+
|
67
|
+
def parse_data(result, options={})
|
68
|
+
return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
|
69
|
+
|
70
|
+
items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
|
71
|
+
Array(items).each do |item|
|
72
|
+
loc = "/works/" + item.fetch("doi")
|
73
|
+
sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
|
74
|
+
end
|
75
|
+
sitemap.sitemap.link_count
|
76
|
+
end
|
77
|
+
|
78
|
+
def push_data(options={})
|
79
|
+
# sync time with AWS S3 before uploading
|
80
|
+
fog_storage.sync_clock
|
81
|
+
|
82
|
+
sitemap.finalize!
|
83
|
+
time_taken = Time.now - options[:start_time]
|
84
|
+
sitemap.sitemap_index.stats_summary(:time_taken => time_taken)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/maltese.rb
ADDED
data/maltese.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require "date"
|
2
|
+
require File.expand_path("../lib/maltese/version", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.authors = "Martin Fenner"
|
6
|
+
s.email = "mfenner@datacite.org"
|
7
|
+
s.name = "maltese"
|
8
|
+
s.homepage = "https://github.com/datacite/maltese"
|
9
|
+
s.summary = "Ruby library to generate sitemap for DataCite Search"
|
10
|
+
s.date = Date.today
|
11
|
+
s.description = "Ruby library to generate sitemap for DataCite Search."
|
12
|
+
s.require_paths = ["lib"]
|
13
|
+
s.version = Maltese::VERSION
|
14
|
+
s.extra_rdoc_files = ["README.md"]
|
15
|
+
s.license = 'MIT'
|
16
|
+
|
17
|
+
# Declary dependencies here, rather than in the Gemfile
|
18
|
+
s.add_dependency 'maremma', '~> 3.5'
|
19
|
+
s.add_dependency 'activesupport', '~> 4.2', '>= 4.2.5'
|
20
|
+
s.add_dependency 'thor', '~> 0.19'
|
21
|
+
s.add_dependency 'sitemap_generator', '~> 5.1'
|
22
|
+
s.add_dependency 'fog-aws', '~> 0.7.6'
|
23
|
+
s.add_dependency 'mime-types', '~> 3.1'
|
24
|
+
s.add_development_dependency 'bundler', '~> 1.0'
|
25
|
+
s.add_development_dependency 'rspec', '~> 3.4'
|
26
|
+
s.add_development_dependency 'rake', '~> 12.0'
|
27
|
+
s.add_development_dependency 'rack-test', '~> 0'
|
28
|
+
s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
|
29
|
+
s.add_development_dependency 'webmock', '~> 1.22', '>= 1.22.3'
|
30
|
+
s.add_development_dependency 'codeclimate-test-reporter', '~> 1.0', '>= 1.0.0'
|
31
|
+
s.add_development_dependency 'simplecov', '~> 0.12.0'
|
32
|
+
|
33
|
+
s.require_paths = ["lib"]
|
34
|
+
s.files = `git ls-files`.split($/)
|
35
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
36
|
+
s.executables = ["maltese"]
|
37
|
+
end
|
Binary file
|
data/spec/cli_spec.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'maltese/cli'
|
3
|
+
|
4
|
+
describe Maltese::CLI do
|
5
|
+
let(:subject) do
|
6
|
+
described_class.new
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:push_url) { ENV['VOLPINO_URL'] }
|
10
|
+
let(:access_token) { ENV['VOLPINO_TOKEN'] }
|
11
|
+
let(:from_date) { "2015-04-07" }
|
12
|
+
let(:until_date) { "2015-04-08" }
|
13
|
+
let(:cli_options) { { push_url: push_url,
|
14
|
+
access_token: access_token,
|
15
|
+
from_date: from_date,
|
16
|
+
until_date: until_date } }
|
17
|
+
|
18
|
+
describe "sitemap", vcr: true, :order => :defined do
|
19
|
+
it 'should succeed' do
|
20
|
+
subject.options = cli_options
|
21
|
+
expect { subject.sitemap }.to output(/2522 links/).to_stdout
|
22
|
+
sitemap = Zlib::GzipReader.open("public/sitemap.xml.gz") { |gz| gz.read }
|
23
|
+
doc = Nokogiri::XML(sitemap)
|
24
|
+
expect(doc.xpath("//xmlns:url").size).to eq(2522)
|
25
|
+
expect(doc.xpath("//xmlns:loc").last.text).to eq("https://search.datacite.org/works/10.6084/M9.FIGSHARE.1371139")
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should succeed with no works' do
|
29
|
+
from_date = "2005-04-07"
|
30
|
+
until_date = "2005-04-08"
|
31
|
+
subject.options = { push_url: push_url,
|
32
|
+
access_token: access_token,
|
33
|
+
from_date: from_date,
|
34
|
+
until_date: until_date }
|
35
|
+
expect { subject.sitemap }.to output("No works found for date range 2005-04-07 - 2005-04-08.\n").to_stdout
|
36
|
+
end
|
37
|
+
|
38
|
+
# it 'should fail' do
|
39
|
+
# subject.options = cli_options.except(:access_token)
|
40
|
+
# expect { subject.sitemap }.to output(/An error occured: Access token missing.\n/).to_stdout
|
41
|
+
# end
|
42
|
+
end
|
43
|
+
end
|