maltese 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +19 -0
- data/.gitignore +55 -0
- data/.rubocop.yml +1156 -0
- data/.travis.yml +26 -0
- data/CHANGELOG.md +5 -0
- data/Dockerfile +16 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +128 -0
- data/LICENSE.md +21 -0
- data/README.md +51 -0
- data/bin/maltese +5 -0
- data/lib/maltese/cli.rb +30 -0
- data/lib/maltese/sitemap.rb +140 -0
- data/lib/maltese/utils.rb +87 -0
- data/lib/maltese/version.rb +3 -0
- data/lib/maltese.rb +8 -0
- data/maltese.gemspec +37 -0
- data/public/sitemap.xml.gz +0 -0
- data/spec/cli_spec.rb +43 -0
- data/spec/fixtures/sitemap.json +7574 -0
- data/spec/fixtures/sitemap_nil.json +11 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_no_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +86 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +86 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +270 -0
- data/spec/sitemap_spec.rb +111 -0
- data/spec/spec_helper.rb +95 -0
- metadata +310 -0
data/.travis.yml
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 2.3.3
|
4
|
+
|
5
|
+
addons:
|
6
|
+
code_climate:
|
7
|
+
repo_token: "$CODECLIMATE_REPO_TOKEN"
|
8
|
+
|
9
|
+
install:
|
10
|
+
- travis_retry bundle install
|
11
|
+
|
12
|
+
script:
|
13
|
+
- bundle exec rspec
|
14
|
+
- bundle exec codeclimate-test-reporter
|
15
|
+
|
16
|
+
notifications:
|
17
|
+
email: false
|
18
|
+
|
19
|
+
deploy:
|
20
|
+
provider: rubygems
|
21
|
+
api_key:
|
22
|
+
secure: uT9YVkRp1usg+glYDSG7KJkm8CQGI8pZDbHlmbPz6ibbA8DVyjbBtYjGbvODCoRkisC24kSy31gMqBSmIxLG0ICv2tOy/iaoiuVeUk6NFfP4dcVGsDueQXjqd6Fjw6fCBg42sojwAVWzvDP2EVjQnbcZqROasLPmKuC2qrm+f9aSYLXmGyBtpvJ5FsfpW33OvE3qJD3y0AlPMdCihPe03FVzSiLNMmGuYOH97MucuWGbUJN+tSFiBfqIrAGT2TQXFrdiT3HtxEt+vNH0cGoLQAKgTgx4XPAcKEjg/cML5yhY/OcPR0uNgqdjxqS3faaH31r1xZaGGfHTf9dj++123YLNHbI8odyA9eF+jYU/3D8UnmMpsTNGZXCFUS8xVUobDcejhPBNhqGPLruLtbvIaqpVZ2bF9BOY1F0ILp4GERzUUUxws+BB1EJ6zFpNrDl7MHlqrc+gRZWcWlazQ82BmLQsTVHiab3ZerGCP4+kYiNeyEnsa3wmVDDd2iffU05Bse44/W1/BKmlzV0QfYl1iMA8lkCrgqmslFecCf0xA01v4CF2Hv63PxOeNmNvZm4VIkgy9uPBjD91AVdscSzCRuTc149OluBqUoxUToX9rEegheUXhWs6ww6DHtlRQI+OBNauRUCo7Fb2zV+gTNzUCSln0fE+z9aLhduuA8JLAKk=
|
23
|
+
gem: maltese
|
24
|
+
on:
|
25
|
+
tags: true
|
26
|
+
repo: datacite/maltese
|
data/CHANGELOG.md
ADDED
data/Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
FROM phusion/passenger-full:0.9.20
|
2
|
+
MAINTAINER Martin Fenner "mfenner@datacite.org"
|
3
|
+
|
4
|
+
# Install Ruby 2.3.3
|
5
|
+
RUN bash -lc 'rvm --default use ruby-2.3.3'
|
6
|
+
|
7
|
+
ENV PATH="/usr/local/rvm/gems/ruby-2.3.3/bin:${PATH}"
|
8
|
+
|
9
|
+
# Update installed APT packages, clean up APT when done.
|
10
|
+
RUN apt-get update && apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \
|
11
|
+
apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
12
|
+
|
13
|
+
# Install maltese gem
|
14
|
+
RUN /sbin/setuser app gem install maltese
|
15
|
+
|
16
|
+
CMD maltese sitemap --sitemap_bucket $SITEMAP_BUCKET --sitemap_url $SITEMAP_URL --from_date $FROM_DATE --until_date $UNTIL_DATE
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
maltese (0.1.2)
|
5
|
+
activesupport (~> 4.2, >= 4.2.5)
|
6
|
+
fog-aws (~> 0.7.6)
|
7
|
+
maremma (~> 3.5)
|
8
|
+
mime-types (~> 3.1)
|
9
|
+
sitemap_generator (~> 5.1)
|
10
|
+
thor (~> 0.19)
|
11
|
+
|
12
|
+
GEM
|
13
|
+
remote: https://rubygems.org/
|
14
|
+
specs:
|
15
|
+
activesupport (4.2.8)
|
16
|
+
i18n (~> 0.7)
|
17
|
+
minitest (~> 5.1)
|
18
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
19
|
+
tzinfo (~> 1.1)
|
20
|
+
addressable (2.5.0)
|
21
|
+
public_suffix (~> 2.0, >= 2.0.2)
|
22
|
+
builder (3.2.3)
|
23
|
+
codeclimate-test-reporter (1.0.6)
|
24
|
+
simplecov
|
25
|
+
crack (0.4.3)
|
26
|
+
safe_yaml (~> 1.0.0)
|
27
|
+
diff-lcs (1.3)
|
28
|
+
docile (1.1.5)
|
29
|
+
excon (0.45.4)
|
30
|
+
faraday (0.9.2)
|
31
|
+
multipart-post (>= 1.2, < 3)
|
32
|
+
faraday-encoding (0.0.4)
|
33
|
+
faraday
|
34
|
+
faraday_middleware (0.10.1)
|
35
|
+
faraday (>= 0.7.4, < 1.0)
|
36
|
+
fog-aws (0.7.6)
|
37
|
+
fog-core (~> 1.27)
|
38
|
+
fog-json (~> 1.0)
|
39
|
+
fog-xml (~> 0.1)
|
40
|
+
ipaddress (~> 0.8)
|
41
|
+
fog-core (1.37.0)
|
42
|
+
builder
|
43
|
+
excon (~> 0.45)
|
44
|
+
formatador (~> 0.2)
|
45
|
+
fog-json (1.0.2)
|
46
|
+
fog-core (~> 1.0)
|
47
|
+
multi_json (~> 1.10)
|
48
|
+
fog-xml (0.1.2)
|
49
|
+
fog-core
|
50
|
+
nokogiri (~> 1.5, >= 1.5.11)
|
51
|
+
formatador (0.2.5)
|
52
|
+
hashdiff (0.3.2)
|
53
|
+
i18n (0.8.1)
|
54
|
+
ipaddress (0.8.3)
|
55
|
+
json (2.0.3)
|
56
|
+
maremma (3.5.1)
|
57
|
+
activesupport (~> 4.2, >= 4.2.5)
|
58
|
+
addressable (>= 2.3.6)
|
59
|
+
builder (~> 3.2, >= 3.2.2)
|
60
|
+
excon (~> 0.45.0)
|
61
|
+
faraday (~> 0.9.2)
|
62
|
+
faraday-encoding (~> 0.0.1)
|
63
|
+
faraday_middleware (~> 0.10.0)
|
64
|
+
multi_json (~> 1.11.2)
|
65
|
+
nokogiri (~> 1.6.7)
|
66
|
+
oj (~> 2.18, >= 2.18.1)
|
67
|
+
mime-types (3.1)
|
68
|
+
mime-types-data (~> 3.2015)
|
69
|
+
mime-types-data (3.2016.0521)
|
70
|
+
mini_portile2 (2.1.0)
|
71
|
+
minitest (5.10.1)
|
72
|
+
multi_json (1.11.3)
|
73
|
+
multipart-post (2.0.0)
|
74
|
+
nokogiri (1.6.8.1)
|
75
|
+
mini_portile2 (~> 2.1.0)
|
76
|
+
oj (2.18.1)
|
77
|
+
public_suffix (2.0.5)
|
78
|
+
rack (2.0.1)
|
79
|
+
rack-test (0.6.3)
|
80
|
+
rack (>= 1.0)
|
81
|
+
rake (12.0.0)
|
82
|
+
rspec (3.5.0)
|
83
|
+
rspec-core (~> 3.5.0)
|
84
|
+
rspec-expectations (~> 3.5.0)
|
85
|
+
rspec-mocks (~> 3.5.0)
|
86
|
+
rspec-core (3.5.4)
|
87
|
+
rspec-support (~> 3.5.0)
|
88
|
+
rspec-expectations (3.5.0)
|
89
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
90
|
+
rspec-support (~> 3.5.0)
|
91
|
+
rspec-mocks (3.5.0)
|
92
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
93
|
+
rspec-support (~> 3.5.0)
|
94
|
+
rspec-support (3.5.0)
|
95
|
+
safe_yaml (1.0.4)
|
96
|
+
simplecov (0.12.0)
|
97
|
+
docile (~> 1.1.0)
|
98
|
+
json (>= 1.8, < 3)
|
99
|
+
simplecov-html (~> 0.10.0)
|
100
|
+
simplecov-html (0.10.0)
|
101
|
+
sitemap_generator (5.3.0)
|
102
|
+
builder (~> 3.0)
|
103
|
+
thor (0.19.4)
|
104
|
+
thread_safe (0.3.6)
|
105
|
+
tzinfo (1.2.2)
|
106
|
+
thread_safe (~> 0.1)
|
107
|
+
vcr (3.0.3)
|
108
|
+
webmock (1.24.6)
|
109
|
+
addressable (>= 2.3.6)
|
110
|
+
crack (>= 0.3.2)
|
111
|
+
hashdiff
|
112
|
+
|
113
|
+
PLATFORMS
|
114
|
+
ruby
|
115
|
+
|
116
|
+
DEPENDENCIES
|
117
|
+
bundler (~> 1.0)
|
118
|
+
codeclimate-test-reporter (~> 1.0, >= 1.0.0)
|
119
|
+
maltese!
|
120
|
+
rack-test (~> 0)
|
121
|
+
rake (~> 12.0)
|
122
|
+
rspec (~> 3.4)
|
123
|
+
simplecov (~> 0.12.0)
|
124
|
+
vcr (~> 3.0, >= 3.0.3)
|
125
|
+
webmock (~> 1.22, >= 1.22.3)
|
126
|
+
|
127
|
+
BUNDLED WITH
|
128
|
+
1.12.5
|
data/LICENSE.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2017 DataCite
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
[](https://travis-ci.org/datacite/maltese)
|
2
|
+
[](https://codeclimate.com/github/datacite/maltese)
|
3
|
+
[](https://codeclimate.com/github/datacite/maltese/coverage)
|
4
|
+
|
5
|
+
# Maltese
|
6
|
+
|
7
|
+
Ruby gem and command-line tool for generating sitemap files from the DataCite REST API. Uses the [SitemapGenerator](https://github.com/kjvarga/sitemap_generator) gem and can be run as Docker container, e.g. using ECS and triggered by AWS Lambda, as described [here](https://medium.com/@pahud/ecs-task-runner-with-lambda-4594b72ccb#.5xpmf2inz).
|
8
|
+
|
9
|
+
Run as a command-line tool:
|
10
|
+
|
11
|
+
```
|
12
|
+
maltese sitemap --from_date 2017-02-15
|
13
|
+
```
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
Requires Ruby 2.2 or later. Then add the following to your `Gemfile` to install the
|
18
|
+
latest version:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem 'maltese'
|
22
|
+
```
|
23
|
+
|
24
|
+
Then run `bundle install` to install into your environment.
|
25
|
+
|
26
|
+
You can also install the gem system-wide in the usual way:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
gem install maltese
|
30
|
+
```
|
31
|
+
|
32
|
+
## Development
|
33
|
+
|
34
|
+
We use rspec for unit testing:
|
35
|
+
|
36
|
+
```
|
37
|
+
bundle exec rspec
|
38
|
+
```
|
39
|
+
|
40
|
+
Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
|
41
|
+
|
42
|
+
### Note on Patches/Pull Requests
|
43
|
+
|
44
|
+
* Fork the project
|
45
|
+
* Write tests for your new feature or a test that reproduces a bug
|
46
|
+
* Implement your feature or make a bug fix
|
47
|
+
* Do not mess with Rakefile, version or history
|
48
|
+
* Commit, push and make a pull request. Bonus points for topical branches.
|
49
|
+
|
50
|
+
## License
|
51
|
+
**maltese** is released under the [MIT License](https://github.com/datacite/maltese/blob/master/LICENSE.md).
|
data/bin/maltese
ADDED
data/lib/maltese/cli.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require "thor"
|
4
|
+
require_relative 'sitemap'
|
5
|
+
|
6
|
+
module Maltese
|
7
|
+
class CLI < Thor
|
8
|
+
def self.exit_on_failure?
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
# from http://stackoverflow.com/questions/22809972/adding-a-version-option-to-a-ruby-thor-cli
|
13
|
+
map %w[--version -v] => :__print_version
|
14
|
+
|
15
|
+
desc "--version, -v", "print the version"
|
16
|
+
def __print_version
|
17
|
+
puts Toccatore::VERSION
|
18
|
+
end
|
19
|
+
|
20
|
+
desc "sitemap", "generate sitemap for DataCite Search"
|
21
|
+
method_option :sitemap_bucket, type: :string, default: ENV['SITEMAP_BUCKET']
|
22
|
+
method_option :sitemap_url, type: :string, default: ENV['SITEMAP_URL']
|
23
|
+
method_option :from_date, type: :string, default: (Time.now.to_date - 1.day).iso8601
|
24
|
+
method_option :until_date, type: :string, default: Time.now.to_date.iso8601
|
25
|
+
def sitemap
|
26
|
+
sitemap = Maltese::Sitemap.new(options)
|
27
|
+
sitemap.queue_jobs
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module Maltese
|
2
|
+
class Sitemap
|
3
|
+
attr_reader :sitemap_bucket, :sitemap_url, :from_date, :until_date
|
4
|
+
|
5
|
+
def initialize(attributes={})
|
6
|
+
@sitemap_bucket = attributes[:sitemap_bucket]|| "sitemaps.datacite.org"
|
7
|
+
@sitemap_url = attributes[:sitemap_url] || "https://search.datacite.org"
|
8
|
+
@from_date = attributes[:from_date].presence || (Time.now.to_date - 1.day).iso8601
|
9
|
+
@until_date = attributes[:until_date].presence || Time.now.to_date.iso8601
|
10
|
+
end
|
11
|
+
|
12
|
+
# load ENV variables from container environment if json file exists
|
13
|
+
# see https://github.com/phusion/baseimage-docker#envvar_dumps
|
14
|
+
env_json_file = "/etc/container_environment.json"
|
15
|
+
if File.size?(env_json_file).to_i > 2
|
16
|
+
env_vars = JSON.parse(File.read(env_json_file))
|
17
|
+
env_vars.each { |k, v| ENV[k] = v }
|
18
|
+
end
|
19
|
+
|
20
|
+
def search_path
|
21
|
+
"#{sitemap_url}/api?"
|
22
|
+
end
|
23
|
+
|
24
|
+
def sitemaps_host
|
25
|
+
"http://#{sitemap_bucket}.s3.amazonaws.com/"
|
26
|
+
end
|
27
|
+
|
28
|
+
def sitemaps_path
|
29
|
+
'sitemaps/'
|
30
|
+
end
|
31
|
+
|
32
|
+
def timeout
|
33
|
+
120
|
34
|
+
end
|
35
|
+
|
36
|
+
def job_batch_size
|
37
|
+
50000
|
38
|
+
end
|
39
|
+
|
40
|
+
def sitemap
|
41
|
+
@sitemap ||= SitemapGenerator::LinkSet.new(
|
42
|
+
default_host: sitemap_url,
|
43
|
+
adapter: s3_adapter,
|
44
|
+
sitemaps_host: sitemaps_host,
|
45
|
+
sitemaps_path: sitemaps_path,
|
46
|
+
finalize: false)
|
47
|
+
end
|
48
|
+
|
49
|
+
def s3_adapter
|
50
|
+
SitemapGenerator::S3Adapter.new(fog_provider: 'AWS',
|
51
|
+
aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
|
52
|
+
aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
|
53
|
+
fog_directory: sitemap_bucket,
|
54
|
+
fog_region: ENV['AWS_REGION'])
|
55
|
+
end
|
56
|
+
|
57
|
+
def fog_storage
|
58
|
+
Fog::Storage.new(provider: 'AWS',
|
59
|
+
aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
|
60
|
+
aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'])
|
61
|
+
end
|
62
|
+
|
63
|
+
def queue_jobs(options={})
|
64
|
+
total = get_total(options)
|
65
|
+
|
66
|
+
if total > 0
|
67
|
+
puts process_data(options.merge(total: total))
|
68
|
+
else
|
69
|
+
puts "No works found for date range #{from_date} - #{until_date}."
|
70
|
+
end
|
71
|
+
|
72
|
+
# return number of works queued
|
73
|
+
total
|
74
|
+
end
|
75
|
+
|
76
|
+
def get_total(options={})
|
77
|
+
query_url = get_query_url(options.merge(rows: 0))
|
78
|
+
result = Maremma.get(query_url, options)
|
79
|
+
result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
|
80
|
+
end
|
81
|
+
|
82
|
+
def get_query_url(options={})
|
83
|
+
options[:offset] = options[:offset].to_i || 0
|
84
|
+
options[:rows] = options[:rows].presence || job_batch_size
|
85
|
+
|
86
|
+
updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
|
87
|
+
fq = "#{updated} AND has_metadata:true AND is_active:true"
|
88
|
+
|
89
|
+
params = { q: "*:*",
|
90
|
+
fq: fq,
|
91
|
+
start: options[:offset],
|
92
|
+
rows: options[:rows],
|
93
|
+
fl: "doi,updated",
|
94
|
+
sort: "updated asc",
|
95
|
+
wt: "json" }
|
96
|
+
search_path + URI.encode_www_form(params)
|
97
|
+
end
|
98
|
+
|
99
|
+
def process_data(options = {})
|
100
|
+
options[:start_time] = Time.now
|
101
|
+
|
102
|
+
# walk through paginated results
|
103
|
+
total_pages = (options[:total].to_f / job_batch_size).ceil
|
104
|
+
|
105
|
+
(0...total_pages).each do |page|
|
106
|
+
options[:offset] = page * job_batch_size
|
107
|
+
data = get_data(options.merge(timeout: timeout))
|
108
|
+
parse_data(data)
|
109
|
+
end
|
110
|
+
|
111
|
+
push_data(options)
|
112
|
+
end
|
113
|
+
|
114
|
+
def get_data(options={})
|
115
|
+
query_url = get_query_url(options)
|
116
|
+
Maremma.get(query_url, options)
|
117
|
+
end
|
118
|
+
|
119
|
+
def parse_data(result, options={})
|
120
|
+
return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
|
121
|
+
|
122
|
+
items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
|
123
|
+
Array(items).each do |item|
|
124
|
+
loc = "/works/" + item.fetch("doi")
|
125
|
+
sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
|
126
|
+
end
|
127
|
+
sitemap.sitemap.link_count
|
128
|
+
end
|
129
|
+
|
130
|
+
def push_data(options={})
|
131
|
+
# sync time with AWS S3 before uploading
|
132
|
+
fog_storage.sync_clock
|
133
|
+
|
134
|
+
sitemap.finalize!
|
135
|
+
options[:start_time] ||= Time.now
|
136
|
+
sitemap.sitemap_index.stats_summary(:time_taken => Time.now - options[:start_time])
|
137
|
+
sitemap.sitemap.link_count
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Maltese
|
2
|
+
module Utils
|
3
|
+
# load ENV variables from container environment if json file exists
|
4
|
+
# see https://github.com/phusion/baseimage-docker#envvar_dumps
|
5
|
+
env_json_file = "/etc/container_environment.json"
|
6
|
+
if File.size?(env_json_file).to_i > 2
|
7
|
+
env_vars = JSON.parse(File.read(env_json_file))
|
8
|
+
env_vars.each { |k, v| ENV[k] = v }
|
9
|
+
end
|
10
|
+
|
11
|
+
def queue_jobs(options={})
|
12
|
+
options[:offset] = options[:offset].to_i || 0
|
13
|
+
options[:rows] = options[:rows].presence || job_batch_size
|
14
|
+
|
15
|
+
total = get_total(options)
|
16
|
+
|
17
|
+
if total > 0
|
18
|
+
puts process_data(options.merge(total: total))
|
19
|
+
else
|
20
|
+
puts "No works found for date range #{from_date} - #{until_date}."
|
21
|
+
end
|
22
|
+
|
23
|
+
# return number of works queued
|
24
|
+
total
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_total(options={})
|
28
|
+
query_url = get_query_url(options.merge(rows: 0))
|
29
|
+
result = Maremma.get(query_url, options)
|
30
|
+
result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_query_url(options={})
|
34
|
+
updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
|
35
|
+
fq = "#{updated} AND has_metadata:true AND is_active:true"
|
36
|
+
|
37
|
+
params = { q: "*:*",
|
38
|
+
fq: fq,
|
39
|
+
start: options[:offset],
|
40
|
+
rows: options[:rows],
|
41
|
+
fl: "doi,updated",
|
42
|
+
sort: "updated asc",
|
43
|
+
wt: "json" }
|
44
|
+
url + URI.encode_www_form(params)
|
45
|
+
end
|
46
|
+
|
47
|
+
def process_data(options = {})
|
48
|
+
options[:start_time] = Time.now
|
49
|
+
|
50
|
+
# walk through paginated results
|
51
|
+
total_pages = (options[:total].to_f / job_batch_size).ceil
|
52
|
+
|
53
|
+
(0...total_pages).each do |page|
|
54
|
+
options[:offset] = page * job_batch_size
|
55
|
+
data = get_data(options.merge(timeout: timeout))
|
56
|
+
parse_data(data)
|
57
|
+
end
|
58
|
+
|
59
|
+
push_data(options)
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_data(options={})
|
63
|
+
query_url = get_query_url(options)
|
64
|
+
Maremma.get(query_url, options)
|
65
|
+
end
|
66
|
+
|
67
|
+
def parse_data(result, options={})
|
68
|
+
return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
|
69
|
+
|
70
|
+
items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
|
71
|
+
Array(items).each do |item|
|
72
|
+
loc = "/works/" + item.fetch("doi")
|
73
|
+
sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
|
74
|
+
end
|
75
|
+
sitemap.sitemap.link_count
|
76
|
+
end
|
77
|
+
|
78
|
+
def push_data(options={})
|
79
|
+
# sync time with AWS S3 before uploading
|
80
|
+
fog_storage.sync_clock
|
81
|
+
|
82
|
+
sitemap.finalize!
|
83
|
+
time_taken = Time.now - options[:start_time]
|
84
|
+
sitemap.sitemap_index.stats_summary(:time_taken => time_taken)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/maltese.rb
ADDED
data/maltese.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require "date"
|
2
|
+
require File.expand_path("../lib/maltese/version", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.authors = "Martin Fenner"
|
6
|
+
s.email = "mfenner@datacite.org"
|
7
|
+
s.name = "maltese"
|
8
|
+
s.homepage = "https://github.com/datacite/maltese"
|
9
|
+
s.summary = "Ruby library to generate sitemap for DataCite Search"
|
10
|
+
s.date = Date.today
|
11
|
+
s.description = "Ruby library to generate sitemap for DataCite Search."
|
12
|
+
s.require_paths = ["lib"]
|
13
|
+
s.version = Maltese::VERSION
|
14
|
+
s.extra_rdoc_files = ["README.md"]
|
15
|
+
s.license = 'MIT'
|
16
|
+
|
17
|
+
# Declary dependencies here, rather than in the Gemfile
|
18
|
+
s.add_dependency 'maremma', '~> 3.5'
|
19
|
+
s.add_dependency 'activesupport', '~> 4.2', '>= 4.2.5'
|
20
|
+
s.add_dependency 'thor', '~> 0.19'
|
21
|
+
s.add_dependency 'sitemap_generator', '~> 5.1'
|
22
|
+
s.add_dependency 'fog-aws', '~> 0.7.6'
|
23
|
+
s.add_dependency 'mime-types', '~> 3.1'
|
24
|
+
s.add_development_dependency 'bundler', '~> 1.0'
|
25
|
+
s.add_development_dependency 'rspec', '~> 3.4'
|
26
|
+
s.add_development_dependency 'rake', '~> 12.0'
|
27
|
+
s.add_development_dependency 'rack-test', '~> 0'
|
28
|
+
s.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
|
29
|
+
s.add_development_dependency 'webmock', '~> 1.22', '>= 1.22.3'
|
30
|
+
s.add_development_dependency 'codeclimate-test-reporter', '~> 1.0', '>= 1.0.0'
|
31
|
+
s.add_development_dependency 'simplecov', '~> 0.12.0'
|
32
|
+
|
33
|
+
s.require_paths = ["lib"]
|
34
|
+
s.files = `git ls-files`.split($/)
|
35
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
36
|
+
s.executables = ["maltese"]
|
37
|
+
end
|
Binary file
|
data/spec/cli_spec.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'maltese/cli'
|
3
|
+
|
4
|
+
describe Maltese::CLI do
|
5
|
+
let(:subject) do
|
6
|
+
described_class.new
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:push_url) { ENV['VOLPINO_URL'] }
|
10
|
+
let(:access_token) { ENV['VOLPINO_TOKEN'] }
|
11
|
+
let(:from_date) { "2015-04-07" }
|
12
|
+
let(:until_date) { "2015-04-08" }
|
13
|
+
let(:cli_options) { { push_url: push_url,
|
14
|
+
access_token: access_token,
|
15
|
+
from_date: from_date,
|
16
|
+
until_date: until_date } }
|
17
|
+
|
18
|
+
describe "sitemap", vcr: true, :order => :defined do
|
19
|
+
it 'should succeed' do
|
20
|
+
subject.options = cli_options
|
21
|
+
expect { subject.sitemap }.to output(/2522 links/).to_stdout
|
22
|
+
sitemap = Zlib::GzipReader.open("public/sitemap.xml.gz") { |gz| gz.read }
|
23
|
+
doc = Nokogiri::XML(sitemap)
|
24
|
+
expect(doc.xpath("//xmlns:url").size).to eq(2522)
|
25
|
+
expect(doc.xpath("//xmlns:loc").last.text).to eq("https://search.datacite.org/works/10.6084/M9.FIGSHARE.1371139")
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should succeed with no works' do
|
29
|
+
from_date = "2005-04-07"
|
30
|
+
until_date = "2005-04-08"
|
31
|
+
subject.options = { push_url: push_url,
|
32
|
+
access_token: access_token,
|
33
|
+
from_date: from_date,
|
34
|
+
until_date: until_date }
|
35
|
+
expect { subject.sitemap }.to output("No works found for date range 2005-04-07 - 2005-04-08.\n").to_stdout
|
36
|
+
end
|
37
|
+
|
38
|
+
# it 'should fail' do
|
39
|
+
# subject.options = cli_options.except(:access_token)
|
40
|
+
# expect { subject.sitemap }.to output(/An error occured: Access token missing.\n/).to_stdout
|
41
|
+
# end
|
42
|
+
end
|
43
|
+
end
|