maltese 0.2.4 → 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +137 -1035
  3. data/.travis.yml +1 -1
  4. data/Gemfile.lock +39 -38
  5. data/README.md +1 -1
  6. data/lib/maltese/sitemap.rb +17 -33
  7. data/lib/maltese/version.rb +1 -1
  8. data/spec/cli_spec.rb +4 -17
  9. data/spec/fixtures/sitemap.json +69666 -7571
  10. data/spec/fixtures/sitemap_nil.json +8 -8
  11. data/spec/fixtures/vcr_cassettes/Maltese_CLI/sitemap/should_succeed.yml +37 -73
  12. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_works_returned_by_the_Datacite_REST_API.yml +49 -0
  13. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_works.yml +24 -19
  14. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_no_works_returned_by_the_Datacite_REST_API.yml +59 -0
  15. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_works_returned_by_the_Datacite_REST_API.yml +59 -0
  16. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_REST_API.yml +105 -0
  17. data/spec/sitemap_spec.rb +28 -52
  18. data/spec/spec_helper.rb +0 -1
  19. metadata +6 -10
  20. data/spec/fixtures/vcr_cassettes/Maltese_CLI/sitemap/should_succeed_with_no_works.yml +0 -44
  21. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +0 -44
  22. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +0 -44
  23. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/get_total/with_no_works.yml +0 -44
  24. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +0 -59
  25. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +0 -59
  26. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Solr_API.yml +0 -44
  27. data/spec/fixtures/vcr_cassettes/Maltese_Sitemap/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Solr_API.yml +0 -141
data/.travis.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.3.3
3
+ - 2.4.4
4
4
 
5
5
  addons:
6
6
  code_climate:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- maltese (0.2.4)
4
+ maltese (0.8)
5
5
  activesupport (>= 4.2.5, < 6)
6
6
  aws-sdk-s3 (~> 1.19)
7
7
  dotenv (~> 2.1, >= 2.1.1)
@@ -13,50 +13,51 @@ PATH
13
13
  GEM
14
14
  remote: https://rubygems.org/
15
15
  specs:
16
- activesupport (5.2.1)
16
+ activesupport (5.2.3)
17
17
  concurrent-ruby (~> 1.0, >= 1.0.2)
18
18
  i18n (>= 0.7, < 2)
19
19
  minitest (~> 5.1)
20
20
  tzinfo (~> 1.1)
21
- addressable (2.5.2)
21
+ addressable (2.6.0)
22
22
  public_suffix (>= 2.0.2, < 4.0)
23
- aws-eventstream (1.0.1)
24
- aws-partitions (1.103.0)
25
- aws-sdk-core (3.27.0)
26
- aws-eventstream (~> 1.0)
23
+ aws-eventstream (1.0.3)
24
+ aws-partitions (1.170.0)
25
+ aws-sdk-core (3.54.1)
26
+ aws-eventstream (~> 1.0, >= 1.0.2)
27
27
  aws-partitions (~> 1.0)
28
- aws-sigv4 (~> 1.0)
28
+ aws-sigv4 (~> 1.1)
29
29
  jmespath (~> 1.0)
30
- aws-sdk-kms (1.9.0)
31
- aws-sdk-core (~> 3, >= 3.26.0)
32
- aws-sigv4 (~> 1.0)
33
- aws-sdk-s3 (1.19.0)
34
- aws-sdk-core (~> 3, >= 3.26.0)
30
+ aws-sdk-kms (1.21.0)
31
+ aws-sdk-core (~> 3, >= 3.53.0)
32
+ aws-sigv4 (~> 1.1)
33
+ aws-sdk-s3 (1.41.0)
34
+ aws-sdk-core (~> 3, >= 3.53.0)
35
35
  aws-sdk-kms (~> 1)
36
- aws-sigv4 (~> 1.0)
37
- aws-sigv4 (1.0.3)
36
+ aws-sigv4 (~> 1.1)
37
+ aws-sigv4 (1.1.0)
38
+ aws-eventstream (~> 1.0, >= 1.0.2)
38
39
  builder (3.2.3)
39
- codeclimate-test-reporter (1.0.8)
40
+ codeclimate-test-reporter (1.0.9)
40
41
  simplecov (<= 0.13)
41
- concurrent-ruby (1.0.5)
42
+ concurrent-ruby (1.1.5)
42
43
  crack (0.4.3)
43
44
  safe_yaml (~> 1.0.0)
44
45
  diff-lcs (1.3)
45
46
  docile (1.1.5)
46
- dotenv (2.5.0)
47
- excon (0.62.0)
48
- faraday (0.15.2)
47
+ dotenv (2.7.2)
48
+ excon (0.64.0)
49
+ faraday (0.15.4)
49
50
  multipart-post (>= 1.2, < 3)
50
- faraday-encoding (0.0.4)
51
+ faraday-encoding (0.0.5)
51
52
  faraday
52
53
  faraday_middleware (0.12.2)
53
54
  faraday (>= 0.7.4, < 1.0)
54
- hashdiff (0.3.7)
55
- i18n (1.1.0)
55
+ hashdiff (0.4.0)
56
+ i18n (1.6.0)
56
57
  concurrent-ruby (~> 1.0)
57
58
  jmespath (1.4.0)
58
- json (2.1.0)
59
- maremma (4.1.1)
59
+ json (2.2.0)
60
+ maremma (4.2.1)
60
61
  activesupport (>= 4.2.5, < 6)
61
62
  addressable (>= 2.3.6)
62
63
  builder (~> 3.2, >= 3.2.2)
@@ -69,46 +70,46 @@ GEM
69
70
  oj (>= 2.8.3)
70
71
  mime-types (3.2.2)
71
72
  mime-types-data (~> 3.2015)
72
- mime-types-data (3.2018.0812)
73
+ mime-types-data (3.2019.0331)
73
74
  mini_portile2 (2.3.0)
74
75
  minitest (5.11.3)
75
76
  multi_json (1.13.1)
76
- multipart-post (2.0.0)
77
- nokogiri (1.8.4)
77
+ multipart-post (2.1.1)
78
+ nokogiri (1.8.5)
78
79
  mini_portile2 (~> 2.3.0)
79
- oj (3.6.8)
80
- public_suffix (3.0.3)
81
- rack (2.0.5)
80
+ oj (3.7.12)
81
+ public_suffix (3.1.0)
82
+ rack (2.0.7)
82
83
  rack-test (0.8.3)
83
84
  rack (>= 1.0, < 3)
84
- rake (12.3.1)
85
+ rake (12.3.2)
85
86
  rspec (3.8.0)
86
87
  rspec-core (~> 3.8.0)
87
88
  rspec-expectations (~> 3.8.0)
88
89
  rspec-mocks (~> 3.8.0)
89
90
  rspec-core (3.8.0)
90
91
  rspec-support (~> 3.8.0)
91
- rspec-expectations (3.8.1)
92
+ rspec-expectations (3.8.3)
92
93
  diff-lcs (>= 1.2.0, < 2.0)
93
94
  rspec-support (~> 3.8.0)
94
95
  rspec-mocks (3.8.0)
95
96
  diff-lcs (>= 1.2.0, < 2.0)
96
97
  rspec-support (~> 3.8.0)
97
98
  rspec-support (3.8.0)
98
- safe_yaml (1.0.4)
99
+ safe_yaml (1.0.5)
99
100
  simplecov (0.13.0)
100
101
  docile (~> 1.1.0)
101
102
  json (>= 1.8, < 3)
102
103
  simplecov-html (~> 0.10.0)
103
104
  simplecov-html (0.10.2)
104
- sitemap_generator (6.0.1)
105
+ sitemap_generator (6.0.2)
105
106
  builder (~> 3.0)
106
- thor (0.20.0)
107
+ thor (0.20.3)
107
108
  thread_safe (0.3.6)
108
109
  tzinfo (1.2.5)
109
110
  thread_safe (~> 0.1)
110
111
  vcr (3.0.3)
111
- webmock (3.4.2)
112
+ webmock (3.5.1)
112
113
  addressable (>= 2.3.6)
113
114
  crack (>= 0.3.2)
114
115
  hashdiff
@@ -128,4 +129,4 @@ DEPENDENCIES
128
129
  webmock (~> 3.0, >= 3.0.1)
129
130
 
130
131
  BUNDLED WITH
131
- 1.16.1
132
+ 1.17.3
data/README.md CHANGED
@@ -9,7 +9,7 @@ Ruby gem and command-line tool for generating sitemap files from the DataCite RE
9
9
  Run as a command-line tool:
10
10
 
11
11
  ```
12
- maltese sitemap --from_date 2017-02-15
12
+ maltese sitemap
13
13
  ```
14
14
 
15
15
  ## Installation
@@ -21,8 +21,6 @@ module Maltese
21
21
  @sitemap_bucket = attributes[:sitemap_bucket].presence || "search.test.datacite.org"
22
22
  @from_date = attributes[:from_date].presence || (Time.now.to_date - 1.day).iso8601
23
23
  @until_date = attributes[:until_date].presence || Time.now.to_date.iso8601
24
- @solr_username = ENV['SOLR_USERNAME']
25
- @solr_password = ENV['SOLR_PASSWORD']
26
24
  end
27
25
 
28
26
  def sitemap_url
@@ -34,7 +32,7 @@ module Maltese
34
32
  end
35
33
 
36
34
  def search_path
37
- ENV['RACK_ENV'] == "production" ? "https://solr.datacite.org/api?" : "https://solr.test.datacite.org/api?"
35
+ ENV['RACK_ENV'] == "production" ? "https://api.datacite.org/dois?" : "https://api.test.datacite.org/dois?"
38
36
  end
39
37
 
40
38
  def timeout
@@ -42,7 +40,7 @@ module Maltese
42
40
  end
43
41
 
44
42
  def job_batch_size
45
- 50000
43
+ 1000
46
44
  end
47
45
 
48
46
  def sitemap
@@ -67,7 +65,7 @@ module Maltese
67
65
  if total > 0
68
66
  puts process_data(options.merge(total: total))
69
67
  else
70
- puts "No works found for date range #{from_date} - #{until_date}."
68
+ puts "No works found."
71
69
  end
72
70
 
73
71
  # return number of works queued
@@ -75,28 +73,20 @@ module Maltese
75
73
  end
76
74
 
77
75
  def get_total(options={})
78
- query_url = get_query_url(options.merge(rows: 0))
79
- # Add basic auth options in
80
- options = options.merge(username: @solr_username, password: @solr_password)
76
+ query_url = get_query_url(options.merge(size: 0))
81
77
 
82
78
  result = Maremma.get(query_url, options)
83
- result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
79
+ result.body.dig("meta", "total")
84
80
  end
85
81
 
86
82
  def get_query_url(options={})
87
- options[:offset] = options[:offset].to_i || 0
88
- options[:rows] = options[:rows].presence || job_batch_size
89
-
90
- updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
91
- fq = "#{updated} AND has_metadata:true AND is_active:true"
92
-
93
- params = { q: "*:*",
94
- fq: fq,
95
- start: options[:offset],
96
- rows: options[:rows],
97
- fl: "doi,updated",
98
- sort: "updated asc",
99
- wt: "json"}
83
+ options[:cursor] = options[:cursor] || 1
84
+ options[:size] = options[:size] || job_batch_size
85
+
86
+ params = {
87
+ "page[cursor]": options[:cursor],
88
+ "page[size]": options[:size],
89
+ }
100
90
  search_path + URI.encode_www_form(params)
101
91
  end
102
92
 
@@ -104,11 +94,9 @@ module Maltese
104
94
  options[:start_time] = Time.now
105
95
 
106
96
  # walk through paginated results
107
- total_pages = (options[:total].to_f / job_batch_size).ceil
108
-
109
- (0...total_pages).each do |page|
110
- options[:offset] = page * job_batch_size
97
+ while options[:cursor] do
111
98
  data = get_data(options.merge(timeout: timeout))
99
+ options[:cursor] = data.dig("links", "next")
112
100
  parse_data(data)
113
101
  end
114
102
 
@@ -118,19 +106,15 @@ module Maltese
118
106
  def get_data(options={})
119
107
  query_url = get_query_url(options)
120
108
 
121
- # Add basic auth options in
122
- options = options.merge(username: @solr_username, password: @solr_password)
123
-
124
109
  Maremma.get(query_url, options)
125
110
  end
126
111
 
127
112
  def parse_data(result)
128
113
  return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
129
114
 
130
- items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
131
- Array(items).each do |item|
132
- loc = "/works/" + item.fetch("doi")
133
- sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated")
115
+ result.body.fetch("data", []).each do |item|
116
+ loc = "/works/" + item.dig("attributes", "doi")
117
+ sitemap.add loc, changefreq: "monthly", lastmod: item.dig("attrributes", "updated")
134
118
  end
135
119
  sitemap.sitemap.link_count
136
120
  end
@@ -1,3 +1,3 @@
1
1
  module Maltese
2
- VERSION = "0.2.4"
2
+ VERSION = "0.8"
3
3
  end
data/spec/cli_spec.rb CHANGED
@@ -6,30 +6,17 @@ describe Maltese::CLI do
6
6
  described_class.new
7
7
  end
8
8
 
9
- let(:from_date) { "2018-03-15" }
10
- let(:until_date) { "2018-04-08" }
11
9
  let(:sitemap_bucket) { "search.test.datacite.org" }
12
- let(:cli_options) { { sitemap_bucket: sitemap_bucket,
13
- from_date: from_date,
14
- until_date: until_date } }
10
+ let(:cli_options) { { sitemap_bucket: sitemap_bucket } }
15
11
 
16
12
  describe "sitemap", vcr: true, :order => :defined do
17
13
  it 'should succeed' do
18
14
  subject.options = cli_options
19
- expect { subject.sitemap }.to output(/266 links/).to_stdout
15
+ expect { subject.sitemap }.to output(/1 links/).to_stdout
20
16
  sitemap = Zlib::GzipReader.open("public/sitemaps/sitemap.xml.gz") { |gz| gz.read }
21
17
  doc = Nokogiri::XML(sitemap)
22
- expect(doc.xpath("//xmlns:url").size).to eq(266)
23
- expect(doc.xpath("//xmlns:loc").last.text).to eq("https://search.test.datacite.org/works/10.0133/37522")
24
- end
25
-
26
- it 'should succeed with no works' do
27
- from_date = "2005-04-07"
28
- until_date = "2005-04-08"
29
- subject.options = { sitemap_bucket: sitemap_bucket,
30
- from_date: from_date,
31
- until_date: until_date }
32
- expect { subject.sitemap }.to output("No works found for date range 2005-04-07 - 2005-04-08.\n").to_stdout
18
+ expect(doc.xpath("//xmlns:url").size).to eq(1)
19
+ expect(doc.xpath("//xmlns:loc").last.text).to eq("https://search.test.datacite.org/")
33
20
  end
34
21
  end
35
22
  end