puree 0.20.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -2
- data/PITCHME.md +43 -21
- data/README.md +72 -18
- data/lib/puree.rb +66 -21
- data/lib/puree/api/api.rb +9 -0
- data/lib/puree/api/authentication.rb +33 -0
- data/lib/puree/api/configuration.rb +43 -0
- data/lib/puree/api/map.rb +76 -0
- data/lib/puree/api/request.rb +116 -0
- data/lib/puree/extractor/collection.rb +131 -0
- data/lib/puree/extractor/dataset.rb +48 -0
- data/lib/puree/extractor/download.rb +71 -0
- data/lib/puree/extractor/event.rb +33 -0
- data/lib/puree/extractor/extractor.rb +10 -0
- data/lib/puree/extractor/journal.rb +29 -0
- data/lib/puree/extractor/organisation.rb +34 -0
- data/lib/puree/extractor/person.rb +32 -0
- data/lib/puree/extractor/project.rb +40 -0
- data/lib/puree/extractor/publication.rb +40 -0
- data/lib/puree/extractor/publisher.rb +27 -0
- data/lib/puree/extractor/resource.rb +69 -0
- data/lib/puree/extractor/server.rb +56 -0
- data/lib/puree/model/address.rb +50 -0
- data/lib/puree/model/copyright_license.rb +26 -0
- data/lib/puree/model/dataset.rb +84 -0
- data/lib/puree/model/download_header.rb +21 -0
- data/lib/puree/model/endeavour_person.rb +34 -0
- data/lib/puree/model/event.rb +31 -0
- data/lib/puree/model/event_header.rb +26 -0
- data/lib/puree/model/file.rb +45 -0
- data/lib/puree/model/helper/validation.rb +15 -0
- data/lib/puree/model/journal.rb +20 -0
- data/lib/puree/model/legal_condition.rb +26 -0
- data/lib/puree/model/link.rb +26 -0
- data/lib/puree/model/model.rb +7 -0
- data/lib/puree/model/organisation.rb +34 -0
- data/lib/puree/model/organisation_header.rb +34 -0
- data/lib/puree/model/person.rb +28 -0
- data/lib/puree/model/person_name.rb +52 -0
- data/lib/puree/model/project.rb +49 -0
- data/lib/puree/model/publication.rb +53 -0
- data/lib/puree/model/publication_status.rb +21 -0
- data/lib/puree/model/publisher.rb +13 -0
- data/lib/puree/model/related_content_header.rb +34 -0
- data/lib/puree/model/resource.rb +42 -0
- data/lib/puree/model/server.rb +13 -0
- data/lib/puree/model/spatial_point.rb +16 -0
- data/lib/puree/model/structure.rb +18 -0
- data/lib/puree/model/temporal_range.rb +15 -0
- data/lib/puree/util/date.rb +86 -0
- data/lib/puree/util/util.rb +8 -0
- data/lib/puree/version.rb +1 -1
- data/lib/puree/xml_extractor/base.rb +47 -0
- data/lib/puree/xml_extractor/collection.rb +40 -0
- data/lib/puree/xml_extractor/dataset.rb +305 -0
- data/lib/puree/xml_extractor/download.rb +42 -0
- data/lib/puree/xml_extractor/event.rb +63 -0
- data/lib/puree/xml_extractor/journal.rb +33 -0
- data/lib/puree/xml_extractor/organisation.rb +75 -0
- data/lib/puree/xml_extractor/person.rb +57 -0
- data/lib/puree/xml_extractor/project.rb +135 -0
- data/lib/puree/xml_extractor/publication.rb +189 -0
- data/lib/puree/xml_extractor/publisher.rb +28 -0
- data/lib/puree/xml_extractor/resource.rb +71 -0
- data/lib/puree/xml_extractor/server.rb +32 -0
- data/lib/puree/xml_extractor/shared.rb +31 -0
- data/lib/puree/xml_extractor/xml_extractor.rb +10 -0
- data/puree.gemspec +11 -8
- data/spec/download_http_spec.rb +31 -0
- data/spec/open_api_dataset_http_spec.rb +15 -0
- data/spec/resource/collection_all_http_spec.rb +77 -0
- data/spec/resource/collection_http_spec.rb +65 -0
- data/spec/resource/dataset_http_spec.rb +104 -0
- data/spec/resource/event_http_spec.rb +52 -0
- data/spec/resource/journal_http_spec.rb +36 -0
- data/spec/resource/organisation_http_spec.rb +52 -0
- data/spec/resource/person_http_spec.rb +48 -0
- data/spec/resource/project_http_spec.rb +76 -0
- data/spec/resource/publication_http_spec.rb +78 -0
- data/spec/resource/publisher_http_spec.rb +26 -0
- data/spec/server_http_spec.rb +26 -0
- data/spec/spec_helper.rb +106 -21
- metadata +110 -46
- data/lib/puree/collection.rb +0 -285
- data/lib/puree/configuration.rb +0 -15
- data/lib/puree/dataset.rb +0 -483
- data/lib/puree/date.rb +0 -63
- data/lib/puree/download.rb +0 -189
- data/lib/puree/event.rb +0 -133
- data/lib/puree/journal.rb +0 -75
- data/lib/puree/map.rb +0 -68
- data/lib/puree/organisation.rb +0 -177
- data/lib/puree/person.rb +0 -136
- data/lib/puree/project.rb +0 -231
- data/lib/puree/publication.rb +0 -258
- data/lib/puree/publisher.rb +0 -64
- data/lib/puree/resource.rb +0 -261
- data/lib/puree/server.rb +0 -156
- data/spec/collection_spec.rb +0 -62
- data/spec/dataset_spec.rb +0 -148
- data/spec/download_spec.rb +0 -33
- data/spec/event_spec.rb +0 -108
- data/spec/journal_spec.rb +0 -92
- data/spec/organisation_spec.rb +0 -112
- data/spec/person_spec.rb +0 -104
- data/spec/project_spec.rb +0 -120
- data/spec/publication_spec.rb +0 -128
- data/spec/publisher_spec.rb +0 -89
- data/spec/server_spec.rb +0 -36
@@ -0,0 +1,28 @@
|
|
1
|
+
module Puree
|
2
|
+
|
3
|
+
module XMLExtractor
|
4
|
+
|
5
|
+
# Publisher XML extractor.
|
6
|
+
#
|
7
|
+
class Publisher < Puree::XMLExtractor::Resource
|
8
|
+
|
9
|
+
def initialize(xml:)
|
10
|
+
super
|
11
|
+
@resource_type = :publisher
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [String, nil]
|
15
|
+
def name
|
16
|
+
xpath_query_for_single_value '/name'
|
17
|
+
end
|
18
|
+
|
19
|
+
# Adds no value as value is Publisher
|
20
|
+
# def type
|
21
|
+
# xpath_query_for_single_value '/typeClassification/term/localizedString'
|
22
|
+
# end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Puree
|
2
|
+
|
3
|
+
module XMLExtractor
|
4
|
+
|
5
|
+
# Resource XML extractor.
|
6
|
+
#
|
7
|
+
class Resource < Puree::XMLExtractor::Base
|
8
|
+
|
9
|
+
def initialize(xml:)
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
# content based
|
14
|
+
def xpath_query(path)
|
15
|
+
path_from_root = service_xpath path
|
16
|
+
@doc.xpath path_from_root
|
17
|
+
end
|
18
|
+
|
19
|
+
# Is there any data after get? For a response that provides a count of the results.
|
20
|
+
# @return [Boolean]
|
21
|
+
def get_data?
|
22
|
+
path = service_xpath_count
|
23
|
+
xpath_result = @doc.xpath path
|
24
|
+
xpath_result.text.strip === '1' ? true : false
|
25
|
+
end
|
26
|
+
|
27
|
+
# @return [Time, nil]
|
28
|
+
def created
|
29
|
+
Time.parse xpath_query_for_single_value('/created')
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Time, nil]
|
33
|
+
def modified
|
34
|
+
Time.parse xpath_query_for_single_value('/modified')
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [String, nil]
|
38
|
+
def uuid
|
39
|
+
xpath_query_for_single_value '/@uuid'
|
40
|
+
end
|
41
|
+
|
42
|
+
# Locale (e.g. en-GB)
|
43
|
+
# @return [String, nil]
|
44
|
+
def locale
|
45
|
+
str = xpath_query_for_single_value '/@locale'
|
46
|
+
str.tr('_','-') if str
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def service_response_name
|
52
|
+
@api_map[:resource_type][@resource_type][:response]
|
53
|
+
end
|
54
|
+
|
55
|
+
def service_xpath_base
|
56
|
+
service_response_name + '/result/content'
|
57
|
+
end
|
58
|
+
|
59
|
+
def service_xpath_count
|
60
|
+
service_response_name + '/count'
|
61
|
+
end
|
62
|
+
|
63
|
+
def service_xpath(str_to_find)
|
64
|
+
service_xpath_base + str_to_find
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Puree
|
2
|
+
|
3
|
+
module XMLExtractor
|
4
|
+
|
5
|
+
# Server XML extractor.
|
6
|
+
#
|
7
|
+
class Server < Puree::XMLExtractor::Base
|
8
|
+
|
9
|
+
def initialize(xml:)
|
10
|
+
@resource_type = :server
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [String]
|
15
|
+
def version
|
16
|
+
path = "#{service_response_name}/baseVersion"
|
17
|
+
@doc.xpath(path).text.strip
|
18
|
+
end
|
19
|
+
|
20
|
+
# Is there any data after get?
|
21
|
+
#
|
22
|
+
# @return [Boolean]
|
23
|
+
def get_data?
|
24
|
+
# n.b. arbitrary element existence check
|
25
|
+
version.empty? ? false : true
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Puree
|
2
|
+
|
3
|
+
module XMLExtractor
|
4
|
+
|
5
|
+
# Shared XML extractor.
|
6
|
+
#
|
7
|
+
module Shared
|
8
|
+
|
9
|
+
# @return [Puree::Model::OrganisationHeader]
|
10
|
+
def self.organisation_header(nokogiri_xml_element)
|
11
|
+
h = Puree::Model::OrganisationHeader.new
|
12
|
+
h.uuid = nokogiri_xml_element.xpath('@uuid').text.strip
|
13
|
+
h.name = nokogiri_xml_element.xpath('name/localizedString').text.strip
|
14
|
+
h.type = nokogiri_xml_element.xpath('typeClassification/term/localizedString').text.strip
|
15
|
+
h
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return [Array<Puree::Model::OrganisationHeader>]
|
19
|
+
def self.organisation_multi_header(nokogiri_xml_nodeset)
|
20
|
+
data = []
|
21
|
+
nokogiri_xml_nodeset.each do |i|
|
22
|
+
data << organisation_header(i)
|
23
|
+
end
|
24
|
+
data.uniq { |d| d.uuid }
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
data/puree.gemspec
CHANGED
@@ -4,21 +4,24 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'puree/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
7
|
+
spec.name = 'puree'
|
8
8
|
spec.version = Puree::VERSION
|
9
|
-
spec.authors =
|
10
|
-
spec.email =
|
11
|
-
spec.summary = %q{
|
12
|
-
spec.description = %q{
|
13
|
-
|
14
|
-
spec.
|
9
|
+
spec.authors = 'Adrian Albin-Clark'
|
10
|
+
spec.email = 'a.albin-clark@lancaster.ac.uk'
|
11
|
+
spec.summary = %q{Metadata extraction from the Pure Research Information System.}
|
12
|
+
spec.description = %q{Fetches metadata from the Pure Research Information System and
|
13
|
+
extracts it into Ruby data models.}
|
14
|
+
spec.homepage = 'https://github.com/lulibrary/puree'
|
15
|
+
spec.license = 'MIT'
|
15
16
|
spec.files = `git ls-files -z`.split("\x0")
|
16
17
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
-
spec.require_paths = [
|
19
|
+
spec.require_paths = ['lib']
|
19
20
|
|
20
21
|
spec.required_ruby_version = '~> 2.1'
|
21
22
|
|
22
23
|
spec.add_runtime_dependency 'http', '~> 2.0'
|
23
24
|
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
|
+
|
26
|
+
spec.add_development_dependency 'rspec'
|
24
27
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Download' do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
|
7
|
+
end
|
8
|
+
|
9
|
+
it '#new' do
|
10
|
+
p = Puree::Extractor::Download.new config
|
11
|
+
expect(p).to be_a Puree::Extractor::Download
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'data retrieval' do
|
15
|
+
before(:all) do
|
16
|
+
@p = Puree::Extractor::Download.new config
|
17
|
+
@metadata = @p.find resource: :dataset,
|
18
|
+
limit: 10
|
19
|
+
end
|
20
|
+
|
21
|
+
it '#find' do
|
22
|
+
expect(@metadata).to all( be_a Puree::Model::DownloadHeader )
|
23
|
+
end
|
24
|
+
|
25
|
+
it '#find' do
|
26
|
+
expect(@metadata).not_to be_empty
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
def new
|
4
|
+
@p = Puree::Extractor::Collection.new resource: @resource_type,
|
5
|
+
config: config
|
6
|
+
end
|
7
|
+
|
8
|
+
def go resource_type
|
9
|
+
@resource_type = resource_type
|
10
|
+
new
|
11
|
+
fetch
|
12
|
+
end
|
13
|
+
|
14
|
+
def fetch
|
15
|
+
count = @p.count
|
16
|
+
(0..count-1).each do |i|
|
17
|
+
resource = @p.find limit: 1,
|
18
|
+
offset: i
|
19
|
+
expect(resource[0]).to be_a resource_class
|
20
|
+
puts "#{i+1} of #{count} #{@resource_type}s"
|
21
|
+
sleep 1
|
22
|
+
system 'clear'
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def resource_class
|
27
|
+
str = "Puree::Model::#{@resource_type.to_s.capitalize}"
|
28
|
+
Object.const_get(str)
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'dataset' do
|
32
|
+
it 'get all, one at a time' do
|
33
|
+
go :dataset
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe 'event' do
|
38
|
+
it 'get all, one at a time' do
|
39
|
+
go :event
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe 'journal' do
|
44
|
+
it 'get all, one at a time' do
|
45
|
+
go :journal
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe 'organisation' do
|
50
|
+
it 'get all, one at a time' do
|
51
|
+
go :organisation
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe 'person' do
|
56
|
+
it 'get all, one at a time' do
|
57
|
+
go :person
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe 'project' do
|
62
|
+
it 'get all, one at a time' do
|
63
|
+
go :project
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe 'publication' do
|
68
|
+
it 'get all, one at a time' do
|
69
|
+
go :publication
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe 'publisher' do
|
74
|
+
it 'get all, one at a time' do
|
75
|
+
go :publisher
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Collection of datasets' do
|
4
|
+
|
5
|
+
def new
|
6
|
+
@p = Puree::Extractor::Collection.new resource: :dataset,
|
7
|
+
config: config
|
8
|
+
end
|
9
|
+
|
10
|
+
def setup
|
11
|
+
new
|
12
|
+
end
|
13
|
+
|
14
|
+
it '#new' do
|
15
|
+
new
|
16
|
+
expect(@p).to be_a(Puree::Extractor::Collection)
|
17
|
+
end
|
18
|
+
|
19
|
+
describe 'data retrieval' do
|
20
|
+
before(:all) do
|
21
|
+
setup
|
22
|
+
@metadata = @p.find limit: 5
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'collection' do
|
26
|
+
expect(@metadata).to be_a(Array)
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'data retrieval instance' do
|
32
|
+
before(:all) do
|
33
|
+
setup
|
34
|
+
@metadata = @p.find limit: 5
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'collection' do
|
38
|
+
expect(@metadata).to be_a(Array)
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'collection item' do
|
42
|
+
expect(@metadata).to all( be_a Puree::Model::Dataset )
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
describe 'data retrieval count' do
|
48
|
+
before(:all) do
|
49
|
+
setup
|
50
|
+
@p.find limit: 0
|
51
|
+
end
|
52
|
+
|
53
|
+
it '#count' do
|
54
|
+
expect(@p.count).to be_a(Fixnum)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
it '#random_resource' do
|
60
|
+
setup
|
61
|
+
metadata = @p.random_resource
|
62
|
+
expect(metadata).to be_a Puree::Model::Dataset
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Dataset' do
|
4
|
+
|
5
|
+
it '#new' do
|
6
|
+
p = Puree::Extractor::Dataset.new config
|
7
|
+
expect(p).to be_a Puree::Extractor::Dataset
|
8
|
+
end
|
9
|
+
|
10
|
+
before(:all) do
|
11
|
+
request :dataset
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'data retrieval' do
|
15
|
+
|
16
|
+
resource_header
|
17
|
+
|
18
|
+
it 'data structure' do
|
19
|
+
expect(@p).to be_a Puree::Model::Dataset
|
20
|
+
end
|
21
|
+
|
22
|
+
it '#access' do
|
23
|
+
expect(@p.access).to be_a String if @p.access
|
24
|
+
end
|
25
|
+
|
26
|
+
it '#associated' do
|
27
|
+
expect(@p.associated).to all( be_a Puree::Model::RelatedContentHeader )
|
28
|
+
end
|
29
|
+
|
30
|
+
it '#available' do
|
31
|
+
expect(@p.available).to be_a Time if @p.available
|
32
|
+
end
|
33
|
+
|
34
|
+
it '#description' do
|
35
|
+
expect(@p.description).to be_a String if @p.description
|
36
|
+
end
|
37
|
+
|
38
|
+
it '#doi' do
|
39
|
+
expect(@p.doi).to be_a String if @p.doi
|
40
|
+
end
|
41
|
+
|
42
|
+
it '#files' do
|
43
|
+
expect(@p.files).to all( be_a Puree::Model::File )
|
44
|
+
end
|
45
|
+
|
46
|
+
it '#keywords' do
|
47
|
+
expect(@p.keywords).to all( be_a String )
|
48
|
+
end
|
49
|
+
|
50
|
+
it '#legal_conditions' do
|
51
|
+
expect(@p.legal_conditions).to all( be_a Puree::Model::LegalCondition )
|
52
|
+
end
|
53
|
+
|
54
|
+
it '#links' do
|
55
|
+
expect(@p.links).to all( be_a Puree::Model::Link )
|
56
|
+
end
|
57
|
+
|
58
|
+
it '#persons_internal' do
|
59
|
+
expect(@p.persons_internal).to all( be_a Puree::Model::EndeavourPerson )
|
60
|
+
end
|
61
|
+
|
62
|
+
it '#persons_external' do
|
63
|
+
expect(@p.persons_external).to all( be_a Puree::Model::EndeavourPerson )
|
64
|
+
end
|
65
|
+
|
66
|
+
it '#persons_other' do
|
67
|
+
expect(@p.persons_other).to all( be_a Puree::Model::EndeavourPerson )
|
68
|
+
end
|
69
|
+
|
70
|
+
it '#production' do
|
71
|
+
expect(@p.production).to be_a Puree::Model::TemporalRange if @p.production
|
72
|
+
end
|
73
|
+
|
74
|
+
it '#projects' do
|
75
|
+
expect(@p.projects).to all( be_a Puree::Model::RelatedContentHeader )
|
76
|
+
end
|
77
|
+
|
78
|
+
it '#publications' do
|
79
|
+
expect(@p.publications).to all( be_a Puree::Model::RelatedContentHeader )
|
80
|
+
end
|
81
|
+
|
82
|
+
it '#publisher' do
|
83
|
+
expect(@p.publisher).to be_a String if @p.publisher
|
84
|
+
end
|
85
|
+
|
86
|
+
it '#spatial_places' do
|
87
|
+
expect(@p.spatial_places).to all( be_a String ) if @p.spatial_places
|
88
|
+
end
|
89
|
+
|
90
|
+
it '#spatial_point' do
|
91
|
+
expect(@p.spatial_point).to be_a Puree::Model::SpatialPoint if @p.spatial_point
|
92
|
+
end
|
93
|
+
|
94
|
+
it '#temporal' do
|
95
|
+
expect(@p.temporal).to be_a Puree::Model::TemporalRange if @p.temporal
|
96
|
+
end
|
97
|
+
|
98
|
+
it '#title' do
|
99
|
+
expect(@p.title).to be_a String if @p.title
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|