pure-extractor 1.1.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b238e51c6777d6dd560f62b11c44e18251042918
4
- data.tar.gz: 70feca57d15b4d1906010215247bd305a43d56b2
3
+ metadata.gz: c13f77c8908044946b4d7939c45e1b925e43a43d
4
+ data.tar.gz: bb730f66444a12fe629e395f2fffe970447d910d
5
5
  SHA512:
6
- metadata.gz: e0351116e8b178eee808f470751460f59056e6d22f1fc3d7ecc13a85a399460102267e1f685d3d3e348ab72d3dfb1fc7a1aa41b5544974f024647a11651d4669
7
- data.tar.gz: 2b63cd36a9e500d843382a14520e48eeaeb4b2fa18ae7e1a46fd268a63d3190423522ffd8986c47ce0c3337f3e15c5377ab2e1fb9cc6792e6fb5f4dcbd1ac1d7
6
+ metadata.gz: d80c1ae5553e9c044a5aa88b9ae61bbb68e8d7c4fba72c46ff08823a86bdeee6e1c45c0ebeb28096ad1d4b5cc91dfaeb3f3d9c112ec915ee9b5db850b8d34625
7
+ data.tar.gz: 69333eb907dffb2e51633907dc350646e1417ced8b2ddf93ef2425d592b01229ee16308ff36cdfac194baeba6683523608b5699ae4750324b933d01de0ddf042
data/.gitignore CHANGED
@@ -9,3 +9,4 @@
9
9
  /tmp/
10
10
  *.gem
11
11
  .rbenv-gemsets
12
+ .byebug_history
data/README.md CHANGED
@@ -44,6 +44,8 @@ pure-extractor -s SERVER_URL -u USERNAME -p PASSWORD -o OUTPUT_DIRECTORY -c CHUN
44
44
  | -u, --username | Username for the Pure WS Rest service, not required if Pure WS requests are unauthenticated |
45
45
  | -p, --password | Password for the Pure WS Rest service, not required if Pure WS requests are unauthenticated |
46
46
  | -c, --chunk-size | The number of entries to return per chunk and store per file, defaults to 200 if not set |
47
+ | -d, --request-delay | Flag to add random delay between sending API requests so as not to overload server, defaults to **false** |
48
+ | -i, --interactive | Run in interactive mode, used not running in docker, defaults to **false** |
47
49
 
48
50
  ## Development
49
51
 
@@ -1,141 +1,12 @@
1
1
  require "pure/extractor/version"
2
- require "pure/extractor/configure_puree"
3
2
  require "pure/extractor/commands/pure_extractor"
4
3
  require 'ruby-progressbar'
4
+ require 'pure/extractor/formatters'
5
+ require 'pure/extractor/extractors'
6
+ require 'puree'
5
7
 
6
8
  module Pure
7
9
  module Extractor
8
10
 
9
- def self.extract type, chunk_size, output_directory
10
-
11
- collection = Puree::Collection.new resource: type
12
-
13
- collection_count = collection.count
14
-
15
- puts collection_count
16
-
17
- progress_bar = ProgressBar.create(format: "%a %e %b\u{15E7}%i %p%% %t", progress_mark: ' ', remainder_mark: "\u{FF65}", total: collection_count)
18
-
19
- offset = 0
20
- file_id = 0
21
-
22
- if chunk_size.nil? || chunk_size.empty?
23
- chunk_size = 200
24
- end
25
-
26
- chunk_size = chunk_size.to_i
27
-
28
- while offset < collection_count do
29
-
30
- file_id += 1
31
-
32
- filename = type.to_s + "_#{file_id.to_s.rjust(6, '0')}"
33
-
34
- output_file = output_directory + "/#{filename}.json"
35
-
36
- returned_collection = collection.find limit: chunk_size, offset: offset
37
-
38
- returned_collection.each do |item|
39
-
40
- delete_keys_for_type type, item
41
-
42
- end
43
-
44
- formatted_results = format_results_for_type type, returned_collection
45
-
46
- write_results_to_file formatted_results, output_file
47
-
48
- update_progress_bar progress_bar, chunk_size, collection_count
49
-
50
- offset += chunk_size
51
-
52
- end
53
-
54
- end
55
-
56
- def self.format_results_for_type type, results
57
-
58
- formatted_results = []
59
-
60
- case type
61
-
62
- when :organisation
63
-
64
- results.each do |result|
65
-
66
- formatted_result = {
67
- system: {
68
- uuid: result["uuid"],
69
- modified_at: result["modified"]
70
- },
71
- details: {
72
- name: result["name"],
73
- description: nil,
74
- url: result["url"][0],
75
- isni: nil,
76
- type: result["type"]
77
- },
78
- parent: {
79
- uuid: result["parent"]["uuid"]
80
- }
81
- }
82
-
83
- formatted_results.push formatted_result
84
-
85
- end
86
-
87
- else
88
- formatted_results = results
89
-
90
- end
91
-
92
- formatted_results
93
-
94
- end
95
-
96
- def self.delete_keys_for_type type, item
97
-
98
- keys = []
99
- nested_keys = {}
100
-
101
- case type
102
-
103
- when :dataset
104
-
105
- keys = ["keyword", "file", "associated", "link", "spatial"]
106
- nested_keys = { "person" => ["external", "other"] }
107
-
108
- end
109
-
110
- keys.each do |key|
111
- item.delete(key)
112
- end
113
-
114
- nested_keys.each do |key, attribute|
115
- item[key].delete(attribute)
116
- end
117
-
118
- item
119
-
120
- end
121
-
122
- def self.update_progress_bar progress_bar, limit, collection_count
123
-
124
- if (progress_bar.progress + limit) < collection_count
125
- progress_bar.progress += limit
126
- else
127
- progress_bar.progress = collection_count
128
- end
129
-
130
- end
131
-
132
- def self.write_results_to_file results, file
133
-
134
- File.open(file, "w") do |f|
135
- f.write(JSON.pretty_generate(results))
136
- end
137
-
138
- end
139
-
140
11
  end
141
12
  end
@@ -10,7 +10,9 @@ module Pure
10
10
  option ["-u", "--username"], "username", "Username to connect to Pure WS"
11
11
  option ["-p", "--password"], "password", "Password to connect to Pure WS"
12
12
  option ["-c", "--chunk-size"], "chunk-size", "Number of entities to extract per file, defaults to 200"
13
-
13
+ option ["-d", "--request-delay"], :flag, "Add random delay between sending API Requests, default to false", default: false
14
+ option ["-i", "--interactive"], :flag, "Run in interactive mode, displaying progress bar. This is the default mode", default: false
15
+
14
16
  end
15
17
  end
16
18
  end
@@ -5,9 +5,7 @@ module Pure
5
5
  module Extractor
6
6
  module Commands
7
7
  class PureExtractorCommand < PureCommand
8
-
9
- include Pure::Extractor::ConfigurePuree
10
-
8
+
11
9
  valid_extracts = [:organisation, :people, :projects, :publications, :datasets]
12
10
 
13
11
  parameter "EXTRACT", "what to extract from pure, valid options are #{valid_extracts.map{|v| v.to_s}}" do |s|
@@ -26,9 +24,29 @@ module Pure
26
24
 
27
25
  def execute
28
26
 
29
- configure_puree server, username, password
30
-
31
- Pure::Extractor.extract pure_collections[extract], chunk_size, output_dir
27
+ puree_config = {
28
+ url: server,
29
+ username: username,
30
+ password: password,
31
+ collection: pure_collections[extract],
32
+ chunk_size: chunk_size,
33
+ output_directory: output_dir,
34
+ delay: request_delay?
35
+ }
36
+
37
+ if interactive?
38
+
39
+ Pure::Extractor::Extractors::InteractiveExtractor.set_config puree_config
40
+ Pure::Extractor::Extractors::InteractiveExtractor.extract
41
+
42
+ else
43
+
44
+ Pure::Extractor::Extractors::LoggingExtractor.set_config puree_config
45
+ Pure::Extractor::Extractors::LoggingExtractor.extract
46
+
47
+ end
48
+
49
+
32
50
 
33
51
  end
34
52
 
@@ -0,0 +1,11 @@
1
+ require 'pure/extractor/extractors/extractor'
2
+ require 'pure/extractor/extractors/interactive'
3
+ require 'pure/extractor/extractors/logging'
4
+
5
+ module Pure
6
+ module Extractor
7
+ module Extractors
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,74 @@
1
+ module Pure
2
+ module Extractor
3
+ module Extractors
4
+
5
+ class Extractor
6
+
7
+ @config = {}
8
+
9
+ def self.set_config config
10
+ @config = config
11
+ end
12
+
13
+ def self.extract_collection_to_disk collection, chunk_size, offset, output_file
14
+
15
+ returned_collection = collection.find limit: chunk_size, offset: offset
16
+
17
+ formatted_results = format_results_for_type @config[:collection], returned_collection
18
+
19
+ write_results_to_file formatted_results, output_file
20
+
21
+ end
22
+
23
+ def self.random_delay
24
+
25
+ random_timeout = 60 + Random.rand(120)
26
+
27
+ sleep(random_timeout)
28
+
29
+ end
30
+
31
+ def self.format_results_for_type type, results
32
+
33
+ begin
34
+ formatter = Module.const_get('Pure::Extractor::Formatters::' + type.to_s.capitalize)
35
+ formatter.format_array results
36
+ rescue NameError
37
+ raise 'No formatter for specified area'
38
+ end
39
+
40
+ end
41
+
42
+ def self.write_results_to_file results, file
43
+
44
+ File.open(file, "w") do |f|
45
+ f.write(JSON.pretty_generate(results))
46
+ end
47
+
48
+ end
49
+
50
+ def self.filename_for_id id
51
+ @config[:collection].to_s + "_#{id.to_s.rjust(6, '0')}"
52
+ end
53
+
54
+ def self.output_filepath_for_filename filename
55
+ @config[:output_directory] + "/#{filename}.json"
56
+ end
57
+
58
+ def self.get_chunk_size
59
+
60
+ chunk_size = @config[:chunk_size]
61
+
62
+ if chunk_size.nil? || chunk_size.empty?
63
+ chunk_size = 200
64
+ end
65
+
66
+ chunk_size.to_i
67
+
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,60 @@
1
+ require 'pure/extractor/extractors/extractor'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Extractors
6
+
7
+ class InteractiveExtractor < Extractor
8
+
9
+ def self.extract
10
+
11
+ collection = Puree::Extractor::Collection.new config: @config, resource: @config[:collection]
12
+
13
+ collection_count = collection.count
14
+
15
+ puts "Extracting #{collection_count} records from #{@config[:collection]} collection"
16
+
17
+ progress_bar = ProgressBar.create(format: "%a %e %b\u{15E7}%i %p%% %t", progress_mark: ' ', remainder_mark: "\u{FF65}", total: collection_count)
18
+
19
+ offset = 0
20
+ file_id = 0
21
+
22
+ chunk_size = get_chunk_size
23
+
24
+ while offset < collection_count do
25
+
26
+ random_delay if (offset != 0) && @config[:delay]
27
+
28
+ file_id += 1
29
+
30
+ filename = filename_for_id file_id
31
+
32
+ output_file = output_filepath_for_filename filename
33
+
34
+ extract_collection_to_disk collection, chunk_size, offset, output_file
35
+
36
+ update_progress_bar progress_bar, chunk_size, collection_count
37
+
38
+ offset += chunk_size
39
+
40
+ end
41
+
42
+ puts "Finished extracting #{collection_count} records from #{@config[:collection]} collection"
43
+
44
+ end
45
+
46
+ def self.update_progress_bar progress_bar, limit, collection_count
47
+
48
+ if (progress_bar.progress + limit) < collection_count
49
+ progress_bar.progress += limit
50
+ else
51
+ progress_bar.progress = collection_count
52
+ end
53
+
54
+ end
55
+
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,68 @@
1
+ require 'pure/extractor/extractors/extractor'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Extractors
6
+
7
+ class LoggingExtractor < Extractor
8
+
9
+ def self.extract
10
+
11
+ collection = Puree::Extractor::Collection.new config: @config, resource: @config[:collection]
12
+
13
+ collection_count = collection.count
14
+
15
+ puts "Extracting #{collection_count} records from #{@config[:collection]} collection"
16
+
17
+ offset = 0
18
+ file_id = 0
19
+
20
+ chunk_size = get_chunk_size
21
+
22
+ number_of_files = get_number_of_files collection_count
23
+
24
+ while offset < collection_count do
25
+
26
+ random_delay if (offset != 0) && @config[:delay]
27
+
28
+ file_id += 1
29
+
30
+ if (offset + chunk_size) > collection_count
31
+ to_records = collection_count
32
+ else
33
+ to_records = offset + chunk_size
34
+ end
35
+
36
+ puts "Extracting records #{offset} - #{to_records} to file #{file_id} of #{number_of_files}"
37
+
38
+ filename = filename_for_id file_id
39
+
40
+ output_file = output_filepath_for_filename filename
41
+
42
+ extract_collection_to_disk collection, chunk_size, offset, output_file
43
+
44
+ puts "Extracted records to #{output_file}"
45
+
46
+ offset += chunk_size
47
+
48
+ end
49
+
50
+ puts "Finished extracting #{collection_count} records from #{@config[:collection]} collection"
51
+
52
+ end
53
+
54
+ def self.get_number_of_files collection_count
55
+
56
+ full_files, remaining_records = collection_count.divmod(get_chunk_size)
57
+
58
+ return full_files if remaining_records == 0
59
+
60
+ full_files + 1
61
+
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,11 @@
1
+ require 'pure/extractor/formatters/resource'
2
+ require 'pure/extractor/formatters/organisation'
3
+ require 'pure/extractor/formatters/person'
4
+
5
+ module Pure
6
+ module Extractor
7
+ module Formatters
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,47 @@
1
+ require 'pure/extractor/formatters/resource'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Formatters
6
+
7
+ class Organisation < Resource
8
+
9
+ def self.format unit
10
+
11
+ {
12
+ system: get_system(unit),
13
+ details: get_details(unit),
14
+ parent: {
15
+ uuid: get_parent_uuid(unit)
16
+ }
17
+ }
18
+
19
+ end
20
+
21
+ def self.get_details result
22
+
23
+ {
24
+ name: result.name,
25
+ description: nil,
26
+ url: result.urls.first,
27
+ isni: nil,
28
+ type: result.type
29
+ }
30
+
31
+ end
32
+
33
+ def self.get_parent_uuid result
34
+
35
+ if result.parent.nil?
36
+ nil
37
+ else
38
+ result.parent.uuid
39
+ end
40
+
41
+ end
42
+
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,28 @@
1
+ require 'pure/extractor/formatters/resource'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Formatters
6
+
7
+ class Person < Resource
8
+
9
+ def self.format unit
10
+
11
+ {
12
+ system: get_system(unit),
13
+ details: {
14
+ first_name: unit.name.first,
15
+ last_name: unit.name.last,
16
+ email: unit.email_addresses.first,
17
+ image_url: unit.image_urls.first,
18
+ orcid: unit.orcid
19
+ }
20
+ }
21
+
22
+ end
23
+
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,31 @@
1
+ module Pure
2
+ module Extractor
3
+ module Formatters
4
+
5
+ class Resource
6
+
7
+ def self.format_array array
8
+
9
+ results = []
10
+
11
+ array.each do |result|
12
+ results.push format(result)
13
+ end
14
+
15
+ results
16
+
17
+ end
18
+
19
+ def self.get_system result
20
+
21
+ {
22
+ uuid: result.uuid,
23
+ modified_at: result.modified
24
+ }
25
+
26
+ end
27
+
28
+ end
29
+ end
30
+ end
31
+ end
@@ -1,5 +1,5 @@
1
1
  module Pure
2
2
  module Extractor
3
- VERSION = "1.1.0"
3
+ VERSION = "2.0.0"
4
4
  end
5
5
  end
@@ -27,11 +27,12 @@ Gem::Specification.new do |spec|
27
27
  spec.require_paths = ["lib"]
28
28
 
29
29
  spec.add_dependency "clamp"
30
- spec.add_dependency "puree", "~> 0.19.1"
30
+ spec.add_dependency "puree", "~> 1.3.0"
31
31
  spec.add_dependency "ruby-progressbar"
32
32
 
33
33
  spec.add_dependency "bundler", "~> 1.12"
34
34
 
35
35
  spec.add_development_dependency "rake", "~> 10.0"
36
36
  spec.add_development_dependency "minitest"
37
+ spec.add_development_dependency "byebug"
37
38
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pure-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stephen Robinson
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-05-23 00:00:00.000000000 Z
12
+ date: 2017-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: clamp
@@ -31,14 +31,14 @@ dependencies:
31
31
  requirements:
32
32
  - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: 0.19.1
34
+ version: 1.3.0
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: 0.19.1
41
+ version: 1.3.0
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: ruby-progressbar
44
44
  requirement: !ruby/object:Gem::Requirement
@@ -95,6 +95,20 @@ dependencies:
95
95
  - - ">="
96
96
  - !ruby/object:Gem::Version
97
97
  version: '0'
98
+ - !ruby/object:Gem::Dependency
99
+ name: byebug
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ type: :development
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
98
112
  description: Command line application to extract data from Pure and write to JSON
99
113
  files for DMAO
100
114
  email:
@@ -114,7 +128,14 @@ files:
114
128
  - lib/pure/extractor.rb
115
129
  - lib/pure/extractor/commands/pure_command.rb
116
130
  - lib/pure/extractor/commands/pure_extractor.rb
117
- - lib/pure/extractor/configure_puree.rb
131
+ - lib/pure/extractor/extractors.rb
132
+ - lib/pure/extractor/extractors/extractor.rb
133
+ - lib/pure/extractor/extractors/interactive.rb
134
+ - lib/pure/extractor/extractors/logging.rb
135
+ - lib/pure/extractor/formatters.rb
136
+ - lib/pure/extractor/formatters/organisation.rb
137
+ - lib/pure/extractor/formatters/person.rb
138
+ - lib/pure/extractor/formatters/resource.rb
118
139
  - lib/pure/extractor/version.rb
119
140
  - pure-extractor.gemspec
120
141
  homepage: https://github.com/lulibrary
@@ -1,27 +0,0 @@
1
- require 'puree'
2
-
3
- module Pure
4
- module Extractor
5
- module ConfigurePuree
6
-
7
- def configure_puree server, username, password
8
-
9
- Puree.configure do |config|
10
-
11
- config.base_url = server
12
-
13
- if !username.nil? && !password.nil? && !username.empty? && !password.empty?
14
-
15
- config.username = username
16
- config.password = password
17
- config.basic_auth = true
18
-
19
- end
20
-
21
- end
22
-
23
- end
24
-
25
- end
26
- end
27
- end