pure-extractor 1.1.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b238e51c6777d6dd560f62b11c44e18251042918
4
- data.tar.gz: 70feca57d15b4d1906010215247bd305a43d56b2
3
+ metadata.gz: c13f77c8908044946b4d7939c45e1b925e43a43d
4
+ data.tar.gz: bb730f66444a12fe629e395f2fffe970447d910d
5
5
  SHA512:
6
- metadata.gz: e0351116e8b178eee808f470751460f59056e6d22f1fc3d7ecc13a85a399460102267e1f685d3d3e348ab72d3dfb1fc7a1aa41b5544974f024647a11651d4669
7
- data.tar.gz: 2b63cd36a9e500d843382a14520e48eeaeb4b2fa18ae7e1a46fd268a63d3190423522ffd8986c47ce0c3337f3e15c5377ab2e1fb9cc6792e6fb5f4dcbd1ac1d7
6
+ metadata.gz: d80c1ae5553e9c044a5aa88b9ae61bbb68e8d7c4fba72c46ff08823a86bdeee6e1c45c0ebeb28096ad1d4b5cc91dfaeb3f3d9c112ec915ee9b5db850b8d34625
7
+ data.tar.gz: 69333eb907dffb2e51633907dc350646e1417ced8b2ddf93ef2425d592b01229ee16308ff36cdfac194baeba6683523608b5699ae4750324b933d01de0ddf042
data/.gitignore CHANGED
@@ -9,3 +9,4 @@
9
9
  /tmp/
10
10
  *.gem
11
11
  .rbenv-gemsets
12
+ .byebug_history
data/README.md CHANGED
@@ -44,6 +44,8 @@ pure-extractor -s SERVER_URL -u USERNAME -p PASSWORD -o OUTPUT_DIRECTORY -c CHUN
44
44
  | -u, --username | Username for the Pure WS Rest service, not required if Pure WS requests are unauthenticated |
45
45
  | -p, --password | Password for the Pure WS Rest service, not required if Pure WS requests are unauthenticated |
46
46
  | -c, --chunk-size | The number of entries to return per chunk and store per file, defaults to 200 if not set |
47
+ | -d, --request-delay | Flag to add random delay between sending API requests so as not to overload server, defaults to **false** |
48
+ | -i, --interactive | Run in interactive mode, used not running in docker, defaults to **false** |
47
49
 
48
50
  ## Development
49
51
 
@@ -1,141 +1,12 @@
1
1
  require "pure/extractor/version"
2
- require "pure/extractor/configure_puree"
3
2
  require "pure/extractor/commands/pure_extractor"
4
3
  require 'ruby-progressbar'
4
+ require 'pure/extractor/formatters'
5
+ require 'pure/extractor/extractors'
6
+ require 'puree'
5
7
 
6
8
  module Pure
7
9
  module Extractor
8
10
 
9
- def self.extract type, chunk_size, output_directory
10
-
11
- collection = Puree::Collection.new resource: type
12
-
13
- collection_count = collection.count
14
-
15
- puts collection_count
16
-
17
- progress_bar = ProgressBar.create(format: "%a %e %b\u{15E7}%i %p%% %t", progress_mark: ' ', remainder_mark: "\u{FF65}", total: collection_count)
18
-
19
- offset = 0
20
- file_id = 0
21
-
22
- if chunk_size.nil? || chunk_size.empty?
23
- chunk_size = 200
24
- end
25
-
26
- chunk_size = chunk_size.to_i
27
-
28
- while offset < collection_count do
29
-
30
- file_id += 1
31
-
32
- filename = type.to_s + "_#{file_id.to_s.rjust(6, '0')}"
33
-
34
- output_file = output_directory + "/#{filename}.json"
35
-
36
- returned_collection = collection.find limit: chunk_size, offset: offset
37
-
38
- returned_collection.each do |item|
39
-
40
- delete_keys_for_type type, item
41
-
42
- end
43
-
44
- formatted_results = format_results_for_type type, returned_collection
45
-
46
- write_results_to_file formatted_results, output_file
47
-
48
- update_progress_bar progress_bar, chunk_size, collection_count
49
-
50
- offset += chunk_size
51
-
52
- end
53
-
54
- end
55
-
56
- def self.format_results_for_type type, results
57
-
58
- formatted_results = []
59
-
60
- case type
61
-
62
- when :organisation
63
-
64
- results.each do |result|
65
-
66
- formatted_result = {
67
- system: {
68
- uuid: result["uuid"],
69
- modified_at: result["modified"]
70
- },
71
- details: {
72
- name: result["name"],
73
- description: nil,
74
- url: result["url"][0],
75
- isni: nil,
76
- type: result["type"]
77
- },
78
- parent: {
79
- uuid: result["parent"]["uuid"]
80
- }
81
- }
82
-
83
- formatted_results.push formatted_result
84
-
85
- end
86
-
87
- else
88
- formatted_results = results
89
-
90
- end
91
-
92
- formatted_results
93
-
94
- end
95
-
96
- def self.delete_keys_for_type type, item
97
-
98
- keys = []
99
- nested_keys = {}
100
-
101
- case type
102
-
103
- when :dataset
104
-
105
- keys = ["keyword", "file", "associated", "link", "spatial"]
106
- nested_keys = { "person" => ["external", "other"] }
107
-
108
- end
109
-
110
- keys.each do |key|
111
- item.delete(key)
112
- end
113
-
114
- nested_keys.each do |key, attribute|
115
- item[key].delete(attribute)
116
- end
117
-
118
- item
119
-
120
- end
121
-
122
- def self.update_progress_bar progress_bar, limit, collection_count
123
-
124
- if (progress_bar.progress + limit) < collection_count
125
- progress_bar.progress += limit
126
- else
127
- progress_bar.progress = collection_count
128
- end
129
-
130
- end
131
-
132
- def self.write_results_to_file results, file
133
-
134
- File.open(file, "w") do |f|
135
- f.write(JSON.pretty_generate(results))
136
- end
137
-
138
- end
139
-
140
11
  end
141
12
  end
@@ -10,7 +10,9 @@ module Pure
10
10
  option ["-u", "--username"], "username", "Username to connect to Pure WS"
11
11
  option ["-p", "--password"], "password", "Password to connect to Pure WS"
12
12
  option ["-c", "--chunk-size"], "chunk-size", "Number of entities to extract per file, defaults to 200"
13
-
13
+ option ["-d", "--request-delay"], :flag, "Add random delay between sending API Requests, default to false", default: false
14
+ option ["-i", "--interactive"], :flag, "Run in interactive mode, displaying progress bar. This is the default mode", default: false
15
+
14
16
  end
15
17
  end
16
18
  end
@@ -5,9 +5,7 @@ module Pure
5
5
  module Extractor
6
6
  module Commands
7
7
  class PureExtractorCommand < PureCommand
8
-
9
- include Pure::Extractor::ConfigurePuree
10
-
8
+
11
9
  valid_extracts = [:organisation, :people, :projects, :publications, :datasets]
12
10
 
13
11
  parameter "EXTRACT", "what to extract from pure, valid options are #{valid_extracts.map{|v| v.to_s}}" do |s|
@@ -26,9 +24,29 @@ module Pure
26
24
 
27
25
  def execute
28
26
 
29
- configure_puree server, username, password
30
-
31
- Pure::Extractor.extract pure_collections[extract], chunk_size, output_dir
27
+ puree_config = {
28
+ url: server,
29
+ username: username,
30
+ password: password,
31
+ collection: pure_collections[extract],
32
+ chunk_size: chunk_size,
33
+ output_directory: output_dir,
34
+ delay: request_delay?
35
+ }
36
+
37
+ if interactive?
38
+
39
+ Pure::Extractor::Extractors::InteractiveExtractor.set_config puree_config
40
+ Pure::Extractor::Extractors::InteractiveExtractor.extract
41
+
42
+ else
43
+
44
+ Pure::Extractor::Extractors::LoggingExtractor.set_config puree_config
45
+ Pure::Extractor::Extractors::LoggingExtractor.extract
46
+
47
+ end
48
+
49
+
32
50
 
33
51
  end
34
52
 
@@ -0,0 +1,11 @@
1
+ require 'pure/extractor/extractors/extractor'
2
+ require 'pure/extractor/extractors/interactive'
3
+ require 'pure/extractor/extractors/logging'
4
+
5
+ module Pure
6
+ module Extractor
7
+ module Extractors
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,74 @@
1
+ module Pure
2
+ module Extractor
3
+ module Extractors
4
+
5
+ class Extractor
6
+
7
+ @config = {}
8
+
9
+ def self.set_config config
10
+ @config = config
11
+ end
12
+
13
+ def self.extract_collection_to_disk collection, chunk_size, offset, output_file
14
+
15
+ returned_collection = collection.find limit: chunk_size, offset: offset
16
+
17
+ formatted_results = format_results_for_type @config[:collection], returned_collection
18
+
19
+ write_results_to_file formatted_results, output_file
20
+
21
+ end
22
+
23
+ def self.random_delay
24
+
25
+ random_timeout = 60 + Random.rand(120)
26
+
27
+ sleep(random_timeout)
28
+
29
+ end
30
+
31
+ def self.format_results_for_type type, results
32
+
33
+ begin
34
+ formatter = Module.const_get('Pure::Extractor::Formatters::' + type.to_s.capitalize)
35
+ formatter.format_array results
36
+ rescue NameError
37
+ raise 'No formatter for specified area'
38
+ end
39
+
40
+ end
41
+
42
+ def self.write_results_to_file results, file
43
+
44
+ File.open(file, "w") do |f|
45
+ f.write(JSON.pretty_generate(results))
46
+ end
47
+
48
+ end
49
+
50
+ def self.filename_for_id id
51
+ @config[:collection].to_s + "_#{id.to_s.rjust(6, '0')}"
52
+ end
53
+
54
+ def self.output_filepath_for_filename filename
55
+ @config[:output_directory] + "/#{filename}.json"
56
+ end
57
+
58
+ def self.get_chunk_size
59
+
60
+ chunk_size = @config[:chunk_size]
61
+
62
+ if chunk_size.nil? || chunk_size.empty?
63
+ chunk_size = 200
64
+ end
65
+
66
+ chunk_size.to_i
67
+
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,60 @@
1
+ require 'pure/extractor/extractors/extractor'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Extractors
6
+
7
+ class InteractiveExtractor < Extractor
8
+
9
+ def self.extract
10
+
11
+ collection = Puree::Extractor::Collection.new config: @config, resource: @config[:collection]
12
+
13
+ collection_count = collection.count
14
+
15
+ puts "Extracting #{collection_count} records from #{@config[:collection]} collection"
16
+
17
+ progress_bar = ProgressBar.create(format: "%a %e %b\u{15E7}%i %p%% %t", progress_mark: ' ', remainder_mark: "\u{FF65}", total: collection_count)
18
+
19
+ offset = 0
20
+ file_id = 0
21
+
22
+ chunk_size = get_chunk_size
23
+
24
+ while offset < collection_count do
25
+
26
+ random_delay if (offset != 0) && @config[:delay]
27
+
28
+ file_id += 1
29
+
30
+ filename = filename_for_id file_id
31
+
32
+ output_file = output_filepath_for_filename filename
33
+
34
+ extract_collection_to_disk collection, chunk_size, offset, output_file
35
+
36
+ update_progress_bar progress_bar, chunk_size, collection_count
37
+
38
+ offset += chunk_size
39
+
40
+ end
41
+
42
+ puts "Finished extracting #{collection_count} records from #{@config[:collection]} collection"
43
+
44
+ end
45
+
46
+ def self.update_progress_bar progress_bar, limit, collection_count
47
+
48
+ if (progress_bar.progress + limit) < collection_count
49
+ progress_bar.progress += limit
50
+ else
51
+ progress_bar.progress = collection_count
52
+ end
53
+
54
+ end
55
+
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,68 @@
1
+ require 'pure/extractor/extractors/extractor'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Extractors
6
+
7
+ class LoggingExtractor < Extractor
8
+
9
+ def self.extract
10
+
11
+ collection = Puree::Extractor::Collection.new config: @config, resource: @config[:collection]
12
+
13
+ collection_count = collection.count
14
+
15
+ puts "Extracting #{collection_count} records from #{@config[:collection]} collection"
16
+
17
+ offset = 0
18
+ file_id = 0
19
+
20
+ chunk_size = get_chunk_size
21
+
22
+ number_of_files = get_number_of_files collection_count
23
+
24
+ while offset < collection_count do
25
+
26
+ random_delay if (offset != 0) && @config[:delay]
27
+
28
+ file_id += 1
29
+
30
+ if (offset + chunk_size) > collection_count
31
+ to_records = collection_count
32
+ else
33
+ to_records = offset + chunk_size
34
+ end
35
+
36
+ puts "Extracting records #{offset} - #{to_records} to file #{file_id} of #{number_of_files}"
37
+
38
+ filename = filename_for_id file_id
39
+
40
+ output_file = output_filepath_for_filename filename
41
+
42
+ extract_collection_to_disk collection, chunk_size, offset, output_file
43
+
44
+ puts "Extracted records to #{output_file}"
45
+
46
+ offset += chunk_size
47
+
48
+ end
49
+
50
+ puts "Finished extracting #{collection_count} records from #{@config[:collection]} collection"
51
+
52
+ end
53
+
54
+ def self.get_number_of_files collection_count
55
+
56
+ full_files, remaining_records = collection_count.divmod(get_chunk_size)
57
+
58
+ return full_files if remaining_records == 0
59
+
60
+ full_files + 1
61
+
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,11 @@
1
+ require 'pure/extractor/formatters/resource'
2
+ require 'pure/extractor/formatters/organisation'
3
+ require 'pure/extractor/formatters/person'
4
+
5
+ module Pure
6
+ module Extractor
7
+ module Formatters
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,47 @@
1
+ require 'pure/extractor/formatters/resource'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Formatters
6
+
7
+ class Organisation < Resource
8
+
9
+ def self.format unit
10
+
11
+ {
12
+ system: get_system(unit),
13
+ details: get_details(unit),
14
+ parent: {
15
+ uuid: get_parent_uuid(unit)
16
+ }
17
+ }
18
+
19
+ end
20
+
21
+ def self.get_details result
22
+
23
+ {
24
+ name: result.name,
25
+ description: nil,
26
+ url: result.urls.first,
27
+ isni: nil,
28
+ type: result.type
29
+ }
30
+
31
+ end
32
+
33
+ def self.get_parent_uuid result
34
+
35
+ if result.parent.nil?
36
+ nil
37
+ else
38
+ result.parent.uuid
39
+ end
40
+
41
+ end
42
+
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,28 @@
1
+ require 'pure/extractor/formatters/resource'
2
+
3
+ module Pure
4
+ module Extractor
5
+ module Formatters
6
+
7
+ class Person < Resource
8
+
9
+ def self.format unit
10
+
11
+ {
12
+ system: get_system(unit),
13
+ details: {
14
+ first_name: unit.name.first,
15
+ last_name: unit.name.last,
16
+ email: unit.email_addresses.first,
17
+ image_url: unit.image_urls.first,
18
+ orcid: unit.orcid
19
+ }
20
+ }
21
+
22
+ end
23
+
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,31 @@
1
+ module Pure
2
+ module Extractor
3
+ module Formatters
4
+
5
+ class Resource
6
+
7
+ def self.format_array array
8
+
9
+ results = []
10
+
11
+ array.each do |result|
12
+ results.push format(result)
13
+ end
14
+
15
+ results
16
+
17
+ end
18
+
19
+ def self.get_system result
20
+
21
+ {
22
+ uuid: result.uuid,
23
+ modified_at: result.modified
24
+ }
25
+
26
+ end
27
+
28
+ end
29
+ end
30
+ end
31
+ end
@@ -1,5 +1,5 @@
1
1
  module Pure
2
2
  module Extractor
3
- VERSION = "1.1.0"
3
+ VERSION = "2.0.0"
4
4
  end
5
5
  end
@@ -27,11 +27,12 @@ Gem::Specification.new do |spec|
27
27
  spec.require_paths = ["lib"]
28
28
 
29
29
  spec.add_dependency "clamp"
30
- spec.add_dependency "puree", "~> 0.19.1"
30
+ spec.add_dependency "puree", "~> 1.3.0"
31
31
  spec.add_dependency "ruby-progressbar"
32
32
 
33
33
  spec.add_dependency "bundler", "~> 1.12"
34
34
 
35
35
  spec.add_development_dependency "rake", "~> 10.0"
36
36
  spec.add_development_dependency "minitest"
37
+ spec.add_development_dependency "byebug"
37
38
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pure-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stephen Robinson
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-05-23 00:00:00.000000000 Z
12
+ date: 2017-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: clamp
@@ -31,14 +31,14 @@ dependencies:
31
31
  requirements:
32
32
  - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: 0.19.1
34
+ version: 1.3.0
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: 0.19.1
41
+ version: 1.3.0
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: ruby-progressbar
44
44
  requirement: !ruby/object:Gem::Requirement
@@ -95,6 +95,20 @@ dependencies:
95
95
  - - ">="
96
96
  - !ruby/object:Gem::Version
97
97
  version: '0'
98
+ - !ruby/object:Gem::Dependency
99
+ name: byebug
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ type: :development
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
98
112
  description: Command line application to extract data from Pure and write to JSON
99
113
  files for DMAO
100
114
  email:
@@ -114,7 +128,14 @@ files:
114
128
  - lib/pure/extractor.rb
115
129
  - lib/pure/extractor/commands/pure_command.rb
116
130
  - lib/pure/extractor/commands/pure_extractor.rb
117
- - lib/pure/extractor/configure_puree.rb
131
+ - lib/pure/extractor/extractors.rb
132
+ - lib/pure/extractor/extractors/extractor.rb
133
+ - lib/pure/extractor/extractors/interactive.rb
134
+ - lib/pure/extractor/extractors/logging.rb
135
+ - lib/pure/extractor/formatters.rb
136
+ - lib/pure/extractor/formatters/organisation.rb
137
+ - lib/pure/extractor/formatters/person.rb
138
+ - lib/pure/extractor/formatters/resource.rb
118
139
  - lib/pure/extractor/version.rb
119
140
  - pure-extractor.gemspec
120
141
  homepage: https://github.com/lulibrary
@@ -1,27 +0,0 @@
1
- require 'puree'
2
-
3
- module Pure
4
- module Extractor
5
- module ConfigurePuree
6
-
7
- def configure_puree server, username, password
8
-
9
- Puree.configure do |config|
10
-
11
- config.base_url = server
12
-
13
- if !username.nil? && !password.nil? && !username.empty? && !password.empty?
14
-
15
- config.username = username
16
- config.password = password
17
- config.basic_auth = true
18
-
19
- end
20
-
21
- end
22
-
23
- end
24
-
25
- end
26
- end
27
- end