fsp_harvester 0.1.14 → 0.1.17

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e285f00da696d7e39d80df794be9524af6e63ea01deb4e73f6c30b3694c016ff
4
- data.tar.gz: fb81b5c1c0fac3bb22e078663025855e5accdb355db1811a4687fb1bca54bc61
3
+ metadata.gz: c3f2b3409b575db21edc69a7e0e4bfbf5be09734fbcc4f4b0d5accb5fedad6c2
4
+ data.tar.gz: 27813b4e090515a869d5fbc519a717eb06e7bc0559e42d33d1284093758f229f
5
5
  SHA512:
6
- metadata.gz: 194132eb78246291a3cb96566ca6a283841a0427afcd6a6abb79c590dbc2c54108e3e8cfef9e4802a77008f1a4c9c94ea7862987e81ce1b4b97cd1fdaf25ca23
7
- data.tar.gz: 9765647726c2bfcd7e790ba11929d257610672bc92d8d11756824432e90db4c05036b2cfcede1a55da95f1e74b9e87fd078c78284c356897a5bdc0a17593a3a1
6
+ metadata.gz: 33aeec82ef754f219db35eba08e68e35f0aecc570b2769f943f073cf1c4b5a9cdfed912c7fc40261797e4d3b00d7f668228aa5cd442eec213c272455aab1a275
7
+ data.tar.gz: 80f03c769794a7bf8054d95c667274a3a3ddde26242b0969e15511f78b74cdd5d20508ed7c36e47a446012ff021f0f3ad5a9aa4a44dc91d07c8c86d0da35c59e
data/.rspec_status CHANGED
@@ -40,16 +40,17 @@ example_id | status | run_time |
40
40
  ./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
41
41
  ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
42
  ./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00693 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 3.65 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 9.96 seconds |
46
+ ./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
47
+ ./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
48
+ ./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
49
+ ./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
50
+ ./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
51
+ ./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
52
+ ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
53
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
53
54
  ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
54
55
  ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
55
56
  ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.14)
4
+ fsp_harvester (0.1.17)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
7
  linkheaders-processor (~> 0.1.17)
data/README.md CHANGED
@@ -20,6 +20,39 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
+ ```
24
+ require 'fsp_harvester'
25
+
26
+ ENV['EXTRUCT_COMMAND'] = "extruct"
27
+ ENV['RDF_COMMAND'] = "/home/user/.rvm/gems/ruby-3.0.0/bin/rdf" # kelloggs distiller
28
+ ENV['TIKA_COMMAND'] = "http://localhost:9998/meta" # assumes using the docker version of tika
29
+
30
+ # to only follow the FAIR signposting specification:
31
+ links, metadata = HarvesterTools::Utils.resolve_guid(guid: guid)
32
+
33
+ links.each do |link|
34
+ puts link.href
35
+ puts link.relation
36
+ end
37
+
38
+ # note, you don't need to catch the return value here. The metadata object that is passed in will be modified
39
+ metadata = FspHarvester::Utils.gather_metadata_from_describedby_links(links: links, metadata: metadata)
40
+
41
+ linkeddata = metadata.graph
42
+ hashdata = metadata.hash
43
+ comments = metadata.comments
44
+ warnings = metadata.warnings
45
+
46
+ # if you want to try other things like content negotiation and "scraping" from HTML, do this:
47
+ # note, you don't need to catch the return value here. The metadata object that is passed in will be modified
48
+ metadata = HarvesterTools::BruteForce.begin_brute_force(guid: guid, metadata: metadata)
49
+
50
+ linkeddata = metadata.graph
51
+ hashdata = metadata.hash
52
+ comments = metadata.comments
53
+ warnings = metadata.warnings
54
+
55
+ ```
23
56
 
24
57
 
25
58
  ## Development
data/lib/constants.rb CHANGED
@@ -78,31 +78,31 @@ GUID_TYPES = {
78
78
  'ark' => Regexp.new(%r{^ark:/[^\s]+$})
79
79
  }
80
80
 
81
- CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
82
- extruct = CONFIG.dig(:extruct, :command)
83
- extruct ||= 'extruct'
81
+ # CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
82
+ # extruct = CONFIG.dig(:extruct, :command)
83
+ extruct = ENV['EXTRUCT_COMMAND'] || 'extruct'
84
84
  extruct.strip!
85
85
  case extruct
86
86
  when /[&|;`$\s]/
87
- abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
87
+ abort 'The Extruct command appears to be subject to command injection. I will not continue'
88
88
  when /echo/i
89
- abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
89
+ abort 'The Extruct command appears to be subject to command injection. I will not continue'
90
90
  end
91
91
  EXTRUCT_COMMAND = extruct
92
92
 
93
- rdf_command = CONFIG.dig(:rdf, :command)
94
- rdf_command ||= 'rdf'
93
+ # rdf_command = CONFIG.dig(:rdf, :command)
94
+ rdf_command = ENV['RDF_COMMAND'] || 'rdf'
95
95
  rdf_command.strip
96
96
  case rdf_command
97
97
  when /[&|;`$\s]/
98
- abort 'The RDF command in the config file appears to be subject to command injection. I will not continue'
98
+ abort 'The RDF command appears to be subject to command injection. I will not continue'
99
99
  when /echo/i
100
- abort 'The RDF command in the config file appears to be subject to command injection. I will not continue'
100
+ abort 'The RDF command appears to be subject to command injection. I will not continue'
101
101
  when !(/rdf$/ =~ $_)
102
102
  abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
103
103
  end
104
104
  RDF_COMMAND = rdf_command
105
105
 
106
- tika_command = CONFIG.dig(:tika, :command)
107
- tika_command ||= 'http://localhost:9998/meta'
106
+ # tika_command = CONFIG.dig(:tika, :command)
107
+ tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
108
108
  TIKA_COMMAND = tika_command
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.14"
4
+ VERSION = "0.1.17"
5
5
  end
@@ -0,0 +1,48 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class BruteForce
6
+ def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
7
+ type, url = HarvesterTools::Utils.convertToURL(guid: guid)
8
+ return false unless type
9
+
10
+ do_content_negotiation(url: url, metadata: metadata)
11
+ metadata
12
+ end
13
+
14
+ def self.do_content_negotiation(url:, metadata:)
15
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
16
+ if response
17
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
18
+ end
19
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
20
+ if response
21
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
22
+ response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
23
+ if response
24
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
30
+ @meta = metadata
31
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
32
+ warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
33
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
34
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
35
+
36
+ unless response
37
+ @meta.add_warning(['001', url, headers])
38
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
39
+ @meta.full_response << [url, "No response"]
40
+ false
41
+ end
42
+
43
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
44
+ @meta.full_response << [url, response.body]
45
+ response
46
+ end
47
+ end
48
+ end
@@ -4,8 +4,8 @@ module HarvesterTools
4
4
 
5
5
  class Utils
6
6
 
7
- def self.resolve_guid(guid:)
8
- @meta = HarvesterTools::MetadataObject.new
7
+ def self.resolve_guid(guid:, metadata: HarvesterTools::MetadataObject.new)
8
+ @meta = metadata
9
9
  @meta.all_uris = [guid]
10
10
  type, url = convertToURL(guid: guid)
11
11
  links = Array.new
@@ -1,6 +1,6 @@
1
1
  module HarvesterTools
2
2
  class MetadataObject
3
- attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
3
+ attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :guid, :score # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
5
5
  def initialize(_params = {}) # get a name from the "new" call, or set a default
6
6
  @hash = {}
@@ -10,6 +10,8 @@ module HarvesterTools
10
10
  @full_response = []
11
11
  @links = []
12
12
  @all_uris = []
13
+ @guid = ""
14
+ @score = 0
13
15
  w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
14
16
  #@warn = File.read("./lib/warnings.json")
15
17
  @warn = JSON.parse(w)
File without changes
@@ -2,7 +2,7 @@
2
2
  command="extruct"
3
3
 
4
4
  [rdf]
5
- command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
5
+ command="/usr/local/bundle/bin/rdf"
6
6
 
7
7
  [tika]
8
8
  command="http://tika:9998/meta"
File without changes
data/lib/warnings.json CHANGED
@@ -115,5 +115,12 @@
115
115
  {"Documentation": "http://dx.doi.org/10.17487/RFC8259"},
116
116
  {"Validator": "https://jsononline.net/json-validator"}],
117
117
  "severity": "WARN"
118
+ },
119
+ "600": {
120
+ "message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
121
+ "linkout": [],
122
+ "severity": "FAILURE"
118
123
  }
124
+
125
+
119
126
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.14
4
+ version: 0.1.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-12 00:00:00.000000000 Z
11
+ date: 2022-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -182,18 +182,19 @@ files:
182
182
  - bin/setup
183
183
  - example_test.rb
184
184
  - launch.json
185
- - lib/config.conf
186
- - lib/config.conf_docker
187
- - lib/config.conf_local
188
185
  - lib/constants.rb
189
186
  - lib/external_tools.rb
190
187
  - lib/fsp_harvester.rb
191
188
  - lib/fsp_harvester/version.rb
192
189
  - lib/harvester.rb
190
+ - lib/harvester_brute.rb
193
191
  - lib/harvester_utils.rb
194
192
  - lib/metadata_harvester.rb
195
193
  - lib/metadata_object.rb
196
194
  - lib/metadata_parser.rb
195
+ - lib/obselete_config.conf
196
+ - lib/obselete_config.conf_docker
197
+ - lib/obselete_config.conf_local
197
198
  - lib/signposting_tests.rb
198
199
  - lib/warnings.json
199
200
  - lib/web_utils.rb