fsp_harvester 0.1.14 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e285f00da696d7e39d80df794be9524af6e63ea01deb4e73f6c30b3694c016ff
4
- data.tar.gz: fb81b5c1c0fac3bb22e078663025855e5accdb355db1811a4687fb1bca54bc61
3
+ metadata.gz: c3f2b3409b575db21edc69a7e0e4bfbf5be09734fbcc4f4b0d5accb5fedad6c2
4
+ data.tar.gz: 27813b4e090515a869d5fbc519a717eb06e7bc0559e42d33d1284093758f229f
5
5
  SHA512:
6
- metadata.gz: 194132eb78246291a3cb96566ca6a283841a0427afcd6a6abb79c590dbc2c54108e3e8cfef9e4802a77008f1a4c9c94ea7862987e81ce1b4b97cd1fdaf25ca23
7
- data.tar.gz: 9765647726c2bfcd7e790ba11929d257610672bc92d8d11756824432e90db4c05036b2cfcede1a55da95f1e74b9e87fd078c78284c356897a5bdc0a17593a3a1
6
+ metadata.gz: 33aeec82ef754f219db35eba08e68e35f0aecc570b2769f943f073cf1c4b5a9cdfed912c7fc40261797e4d3b00d7f668228aa5cd442eec213c272455aab1a275
7
+ data.tar.gz: 80f03c769794a7bf8054d95c667274a3a3ddde26242b0969e15511f78b74cdd5d20508ed7c36e47a446012ff021f0f3ad5a9aa4a44dc91d07c8c86d0da35c59e
data/.rspec_status CHANGED
@@ -40,16 +40,17 @@ example_id | status | run_time |
40
40
  ./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
41
41
  ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
42
  ./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00693 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | passed | 3.65 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 9.96 seconds |
46
+ ./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
47
+ ./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
48
+ ./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
49
+ ./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
50
+ ./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
51
+ ./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
52
+ ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
53
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
53
54
  ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
54
55
  ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
55
56
  ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.14)
4
+ fsp_harvester (0.1.17)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
7
  linkheaders-processor (~> 0.1.17)
data/README.md CHANGED
@@ -20,6 +20,39 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
+ ```
24
+ require 'fsp_harvester'
25
+
26
+ ENV['EXTRUCT_COMMAND'] = "extruct"
27
+ ENV['RDF_COMMAND'] = "/home/user/.rvm/gems/ruby-3.0.0/bin/rdf" # kelloggs distiller
28
+ ENV['TIKA_COMMAND'] = "http://localhost:9998/meta" # assumes using the docker version of tika
29
+
30
+ # to only follow the FAIR signposting specification:
31
+ links, metadata = HarvesterTools::Utils.resolve_guid(guid: guid)
32
+
33
+ links.each do |link|
34
+ puts link.href
35
+ puts link.relation
36
+ end
37
+
38
+ # note, you don't need to catch the return value here. The metadata object that is passed in will be modified
39
+ metadata = FspHarvester::Utils.gather_metadata_from_describedby_links(links: links, metadata: metadata)
40
+
41
+ linkeddata = metadata.graph
42
+ hashdata = metadata.hash
43
+ comments = metadata.comments
44
+ warnings = metadata.warnings
45
+
46
+ # if you want to try other things like content negotiation and "scraping" from HTML, do this:
47
+ # note, you don't need to catch the return value here. The metadata object that is passed in will be modified
48
+ metadata = HarvesterTools::BruteForce.begin_brute_force(guid: guid, metadata: metadata)
49
+
50
+ linkeddata = metadata.graph
51
+ hashdata = metadata.hash
52
+ comments = metadata.comments
53
+ warnings = metadata.warnings
54
+
55
+ ```
23
56
 
24
57
 
25
58
  ## Development
data/lib/constants.rb CHANGED
@@ -78,31 +78,31 @@ GUID_TYPES = {
78
78
  'ark' => Regexp.new(%r{^ark:/[^\s]+$})
79
79
  }
80
80
 
81
- CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
82
- extruct = CONFIG.dig(:extruct, :command)
83
- extruct ||= 'extruct'
81
+ # CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
82
+ # extruct = CONFIG.dig(:extruct, :command)
83
+ extruct = ENV['EXTRUCT_COMMAND'] || 'extruct'
84
84
  extruct.strip!
85
85
  case extruct
86
86
  when /[&|;`$\s]/
87
- abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
87
+ abort 'The Extruct command appears to be subject to command injection. I will not continue'
88
88
  when /echo/i
89
- abort 'The Extruct command in the config file appears to be subject to command injection. I will not continue'
89
+ abort 'The Extruct command appears to be subject to command injection. I will not continue'
90
90
  end
91
91
  EXTRUCT_COMMAND = extruct
92
92
 
93
- rdf_command = CONFIG.dig(:rdf, :command)
94
- rdf_command ||= 'rdf'
93
+ # rdf_command = CONFIG.dig(:rdf, :command)
94
+ rdf_command = ENV['RDF_COMMAND'] || 'rdf'
95
95
  rdf_command.strip
96
96
  case rdf_command
97
97
  when /[&|;`$\s]/
98
- abort 'The RDF command in the config file appears to be subject to command injection. I will not continue'
98
+ abort 'The RDF command appears to be subject to command injection. I will not continue'
99
99
  when /echo/i
100
- abort 'The RDF command in the config file appears to be subject to command injection. I will not continue'
100
+ abort 'The RDF command appears to be subject to command injection. I will not continue'
101
101
  when !(/rdf$/ =~ $_)
102
102
  abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
103
103
  end
104
104
  RDF_COMMAND = rdf_command
105
105
 
106
- tika_command = CONFIG.dig(:tika, :command)
107
- tika_command ||= 'http://localhost:9998/meta'
106
+ # tika_command = CONFIG.dig(:tika, :command)
107
+ tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
108
108
  TIKA_COMMAND = tika_command
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.14"
4
+ VERSION = "0.1.17"
5
5
  end
@@ -0,0 +1,48 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class BruteForce
6
+ def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
7
+ type, url = HarvesterTools::Utils.convertToURL(guid: guid)
8
+ return false unless type
9
+
10
+ do_content_negotiation(url: url, metadata: metadata)
11
+ metadata
12
+ end
13
+
14
+ def self.do_content_negotiation(url:, metadata:)
15
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
16
+ if response
17
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
18
+ end
19
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
20
+ if response
21
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
22
+ response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
23
+ if response
24
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
30
+ @meta = metadata
31
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
32
+ warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
33
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
34
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
35
+
36
+ unless response
37
+ @meta.add_warning(['001', url, headers])
38
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
39
+ @meta.full_response << [url, "No response"]
40
+ false
41
+ end
42
+
43
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
44
+ @meta.full_response << [url, response.body]
45
+ response
46
+ end
47
+ end
48
+ end
@@ -4,8 +4,8 @@ module HarvesterTools
4
4
 
5
5
  class Utils
6
6
 
7
- def self.resolve_guid(guid:)
8
- @meta = HarvesterTools::MetadataObject.new
7
+ def self.resolve_guid(guid:, metadata: HarvesterTools::MetadataObject.new)
8
+ @meta = metadata
9
9
  @meta.all_uris = [guid]
10
10
  type, url = convertToURL(guid: guid)
11
11
  links = Array.new
@@ -1,6 +1,6 @@
1
1
  module HarvesterTools
2
2
  class MetadataObject
3
- attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
3
+ attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :guid, :score # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
4
4
 
5
5
  def initialize(_params = {}) # get a name from the "new" call, or set a default
6
6
  @hash = {}
@@ -10,6 +10,8 @@ module HarvesterTools
10
10
  @full_response = []
11
11
  @links = []
12
12
  @all_uris = []
13
+ @guid = ""
14
+ @score = 0
13
15
  w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
14
16
  #@warn = File.read("./lib/warnings.json")
15
17
  @warn = JSON.parse(w)
File without changes
@@ -2,7 +2,7 @@
2
2
  command="extruct"
3
3
 
4
4
  [rdf]
5
- command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
5
+ command="/usr/local/bundle/bin/rdf"
6
6
 
7
7
  [tika]
8
8
  command="http://tika:9998/meta"
File without changes
data/lib/warnings.json CHANGED
@@ -115,5 +115,12 @@
115
115
  {"Documentation": "http://dx.doi.org/10.17487/RFC8259"},
116
116
  {"Validator": "https://jsononline.net/json-validator"}],
117
117
  "severity": "WARN"
118
+ },
119
+ "600": {
120
+ "message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
121
+ "linkout": [],
122
+ "severity": "FAILURE"
118
123
  }
124
+
125
+
119
126
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.14
4
+ version: 0.1.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-08-12 00:00:00.000000000 Z
11
+ date: 2022-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -182,18 +182,19 @@ files:
182
182
  - bin/setup
183
183
  - example_test.rb
184
184
  - launch.json
185
- - lib/config.conf
186
- - lib/config.conf_docker
187
- - lib/config.conf_local
188
185
  - lib/constants.rb
189
186
  - lib/external_tools.rb
190
187
  - lib/fsp_harvester.rb
191
188
  - lib/fsp_harvester/version.rb
192
189
  - lib/harvester.rb
190
+ - lib/harvester_brute.rb
193
191
  - lib/harvester_utils.rb
194
192
  - lib/metadata_harvester.rb
195
193
  - lib/metadata_object.rb
196
194
  - lib/metadata_parser.rb
195
+ - lib/obselete_config.conf
196
+ - lib/obselete_config.conf_docker
197
+ - lib/obselete_config.conf_local
197
198
  - lib/signposting_tests.rb
198
199
  - lib/warnings.json
199
200
  - lib/web_utils.rb