fsp_harvester 0.1.14 → 0.1.17
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec_status +11 -10
- data/Gemfile.lock +1 -1
- data/README.md +33 -0
- data/lib/constants.rb +11 -11
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/harvester_brute.rb +48 -0
- data/lib/harvester_utils.rb +2 -2
- data/lib/metadata_object.rb +3 -1
- data/lib/{config.conf_docker → obselete_config.conf} +0 -0
- data/lib/{config.conf_local → obselete_config.conf_docker} +1 -1
- data/lib/{config.conf → obselete_config.conf_local} +0 -0
- data/lib/warnings.json +7 -0
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c3f2b3409b575db21edc69a7e0e4bfbf5be09734fbcc4f4b0d5accb5fedad6c2
|
4
|
+
data.tar.gz: 27813b4e090515a869d5fbc519a717eb06e7bc0559e42d33d1284093758f229f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 33aeec82ef754f219db35eba08e68e35f0aecc570b2769f943f073cf1c4b5a9cdfed912c7fc40261797e4d3b00d7f668228aa5cd442eec213c272455aab1a275
|
7
|
+
data.tar.gz: 80f03c769794a7bf8054d95c667274a3a3ddde26242b0969e15511f78b74cdd5d20508ed7c36e47a446012ff021f0f3ad5a9aa4a44dc91d07c8c86d0da35c59e
|
data/.rspec_status
CHANGED
@@ -40,16 +40,17 @@ example_id | status | run_time |
|
|
40
40
|
./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
|
41
41
|
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
|
42
42
|
./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] |
|
45
|
-
./spec/
|
46
|
-
./spec/item_spec.rb[1:1:
|
47
|
-
./spec/item_spec.rb[1:1:
|
48
|
-
./spec/item_spec.rb[1:1:
|
49
|
-
./spec/item_spec.rb[1:1:
|
50
|
-
./spec/item_spec.rb[1:1:
|
51
|
-
./spec/item_spec.rb[1:1:
|
52
|
-
./spec/item_spec.rb[1:1:
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00693 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | passed | 3.65 seconds |
|
45
|
+
./spec/fsp_harvester_spec.rb[1:3] | passed | 9.96 seconds |
|
46
|
+
./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
|
47
|
+
./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
|
48
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
|
49
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
|
53
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
|
53
54
|
./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
|
54
55
|
./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
|
55
56
|
./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -20,6 +20,39 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
```
|
24
|
+
require 'fsp_harvester'
|
25
|
+
|
26
|
+
ENV['EXTRUCT_COMMAND'] = "extruct"
|
27
|
+
ENV['RDF_COMMAND'] = "/home/user/.rvm/gems/ruby-3.0.0/bin/rdf" # kelloggs distiller
|
28
|
+
ENV['TIKA_COMMAND'] = "http://localhost:9998/meta" # assumes using the docker version of tika
|
29
|
+
|
30
|
+
# to only follow the FAIR signposting specification:
|
31
|
+
links, metadata = HarvesterTools::Utils.resolve_guid(guid: guid)
|
32
|
+
|
33
|
+
links.each do |link|
|
34
|
+
puts link.href
|
35
|
+
puts link.relation
|
36
|
+
end
|
37
|
+
|
38
|
+
# note, you don't need to catch the return value here. The metadata object that is passed in will be modified
|
39
|
+
metadata = FspHarvester::Utils.gather_metadata_from_describedby_links(links: links, metadata: metadata)
|
40
|
+
|
41
|
+
linkeddata = metadata.graph
|
42
|
+
hashdata = metadata.hash
|
43
|
+
comments = metadata.comments
|
44
|
+
warnings = metadata.warnings
|
45
|
+
|
46
|
+
# if you want to try other things like content negotiation and "scraping" from HTML, do this:
|
47
|
+
# note, you don't need to catch the return value here. The metadata object that is passed in will be modified
|
48
|
+
metadata = HarvesterTools::BruteForce.begin_brute_force(guid: guid, metadata: metadata)
|
49
|
+
|
50
|
+
linkeddata = metadata.graph
|
51
|
+
hashdata = metadata.hash
|
52
|
+
comments = metadata.comments
|
53
|
+
warnings = metadata.warnings
|
54
|
+
|
55
|
+
```
|
23
56
|
|
24
57
|
|
25
58
|
## Development
|
data/lib/constants.rb
CHANGED
@@ -78,31 +78,31 @@ GUID_TYPES = {
|
|
78
78
|
'ark' => Regexp.new(%r{^ark:/[^\s]+$})
|
79
79
|
}
|
80
80
|
|
81
|
-
CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
82
|
-
extruct = CONFIG.dig(:extruct, :command)
|
83
|
-
extruct
|
81
|
+
# CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
82
|
+
# extruct = CONFIG.dig(:extruct, :command)
|
83
|
+
extruct = ENV['EXTRUCT_COMMAND'] || 'extruct'
|
84
84
|
extruct.strip!
|
85
85
|
case extruct
|
86
86
|
when /[&|;`$\s]/
|
87
|
-
abort 'The Extruct command
|
87
|
+
abort 'The Extruct command appears to be subject to command injection. I will not continue'
|
88
88
|
when /echo/i
|
89
|
-
abort 'The Extruct command
|
89
|
+
abort 'The Extruct command appears to be subject to command injection. I will not continue'
|
90
90
|
end
|
91
91
|
EXTRUCT_COMMAND = extruct
|
92
92
|
|
93
|
-
rdf_command = CONFIG.dig(:rdf, :command)
|
94
|
-
rdf_command
|
93
|
+
# rdf_command = CONFIG.dig(:rdf, :command)
|
94
|
+
rdf_command = ENV['RDF_COMMAND'] || 'rdf'
|
95
95
|
rdf_command.strip
|
96
96
|
case rdf_command
|
97
97
|
when /[&|;`$\s]/
|
98
|
-
abort 'The RDF command
|
98
|
+
abort 'The RDF command appears to be subject to command injection. I will not continue'
|
99
99
|
when /echo/i
|
100
|
-
abort 'The RDF command
|
100
|
+
abort 'The RDF command appears to be subject to command injection. I will not continue'
|
101
101
|
when !(/rdf$/ =~ $_)
|
102
102
|
abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
|
103
103
|
end
|
104
104
|
RDF_COMMAND = rdf_command
|
105
105
|
|
106
|
-
tika_command = CONFIG.dig(:tika, :command)
|
107
|
-
tika_command
|
106
|
+
# tika_command = CONFIG.dig(:tika, :command)
|
107
|
+
tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
|
108
108
|
TIKA_COMMAND = tika_command
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module HarvesterTools
|
2
|
+
class Error < StandardError
|
3
|
+
end
|
4
|
+
|
5
|
+
class BruteForce
|
6
|
+
def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
|
7
|
+
type, url = HarvesterTools::Utils.convertToURL(guid: guid)
|
8
|
+
return false unless type
|
9
|
+
|
10
|
+
do_content_negotiation(url: url, metadata: metadata)
|
11
|
+
metadata
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.do_content_negotiation(url:, metadata:)
|
15
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
|
16
|
+
if response
|
17
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
|
18
|
+
end
|
19
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
|
20
|
+
if response
|
21
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
22
|
+
response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
|
23
|
+
if response
|
24
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
|
30
|
+
@meta = metadata
|
31
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
32
|
+
warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
|
33
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
|
34
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
35
|
+
|
36
|
+
unless response
|
37
|
+
@meta.add_warning(['001', url, headers])
|
38
|
+
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
|
39
|
+
@meta.full_response << [url, "No response"]
|
40
|
+
false
|
41
|
+
end
|
42
|
+
|
43
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
44
|
+
@meta.full_response << [url, response.body]
|
45
|
+
response
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/harvester_utils.rb
CHANGED
@@ -4,8 +4,8 @@ module HarvesterTools
|
|
4
4
|
|
5
5
|
class Utils
|
6
6
|
|
7
|
-
def self.resolve_guid(guid:)
|
8
|
-
@meta =
|
7
|
+
def self.resolve_guid(guid:, metadata: HarvesterTools::MetadataObject.new)
|
8
|
+
@meta = metadata
|
9
9
|
@meta.all_uris = [guid]
|
10
10
|
type, url = convertToURL(guid: guid)
|
11
11
|
links = Array.new
|
data/lib/metadata_object.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module HarvesterTools
|
2
2
|
class MetadataObject
|
3
|
-
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
3
|
+
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :guid, :score # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
4
4
|
|
5
5
|
def initialize(_params = {}) # get a name from the "new" call, or set a default
|
6
6
|
@hash = {}
|
@@ -10,6 +10,8 @@ module HarvesterTools
|
|
10
10
|
@full_response = []
|
11
11
|
@links = []
|
12
12
|
@all_uris = []
|
13
|
+
@guid = ""
|
14
|
+
@score = 0
|
13
15
|
w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
|
14
16
|
#@warn = File.read("./lib/warnings.json")
|
15
17
|
@warn = JSON.parse(w)
|
File without changes
|
File without changes
|
data/lib/warnings.json
CHANGED
@@ -115,5 +115,12 @@
|
|
115
115
|
{"Documentation": "http://dx.doi.org/10.17487/RFC8259"},
|
116
116
|
{"Validator": "https://jsononline.net/json-validator"}],
|
117
117
|
"severity": "WARN"
|
118
|
+
},
|
119
|
+
"600": {
|
120
|
+
"message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
|
121
|
+
"linkout": [],
|
122
|
+
"severity": "FAILURE"
|
118
123
|
}
|
124
|
+
|
125
|
+
|
119
126
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -182,18 +182,19 @@ files:
|
|
182
182
|
- bin/setup
|
183
183
|
- example_test.rb
|
184
184
|
- launch.json
|
185
|
-
- lib/config.conf
|
186
|
-
- lib/config.conf_docker
|
187
|
-
- lib/config.conf_local
|
188
185
|
- lib/constants.rb
|
189
186
|
- lib/external_tools.rb
|
190
187
|
- lib/fsp_harvester.rb
|
191
188
|
- lib/fsp_harvester/version.rb
|
192
189
|
- lib/harvester.rb
|
190
|
+
- lib/harvester_brute.rb
|
193
191
|
- lib/harvester_utils.rb
|
194
192
|
- lib/metadata_harvester.rb
|
195
193
|
- lib/metadata_object.rb
|
196
194
|
- lib/metadata_parser.rb
|
195
|
+
- lib/obselete_config.conf
|
196
|
+
- lib/obselete_config.conf_docker
|
197
|
+
- lib/obselete_config.conf_local
|
197
198
|
- lib/signposting_tests.rb
|
198
199
|
- lib/warnings.json
|
199
200
|
- lib/web_utils.rb
|