fsp_harvester 0.1.14 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +11 -10
- data/Gemfile.lock +1 -1
- data/README.md +33 -0
- data/lib/constants.rb +11 -11
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/harvester_brute.rb +48 -0
- data/lib/harvester_utils.rb +2 -2
- data/lib/metadata_object.rb +3 -1
- data/lib/{config.conf_docker → obselete_config.conf} +0 -0
- data/lib/{config.conf_local → obselete_config.conf_docker} +1 -1
- data/lib/{config.conf → obselete_config.conf_local} +0 -0
- data/lib/warnings.json +7 -0
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c3f2b3409b575db21edc69a7e0e4bfbf5be09734fbcc4f4b0d5accb5fedad6c2
|
4
|
+
data.tar.gz: 27813b4e090515a869d5fbc519a717eb06e7bc0559e42d33d1284093758f229f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 33aeec82ef754f219db35eba08e68e35f0aecc570b2769f943f073cf1c4b5a9cdfed912c7fc40261797e4d3b00d7f668228aa5cd442eec213c272455aab1a275
|
7
|
+
data.tar.gz: 80f03c769794a7bf8054d95c667274a3a3ddde26242b0969e15511f78b74cdd5d20508ed7c36e47a446012ff021f0f3ad5a9aa4a44dc91d07c8c86d0da35c59e
|
data/.rspec_status
CHANGED
@@ -40,16 +40,17 @@ example_id | status | run_time |
|
|
40
40
|
./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
|
41
41
|
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
|
42
42
|
./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] |
|
45
|
-
./spec/
|
46
|
-
./spec/item_spec.rb[1:1:
|
47
|
-
./spec/item_spec.rb[1:1:
|
48
|
-
./spec/item_spec.rb[1:1:
|
49
|
-
./spec/item_spec.rb[1:1:
|
50
|
-
./spec/item_spec.rb[1:1:
|
51
|
-
./spec/item_spec.rb[1:1:
|
52
|
-
./spec/item_spec.rb[1:1:
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00693 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | passed | 3.65 seconds |
|
45
|
+
./spec/fsp_harvester_spec.rb[1:3] | passed | 9.96 seconds |
|
46
|
+
./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
|
47
|
+
./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
|
48
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
|
49
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
|
53
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
|
53
54
|
./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
|
54
55
|
./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
|
55
56
|
./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -20,6 +20,39 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
```
|
24
|
+
require 'fsp_harvester'
|
25
|
+
|
26
|
+
ENV['EXTRUCT_COMMAND'] = "extruct"
|
27
|
+
ENV['RDF_COMMAND'] = "/home/user/.rvm/gems/ruby-3.0.0/bin/rdf" # kelloggs distiller
|
28
|
+
ENV['TIKA_COMMAND'] = "http://localhost:9998/meta" # assumes using the docker version of tika
|
29
|
+
|
30
|
+
# to only follow the FAIR signposting specification:
|
31
|
+
links, metadata = HarvesterTools::Utils.resolve_guid(guid: guid)
|
32
|
+
|
33
|
+
links.each do |link|
|
34
|
+
puts link.href
|
35
|
+
puts link.relation
|
36
|
+
end
|
37
|
+
|
38
|
+
# note, you don't need to catch the return value here. The metadata object that is passed in will be modified
|
39
|
+
metadata = FspHarvester::Utils.gather_metadata_from_describedby_links(links: links, metadata: metadata)
|
40
|
+
|
41
|
+
linkeddata = metadata.graph
|
42
|
+
hashdata = metadata.hash
|
43
|
+
comments = metadata.comments
|
44
|
+
warnings = metadata.warnings
|
45
|
+
|
46
|
+
# if you want to try other things like content negotiation and "scraping" from HTML, do this:
|
47
|
+
# note, you don't need to catch the return value here. The metadata object that is passed in will be modified
|
48
|
+
metadata = HarvesterTools::BruteForce.begin_brute_force(guid: guid, metadata: metadata)
|
49
|
+
|
50
|
+
linkeddata = metadata.graph
|
51
|
+
hashdata = metadata.hash
|
52
|
+
comments = metadata.comments
|
53
|
+
warnings = metadata.warnings
|
54
|
+
|
55
|
+
```
|
23
56
|
|
24
57
|
|
25
58
|
## Development
|
data/lib/constants.rb
CHANGED
@@ -78,31 +78,31 @@ GUID_TYPES = {
|
|
78
78
|
'ark' => Regexp.new(%r{^ark:/[^\s]+$})
|
79
79
|
}
|
80
80
|
|
81
|
-
CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
82
|
-
extruct = CONFIG.dig(:extruct, :command)
|
83
|
-
extruct
|
81
|
+
# CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
82
|
+
# extruct = CONFIG.dig(:extruct, :command)
|
83
|
+
extruct = ENV['EXTRUCT_COMMAND'] || 'extruct'
|
84
84
|
extruct.strip!
|
85
85
|
case extruct
|
86
86
|
when /[&|;`$\s]/
|
87
|
-
abort 'The Extruct command
|
87
|
+
abort 'The Extruct command appears to be subject to command injection. I will not continue'
|
88
88
|
when /echo/i
|
89
|
-
abort 'The Extruct command
|
89
|
+
abort 'The Extruct command appears to be subject to command injection. I will not continue'
|
90
90
|
end
|
91
91
|
EXTRUCT_COMMAND = extruct
|
92
92
|
|
93
|
-
rdf_command = CONFIG.dig(:rdf, :command)
|
94
|
-
rdf_command
|
93
|
+
# rdf_command = CONFIG.dig(:rdf, :command)
|
94
|
+
rdf_command = ENV['RDF_COMMAND'] || 'rdf'
|
95
95
|
rdf_command.strip
|
96
96
|
case rdf_command
|
97
97
|
when /[&|;`$\s]/
|
98
|
-
abort 'The RDF command
|
98
|
+
abort 'The RDF command appears to be subject to command injection. I will not continue'
|
99
99
|
when /echo/i
|
100
|
-
abort 'The RDF command
|
100
|
+
abort 'The RDF command appears to be subject to command injection. I will not continue'
|
101
101
|
when !(/rdf$/ =~ $_)
|
102
102
|
abort "this software requires that Kelloggs Distiller tool is used. The distiller command must end in 'rdf'"
|
103
103
|
end
|
104
104
|
RDF_COMMAND = rdf_command
|
105
105
|
|
106
|
-
tika_command = CONFIG.dig(:tika, :command)
|
107
|
-
tika_command
|
106
|
+
# tika_command = CONFIG.dig(:tika, :command)
|
107
|
+
tika_command = ENV['TIKA_COMMAND'] || 'http://localhost:9998/meta'
|
108
108
|
TIKA_COMMAND = tika_command
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module HarvesterTools
|
2
|
+
class Error < StandardError
|
3
|
+
end
|
4
|
+
|
5
|
+
class BruteForce
|
6
|
+
def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
|
7
|
+
type, url = HarvesterTools::Utils.convertToURL(guid: guid)
|
8
|
+
return false unless type
|
9
|
+
|
10
|
+
do_content_negotiation(url: url, metadata: metadata)
|
11
|
+
metadata
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.do_content_negotiation(url:, metadata:)
|
15
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
|
16
|
+
if response
|
17
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
|
18
|
+
end
|
19
|
+
response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
|
20
|
+
if response
|
21
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
22
|
+
response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
|
23
|
+
if response
|
24
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
|
30
|
+
@meta = metadata
|
31
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
32
|
+
warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
|
33
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
|
34
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
35
|
+
|
36
|
+
unless response
|
37
|
+
@meta.add_warning(['001', url, headers])
|
38
|
+
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
|
39
|
+
@meta.full_response << [url, "No response"]
|
40
|
+
false
|
41
|
+
end
|
42
|
+
|
43
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
44
|
+
@meta.full_response << [url, response.body]
|
45
|
+
response
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/harvester_utils.rb
CHANGED
@@ -4,8 +4,8 @@ module HarvesterTools
|
|
4
4
|
|
5
5
|
class Utils
|
6
6
|
|
7
|
-
def self.resolve_guid(guid:)
|
8
|
-
@meta =
|
7
|
+
def self.resolve_guid(guid:, metadata: HarvesterTools::MetadataObject.new)
|
8
|
+
@meta = metadata
|
9
9
|
@meta.all_uris = [guid]
|
10
10
|
type, url = convertToURL(guid: guid)
|
11
11
|
links = Array.new
|
data/lib/metadata_object.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module HarvesterTools
|
2
2
|
class MetadataObject
|
3
|
-
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
3
|
+
attr_accessor :hash, :graph, :comments, :links, :warnings, :guidtype, :full_response, :all_uris, :guid, :score # a hash of metadata # a RDF.rb graph of metadata # an array of comments # the type of GUID that was detected # will be an array of Net::HTTP::Response
|
4
4
|
|
5
5
|
def initialize(_params = {}) # get a name from the "new" call, or set a default
|
6
6
|
@hash = {}
|
@@ -10,6 +10,8 @@ module HarvesterTools
|
|
10
10
|
@full_response = []
|
11
11
|
@links = []
|
12
12
|
@all_uris = []
|
13
|
+
@guid = ""
|
14
|
+
@score = 0
|
13
15
|
w = RestClient.get("https://raw.githubusercontent.com/markwilkinson/FAIR-Signposting-Harvester/master/lib/warnings.json")
|
14
16
|
#@warn = File.read("./lib/warnings.json")
|
15
17
|
@warn = JSON.parse(w)
|
File without changes
|
File without changes
|
data/lib/warnings.json
CHANGED
@@ -115,5 +115,12 @@
|
|
115
115
|
{"Documentation": "http://dx.doi.org/10.17487/RFC8259"},
|
116
116
|
{"Validator": "https://jsononline.net/json-validator"}],
|
117
117
|
"severity": "WARN"
|
118
|
+
},
|
119
|
+
"600": {
|
120
|
+
"message": "Data identifier cannot be unambiguously determined, therefore cannot be tested against known persistent identifier schemas",
|
121
|
+
"linkout": [],
|
122
|
+
"severity": "FAILURE"
|
118
123
|
}
|
124
|
+
|
125
|
+
|
119
126
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fsp_harvester
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mark Wilkinson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-08-
|
11
|
+
date: 2022-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -182,18 +182,19 @@ files:
|
|
182
182
|
- bin/setup
|
183
183
|
- example_test.rb
|
184
184
|
- launch.json
|
185
|
-
- lib/config.conf
|
186
|
-
- lib/config.conf_docker
|
187
|
-
- lib/config.conf_local
|
188
185
|
- lib/constants.rb
|
189
186
|
- lib/external_tools.rb
|
190
187
|
- lib/fsp_harvester.rb
|
191
188
|
- lib/fsp_harvester/version.rb
|
192
189
|
- lib/harvester.rb
|
190
|
+
- lib/harvester_brute.rb
|
193
191
|
- lib/harvester_utils.rb
|
194
192
|
- lib/metadata_harvester.rb
|
195
193
|
- lib/metadata_object.rb
|
196
194
|
- lib/metadata_parser.rb
|
195
|
+
- lib/obselete_config.conf
|
196
|
+
- lib/obselete_config.conf_docker
|
197
|
+
- lib/obselete_config.conf_local
|
197
198
|
- lib/signposting_tests.rb
|
198
199
|
- lib/warnings.json
|
199
200
|
- lib/web_utils.rb
|