fsp_harvester 0.1.11 → 0.1.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec_status +51 -51
- data/Gemfile.lock +34 -26
- data/lib/config.conf +8 -0
- data/lib/constants.rb +8 -5
- data/lib/{fsp_metadata_external_tools.rb → external_tools.rb} +17 -15
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +8 -106
- data/lib/harvester.rb +28 -0
- data/lib/harvester_utils.rb +78 -0
- data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} +51 -33
- data/lib/metadata_object.rb +1 -1
- data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} +28 -13
- data/lib/signposting_tests.rb +9 -6
- data/lib/warnings.json +33 -21
- data/lib/web_utils.rb +7 -7
- metadata +10 -8
- data/lib/swagger.rb +0 -224
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e285f00da696d7e39d80df794be9524af6e63ea01deb4e73f6c30b3694c016ff
|
4
|
+
data.tar.gz: fb81b5c1c0fac3bb22e078663025855e5accdb355db1811a4687fb1bca54bc61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 194132eb78246291a3cb96566ca6a283841a0427afcd6a6abb79c590dbc2c54108e3e8cfef9e4802a77008f1a4c9c94ea7862987e81ce1b4b97cd1fdaf25ca23
|
7
|
+
data.tar.gz: 9765647726c2bfcd7e790ba11929d257610672bc92d8d11756824432e90db4c05036b2cfcede1a55da95f1e74b9e87fd078c78284c356897a5bdc0a17593a3a1
|
data/.rspec_status
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
example_id | status | run_time |
|
2
2
|
---------------------------------- | ------ | --------------- |
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed | 1.
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed | 1.
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 1.
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 2.
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed | 2.
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed |
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed | 2.
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed | 2.
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed | 2.
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed | 3.
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed | 2.
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed | 2.
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed | 2.
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed | 1.
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.66 seconds |
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.13 seconds |
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.08 seconds |
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.68 seconds |
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.86 seconds |
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.11 seconds |
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 3.07 seconds |
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.13 seconds |
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.73 seconds |
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.64 seconds |
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 3.36 seconds |
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.26 seconds |
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.9 seconds |
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.31 seconds |
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.47 seconds |
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.22 seconds |
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.23 seconds |
|
20
20
|
./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
|
21
21
|
./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1.
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed | 2.
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed |
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed |
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed |
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed |
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed |
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed |
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed |
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed |
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed |
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed | 2.
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed | 2.
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed |
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed |
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] | failed |
|
45
|
-
./spec/item_spec.rb[1:1:1] | passed | 2.
|
46
|
-
./spec/item_spec.rb[1:1:2] | passed |
|
47
|
-
./spec/item_spec.rb[1:1:3] | passed |
|
48
|
-
./spec/item_spec.rb[1:1:4] | passed | 1.
|
49
|
-
./spec/item_spec.rb[1:1:5] | passed |
|
50
|
-
./spec/item_spec.rb[1:1:6] | passed |
|
51
|
-
./spec/item_spec.rb[1:1:7] | passed | 2.
|
52
|
-
./spec/item_spec.rb[1:1:8] | passed | 0.
|
53
|
-
./spec/type_spec.rb[1:1:1] | passed |
|
54
|
-
./spec/type_spec.rb[1:1:2] | passed |
|
55
|
-
./spec/type_spec.rb[1:1:3] | passed |
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.5 seconds |
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.54 seconds |
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.25 seconds |
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.35 seconds |
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.50811 seconds |
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 3.45 seconds |
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.3 seconds |
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.15 seconds |
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 1.12 seconds |
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 2.15 seconds |
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.19 seconds |
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.98 seconds |
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.87 seconds |
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
|
45
|
+
./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
|
46
|
+
./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
|
47
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
|
48
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
|
49
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
|
53
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
|
54
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
|
55
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.14)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.17)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -56,7 +56,7 @@ GEM
|
|
56
56
|
faraday-encoding (0.0.5)
|
57
57
|
faraday
|
58
58
|
faraday-excon (1.1.0)
|
59
|
-
faraday-http-cache (2.4.
|
59
|
+
faraday-http-cache (2.4.1)
|
60
60
|
faraday (>= 0.8)
|
61
61
|
faraday-httpclient (1.0.1)
|
62
62
|
faraday-multipart (1.0.4)
|
@@ -99,34 +99,35 @@ GEM
|
|
99
99
|
sparql (~> 3.2)
|
100
100
|
sxp (~> 1.2)
|
101
101
|
link_header (0.0.8)
|
102
|
-
linkeddata (3.2.
|
103
|
-
json-ld (~> 3.2)
|
102
|
+
linkeddata (3.2.1)
|
103
|
+
json-ld (~> 3.2, >= 3.2.3)
|
104
104
|
json-ld-preloaded (~> 3.2)
|
105
105
|
ld-patch (~> 3.2)
|
106
|
-
nokogiri (~> 1.
|
107
|
-
rdf (~> 3.2)
|
108
|
-
rdf-aggregate-repo (~> 3.2)
|
106
|
+
nokogiri (~> 1.13, >= 1.13.8)
|
107
|
+
rdf (~> 3.2, >= 3.2.9)
|
108
|
+
rdf-aggregate-repo (~> 3.2, >= 3.2.1)
|
109
109
|
rdf-hamster-repo (~> 3.2)
|
110
|
-
rdf-isomorphic (~> 3.2)
|
110
|
+
rdf-isomorphic (~> 3.2, >= 3.2.1)
|
111
111
|
rdf-json (~> 3.2)
|
112
|
-
rdf-microdata (~> 3.2)
|
113
|
-
rdf-n3 (~> 3.2)
|
112
|
+
rdf-microdata (~> 3.2, >= 3.2.1)
|
113
|
+
rdf-n3 (~> 3.2, >= 3.2.1)
|
114
114
|
rdf-normalize (~> 0.5)
|
115
|
-
rdf-ordered-repo (~> 3.2)
|
115
|
+
rdf-ordered-repo (~> 3.2, >= 3.2.1)
|
116
116
|
rdf-rdfa (~> 3.2)
|
117
117
|
rdf-rdfxml (~> 3.2)
|
118
118
|
rdf-reasoner (~> 0.8)
|
119
|
-
rdf-tabular (~> 3.2)
|
119
|
+
rdf-tabular (~> 3.2, >= 3.2.1)
|
120
120
|
rdf-trig (~> 3.2)
|
121
121
|
rdf-trix (~> 3.2)
|
122
|
-
rdf-turtle (~> 3.2)
|
123
|
-
rdf-vocab (~> 3.2)
|
124
|
-
rdf-xsd (~> 3.2)
|
125
|
-
shacl (~> 0.2)
|
126
|
-
shex (~> 0.7)
|
127
|
-
sparql (~> 3.2)
|
128
|
-
sparql-client (~> 3.2)
|
129
|
-
|
122
|
+
rdf-turtle (~> 3.2, >= 3.2.1)
|
123
|
+
rdf-vocab (~> 3.2, >= 3.2.1)
|
124
|
+
rdf-xsd (~> 3.2, >= 3.2.1)
|
125
|
+
shacl (~> 0.2, >= 0.2.1)
|
126
|
+
shex (~> 0.7, >= 0.7.1)
|
127
|
+
sparql (~> 3.2, >= 3.2.4)
|
128
|
+
sparql-client (~> 3.2, >= 3.2.1)
|
129
|
+
yaml-ld (~> 0.0)
|
130
|
+
linkheaders-processor (0.1.17)
|
130
131
|
json (~> 2.0)
|
131
132
|
json-ld (~> 3.2)
|
132
133
|
json-ld-preloaded (~> 3.2)
|
@@ -159,8 +160,10 @@ GEM
|
|
159
160
|
racc (~> 1.4)
|
160
161
|
parallel (1.22.1)
|
161
162
|
parseconfig (1.1.2)
|
162
|
-
parser (3.1.2.
|
163
|
+
parser (3.1.2.1)
|
163
164
|
ast (~> 2.4.1)
|
165
|
+
psych (4.0.4)
|
166
|
+
stringio
|
164
167
|
public_suffix (4.0.7)
|
165
168
|
racc (1.6.0)
|
166
169
|
rack (2.2.4)
|
@@ -249,17 +252,17 @@ GEM
|
|
249
252
|
diff-lcs (>= 1.2.0, < 2.0)
|
250
253
|
rspec-support (~> 3.11.0)
|
251
254
|
rspec-support (3.11.0)
|
252
|
-
rubocop (1.
|
255
|
+
rubocop (1.34.1)
|
253
256
|
json (~> 2.3)
|
254
257
|
parallel (~> 1.10)
|
255
|
-
parser (>= 3.1.
|
258
|
+
parser (>= 3.1.2.1)
|
256
259
|
rainbow (>= 2.2.2, < 4.0)
|
257
260
|
regexp_parser (>= 1.8, < 3.0)
|
258
261
|
rexml (>= 3.2.5, < 4.0)
|
259
|
-
rubocop-ast (>= 1.
|
262
|
+
rubocop-ast (>= 1.20.0, < 2.0)
|
260
263
|
ruby-progressbar (~> 1.7)
|
261
264
|
unicode-display_width (>= 1.4.0, < 3.0)
|
262
|
-
rubocop-ast (1.
|
265
|
+
rubocop-ast (1.21.0)
|
263
266
|
parser (>= 3.1.1.0)
|
264
267
|
ruby-progressbar (1.11.0)
|
265
268
|
ruby2_keywords (0.0.5)
|
@@ -291,6 +294,7 @@ GEM
|
|
291
294
|
sparql-client (3.2.1)
|
292
295
|
net-http-persistent (~> 4.0, >= 4.0.1)
|
293
296
|
rdf (~> 3.2, >= 3.2.6)
|
297
|
+
stringio (3.0.2)
|
294
298
|
sxp (1.2.2)
|
295
299
|
matrix
|
296
300
|
rdf (~> 3.2)
|
@@ -303,6 +307,10 @@ GEM
|
|
303
307
|
unicode-types (1.7.0)
|
304
308
|
xml-simple (1.1.9)
|
305
309
|
rexml
|
310
|
+
yaml-ld (0.0.1)
|
311
|
+
json-ld (~> 3.2, >= 3.2.2)
|
312
|
+
psych (~> 4.0)
|
313
|
+
rdf (~> 3.2)
|
306
314
|
|
307
315
|
PLATFORMS
|
308
316
|
x86_64-linux
|
data/lib/config.conf
ADDED
data/lib/constants.rb
CHANGED
@@ -69,11 +69,14 @@ SELF_IDENTIFIER_PREDICATES = [
|
|
69
69
|
'https://schema.org/identifier'
|
70
70
|
]
|
71
71
|
|
72
|
-
GUID_TYPES = {
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
72
|
+
GUID_TYPES = {
|
73
|
+
'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
|
74
|
+
'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
|
75
|
+
'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
|
76
|
+
'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
|
77
|
+
'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
|
78
|
+
'ark' => Regexp.new(%r{^ark:/[^\s]+$})
|
79
|
+
}
|
77
80
|
|
78
81
|
CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
79
82
|
extruct = CONFIG.dig(:extruct, :command)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
module
|
3
|
+
module HarvesterTools
|
4
4
|
class Error < StandardError
|
5
5
|
end
|
6
6
|
|
7
7
|
class ExternalTools
|
8
8
|
|
9
|
-
def initialize(metadata:
|
9
|
+
def initialize(metadata: HarvesterTools::MetadataObject.new)
|
10
10
|
@meta = metadata
|
11
11
|
end
|
12
12
|
|
@@ -25,10 +25,7 @@ module FspHarvester
|
|
25
25
|
file.rewind
|
26
26
|
|
27
27
|
@meta.comments << "INFO: The message body is being examined by Distiller\n"
|
28
|
-
|
29
|
-
command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
30
|
-
# command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
31
|
-
# command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
|
28
|
+
command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
32
29
|
warn "distiller command: #{command}"
|
33
30
|
result, _stderr, _status = Open3.capture3(command)
|
34
31
|
warn ''
|
@@ -41,12 +38,13 @@ module FspHarvester
|
|
41
38
|
if result !~ /@context/i # failure returns nil
|
42
39
|
@meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
43
40
|
@meta.add_warning(['018', '', ''])
|
41
|
+
result = "{}"
|
44
42
|
else
|
45
43
|
@meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
46
|
-
parse_rdf(result: result, content_type: "application/ld+json")
|
47
44
|
end
|
48
45
|
@@distillerknown[bhash] = true
|
49
46
|
end
|
47
|
+
result
|
50
48
|
end
|
51
49
|
|
52
50
|
def processs_with_extruct(uri:)
|
@@ -55,6 +53,11 @@ module FspHarvester
|
|
55
53
|
stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
|
56
54
|
warn "open3 status: #{status} #{stdout}"
|
57
55
|
result = stderr # absurd that the output comes over stderr! LOL!
|
56
|
+
jsonld = {}
|
57
|
+
microdata = Hash.new
|
58
|
+
microformat = Hash.new
|
59
|
+
opengraph = Hash.new
|
60
|
+
rdfa = Hash.new
|
58
61
|
|
59
62
|
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
60
63
|
@meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
@@ -66,17 +69,16 @@ module FspHarvester
|
|
66
69
|
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
67
70
|
json = JSON.parse result
|
68
71
|
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
@meta.merge_hash(json.first) if json.first.is_a? Hash
|
72
|
+
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
73
|
+
microdata = json['microdata'].first if json['microdata'].any
|
74
|
+
microformat = json['microformat'].first if json['microformat'].any?
|
75
|
+
opengraph = json['opengraph'].first if json['opengraph'].any?
|
76
|
+
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
77
|
+
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
77
78
|
else
|
78
79
|
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
79
80
|
end
|
81
|
+
[jsonld, microdata, microformat, opengraph, rdfa]
|
80
82
|
end
|
81
83
|
end
|
82
84
|
end
|
data/lib/fsp_harvester.rb
CHANGED
@@ -1,121 +1,23 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require_relative 'fsp_harvester/version'
|
4
|
-
require 'json/ld'
|
5
|
-
require 'json/ld/preloaded'
|
6
|
-
require 'json'
|
7
|
-
require 'linkheaders/processor'
|
8
|
-
require 'addressable'
|
9
|
-
require 'tempfile'
|
10
|
-
require 'xmlsimple'
|
11
|
-
require 'nokogiri'
|
12
|
-
require 'parseconfig'
|
13
|
-
require 'rest-client'
|
14
|
-
require 'cgi'
|
15
|
-
require 'digest'
|
16
|
-
require 'open3'
|
17
|
-
require 'metainspector'
|
18
|
-
require 'rdf/xsd'
|
19
|
-
require_relative './metadata_object'
|
20
|
-
require_relative './constants'
|
21
|
-
require_relative './web_utils'
|
22
|
-
require_relative './signposting_tests'
|
23
|
-
require_relative './fsp_metadata_harvester'
|
24
|
-
require_relative './fsp_metadata_parser'
|
25
|
-
|
26
1
|
|
2
|
+
require_relative 'harvester'
|
27
3
|
module FspHarvester
|
28
4
|
class Error < StandardError
|
29
5
|
end
|
30
6
|
|
31
7
|
class Utils
|
32
|
-
# @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
|
33
|
-
# @warnings = JSON.parse(File.read("warnings.json"))
|
34
|
-
|
35
|
-
|
36
|
-
def self.resolve_guid(guid:)
|
37
|
-
@meta = FspHarvester::MetadataObject.new
|
38
|
-
@meta.all_uris = [guid]
|
39
|
-
type, url = convertToURL(guid: guid)
|
40
|
-
links = Array.new
|
41
|
-
if type
|
42
|
-
links = resolve_url(url: url)
|
43
|
-
@meta.links << links
|
44
|
-
else
|
45
|
-
@meta.add_warning(['006', guid, ''])
|
46
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
47
|
-
end
|
48
|
-
[links, @meta]
|
49
|
-
end
|
50
8
|
|
51
|
-
def self.gather_metadata_from_describedby_links(links: [], metadata:
|
9
|
+
def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
|
52
10
|
@meta = metadata
|
53
11
|
db = []
|
54
12
|
links.each do |l|
|
55
13
|
db << l if l.relation == 'describedby'
|
56
14
|
end
|
57
|
-
|
15
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_links(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
|
58
16
|
@meta
|
59
17
|
end
|
60
18
|
|
61
|
-
def self.
|
62
|
-
|
63
|
-
if k == 'inchi' and regex.match(guid)
|
64
|
-
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
65
|
-
elsif k == 'handle1' and regex.match(guid)
|
66
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
67
|
-
elsif k == 'handle2' and regex.match(guid)
|
68
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
69
|
-
elsif k == 'uri' and regex.match(guid)
|
70
|
-
return 'uri', guid
|
71
|
-
elsif k == 'doi' and regex.match(guid)
|
72
|
-
return 'doi', "https://doi.org/#{guid}"
|
73
|
-
end
|
74
|
-
end
|
75
|
-
[nil, nil]
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.typeit(guid:)
|
79
|
-
Utils::GUID_TYPES.each do |type, regex|
|
80
|
-
return type if regex.match(guid)
|
81
|
-
end
|
82
|
-
false
|
83
|
-
end
|
84
|
-
|
85
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
86
|
-
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
87
|
-
warn "\n\n FETCHING #{url} #{header}\n\n"
|
88
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
89
|
-
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
90
|
-
|
91
|
-
unless response
|
92
|
-
@meta.add_warning(['001', url, header])
|
93
|
-
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
94
|
-
return []
|
95
|
-
end
|
96
|
-
|
97
|
-
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
98
|
-
@meta.full_response << response.body
|
99
|
-
|
100
|
-
links = process_link_headers(response: response) unless nolinkheaders
|
101
|
-
links
|
102
|
-
end
|
103
|
-
|
104
|
-
def self.process_link_headers(response:)
|
105
|
-
warn "\n\n parsing #{response.headers}\n\n"
|
106
|
-
|
107
|
-
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
108
|
-
parser.extract_and_parse(response: response)
|
109
|
-
factory = parser.factory # LinkHeaders::LinkFactory
|
110
|
-
|
111
|
-
warn "\n\n length bfore #{factory.all_links.length}\n\n"
|
112
|
-
signpostingcheck(factory: factory)
|
113
|
-
warn "\n\n length aftr #{factory.all_links.length}\n\n"
|
114
|
-
warn "\n\n links #{factory.all_links}\n\n"
|
115
|
-
factory.all_links
|
116
|
-
end
|
117
|
-
|
118
|
-
def self.signpostingcheck(factory:)
|
19
|
+
def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
|
20
|
+
@meta = metadata
|
119
21
|
citeas = Array.new
|
120
22
|
describedby = Array.new
|
121
23
|
item = Array.new
|
@@ -134,13 +36,13 @@ module FspHarvester
|
|
134
36
|
end
|
135
37
|
end
|
136
38
|
|
137
|
-
check_describedby_rules(describedby: describedby)
|
138
|
-
check_item_rules(item: item)
|
39
|
+
check_describedby_rules(describedby: describedby, metadata: @meta)
|
40
|
+
check_item_rules(item: item, metadata: @meta)
|
139
41
|
|
140
42
|
if citeas.length > 1
|
141
43
|
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
142
44
|
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
143
|
-
citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
45
|
+
citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
144
46
|
end
|
145
47
|
|
146
48
|
unless citeas.length == 1 && describedby.length > 0
|
data/lib/harvester.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#require_relative 'fsp_harvester/version'
|
4
|
+
require 'json/ld'
|
5
|
+
require 'json/ld/preloaded'
|
6
|
+
require 'json'
|
7
|
+
require 'linkheaders/processor'
|
8
|
+
require 'addressable'
|
9
|
+
require 'tempfile'
|
10
|
+
require 'xmlsimple'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'parseconfig'
|
13
|
+
require 'rest-client'
|
14
|
+
require 'cgi'
|
15
|
+
require 'digest'
|
16
|
+
require 'open3'
|
17
|
+
require 'metainspector'
|
18
|
+
require 'rdf/xsd'
|
19
|
+
require_relative './metadata_object'
|
20
|
+
require_relative './constants'
|
21
|
+
require_relative './web_utils'
|
22
|
+
require_relative './signposting_tests'
|
23
|
+
require_relative './metadata_harvester'
|
24
|
+
require_relative './fsp_harvester'
|
25
|
+
require_relative './harvester_utils'
|
26
|
+
require_relative './harvester_brute'
|
27
|
+
require_relative './external_tools'
|
28
|
+
require_relative './metadata_parser'
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module HarvesterTools
|
2
|
+
class Error < StandardError
|
3
|
+
end
|
4
|
+
|
5
|
+
class Utils
|
6
|
+
|
7
|
+
def self.resolve_guid(guid:)
|
8
|
+
@meta = HarvesterTools::MetadataObject.new
|
9
|
+
@meta.all_uris = [guid]
|
10
|
+
type, url = convertToURL(guid: guid)
|
11
|
+
links = Array.new
|
12
|
+
if type
|
13
|
+
links = resolve_url(url: url, metadata: @meta)
|
14
|
+
@meta.links = @meta.links | links
|
15
|
+
else
|
16
|
+
@meta.add_warning(['006', guid, ''])
|
17
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
18
|
+
end
|
19
|
+
[links, @meta]
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.convertToURL(guid:)
|
23
|
+
GUID_TYPES.each do |k, regex|
|
24
|
+
if k == 'inchi' and regex.match(guid)
|
25
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
26
|
+
elsif k == 'handle1' and regex.match(guid)
|
27
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
28
|
+
elsif k == 'handle2' and regex.match(guid)
|
29
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
30
|
+
elsif k == 'uri' and regex.match(guid)
|
31
|
+
return 'uri', guid
|
32
|
+
elsif k == 'doi' and regex.match(guid)
|
33
|
+
return 'doi', "https://doi.org/#{guid}"
|
34
|
+
elsif k == 'ark' and regex.match(guid)
|
35
|
+
return 'ark', "https://n2t.net/#{guid}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
[nil, nil]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.typeit(guid:)
|
42
|
+
GUID_TYPES.each do |type, regex|
|
43
|
+
return type if regex.match(guid)
|
44
|
+
end
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
|
49
|
+
@meta = metadata
|
50
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
51
|
+
warn "\n\n FETCHING #{url} #{header}\n\n"
|
52
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
53
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
54
|
+
|
55
|
+
unless response
|
56
|
+
@meta.add_warning(['001', url, header])
|
57
|
+
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
58
|
+
return []
|
59
|
+
end
|
60
|
+
|
61
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
62
|
+
@meta.full_response << response.body
|
63
|
+
|
64
|
+
links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
|
65
|
+
links
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.process_link_headers(response:, metadata:)
|
69
|
+
warn "\n\n parsing #{response.headers}\n\n"
|
70
|
+
|
71
|
+
parser = LinkHeaders::Processor.new(default_anchor: metadata.all_uris.last)
|
72
|
+
parser.extract_and_parse(response: response)
|
73
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
74
|
+
FspHarvester::Utils.signpostingcheck(factory: factory, metadata: metadata)
|
75
|
+
factory.all_links
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|