fsp_harvester 0.1.11 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec_status +51 -51
- data/Gemfile.lock +34 -26
- data/lib/config.conf +8 -0
- data/lib/constants.rb +8 -5
- data/lib/{fsp_metadata_external_tools.rb → external_tools.rb} +17 -15
- data/lib/fsp_harvester/version.rb +1 -1
- data/lib/fsp_harvester.rb +8 -106
- data/lib/harvester.rb +28 -0
- data/lib/harvester_utils.rb +78 -0
- data/lib/{fsp_metadata_harvester.rb → metadata_harvester.rb} +51 -33
- data/lib/metadata_object.rb +1 -1
- data/lib/{fsp_metadata_parser.rb → metadata_parser.rb} +28 -13
- data/lib/signposting_tests.rb +9 -6
- data/lib/warnings.json +33 -21
- data/lib/web_utils.rb +7 -7
- metadata +10 -8
- data/lib/swagger.rb +0 -224
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e285f00da696d7e39d80df794be9524af6e63ea01deb4e73f6c30b3694c016ff
|
4
|
+
data.tar.gz: fb81b5c1c0fac3bb22e078663025855e5accdb355db1811a4687fb1bca54bc61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 194132eb78246291a3cb96566ca6a283841a0427afcd6a6abb79c590dbc2c54108e3e8cfef9e4802a77008f1a4c9c94ea7862987e81ce1b4b97cd1fdaf25ca23
|
7
|
+
data.tar.gz: 9765647726c2bfcd7e790ba11929d257610672bc92d8d11756824432e90db4c05036b2cfcede1a55da95f1e74b9e87fd078c78284c356897a5bdc0a17593a3a1
|
data/.rspec_status
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
example_id | status | run_time |
|
2
2
|
---------------------------------- | ------ | --------------- |
|
3
|
-
./spec/cite-as_spec.rb[1:1:1] | passed | 1.
|
4
|
-
./spec/cite-as_spec.rb[1:1:2] | passed | 1.
|
5
|
-
./spec/cite-as_spec.rb[1:1:3] | passed | 1.
|
6
|
-
./spec/cite-as_spec.rb[1:1:4] | passed | 1.
|
7
|
-
./spec/cite-as_spec.rb[1:1:5] | passed | 2.
|
8
|
-
./spec/cite-as_spec.rb[1:1:6] | passed | 2.
|
9
|
-
./spec/cite-as_spec.rb[1:1:7] | passed |
|
10
|
-
./spec/cite-as_spec.rb[1:1:8] | passed | 2.
|
11
|
-
./spec/cite-as_spec.rb[1:1:9] | passed | 2.
|
12
|
-
./spec/cite-as_spec.rb[1:1:10] | passed | 2.
|
13
|
-
./spec/cite-as_spec.rb[1:1:11] | passed | 3.
|
14
|
-
./spec/cite-as_spec.rb[1:1:12] | passed | 2.
|
15
|
-
./spec/cite-as_spec.rb[1:1:13] | passed | 2.
|
16
|
-
./spec/cite-as_spec.rb[1:1:14] | passed | 2.
|
17
|
-
./spec/cite-as_spec.rb[1:1:15] | passed | 1.
|
18
|
-
./spec/cite-as_spec.rb[1:1:16] | passed | 1.
|
19
|
-
./spec/cite-as_spec.rb[1:1:17] | passed | 1.
|
3
|
+
./spec/cite-as_spec.rb[1:1:1] | passed | 1.66 seconds |
|
4
|
+
./spec/cite-as_spec.rb[1:1:2] | passed | 1.13 seconds |
|
5
|
+
./spec/cite-as_spec.rb[1:1:3] | passed | 1.08 seconds |
|
6
|
+
./spec/cite-as_spec.rb[1:1:4] | passed | 1.68 seconds |
|
7
|
+
./spec/cite-as_spec.rb[1:1:5] | passed | 2.86 seconds |
|
8
|
+
./spec/cite-as_spec.rb[1:1:6] | passed | 2.11 seconds |
|
9
|
+
./spec/cite-as_spec.rb[1:1:7] | passed | 3.07 seconds |
|
10
|
+
./spec/cite-as_spec.rb[1:1:8] | passed | 2.13 seconds |
|
11
|
+
./spec/cite-as_spec.rb[1:1:9] | passed | 2.73 seconds |
|
12
|
+
./spec/cite-as_spec.rb[1:1:10] | passed | 2.64 seconds |
|
13
|
+
./spec/cite-as_spec.rb[1:1:11] | passed | 3.36 seconds |
|
14
|
+
./spec/cite-as_spec.rb[1:1:12] | passed | 2.26 seconds |
|
15
|
+
./spec/cite-as_spec.rb[1:1:13] | passed | 2.9 seconds |
|
16
|
+
./spec/cite-as_spec.rb[1:1:14] | passed | 2.31 seconds |
|
17
|
+
./spec/cite-as_spec.rb[1:1:15] | passed | 1.47 seconds |
|
18
|
+
./spec/cite-as_spec.rb[1:1:16] | passed | 1.22 seconds |
|
19
|
+
./spec/cite-as_spec.rb[1:1:17] | passed | 1.23 seconds |
|
20
20
|
./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
|
21
21
|
./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
|
22
|
-
./spec/cite-as_spec.rb[1:1:20] | passed | 1.
|
23
|
-
./spec/cite-as_spec.rb[1:1:21] | passed | 2.
|
24
|
-
./spec/cite-as_spec.rb[1:1:22] | passed | 1.
|
25
|
-
./spec/cite-as_spec.rb[1:1:23] | passed | 1.
|
26
|
-
./spec/cite-as_spec.rb[1:1:24] | failed | 1.
|
27
|
-
./spec/cite-as_spec.rb[1:1:25] | passed | 0.
|
28
|
-
./spec/describedby_spec.rb[1:1:1] | passed |
|
29
|
-
./spec/describedby_spec.rb[1:1:2] | passed |
|
30
|
-
./spec/describedby_spec.rb[1:1:3] | passed |
|
31
|
-
./spec/describedby_spec.rb[1:1:4] | passed |
|
32
|
-
./spec/describedby_spec.rb[1:1:5] | passed | 1
|
33
|
-
./spec/describedby_spec.rb[1:1:6] | passed |
|
34
|
-
./spec/describedby_spec.rb[1:1:7] | passed |
|
35
|
-
./spec/describedby_spec.rb[1:1:8] | passed |
|
36
|
-
./spec/describedby_spec.rb[1:1:9] | passed |
|
37
|
-
./spec/describedby_spec.rb[1:1:10] | passed |
|
38
|
-
./spec/describedby_spec.rb[1:1:11] | passed | 2.
|
39
|
-
./spec/describedby_spec.rb[1:1:12] | passed | 2.
|
40
|
-
./spec/describedby_spec.rb[1:1:13] | passed | 1.
|
41
|
-
./spec/describedby_spec.rb[1:1:14] | passed |
|
42
|
-
./spec/describedby_spec.rb[1:1:15] | passed |
|
43
|
-
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.
|
44
|
-
./spec/fsp_harvester_spec.rb[1:2] | failed |
|
45
|
-
./spec/item_spec.rb[1:1:1] | passed | 2.
|
46
|
-
./spec/item_spec.rb[1:1:2] | passed |
|
47
|
-
./spec/item_spec.rb[1:1:3] | passed |
|
48
|
-
./spec/item_spec.rb[1:1:4] | passed | 1.
|
49
|
-
./spec/item_spec.rb[1:1:5] | passed |
|
50
|
-
./spec/item_spec.rb[1:1:6] | passed |
|
51
|
-
./spec/item_spec.rb[1:1:7] | passed | 2.
|
52
|
-
./spec/item_spec.rb[1:1:8] | passed | 0.
|
53
|
-
./spec/type_spec.rb[1:1:1] | passed |
|
54
|
-
./spec/type_spec.rb[1:1:2] | passed |
|
55
|
-
./spec/type_spec.rb[1:1:3] | passed |
|
22
|
+
./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
|
23
|
+
./spec/cite-as_spec.rb[1:1:21] | passed | 2.5 seconds |
|
24
|
+
./spec/cite-as_spec.rb[1:1:22] | passed | 1.54 seconds |
|
25
|
+
./spec/cite-as_spec.rb[1:1:23] | passed | 1.25 seconds |
|
26
|
+
./spec/cite-as_spec.rb[1:1:24] | failed | 1.35 seconds |
|
27
|
+
./spec/cite-as_spec.rb[1:1:25] | passed | 0.50811 seconds |
|
28
|
+
./spec/describedby_spec.rb[1:1:1] | passed | 3.45 seconds |
|
29
|
+
./spec/describedby_spec.rb[1:1:2] | passed | 1.3 seconds |
|
30
|
+
./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
|
31
|
+
./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
|
32
|
+
./spec/describedby_spec.rb[1:1:5] | passed | 1.15 seconds |
|
33
|
+
./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
|
34
|
+
./spec/describedby_spec.rb[1:1:7] | passed | 1.12 seconds |
|
35
|
+
./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
|
36
|
+
./spec/describedby_spec.rb[1:1:9] | passed | 2.15 seconds |
|
37
|
+
./spec/describedby_spec.rb[1:1:10] | passed | 2.19 seconds |
|
38
|
+
./spec/describedby_spec.rb[1:1:11] | passed | 2.98 seconds |
|
39
|
+
./spec/describedby_spec.rb[1:1:12] | passed | 2.87 seconds |
|
40
|
+
./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
|
41
|
+
./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
|
42
|
+
./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
|
43
|
+
./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
|
44
|
+
./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
|
45
|
+
./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
|
46
|
+
./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
|
47
|
+
./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
|
48
|
+
./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
|
49
|
+
./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
|
50
|
+
./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
|
51
|
+
./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
|
52
|
+
./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
|
53
|
+
./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
|
54
|
+
./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
|
55
|
+
./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fsp_harvester (0.1.
|
4
|
+
fsp_harvester (0.1.14)
|
5
5
|
json (~> 2.0)
|
6
6
|
linkeddata (~> 3.2)
|
7
|
-
linkheaders-processor (~> 0.1.
|
7
|
+
linkheaders-processor (~> 0.1.17)
|
8
8
|
metainspector (~> 5.11.2)
|
9
9
|
parseconfig (~> 1.1)
|
10
10
|
rake (~> 13.0)
|
@@ -56,7 +56,7 @@ GEM
|
|
56
56
|
faraday-encoding (0.0.5)
|
57
57
|
faraday
|
58
58
|
faraday-excon (1.1.0)
|
59
|
-
faraday-http-cache (2.4.
|
59
|
+
faraday-http-cache (2.4.1)
|
60
60
|
faraday (>= 0.8)
|
61
61
|
faraday-httpclient (1.0.1)
|
62
62
|
faraday-multipart (1.0.4)
|
@@ -99,34 +99,35 @@ GEM
|
|
99
99
|
sparql (~> 3.2)
|
100
100
|
sxp (~> 1.2)
|
101
101
|
link_header (0.0.8)
|
102
|
-
linkeddata (3.2.
|
103
|
-
json-ld (~> 3.2)
|
102
|
+
linkeddata (3.2.1)
|
103
|
+
json-ld (~> 3.2, >= 3.2.3)
|
104
104
|
json-ld-preloaded (~> 3.2)
|
105
105
|
ld-patch (~> 3.2)
|
106
|
-
nokogiri (~> 1.
|
107
|
-
rdf (~> 3.2)
|
108
|
-
rdf-aggregate-repo (~> 3.2)
|
106
|
+
nokogiri (~> 1.13, >= 1.13.8)
|
107
|
+
rdf (~> 3.2, >= 3.2.9)
|
108
|
+
rdf-aggregate-repo (~> 3.2, >= 3.2.1)
|
109
109
|
rdf-hamster-repo (~> 3.2)
|
110
|
-
rdf-isomorphic (~> 3.2)
|
110
|
+
rdf-isomorphic (~> 3.2, >= 3.2.1)
|
111
111
|
rdf-json (~> 3.2)
|
112
|
-
rdf-microdata (~> 3.2)
|
113
|
-
rdf-n3 (~> 3.2)
|
112
|
+
rdf-microdata (~> 3.2, >= 3.2.1)
|
113
|
+
rdf-n3 (~> 3.2, >= 3.2.1)
|
114
114
|
rdf-normalize (~> 0.5)
|
115
|
-
rdf-ordered-repo (~> 3.2)
|
115
|
+
rdf-ordered-repo (~> 3.2, >= 3.2.1)
|
116
116
|
rdf-rdfa (~> 3.2)
|
117
117
|
rdf-rdfxml (~> 3.2)
|
118
118
|
rdf-reasoner (~> 0.8)
|
119
|
-
rdf-tabular (~> 3.2)
|
119
|
+
rdf-tabular (~> 3.2, >= 3.2.1)
|
120
120
|
rdf-trig (~> 3.2)
|
121
121
|
rdf-trix (~> 3.2)
|
122
|
-
rdf-turtle (~> 3.2)
|
123
|
-
rdf-vocab (~> 3.2)
|
124
|
-
rdf-xsd (~> 3.2)
|
125
|
-
shacl (~> 0.2)
|
126
|
-
shex (~> 0.7)
|
127
|
-
sparql (~> 3.2)
|
128
|
-
sparql-client (~> 3.2)
|
129
|
-
|
122
|
+
rdf-turtle (~> 3.2, >= 3.2.1)
|
123
|
+
rdf-vocab (~> 3.2, >= 3.2.1)
|
124
|
+
rdf-xsd (~> 3.2, >= 3.2.1)
|
125
|
+
shacl (~> 0.2, >= 0.2.1)
|
126
|
+
shex (~> 0.7, >= 0.7.1)
|
127
|
+
sparql (~> 3.2, >= 3.2.4)
|
128
|
+
sparql-client (~> 3.2, >= 3.2.1)
|
129
|
+
yaml-ld (~> 0.0)
|
130
|
+
linkheaders-processor (0.1.17)
|
130
131
|
json (~> 2.0)
|
131
132
|
json-ld (~> 3.2)
|
132
133
|
json-ld-preloaded (~> 3.2)
|
@@ -159,8 +160,10 @@ GEM
|
|
159
160
|
racc (~> 1.4)
|
160
161
|
parallel (1.22.1)
|
161
162
|
parseconfig (1.1.2)
|
162
|
-
parser (3.1.2.
|
163
|
+
parser (3.1.2.1)
|
163
164
|
ast (~> 2.4.1)
|
165
|
+
psych (4.0.4)
|
166
|
+
stringio
|
164
167
|
public_suffix (4.0.7)
|
165
168
|
racc (1.6.0)
|
166
169
|
rack (2.2.4)
|
@@ -249,17 +252,17 @@ GEM
|
|
249
252
|
diff-lcs (>= 1.2.0, < 2.0)
|
250
253
|
rspec-support (~> 3.11.0)
|
251
254
|
rspec-support (3.11.0)
|
252
|
-
rubocop (1.
|
255
|
+
rubocop (1.34.1)
|
253
256
|
json (~> 2.3)
|
254
257
|
parallel (~> 1.10)
|
255
|
-
parser (>= 3.1.
|
258
|
+
parser (>= 3.1.2.1)
|
256
259
|
rainbow (>= 2.2.2, < 4.0)
|
257
260
|
regexp_parser (>= 1.8, < 3.0)
|
258
261
|
rexml (>= 3.2.5, < 4.0)
|
259
|
-
rubocop-ast (>= 1.
|
262
|
+
rubocop-ast (>= 1.20.0, < 2.0)
|
260
263
|
ruby-progressbar (~> 1.7)
|
261
264
|
unicode-display_width (>= 1.4.0, < 3.0)
|
262
|
-
rubocop-ast (1.
|
265
|
+
rubocop-ast (1.21.0)
|
263
266
|
parser (>= 3.1.1.0)
|
264
267
|
ruby-progressbar (1.11.0)
|
265
268
|
ruby2_keywords (0.0.5)
|
@@ -291,6 +294,7 @@ GEM
|
|
291
294
|
sparql-client (3.2.1)
|
292
295
|
net-http-persistent (~> 4.0, >= 4.0.1)
|
293
296
|
rdf (~> 3.2, >= 3.2.6)
|
297
|
+
stringio (3.0.2)
|
294
298
|
sxp (1.2.2)
|
295
299
|
matrix
|
296
300
|
rdf (~> 3.2)
|
@@ -303,6 +307,10 @@ GEM
|
|
303
307
|
unicode-types (1.7.0)
|
304
308
|
xml-simple (1.1.9)
|
305
309
|
rexml
|
310
|
+
yaml-ld (0.0.1)
|
311
|
+
json-ld (~> 3.2, >= 3.2.2)
|
312
|
+
psych (~> 4.0)
|
313
|
+
rdf (~> 3.2)
|
306
314
|
|
307
315
|
PLATFORMS
|
308
316
|
x86_64-linux
|
data/lib/config.conf
ADDED
data/lib/constants.rb
CHANGED
@@ -69,11 +69,14 @@ SELF_IDENTIFIER_PREDICATES = [
|
|
69
69
|
'https://schema.org/identifier'
|
70
70
|
]
|
71
71
|
|
72
|
-
GUID_TYPES = {
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
72
|
+
GUID_TYPES = {
|
73
|
+
'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
|
74
|
+
'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
|
75
|
+
'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
|
76
|
+
'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
|
77
|
+
'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
|
78
|
+
'ark' => Regexp.new(%r{^ark:/[^\s]+$})
|
79
|
+
}
|
77
80
|
|
78
81
|
CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
|
79
82
|
extruct = CONFIG.dig(:extruct, :command)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
module
|
3
|
+
module HarvesterTools
|
4
4
|
class Error < StandardError
|
5
5
|
end
|
6
6
|
|
7
7
|
class ExternalTools
|
8
8
|
|
9
|
-
def initialize(metadata:
|
9
|
+
def initialize(metadata: HarvesterTools::MetadataObject.new)
|
10
10
|
@meta = metadata
|
11
11
|
end
|
12
12
|
|
@@ -25,10 +25,7 @@ module FspHarvester
|
|
25
25
|
file.rewind
|
26
26
|
|
27
27
|
@meta.comments << "INFO: The message body is being examined by Distiller\n"
|
28
|
-
|
29
|
-
command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
30
|
-
# command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
31
|
-
# command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
|
28
|
+
command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
|
32
29
|
warn "distiller command: #{command}"
|
33
30
|
result, _stderr, _status = Open3.capture3(command)
|
34
31
|
warn ''
|
@@ -41,12 +38,13 @@ module FspHarvester
|
|
41
38
|
if result !~ /@context/i # failure returns nil
|
42
39
|
@meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
|
43
40
|
@meta.add_warning(['018', '', ''])
|
41
|
+
result = "{}"
|
44
42
|
else
|
45
43
|
@meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
|
46
|
-
parse_rdf(result: result, content_type: "application/ld+json")
|
47
44
|
end
|
48
45
|
@@distillerknown[bhash] = true
|
49
46
|
end
|
47
|
+
result
|
50
48
|
end
|
51
49
|
|
52
50
|
def processs_with_extruct(uri:)
|
@@ -55,6 +53,11 @@ module FspHarvester
|
|
55
53
|
stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
|
56
54
|
warn "open3 status: #{status} #{stdout}"
|
57
55
|
result = stderr # absurd that the output comes over stderr! LOL!
|
56
|
+
jsonld = {}
|
57
|
+
microdata = Hash.new
|
58
|
+
microformat = Hash.new
|
59
|
+
opengraph = Hash.new
|
60
|
+
rdfa = Hash.new
|
58
61
|
|
59
62
|
if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
|
60
63
|
@meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
|
@@ -66,17 +69,16 @@ module FspHarvester
|
|
66
69
|
elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
|
67
70
|
json = JSON.parse result
|
68
71
|
@meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
@meta.merge_hash(json.first) if json.first.is_a? Hash
|
72
|
+
jsonld = json['json-ld'].to_json if json['json-ld'].any?
|
73
|
+
microdata = json['microdata'].first if json['microdata'].any
|
74
|
+
microformat = json['microformat'].first if json['microformat'].any?
|
75
|
+
opengraph = json['opengraph'].first if json['opengraph'].any?
|
76
|
+
rdfa = json['rdfa'].to_json if json['rdfa'].any?
|
77
|
+
# @meta.merge_hash(json.first) if json.first.is_a? Hash
|
77
78
|
else
|
78
79
|
@meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
|
79
80
|
end
|
81
|
+
[jsonld, microdata, microformat, opengraph, rdfa]
|
80
82
|
end
|
81
83
|
end
|
82
84
|
end
|
data/lib/fsp_harvester.rb
CHANGED
@@ -1,121 +1,23 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require_relative 'fsp_harvester/version'
|
4
|
-
require 'json/ld'
|
5
|
-
require 'json/ld/preloaded'
|
6
|
-
require 'json'
|
7
|
-
require 'linkheaders/processor'
|
8
|
-
require 'addressable'
|
9
|
-
require 'tempfile'
|
10
|
-
require 'xmlsimple'
|
11
|
-
require 'nokogiri'
|
12
|
-
require 'parseconfig'
|
13
|
-
require 'rest-client'
|
14
|
-
require 'cgi'
|
15
|
-
require 'digest'
|
16
|
-
require 'open3'
|
17
|
-
require 'metainspector'
|
18
|
-
require 'rdf/xsd'
|
19
|
-
require_relative './metadata_object'
|
20
|
-
require_relative './constants'
|
21
|
-
require_relative './web_utils'
|
22
|
-
require_relative './signposting_tests'
|
23
|
-
require_relative './fsp_metadata_harvester'
|
24
|
-
require_relative './fsp_metadata_parser'
|
25
|
-
|
26
1
|
|
2
|
+
require_relative 'harvester'
|
27
3
|
module FspHarvester
|
28
4
|
class Error < StandardError
|
29
5
|
end
|
30
6
|
|
31
7
|
class Utils
|
32
|
-
# @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
|
33
|
-
# @warnings = JSON.parse(File.read("warnings.json"))
|
34
|
-
|
35
|
-
|
36
|
-
def self.resolve_guid(guid:)
|
37
|
-
@meta = FspHarvester::MetadataObject.new
|
38
|
-
@meta.all_uris = [guid]
|
39
|
-
type, url = convertToURL(guid: guid)
|
40
|
-
links = Array.new
|
41
|
-
if type
|
42
|
-
links = resolve_url(url: url)
|
43
|
-
@meta.links << links
|
44
|
-
else
|
45
|
-
@meta.add_warning(['006', guid, ''])
|
46
|
-
@meta.comments << "FATAL: GUID type not recognized.\n"
|
47
|
-
end
|
48
|
-
[links, @meta]
|
49
|
-
end
|
50
8
|
|
51
|
-
def self.gather_metadata_from_describedby_links(links: [], metadata:
|
9
|
+
def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
|
52
10
|
@meta = metadata
|
53
11
|
db = []
|
54
12
|
links.each do |l|
|
55
13
|
db << l if l.relation == 'describedby'
|
56
14
|
end
|
57
|
-
|
15
|
+
HarvesterTools::MetadataHarvester.extract_metadata_from_links(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
|
58
16
|
@meta
|
59
17
|
end
|
60
18
|
|
61
|
-
def self.
|
62
|
-
|
63
|
-
if k == 'inchi' and regex.match(guid)
|
64
|
-
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
65
|
-
elsif k == 'handle1' and regex.match(guid)
|
66
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
67
|
-
elsif k == 'handle2' and regex.match(guid)
|
68
|
-
return 'handle', "http://hdl.handle.net/#{guid}"
|
69
|
-
elsif k == 'uri' and regex.match(guid)
|
70
|
-
return 'uri', guid
|
71
|
-
elsif k == 'doi' and regex.match(guid)
|
72
|
-
return 'doi', "https://doi.org/#{guid}"
|
73
|
-
end
|
74
|
-
end
|
75
|
-
[nil, nil]
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.typeit(guid:)
|
79
|
-
Utils::GUID_TYPES.each do |type, regex|
|
80
|
-
return type if regex.match(guid)
|
81
|
-
end
|
82
|
-
false
|
83
|
-
end
|
84
|
-
|
85
|
-
def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
|
86
|
-
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
87
|
-
warn "\n\n FETCHING #{url} #{header}\n\n"
|
88
|
-
response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
89
|
-
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
90
|
-
|
91
|
-
unless response
|
92
|
-
@meta.add_warning(['001', url, header])
|
93
|
-
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
94
|
-
return []
|
95
|
-
end
|
96
|
-
|
97
|
-
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
98
|
-
@meta.full_response << response.body
|
99
|
-
|
100
|
-
links = process_link_headers(response: response) unless nolinkheaders
|
101
|
-
links
|
102
|
-
end
|
103
|
-
|
104
|
-
def self.process_link_headers(response:)
|
105
|
-
warn "\n\n parsing #{response.headers}\n\n"
|
106
|
-
|
107
|
-
parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
|
108
|
-
parser.extract_and_parse(response: response)
|
109
|
-
factory = parser.factory # LinkHeaders::LinkFactory
|
110
|
-
|
111
|
-
warn "\n\n length bfore #{factory.all_links.length}\n\n"
|
112
|
-
signpostingcheck(factory: factory)
|
113
|
-
warn "\n\n length aftr #{factory.all_links.length}\n\n"
|
114
|
-
warn "\n\n links #{factory.all_links}\n\n"
|
115
|
-
factory.all_links
|
116
|
-
end
|
117
|
-
|
118
|
-
def self.signpostingcheck(factory:)
|
19
|
+
def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
|
20
|
+
@meta = metadata
|
119
21
|
citeas = Array.new
|
120
22
|
describedby = Array.new
|
121
23
|
item = Array.new
|
@@ -134,13 +36,13 @@ module FspHarvester
|
|
134
36
|
end
|
135
37
|
end
|
136
38
|
|
137
|
-
check_describedby_rules(describedby: describedby)
|
138
|
-
check_item_rules(item: item)
|
39
|
+
check_describedby_rules(describedby: describedby, metadata: @meta)
|
40
|
+
check_item_rules(item: item, metadata: @meta)
|
139
41
|
|
140
42
|
if citeas.length > 1
|
141
43
|
warn "INFO: multiple cite-as links found. Checking for conflicts\n"
|
142
44
|
@meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
|
143
|
-
citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
45
|
+
citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
|
144
46
|
end
|
145
47
|
|
146
48
|
unless citeas.length == 1 && describedby.length > 0
|
data/lib/harvester.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#require_relative 'fsp_harvester/version'
|
4
|
+
require 'json/ld'
|
5
|
+
require 'json/ld/preloaded'
|
6
|
+
require 'json'
|
7
|
+
require 'linkheaders/processor'
|
8
|
+
require 'addressable'
|
9
|
+
require 'tempfile'
|
10
|
+
require 'xmlsimple'
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'parseconfig'
|
13
|
+
require 'rest-client'
|
14
|
+
require 'cgi'
|
15
|
+
require 'digest'
|
16
|
+
require 'open3'
|
17
|
+
require 'metainspector'
|
18
|
+
require 'rdf/xsd'
|
19
|
+
require_relative './metadata_object'
|
20
|
+
require_relative './constants'
|
21
|
+
require_relative './web_utils'
|
22
|
+
require_relative './signposting_tests'
|
23
|
+
require_relative './metadata_harvester'
|
24
|
+
require_relative './fsp_harvester'
|
25
|
+
require_relative './harvester_utils'
|
26
|
+
require_relative './harvester_brute'
|
27
|
+
require_relative './external_tools'
|
28
|
+
require_relative './metadata_parser'
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module HarvesterTools
|
2
|
+
class Error < StandardError
|
3
|
+
end
|
4
|
+
|
5
|
+
class Utils
|
6
|
+
|
7
|
+
def self.resolve_guid(guid:)
|
8
|
+
@meta = HarvesterTools::MetadataObject.new
|
9
|
+
@meta.all_uris = [guid]
|
10
|
+
type, url = convertToURL(guid: guid)
|
11
|
+
links = Array.new
|
12
|
+
if type
|
13
|
+
links = resolve_url(url: url, metadata: @meta)
|
14
|
+
@meta.links = @meta.links | links
|
15
|
+
else
|
16
|
+
@meta.add_warning(['006', guid, ''])
|
17
|
+
@meta.comments << "FATAL: GUID type not recognized.\n"
|
18
|
+
end
|
19
|
+
[links, @meta]
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.convertToURL(guid:)
|
23
|
+
GUID_TYPES.each do |k, regex|
|
24
|
+
if k == 'inchi' and regex.match(guid)
|
25
|
+
return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
|
26
|
+
elsif k == 'handle1' and regex.match(guid)
|
27
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
28
|
+
elsif k == 'handle2' and regex.match(guid)
|
29
|
+
return 'handle', "http://hdl.handle.net/#{guid}"
|
30
|
+
elsif k == 'uri' and regex.match(guid)
|
31
|
+
return 'uri', guid
|
32
|
+
elsif k == 'doi' and regex.match(guid)
|
33
|
+
return 'doi', "https://doi.org/#{guid}"
|
34
|
+
elsif k == 'ark' and regex.match(guid)
|
35
|
+
return 'ark', "https://n2t.net/#{guid}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
[nil, nil]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.typeit(guid:)
|
42
|
+
GUID_TYPES.each do |type, regex|
|
43
|
+
return type if regex.match(guid)
|
44
|
+
end
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
|
49
|
+
@meta = metadata
|
50
|
+
@meta.guidtype = 'uri' if @meta.guidtype.nil?
|
51
|
+
warn "\n\n FETCHING #{url} #{header}\n\n"
|
52
|
+
response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
|
53
|
+
warn "\n\n head #{response.headers.inspect}\n\n" if response
|
54
|
+
|
55
|
+
unless response
|
56
|
+
@meta.add_warning(['001', url, header])
|
57
|
+
@meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
|
58
|
+
return []
|
59
|
+
end
|
60
|
+
|
61
|
+
@meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
|
62
|
+
@meta.full_response << response.body
|
63
|
+
|
64
|
+
links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
|
65
|
+
links
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.process_link_headers(response:, metadata:)
|
69
|
+
warn "\n\n parsing #{response.headers}\n\n"
|
70
|
+
|
71
|
+
parser = LinkHeaders::Processor.new(default_anchor: metadata.all_uris.last)
|
72
|
+
parser.extract_and_parse(response: response)
|
73
|
+
factory = parser.factory # LinkHeaders::LinkFactory
|
74
|
+
FspHarvester::Utils.signpostingcheck(factory: factory, metadata: metadata)
|
75
|
+
factory.all_links
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|