fsp_harvester 0.1.10 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96405252194bfb8df419e61c17f113648e8bd254c034577107d8be161ed4753d
4
- data.tar.gz: 62117a821bcea28d3ee06ccccf085a3b3150b7f33e1b67594c1d21b5ce23d345
3
+ metadata.gz: '099b2769aca02c9b6fba26583dfbcfc4b60c39798be4adf0a4c71a989af0094c'
4
+ data.tar.gz: 80b8657befce11cdd8d58c420fe8039ba5407b29b6e0fed645325212de95f4d0
5
5
  SHA512:
6
- metadata.gz: d30011cfd562b090d6a08895d0ac004d8f9c4aafbe0dc4ff75be282f896c8ae6b83b5fd826db731fc248486657d66df89fd2500df9f38c1045d6fa201296a46b
7
- data.tar.gz: 3064bf42a8a83ac6a89cae8635eeebcb71d22d1a923672f8379e7122e97eac527af46a710a4714b235a3671c93ea1142b2aa44bb907aa1a072918e43c31373e0
6
+ metadata.gz: 683362c6a0710bf9a0d5420a9ae6fe8372338f9c6132ae5ae56619dce1bfa88df7914008a661cf49685000166a0c4c6b476691acda109326c3955f73b796cc4e
7
+ data.tar.gz: 931972a9f872bcb90e11ed731e4de9b406b9c3ccb1f44e6af250d9cb817616040e86149d34f070fe9ac0c6711f37438922b0efb5e2915ce00e39fa4ce09c030b
data/.rspec_status CHANGED
@@ -1,55 +1,55 @@
1
1
  example_id | status | run_time |
2
2
  ---------------------------------- | ------ | --------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.05 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 0.98315 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 0.67905 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 1.3 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 2.13 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 1.47 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 2.35 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 1.69 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.34 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 1.66 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 2.36 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 1.65 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.37 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 1.68 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.01 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.02 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 1.04 seconds |
20
- ./spec/cite-as_spec.rb[1:1:18] | passed | 0.98558 seconds |
21
- ./spec/cite-as_spec.rb[1:1:19] | passed | 1.36 seconds |
22
- ./spec/cite-as_spec.rb[1:1:20] | passed | 1.37 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 1.71 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 0.93618 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 0.94888 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.03 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.3531 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 2.12 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 0.96254 seconds |
30
- ./spec/describedby_spec.rb[1:1:3] | passed | 0.92669 seconds |
31
- ./spec/describedby_spec.rb[1:1:4] | passed | 0.92801 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1 second |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 0.66763 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 0.66021 seconds |
35
- ./spec/describedby_spec.rb[1:1:8] | passed | 1.89 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 1.3 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 1.7 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 2.28 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 2.27 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 1.39 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 1.65 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 1.7 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00215 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | failed | 0.00021 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 2.04 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 2 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 0.92924 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.36 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 1.71 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 1.68 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 2.37 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.34241 seconds |
53
- ./spec/type_spec.rb[1:1:1] | passed | 0.9855 seconds |
54
- ./spec/type_spec.rb[1:1:2] | passed | 0.96202 seconds |
55
- ./spec/type_spec.rb[1:1:3] | passed | 0.96005 seconds |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.66 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.13 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.08 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.68 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.86 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.11 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 3.07 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.13 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.73 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.64 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 3.36 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.26 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.9 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.31 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.47 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.22 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.23 seconds |
20
+ ./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
21
+ ./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.5 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.54 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.25 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.35 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.50811 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 3.45 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.3 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.15 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 1.12 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 2.15 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.19 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.98 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.87 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
+ ./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
46
+ ./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
47
+ ./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
48
+ ./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
49
+ ./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
50
+ ./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
51
+ ./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
52
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
53
+ ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
54
+ ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
55
+ ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.10)
4
+ fsp_harvester (0.1.13)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.16)
7
+ linkheaders-processor (~> 0.1.17)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -36,7 +36,7 @@ GEM
36
36
  scanf (~> 1.0)
37
37
  sxp (~> 1.2)
38
38
  unicode-types (~> 1.7)
39
- faraday (1.10.0)
39
+ faraday (1.10.1)
40
40
  faraday-em_http (~> 1.0)
41
41
  faraday-em_synchrony (~> 1.0)
42
42
  faraday-excon (~> 1.1)
@@ -56,7 +56,7 @@ GEM
56
56
  faraday-encoding (0.0.5)
57
57
  faraday
58
58
  faraday-excon (1.1.0)
59
- faraday-http-cache (2.4.0)
59
+ faraday-http-cache (2.4.1)
60
60
  faraday (>= 0.8)
61
61
  faraday-httpclient (1.0.1)
62
62
  faraday-multipart (1.0.4)
@@ -82,13 +82,13 @@ GEM
82
82
  concurrent-ruby (~> 1.0)
83
83
  json (2.6.2)
84
84
  json-canonicalization (0.3.0)
85
- json-ld (3.2.1)
85
+ json-ld (3.2.3)
86
86
  htmlentities (~> 4.3)
87
87
  json-canonicalization (~> 0.3)
88
88
  link_header (~> 0.0, >= 0.0.8)
89
89
  multi_json (~> 1.15)
90
90
  rack (~> 2.2)
91
- rdf (~> 3.2)
91
+ rdf (~> 3.2, >= 3.2.9)
92
92
  json-ld-preloaded (3.2.0)
93
93
  json-ld (~> 3.2)
94
94
  rdf (~> 3.2)
@@ -99,34 +99,35 @@ GEM
99
99
  sparql (~> 3.2)
100
100
  sxp (~> 1.2)
101
101
  link_header (0.0.8)
102
- linkeddata (3.2.0)
103
- json-ld (~> 3.2)
102
+ linkeddata (3.2.1)
103
+ json-ld (~> 3.2, >= 3.2.3)
104
104
  json-ld-preloaded (~> 3.2)
105
105
  ld-patch (~> 3.2)
106
- nokogiri (~> 1.12, >= 1.12.5)
107
- rdf (~> 3.2)
108
- rdf-aggregate-repo (~> 3.2)
106
+ nokogiri (~> 1.13, >= 1.13.8)
107
+ rdf (~> 3.2, >= 3.2.9)
108
+ rdf-aggregate-repo (~> 3.2, >= 3.2.1)
109
109
  rdf-hamster-repo (~> 3.2)
110
- rdf-isomorphic (~> 3.2)
110
+ rdf-isomorphic (~> 3.2, >= 3.2.1)
111
111
  rdf-json (~> 3.2)
112
- rdf-microdata (~> 3.2)
113
- rdf-n3 (~> 3.2)
112
+ rdf-microdata (~> 3.2, >= 3.2.1)
113
+ rdf-n3 (~> 3.2, >= 3.2.1)
114
114
  rdf-normalize (~> 0.5)
115
- rdf-ordered-repo (~> 3.2)
115
+ rdf-ordered-repo (~> 3.2, >= 3.2.1)
116
116
  rdf-rdfa (~> 3.2)
117
117
  rdf-rdfxml (~> 3.2)
118
118
  rdf-reasoner (~> 0.8)
119
- rdf-tabular (~> 3.2)
119
+ rdf-tabular (~> 3.2, >= 3.2.1)
120
120
  rdf-trig (~> 3.2)
121
121
  rdf-trix (~> 3.2)
122
- rdf-turtle (~> 3.2)
123
- rdf-vocab (~> 3.2)
124
- rdf-xsd (~> 3.2)
125
- shacl (~> 0.2)
126
- shex (~> 0.7)
127
- sparql (~> 3.2)
128
- sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.16)
122
+ rdf-turtle (~> 3.2, >= 3.2.1)
123
+ rdf-vocab (~> 3.2, >= 3.2.1)
124
+ rdf-xsd (~> 3.2, >= 3.2.1)
125
+ shacl (~> 0.2, >= 0.2.1)
126
+ shex (~> 0.7, >= 0.7.1)
127
+ sparql (~> 3.2, >= 3.2.4)
128
+ sparql-client (~> 3.2, >= 3.2.1)
129
+ yaml-ld (~> 0.0)
130
+ linkheaders-processor (0.1.17)
130
131
  json (~> 2.0)
131
132
  json-ld (~> 3.2)
132
133
  json-ld-preloaded (~> 3.2)
@@ -159,14 +160,16 @@ GEM
159
160
  racc (~> 1.4)
160
161
  parallel (1.22.1)
161
162
  parseconfig (1.1.2)
162
- parser (3.1.2.0)
163
+ parser (3.1.2.1)
163
164
  ast (~> 2.4.1)
165
+ psych (4.0.4)
166
+ stringio
164
167
  public_suffix (4.0.7)
165
168
  racc (1.6.0)
166
169
  rack (2.2.4)
167
170
  rainbow (3.1.1)
168
171
  rake (13.0.6)
169
- rdf (3.2.8)
172
+ rdf (3.2.9)
170
173
  link_header (~> 0.0, >= 0.0.8)
171
174
  rdf-aggregate-repo (3.2.1)
172
175
  rdf (~> 3.2)
@@ -249,17 +252,17 @@ GEM
249
252
  diff-lcs (>= 1.2.0, < 2.0)
250
253
  rspec-support (~> 3.11.0)
251
254
  rspec-support (3.11.0)
252
- rubocop (1.33.0)
255
+ rubocop (1.34.1)
253
256
  json (~> 2.3)
254
257
  parallel (~> 1.10)
255
- parser (>= 3.1.0.0)
258
+ parser (>= 3.1.2.1)
256
259
  rainbow (>= 2.2.2, < 4.0)
257
260
  regexp_parser (>= 1.8, < 3.0)
258
261
  rexml (>= 3.2.5, < 4.0)
259
- rubocop-ast (>= 1.19.1, < 2.0)
262
+ rubocop-ast (>= 1.20.0, < 2.0)
260
263
  ruby-progressbar (~> 1.7)
261
264
  unicode-display_width (>= 1.4.0, < 3.0)
262
- rubocop-ast (1.19.1)
265
+ rubocop-ast (1.21.0)
263
266
  parser (>= 3.1.1.0)
264
267
  ruby-progressbar (1.11.0)
265
268
  ruby2_keywords (0.0.5)
@@ -291,6 +294,7 @@ GEM
291
294
  sparql-client (3.2.1)
292
295
  net-http-persistent (~> 4.0, >= 4.0.1)
293
296
  rdf (~> 3.2, >= 3.2.6)
297
+ stringio (3.0.2)
294
298
  sxp (1.2.2)
295
299
  matrix
296
300
  rdf (~> 3.2)
@@ -303,6 +307,10 @@ GEM
303
307
  unicode-types (1.7.0)
304
308
  xml-simple (1.1.9)
305
309
  rexml
310
+ yaml-ld (0.0.1)
311
+ json-ld (~> 3.2, >= 3.2.2)
312
+ psych (~> 4.0)
313
+ rdf (~> 3.2)
306
314
 
307
315
  PLATFORMS
308
316
  x86_64-linux
data/lib/config.conf ADDED
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
data/lib/constants.rb CHANGED
@@ -69,11 +69,14 @@ SELF_IDENTIFIER_PREDICATES = [
69
69
  'https://schema.org/identifier'
70
70
  ]
71
71
 
72
- GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
73
- 'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
74
- 'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
75
- 'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
76
- 'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
72
+ GUID_TYPES = {
73
+ 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
74
+ 'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
75
+ 'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
76
+ 'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
77
+ 'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
78
+ 'ark' => Regexp.new(%r{^ark:/[^\s]+$})
79
+ }
77
80
 
78
81
  CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
79
82
  extruct = CONFIG.dig(:extruct, :command)
@@ -1,12 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module FspHarvester
3
+ module HarvesterTools
4
4
  class Error < StandardError
5
5
  end
6
6
 
7
7
  class ExternalTools
8
8
 
9
- def initialize(metadata: FspHarvester::MetadataObject.new)
9
+ def initialize(metadata: HarvesterTools::MetadataObject.new)
10
10
  @meta = metadata
11
11
  end
12
12
 
@@ -25,10 +25,7 @@ module FspHarvester
25
25
  file.rewind
26
26
 
27
27
  @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
- # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
29
- command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
- # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
31
- # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
28
+ command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
32
29
  warn "distiller command: #{command}"
33
30
  result, _stderr, _status = Open3.capture3(command)
34
31
  warn ''
@@ -41,12 +38,13 @@ module FspHarvester
41
38
  if result !~ /@context/i # failure returns nil
42
39
  @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
40
  @meta.add_warning(['018', '', ''])
41
+ result = "{}"
44
42
  else
45
43
  @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
- parse_rdf(result: result, content_type: "application/ld+json")
47
44
  end
48
45
  @@distillerknown[bhash] = true
49
46
  end
47
+ result
50
48
  end
51
49
 
52
50
  def processs_with_extruct(uri:)
@@ -55,6 +53,11 @@ module FspHarvester
55
53
  stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
56
54
  warn "open3 status: #{status} #{stdout}"
57
55
  result = stderr # absurd that the output comes over stderr! LOL!
56
+ jsonld = {}
57
+ microdata = Hash.new
58
+ microformat = Hash.new
59
+ opengraph = Hash.new
60
+ rdfa = Hash.new
58
61
 
59
62
  if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
60
63
  @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
@@ -66,17 +69,16 @@ module FspHarvester
66
69
  elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
67
70
  json = JSON.parse result
68
71
  @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
69
-
70
- parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
71
- @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
72
- @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
73
- @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
74
- parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
75
-
76
- @meta.merge_hash(json.first) if json.first.is_a? Hash
72
+ jsonld = json['json-ld'].to_json if json['json-ld'].any?
73
+ microdata = json['microdata'].first if json['microdata'].any
74
+ microformat = json['microformat'].first if json['microformat'].any?
75
+ opengraph = json['opengraph'].first if json['opengraph'].any?
76
+ rdfa = json['rdfa'].to_json if json['rdfa'].any?
77
+ # @meta.merge_hash(json.first) if json.first.is_a? Hash
77
78
  else
78
79
  @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
79
80
  end
81
+ [jsonld, microdata, microformat, opengraph, rdfa]
80
82
  end
81
83
  end
82
84
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.10"
4
+ VERSION = "0.1.13"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -1,121 +1,23 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'fsp_harvester/version'
4
- require 'json/ld'
5
- require 'json/ld/preloaded'
6
- require 'json'
7
- require 'linkheaders/processor'
8
- require 'addressable'
9
- require 'tempfile'
10
- require 'xmlsimple'
11
- require 'nokogiri'
12
- require 'parseconfig'
13
- require 'rest-client'
14
- require 'cgi'
15
- require 'digest'
16
- require 'open3'
17
- require 'metainspector'
18
- require 'rdf/xsd'
19
- require_relative './metadata_object'
20
- require_relative './constants'
21
- require_relative './web_utils'
22
- require_relative './signposting_tests'
23
- require_relative './fsp_metadata_harvester'
24
- require_relative './fsp_metadata_parser'
25
-
26
1
 
2
+ require_relative 'harvester'
27
3
  module FspHarvester
28
4
  class Error < StandardError
29
5
  end
30
6
 
31
7
  class Utils
32
- # @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
33
- # @warnings = JSON.parse(File.read("warnings.json"))
34
-
35
-
36
- def self.resolve_guid(guid:)
37
- @meta = FspHarvester::MetadataObject.new
38
- @meta.all_uris = [guid]
39
- type, url = convertToURL(guid: guid)
40
- links = Array.new
41
- if type
42
- links = resolve_url(url: url)
43
- @meta.links << links
44
- else
45
- @meta.add_warning(['006', guid, ''])
46
- @meta.comments << "FATAL: GUID type not recognized.\n"
47
- end
48
- [links, @meta]
49
- end
50
8
 
51
- def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
9
+ def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
52
10
  @meta = metadata
53
11
  db = []
54
12
  links.each do |l|
55
13
  db << l if l.relation == 'describedby'
56
14
  end
57
- FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
15
+ HarvesterTools::MetadataHarvester.extract_metadata_from_links(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
58
16
  @meta
59
17
  end
60
18
 
61
- def self.convertToURL(guid:)
62
- GUID_TYPES.each do |k, regex|
63
- if k == 'inchi' and regex.match(guid)
64
- return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
65
- elsif k == 'handle1' and regex.match(guid)
66
- return 'handle', "http://hdl.handle.net/#{guid}"
67
- elsif k == 'handle2' and regex.match(guid)
68
- return 'handle', "http://hdl.handle.net/#{guid}"
69
- elsif k == 'uri' and regex.match(guid)
70
- return 'uri', guid
71
- elsif k == 'doi' and regex.match(guid)
72
- return 'doi', "https://doi.org/#{guid}"
73
- end
74
- end
75
- [nil, nil]
76
- end
77
-
78
- def self.typeit(guid:)
79
- Utils::GUID_TYPES.each do |type, regex|
80
- return type if regex.match(guid)
81
- end
82
- false
83
- end
84
-
85
- def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
86
- @meta.guidtype = 'uri' if @meta.guidtype.nil?
87
- warn "\n\n FETCHING #{url} #{header}\n\n"
88
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
89
- warn "\n\n head #{response.headers.inspect}\n\n" if response
90
-
91
- unless response
92
- @meta.add_warning(['001', url, header])
93
- @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
94
- return []
95
- end
96
-
97
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
98
- @meta.full_response << response.body
99
-
100
- links = process_link_headers(response: response) unless nolinkheaders
101
- links
102
- end
103
-
104
- def self.process_link_headers(response:)
105
- warn "\n\n parsing #{response.headers}\n\n"
106
-
107
- parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
108
- parser.extract_and_parse(response: response)
109
- factory = parser.factory # LinkHeaders::LinkFactory
110
-
111
- warn "\n\n length bfore #{factory.all_links.length}\n\n"
112
- signpostingcheck(factory: factory)
113
- warn "\n\n length aftr #{factory.all_links.length}\n\n"
114
- warn "\n\n links #{factory.all_links}\n\n"
115
- factory.all_links
116
- end
117
-
118
- def self.signpostingcheck(factory:)
19
+ def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
20
+ @meta = metadata
119
21
  citeas = Array.new
120
22
  describedby = Array.new
121
23
  item = Array.new
@@ -134,13 +36,13 @@ module FspHarvester
134
36
  end
135
37
  end
136
38
 
137
- check_describedby_rules(describedby: describedby)
138
- check_item_rules(item: item)
39
+ check_describedby_rules(describedby: describedby, metadata: @meta)
40
+ check_item_rules(item: item, metadata: @meta)
139
41
 
140
42
  if citeas.length > 1
141
43
  warn "INFO: multiple cite-as links found. Checking for conflicts\n"
142
44
  @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
143
- citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
45
+ citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
144
46
  end
145
47
 
146
48
  unless citeas.length == 1 && describedby.length > 0
data/lib/harvester.rb ADDED
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ #require_relative 'fsp_harvester/version'
4
+ require 'json/ld'
5
+ require 'json/ld/preloaded'
6
+ require 'json'
7
+ require 'linkheaders/processor'
8
+ require 'addressable'
9
+ require 'tempfile'
10
+ require 'xmlsimple'
11
+ require 'nokogiri'
12
+ require 'parseconfig'
13
+ require 'rest-client'
14
+ require 'cgi'
15
+ require 'digest'
16
+ require 'open3'
17
+ require 'metainspector'
18
+ require 'rdf/xsd'
19
+ require_relative './metadata_object'
20
+ require_relative './constants'
21
+ require_relative './web_utils'
22
+ require_relative './signposting_tests'
23
+ require_relative './metadata_harvester'
24
+ require_relative './fsp_harvester'
25
+ require_relative './harvester_utils'
26
+ require_relative './harvester_brute'
27
+ require_relative './external_tools'
28
+ require_relative './metadata_parser'
@@ -0,0 +1,78 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class Utils
6
+
7
+ def self.resolve_guid(guid:)
8
+ @meta = HarvesterTools::MetadataObject.new
9
+ @meta.all_uris = [guid]
10
+ type, url = convertToURL(guid: guid)
11
+ links = Array.new
12
+ if type
13
+ links = resolve_url(url: url, metadata: @meta)
14
+ @meta.links = @meta.links | links
15
+ else
16
+ @meta.add_warning(['006', guid, ''])
17
+ @meta.comments << "FATAL: GUID type not recognized.\n"
18
+ end
19
+ [links, @meta]
20
+ end
21
+
22
+ def self.convertToURL(guid:)
23
+ GUID_TYPES.each do |k, regex|
24
+ if k == 'inchi' and regex.match(guid)
25
+ return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
26
+ elsif k == 'handle1' and regex.match(guid)
27
+ return 'handle', "http://hdl.handle.net/#{guid}"
28
+ elsif k == 'handle2' and regex.match(guid)
29
+ return 'handle', "http://hdl.handle.net/#{guid}"
30
+ elsif k == 'uri' and regex.match(guid)
31
+ return 'uri', guid
32
+ elsif k == 'doi' and regex.match(guid)
33
+ return 'doi', "https://doi.org/#{guid}"
34
+ elsif k == 'ark' and regex.match(guid)
35
+ return 'ark', "https://n2t.net/#{guid}"
36
+ end
37
+ end
38
+ [nil, nil]
39
+ end
40
+
41
+ def self.typeit(guid:)
42
+ GUID_TYPES.each do |type, regex|
43
+ return type if regex.match(guid)
44
+ end
45
+ false
46
+ end
47
+
48
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
49
+ @meta = metadata
50
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
51
+ warn "\n\n FETCHING #{url} #{header}\n\n"
52
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
53
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
54
+
55
+ unless response
56
+ @meta.add_warning(['001', url, header])
57
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
58
+ return []
59
+ end
60
+
61
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
62
+ @meta.full_response << response.body
63
+
64
+ links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
65
+ links
66
+ end
67
+
68
+ def self.process_link_headers(response:, metadata:)
69
+ warn "\n\n parsing #{response.headers}\n\n"
70
+
71
+ parser = LinkHeaders::Processor.new(default_anchor: metadata.all_uris.last)
72
+ parser.extract_and_parse(response: response)
73
+ factory = parser.factory # LinkHeaders::LinkFactory
74
+ FspHarvester::Utils.signpostingcheck(factory: factory, metadata: metadata)
75
+ factory.all_links
76
+ end
77
+ end
78
+ end