fsp_harvester 0.1.11 → 0.1.14

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 895567e9edd571dbca7dee89a0270d1c14342fed06c3eb81c81e06f3c07ddbed
4
- data.tar.gz: 7eee65295c206d6cee7b4ef28830f64087ba172a294cde7401490bffa20dbe1a
3
+ metadata.gz: e285f00da696d7e39d80df794be9524af6e63ea01deb4e73f6c30b3694c016ff
4
+ data.tar.gz: fb81b5c1c0fac3bb22e078663025855e5accdb355db1811a4687fb1bca54bc61
5
5
  SHA512:
6
- metadata.gz: f0c7727598525cb55b6c2bfaf36d5ce3dda5da6efddf85888328b7c93b874c508989122627e5deaa5101fc0a20279432aa023ecefef112926219f267e3622234
7
- data.tar.gz: 29f834c57ec73e27f988948893dc92fe56550b829585df390a9a1398770845115202289f6f9557c01eb2fc3eec218f863371db60649f6a3fef01da9457c2862e
6
+ metadata.gz: 194132eb78246291a3cb96566ca6a283841a0427afcd6a6abb79c590dbc2c54108e3e8cfef9e4802a77008f1a4c9c94ea7862987e81ce1b4b97cd1fdaf25ca23
7
+ data.tar.gz: 9765647726c2bfcd7e790ba11929d257610672bc92d8d11756824432e90db4c05036b2cfcede1a55da95f1e74b9e87fd078c78284c356897a5bdc0a17593a3a1
data/.rspec_status CHANGED
@@ -1,55 +1,55 @@
1
1
  example_id | status | run_time |
2
2
  ---------------------------------- | ------ | --------------- |
3
- ./spec/cite-as_spec.rb[1:1:1] | passed | 1.61 seconds |
4
- ./spec/cite-as_spec.rb[1:1:2] | passed | 1.18 seconds |
5
- ./spec/cite-as_spec.rb[1:1:3] | passed | 1.02 seconds |
6
- ./spec/cite-as_spec.rb[1:1:4] | passed | 1.6 seconds |
7
- ./spec/cite-as_spec.rb[1:1:5] | passed | 2.78 seconds |
8
- ./spec/cite-as_spec.rb[1:1:6] | passed | 2.09 seconds |
9
- ./spec/cite-as_spec.rb[1:1:7] | passed | 2.98 seconds |
10
- ./spec/cite-as_spec.rb[1:1:8] | passed | 2.2 seconds |
11
- ./spec/cite-as_spec.rb[1:1:9] | passed | 2.87 seconds |
12
- ./spec/cite-as_spec.rb[1:1:10] | passed | 2.18 seconds |
13
- ./spec/cite-as_spec.rb[1:1:11] | passed | 3.16 seconds |
14
- ./spec/cite-as_spec.rb[1:1:12] | passed | 2.36 seconds |
15
- ./spec/cite-as_spec.rb[1:1:13] | passed | 2.89 seconds |
16
- ./spec/cite-as_spec.rb[1:1:14] | passed | 2.13 seconds |
17
- ./spec/cite-as_spec.rb[1:1:15] | passed | 1.18 seconds |
18
- ./spec/cite-as_spec.rb[1:1:16] | passed | 1.3 seconds |
19
- ./spec/cite-as_spec.rb[1:1:17] | passed | 1.17 seconds |
3
+ ./spec/cite-as_spec.rb[1:1:1] | passed | 1.66 seconds |
4
+ ./spec/cite-as_spec.rb[1:1:2] | passed | 1.13 seconds |
5
+ ./spec/cite-as_spec.rb[1:1:3] | passed | 1.08 seconds |
6
+ ./spec/cite-as_spec.rb[1:1:4] | passed | 1.68 seconds |
7
+ ./spec/cite-as_spec.rb[1:1:5] | passed | 2.86 seconds |
8
+ ./spec/cite-as_spec.rb[1:1:6] | passed | 2.11 seconds |
9
+ ./spec/cite-as_spec.rb[1:1:7] | passed | 3.07 seconds |
10
+ ./spec/cite-as_spec.rb[1:1:8] | passed | 2.13 seconds |
11
+ ./spec/cite-as_spec.rb[1:1:9] | passed | 2.73 seconds |
12
+ ./spec/cite-as_spec.rb[1:1:10] | passed | 2.64 seconds |
13
+ ./spec/cite-as_spec.rb[1:1:11] | passed | 3.36 seconds |
14
+ ./spec/cite-as_spec.rb[1:1:12] | passed | 2.26 seconds |
15
+ ./spec/cite-as_spec.rb[1:1:13] | passed | 2.9 seconds |
16
+ ./spec/cite-as_spec.rb[1:1:14] | passed | 2.31 seconds |
17
+ ./spec/cite-as_spec.rb[1:1:15] | passed | 1.47 seconds |
18
+ ./spec/cite-as_spec.rb[1:1:16] | passed | 1.22 seconds |
19
+ ./spec/cite-as_spec.rb[1:1:17] | passed | 1.23 seconds |
20
20
  ./spec/cite-as_spec.rb[1:1:18] | passed | 1.2 seconds |
21
21
  ./spec/cite-as_spec.rb[1:1:19] | passed | 1.71 seconds |
22
- ./spec/cite-as_spec.rb[1:1:20] | passed | 1.69 seconds |
23
- ./spec/cite-as_spec.rb[1:1:21] | passed | 2.22 seconds |
24
- ./spec/cite-as_spec.rb[1:1:22] | passed | 1.09 seconds |
25
- ./spec/cite-as_spec.rb[1:1:23] | passed | 1.17 seconds |
26
- ./spec/cite-as_spec.rb[1:1:24] | failed | 1.2 seconds |
27
- ./spec/cite-as_spec.rb[1:1:25] | passed | 0.48048 seconds |
28
- ./spec/describedby_spec.rb[1:1:1] | passed | 2.12 seconds |
29
- ./spec/describedby_spec.rb[1:1:2] | passed | 0.96254 seconds |
30
- ./spec/describedby_spec.rb[1:1:3] | passed | 0.92669 seconds |
31
- ./spec/describedby_spec.rb[1:1:4] | passed | 0.92801 seconds |
32
- ./spec/describedby_spec.rb[1:1:5] | passed | 1 second |
33
- ./spec/describedby_spec.rb[1:1:6] | passed | 0.66763 seconds |
34
- ./spec/describedby_spec.rb[1:1:7] | passed | 0.66021 seconds |
35
- ./spec/describedby_spec.rb[1:1:8] | passed | 1.89 seconds |
36
- ./spec/describedby_spec.rb[1:1:9] | passed | 1.3 seconds |
37
- ./spec/describedby_spec.rb[1:1:10] | passed | 1.7 seconds |
38
- ./spec/describedby_spec.rb[1:1:11] | passed | 2.28 seconds |
39
- ./spec/describedby_spec.rb[1:1:12] | passed | 2.27 seconds |
40
- ./spec/describedby_spec.rb[1:1:13] | passed | 1.39 seconds |
41
- ./spec/describedby_spec.rb[1:1:14] | passed | 1.65 seconds |
42
- ./spec/describedby_spec.rb[1:1:15] | passed | 1.7 seconds |
43
- ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00215 seconds |
44
- ./spec/fsp_harvester_spec.rb[1:2] | failed | 0.00021 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 2.04 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 2 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 0.92924 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.36 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 1.71 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 1.68 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 2.37 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.34241 seconds |
53
- ./spec/type_spec.rb[1:1:1] | passed | 0.9855 seconds |
54
- ./spec/type_spec.rb[1:1:2] | passed | 0.96202 seconds |
55
- ./spec/type_spec.rb[1:1:3] | passed | 0.96005 seconds |
22
+ ./spec/cite-as_spec.rb[1:1:20] | passed | 1.66 seconds |
23
+ ./spec/cite-as_spec.rb[1:1:21] | passed | 2.5 seconds |
24
+ ./spec/cite-as_spec.rb[1:1:22] | passed | 1.54 seconds |
25
+ ./spec/cite-as_spec.rb[1:1:23] | passed | 1.25 seconds |
26
+ ./spec/cite-as_spec.rb[1:1:24] | failed | 1.35 seconds |
27
+ ./spec/cite-as_spec.rb[1:1:25] | passed | 0.50811 seconds |
28
+ ./spec/describedby_spec.rb[1:1:1] | passed | 3.45 seconds |
29
+ ./spec/describedby_spec.rb[1:1:2] | passed | 1.3 seconds |
30
+ ./spec/describedby_spec.rb[1:1:3] | passed | 1.22 seconds |
31
+ ./spec/describedby_spec.rb[1:1:4] | passed | 1.22 seconds |
32
+ ./spec/describedby_spec.rb[1:1:5] | passed | 1.15 seconds |
33
+ ./spec/describedby_spec.rb[1:1:6] | passed | 1.04 seconds |
34
+ ./spec/describedby_spec.rb[1:1:7] | passed | 1.12 seconds |
35
+ ./spec/describedby_spec.rb[1:1:8] | passed | 2.44 seconds |
36
+ ./spec/describedby_spec.rb[1:1:9] | passed | 2.15 seconds |
37
+ ./spec/describedby_spec.rb[1:1:10] | passed | 2.19 seconds |
38
+ ./spec/describedby_spec.rb[1:1:11] | passed | 2.98 seconds |
39
+ ./spec/describedby_spec.rb[1:1:12] | passed | 2.87 seconds |
40
+ ./spec/describedby_spec.rb[1:1:13] | passed | 1.74 seconds |
41
+ ./spec/describedby_spec.rb[1:1:14] | passed | 2.27 seconds |
42
+ ./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
43
+ ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
+ ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
+ ./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
46
+ ./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
47
+ ./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
48
+ ./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
49
+ ./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
50
+ ./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
51
+ ./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
52
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
53
+ ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
54
+ ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
55
+ ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.11)
4
+ fsp_harvester (0.1.14)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
- linkheaders-processor (~> 0.1.16)
7
+ linkheaders-processor (~> 0.1.17)
8
8
  metainspector (~> 5.11.2)
9
9
  parseconfig (~> 1.1)
10
10
  rake (~> 13.0)
@@ -56,7 +56,7 @@ GEM
56
56
  faraday-encoding (0.0.5)
57
57
  faraday
58
58
  faraday-excon (1.1.0)
59
- faraday-http-cache (2.4.0)
59
+ faraday-http-cache (2.4.1)
60
60
  faraday (>= 0.8)
61
61
  faraday-httpclient (1.0.1)
62
62
  faraday-multipart (1.0.4)
@@ -99,34 +99,35 @@ GEM
99
99
  sparql (~> 3.2)
100
100
  sxp (~> 1.2)
101
101
  link_header (0.0.8)
102
- linkeddata (3.2.0)
103
- json-ld (~> 3.2)
102
+ linkeddata (3.2.1)
103
+ json-ld (~> 3.2, >= 3.2.3)
104
104
  json-ld-preloaded (~> 3.2)
105
105
  ld-patch (~> 3.2)
106
- nokogiri (~> 1.12, >= 1.12.5)
107
- rdf (~> 3.2)
108
- rdf-aggregate-repo (~> 3.2)
106
+ nokogiri (~> 1.13, >= 1.13.8)
107
+ rdf (~> 3.2, >= 3.2.9)
108
+ rdf-aggregate-repo (~> 3.2, >= 3.2.1)
109
109
  rdf-hamster-repo (~> 3.2)
110
- rdf-isomorphic (~> 3.2)
110
+ rdf-isomorphic (~> 3.2, >= 3.2.1)
111
111
  rdf-json (~> 3.2)
112
- rdf-microdata (~> 3.2)
113
- rdf-n3 (~> 3.2)
112
+ rdf-microdata (~> 3.2, >= 3.2.1)
113
+ rdf-n3 (~> 3.2, >= 3.2.1)
114
114
  rdf-normalize (~> 0.5)
115
- rdf-ordered-repo (~> 3.2)
115
+ rdf-ordered-repo (~> 3.2, >= 3.2.1)
116
116
  rdf-rdfa (~> 3.2)
117
117
  rdf-rdfxml (~> 3.2)
118
118
  rdf-reasoner (~> 0.8)
119
- rdf-tabular (~> 3.2)
119
+ rdf-tabular (~> 3.2, >= 3.2.1)
120
120
  rdf-trig (~> 3.2)
121
121
  rdf-trix (~> 3.2)
122
- rdf-turtle (~> 3.2)
123
- rdf-vocab (~> 3.2)
124
- rdf-xsd (~> 3.2)
125
- shacl (~> 0.2)
126
- shex (~> 0.7)
127
- sparql (~> 3.2)
128
- sparql-client (~> 3.2)
129
- linkheaders-processor (0.1.16)
122
+ rdf-turtle (~> 3.2, >= 3.2.1)
123
+ rdf-vocab (~> 3.2, >= 3.2.1)
124
+ rdf-xsd (~> 3.2, >= 3.2.1)
125
+ shacl (~> 0.2, >= 0.2.1)
126
+ shex (~> 0.7, >= 0.7.1)
127
+ sparql (~> 3.2, >= 3.2.4)
128
+ sparql-client (~> 3.2, >= 3.2.1)
129
+ yaml-ld (~> 0.0)
130
+ linkheaders-processor (0.1.17)
130
131
  json (~> 2.0)
131
132
  json-ld (~> 3.2)
132
133
  json-ld-preloaded (~> 3.2)
@@ -159,8 +160,10 @@ GEM
159
160
  racc (~> 1.4)
160
161
  parallel (1.22.1)
161
162
  parseconfig (1.1.2)
162
- parser (3.1.2.0)
163
+ parser (3.1.2.1)
163
164
  ast (~> 2.4.1)
165
+ psych (4.0.4)
166
+ stringio
164
167
  public_suffix (4.0.7)
165
168
  racc (1.6.0)
166
169
  rack (2.2.4)
@@ -249,17 +252,17 @@ GEM
249
252
  diff-lcs (>= 1.2.0, < 2.0)
250
253
  rspec-support (~> 3.11.0)
251
254
  rspec-support (3.11.0)
252
- rubocop (1.33.0)
255
+ rubocop (1.34.1)
253
256
  json (~> 2.3)
254
257
  parallel (~> 1.10)
255
- parser (>= 3.1.0.0)
258
+ parser (>= 3.1.2.1)
256
259
  rainbow (>= 2.2.2, < 4.0)
257
260
  regexp_parser (>= 1.8, < 3.0)
258
261
  rexml (>= 3.2.5, < 4.0)
259
- rubocop-ast (>= 1.19.1, < 2.0)
262
+ rubocop-ast (>= 1.20.0, < 2.0)
260
263
  ruby-progressbar (~> 1.7)
261
264
  unicode-display_width (>= 1.4.0, < 3.0)
262
- rubocop-ast (1.19.1)
265
+ rubocop-ast (1.21.0)
263
266
  parser (>= 3.1.1.0)
264
267
  ruby-progressbar (1.11.0)
265
268
  ruby2_keywords (0.0.5)
@@ -291,6 +294,7 @@ GEM
291
294
  sparql-client (3.2.1)
292
295
  net-http-persistent (~> 4.0, >= 4.0.1)
293
296
  rdf (~> 3.2, >= 3.2.6)
297
+ stringio (3.0.2)
294
298
  sxp (1.2.2)
295
299
  matrix
296
300
  rdf (~> 3.2)
@@ -303,6 +307,10 @@ GEM
303
307
  unicode-types (1.7.0)
304
308
  xml-simple (1.1.9)
305
309
  rexml
310
+ yaml-ld (0.0.1)
311
+ json-ld (~> 3.2, >= 3.2.2)
312
+ psych (~> 4.0)
313
+ rdf (~> 3.2)
306
314
 
307
315
  PLATFORMS
308
316
  x86_64-linux
data/lib/config.conf ADDED
@@ -0,0 +1,8 @@
1
+ [extruct]
2
+ command="extruct"
3
+
4
+ [rdf]
5
+ command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
6
+
7
+ [tika]
8
+ command="http://tika:9998/meta"
data/lib/constants.rb CHANGED
@@ -69,11 +69,14 @@ SELF_IDENTIFIER_PREDICATES = [
69
69
  'https://schema.org/identifier'
70
70
  ]
71
71
 
72
- GUID_TYPES = { 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
73
- 'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
74
- 'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
75
- 'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
76
- 'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}) }
72
+ GUID_TYPES = {
73
+ 'inchi' => Regexp.new(/^\w{14}-\w{10}-\w$/),
74
+ 'doi' => Regexp.new(%r{^10.\d{4,9}/[-._;()/:A-Z0-9]+$}i),
75
+ 'handle1' => Regexp.new(%r{^[^/]+/[^/]+$}i),
76
+ 'handle2' => Regexp.new(%r{^\d{4,5}/[-._;()/:A-Z0-9]+$}i), # legacy style 12345/AGB47A
77
+ 'uri' => Regexp.new(%r{^\w+:/?/?[^\s]+$}),
78
+ 'ark' => Regexp.new(%r{^ark:/[^\s]+$})
79
+ }
77
80
 
78
81
  CONFIG = File.exist?('config.conf') ? ParseConfig.new('config.conf') : {}
79
82
  extruct = CONFIG.dig(:extruct, :command)
@@ -1,12 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module FspHarvester
3
+ module HarvesterTools
4
4
  class Error < StandardError
5
5
  end
6
6
 
7
7
  class ExternalTools
8
8
 
9
- def initialize(metadata: FspHarvester::MetadataObject.new)
9
+ def initialize(metadata: HarvesterTools::MetadataObject.new)
10
10
  @meta = metadata
11
11
  end
12
12
 
@@ -25,10 +25,7 @@ module FspHarvester
25
25
  file.rewind
26
26
 
27
27
  @meta.comments << "INFO: The message body is being examined by Distiller\n"
28
- # command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format turtle #{file.path} 2>/dev/null"
29
- command = "LANG=en_US.UTF-8 #{Utils::RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
30
- # command = "LANG=en_US.UTF-8 /usr/local/bin/ruby #{@rdf_command} serialize --input-format rdfa --output-format jsonld #{file.path}"
31
- # command = "LANG=en_US.UTF-8 /home/osboxes/.rvm/rubies/ruby-2.6.3/bin/ruby /home/osboxes/.rvm/gems/ruby-2.6.3/bin/rdf serialize --output-format jsonld #{file.path}"
28
+ command = "LANG=en_US.UTF-8 #{RDFCommand} serialize --input-format rdfa --output-format jsonld #{file.path}"
32
29
  warn "distiller command: #{command}"
33
30
  result, _stderr, _status = Open3.capture3(command)
34
31
  warn ''
@@ -41,12 +38,13 @@ module FspHarvester
41
38
  if result !~ /@context/i # failure returns nil
42
39
  @meta.comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n"
43
40
  @meta.add_warning(['018', '', ''])
41
+ result = "{}"
44
42
  else
45
43
  @meta.comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n"
46
- parse_rdf(result: result, content_type: "application/ld+json")
47
44
  end
48
45
  @@distillerknown[bhash] = true
49
46
  end
47
+ result
50
48
  end
51
49
 
52
50
  def processs_with_extruct(uri:)
@@ -55,6 +53,11 @@ module FspHarvester
55
53
  stdout, stderr, status = Open3.capture3(EXTRUCT_COMMAND + ' ' + uri)
56
54
  warn "open3 status: #{status} #{stdout}"
57
55
  result = stderr # absurd that the output comes over stderr! LOL!
56
+ jsonld = {}
57
+ microdata = Hash.new
58
+ microformat = Hash.new
59
+ opengraph = Hash.new
60
+ rdfa = Hash.new
58
61
 
59
62
  if result.to_s.match(/(Failed\sto\sextract.*?)\n/)
60
63
  @meta.comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n"
@@ -66,17 +69,16 @@ module FspHarvester
66
69
  elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON
67
70
  json = JSON.parse result
68
71
  @meta.comments << "INFO: the extruct tool found parseable data at #{uri}\n"
69
-
70
- parse_rdf(body: json['json-ld'].to_json, content_type: 'application/ld+json') if json['json-ld'].any? # RDF
71
- @meta.merge_hash(json['microdata'].first) if json['microdata'].any?
72
- @meta.merge_hash(json['microformat'].first) if json['microformat'].any?
73
- @meta.merge_hash(json['opengraph'].first) if json['opengraph'].any?
74
- parse_rdf(body: json['rdfa'].to_json, content_type: 'application/ld+json') if json['rdfa'].any? # RDF
75
-
76
- @meta.merge_hash(json.first) if json.first.is_a? Hash
72
+ jsonld = json['json-ld'].to_json if json['json-ld'].any?
73
+ microdata = json['microdata'].first if json['microdata'].any
74
+ microformat = json['microformat'].first if json['microformat'].any?
75
+ opengraph = json['opengraph'].first if json['opengraph'].any?
76
+ rdfa = json['rdfa'].to_json if json['rdfa'].any?
77
+ # @meta.merge_hash(json.first) if json.first.is_a? Hash
77
78
  else
78
79
  @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n"
79
80
  end
81
+ [jsonld, microdata, microformat, opengraph, rdfa]
80
82
  end
81
83
  end
82
84
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.11"
4
+ VERSION = "0.1.14"
5
5
  end
data/lib/fsp_harvester.rb CHANGED
@@ -1,121 +1,23 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'fsp_harvester/version'
4
- require 'json/ld'
5
- require 'json/ld/preloaded'
6
- require 'json'
7
- require 'linkheaders/processor'
8
- require 'addressable'
9
- require 'tempfile'
10
- require 'xmlsimple'
11
- require 'nokogiri'
12
- require 'parseconfig'
13
- require 'rest-client'
14
- require 'cgi'
15
- require 'digest'
16
- require 'open3'
17
- require 'metainspector'
18
- require 'rdf/xsd'
19
- require_relative './metadata_object'
20
- require_relative './constants'
21
- require_relative './web_utils'
22
- require_relative './signposting_tests'
23
- require_relative './fsp_metadata_harvester'
24
- require_relative './fsp_metadata_parser'
25
-
26
1
 
2
+ require_relative 'harvester'
27
3
  module FspHarvester
28
4
  class Error < StandardError
29
5
  end
30
6
 
31
7
  class Utils
32
- # @@distillerknown = {} # global, hash of sha256 keys of message bodies - have they been seen before t/f
33
- # @warnings = JSON.parse(File.read("warnings.json"))
34
-
35
-
36
- def self.resolve_guid(guid:)
37
- @meta = FspHarvester::MetadataObject.new
38
- @meta.all_uris = [guid]
39
- type, url = convertToURL(guid: guid)
40
- links = Array.new
41
- if type
42
- links = resolve_url(url: url)
43
- @meta.links << links
44
- else
45
- @meta.add_warning(['006', guid, ''])
46
- @meta.comments << "FATAL: GUID type not recognized.\n"
47
- end
48
- [links, @meta]
49
- end
50
8
 
51
- def self.gather_metadata_from_describedby_links(links: [], metadata: FspHarvester::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
9
+ def self.gather_metadata_from_describedby_links(links: [], metadata: HarvesterTools::MetadataObject.new) # meta should have already been created by resolve+guid, but maybe not
52
10
  @meta = metadata
53
11
  db = []
54
12
  links.each do |l|
55
13
  db << l if l.relation == 'describedby'
56
14
  end
57
- FspHarvester::MetadataHarvester.extract_metadata(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
15
+ HarvesterTools::MetadataHarvester.extract_metadata_from_links(links: db, metadata: @meta) # everything is gathered into the @meta metadata object
58
16
  @meta
59
17
  end
60
18
 
61
- def self.convertToURL(guid:)
62
- GUID_TYPES.each do |k, regex|
63
- if k == 'inchi' and regex.match(guid)
64
- return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
65
- elsif k == 'handle1' and regex.match(guid)
66
- return 'handle', "http://hdl.handle.net/#{guid}"
67
- elsif k == 'handle2' and regex.match(guid)
68
- return 'handle', "http://hdl.handle.net/#{guid}"
69
- elsif k == 'uri' and regex.match(guid)
70
- return 'uri', guid
71
- elsif k == 'doi' and regex.match(guid)
72
- return 'doi', "https://doi.org/#{guid}"
73
- end
74
- end
75
- [nil, nil]
76
- end
77
-
78
- def self.typeit(guid:)
79
- Utils::GUID_TYPES.each do |type, regex|
80
- return type if regex.match(guid)
81
- end
82
- false
83
- end
84
-
85
- def self.resolve_url(url:, method: :get, nolinkheaders: false, header: ACCEPT_STAR_HEADER)
86
- @meta.guidtype = 'uri' if @meta.guidtype.nil?
87
- warn "\n\n FETCHING #{url} #{header}\n\n"
88
- response = FspHarvester::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
89
- warn "\n\n head #{response.headers.inspect}\n\n" if response
90
-
91
- unless response
92
- @meta.add_warning(['001', url, header])
93
- @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
94
- return []
95
- end
96
-
97
- @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
98
- @meta.full_response << response.body
99
-
100
- links = process_link_headers(response: response) unless nolinkheaders
101
- links
102
- end
103
-
104
- def self.process_link_headers(response:)
105
- warn "\n\n parsing #{response.headers}\n\n"
106
-
107
- parser = LinkHeaders::Processor.new(default_anchor: @meta.all_uris.last)
108
- parser.extract_and_parse(response: response)
109
- factory = parser.factory # LinkHeaders::LinkFactory
110
-
111
- warn "\n\n length bfore #{factory.all_links.length}\n\n"
112
- signpostingcheck(factory: factory)
113
- warn "\n\n length aftr #{factory.all_links.length}\n\n"
114
- warn "\n\n links #{factory.all_links}\n\n"
115
- factory.all_links
116
- end
117
-
118
- def self.signpostingcheck(factory:)
19
+ def self.signpostingcheck(factory:, metadata: HarvesterTools::MetadataObject.new)
20
+ @meta = metadata
119
21
  citeas = Array.new
120
22
  describedby = Array.new
121
23
  item = Array.new
@@ -134,13 +36,13 @@ module FspHarvester
134
36
  end
135
37
  end
136
38
 
137
- check_describedby_rules(describedby: describedby)
138
- check_item_rules(item: item)
39
+ check_describedby_rules(describedby: describedby, metadata: @meta)
40
+ check_item_rules(item: item, metadata: @meta)
139
41
 
140
42
  if citeas.length > 1
141
43
  warn "INFO: multiple cite-as links found. Checking for conflicts\n"
142
44
  @meta.comments << "INFO: multiple cite-as links found. Checking for conflicts\n"
143
- citeas = check_for_citeas_conflicts(citeas: citeas) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
45
+ citeas = check_for_citeas_conflicts(citeas: citeas, metadata: @meta) # this adds to the metadata objects if there are conflicts, returns the list of unique citeas (SHOULD ONLY BE ONE!)
144
46
  end
145
47
 
146
48
  unless citeas.length == 1 && describedby.length > 0
data/lib/harvester.rb ADDED
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ #require_relative 'fsp_harvester/version'
4
+ require 'json/ld'
5
+ require 'json/ld/preloaded'
6
+ require 'json'
7
+ require 'linkheaders/processor'
8
+ require 'addressable'
9
+ require 'tempfile'
10
+ require 'xmlsimple'
11
+ require 'nokogiri'
12
+ require 'parseconfig'
13
+ require 'rest-client'
14
+ require 'cgi'
15
+ require 'digest'
16
+ require 'open3'
17
+ require 'metainspector'
18
+ require 'rdf/xsd'
19
+ require_relative './metadata_object'
20
+ require_relative './constants'
21
+ require_relative './web_utils'
22
+ require_relative './signposting_tests'
23
+ require_relative './metadata_harvester'
24
+ require_relative './fsp_harvester'
25
+ require_relative './harvester_utils'
26
+ require_relative './harvester_brute'
27
+ require_relative './external_tools'
28
+ require_relative './metadata_parser'
@@ -0,0 +1,78 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class Utils
6
+
7
+ def self.resolve_guid(guid:)
8
+ @meta = HarvesterTools::MetadataObject.new
9
+ @meta.all_uris = [guid]
10
+ type, url = convertToURL(guid: guid)
11
+ links = Array.new
12
+ if type
13
+ links = resolve_url(url: url, metadata: @meta)
14
+ @meta.links = @meta.links | links
15
+ else
16
+ @meta.add_warning(['006', guid, ''])
17
+ @meta.comments << "FATAL: GUID type not recognized.\n"
18
+ end
19
+ [links, @meta]
20
+ end
21
+
22
+ def self.convertToURL(guid:)
23
+ GUID_TYPES.each do |k, regex|
24
+ if k == 'inchi' and regex.match(guid)
25
+ return 'inchi', "https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/#{guid}"
26
+ elsif k == 'handle1' and regex.match(guid)
27
+ return 'handle', "http://hdl.handle.net/#{guid}"
28
+ elsif k == 'handle2' and regex.match(guid)
29
+ return 'handle', "http://hdl.handle.net/#{guid}"
30
+ elsif k == 'uri' and regex.match(guid)
31
+ return 'uri', guid
32
+ elsif k == 'doi' and regex.match(guid)
33
+ return 'doi', "https://doi.org/#{guid}"
34
+ elsif k == 'ark' and regex.match(guid)
35
+ return 'ark', "https://n2t.net/#{guid}"
36
+ end
37
+ end
38
+ [nil, nil]
39
+ end
40
+
41
+ def self.typeit(guid:)
42
+ GUID_TYPES.each do |type, regex|
43
+ return type if regex.match(guid)
44
+ end
45
+ false
46
+ end
47
+
48
+ def self.resolve_url(url:, method: :get, nolinkheaders: false, metadata:, header: ACCEPT_STAR_HEADER)
49
+ @meta = metadata
50
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
51
+ warn "\n\n FETCHING #{url} #{header}\n\n"
52
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: header, method: method, meta: @meta)
53
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
54
+
55
+ unless response
56
+ @meta.add_warning(['001', url, header])
57
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{header}.\n"
58
+ return []
59
+ end
60
+
61
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
62
+ @meta.full_response << response.body
63
+
64
+ links = process_link_headers(response: response, metadata: @meta) unless nolinkheaders
65
+ links
66
+ end
67
+
68
+ def self.process_link_headers(response:, metadata:)
69
+ warn "\n\n parsing #{response.headers}\n\n"
70
+
71
+ parser = LinkHeaders::Processor.new(default_anchor: metadata.all_uris.last)
72
+ parser.extract_and_parse(response: response)
73
+ factory = parser.factory # LinkHeaders::LinkFactory
74
+ FspHarvester::Utils.signpostingcheck(factory: factory, metadata: metadata)
75
+ factory.all_links
76
+ end
77
+ end
78
+ end