fsp_harvester 0.1.13 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '099b2769aca02c9b6fba26583dfbcfc4b60c39798be4adf0a4c71a989af0094c'
4
- data.tar.gz: 80b8657befce11cdd8d58c420fe8039ba5407b29b6e0fed645325212de95f4d0
3
+ metadata.gz: aec11fd57963ffb176ddb88338b9e262027c9a7d39364089ae130fb4b628bf5b
4
+ data.tar.gz: f8733a00de5c6c24a622235c18ba0dae208f5bac52d50607480e51fd563678c8
5
5
  SHA512:
6
- metadata.gz: 683362c6a0710bf9a0d5420a9ae6fe8372338f9c6132ae5ae56619dce1bfa88df7914008a661cf49685000166a0c4c6b476691acda109326c3955f73b796cc4e
7
- data.tar.gz: 931972a9f872bcb90e11ed731e4de9b406b9c3ccb1f44e6af250d9cb817616040e86149d34f070fe9ac0c6711f37438922b0efb5e2915ce00e39fa4ce09c030b
6
+ metadata.gz: c484e41aa0305f34d0bf7f82cad60b9b02106ffe80b9371c99e77b199eef9ce52818222368b8b3a3ff73d94dba89b8d7fb815d29c95ca335772946e1e9762849
7
+ data.tar.gz: '09dfdcc12b9176bc88c31a196893ae9ede6c35c2fd59271ca8fa5b1c29f0807ee82c8416ca5f8b7ff75c6caab71648b6d0e15d0976784cb7d34ff8686332be37'
data/.rspec_status CHANGED
@@ -42,14 +42,15 @@ example_id | status | run_time |
42
42
  ./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
43
43
  ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
44
  ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 6.87 seconds |
46
+ ./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
47
+ ./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
48
+ ./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
49
+ ./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
50
+ ./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
51
+ ./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
52
+ ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
53
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
53
54
  ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
54
55
  ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
55
56
  ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.13)
4
+ fsp_harvester (0.1.16)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
7
  linkheaders-processor (~> 0.1.17)
data/lib/config.conf CHANGED
@@ -2,7 +2,7 @@
2
2
  command="extruct"
3
3
 
4
4
  [rdf]
5
- command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
5
+ command="/usr/local/bundle/bin/rdf"
6
6
 
7
7
  [tika]
8
8
  command="http://tika:9998/meta"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.13"
4
+ VERSION = "0.1.16"
5
5
  end
@@ -0,0 +1,48 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class BruteForce
6
+ def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
7
+ type, url = HarvesterTools::Utils.convertToURL(guid: guid)
8
+ return false unless type
9
+
10
+ do_content_negotiation(url: url, metadata: metadata)
11
+ metadata
12
+ end
13
+
14
+ def self.do_content_negotiation(url:, metadata:)
15
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
16
+ if response
17
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
18
+ end
19
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
20
+ if response
21
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
22
+ response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
23
+ if response
24
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
30
+ @meta = metadata
31
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
32
+ warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
33
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
34
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
35
+
36
+ unless response
37
+ @meta.add_warning(['001', url, headers])
38
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
39
+ @meta.full_response << [url, "No response"]
40
+ false
41
+ end
42
+
43
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
44
+ @meta.full_response << [url, response.body]
45
+ response
46
+ end
47
+ end
48
+ end
data/lib/web_utils.rb CHANGED
@@ -18,13 +18,13 @@ module HarvesterTools
18
18
  warn "final URL #{response.request.url}"
19
19
  warn "Response code #{response.code}"
20
20
  if response.code == 203
21
- meta.warnings << ["002", url, headers]
21
+ meta.add_warning(["002", url, headers])
22
22
  meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
23
23
  end
24
24
  response
25
25
  rescue RestClient::ExceptionWithResponse => e
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response.code} with response #{e.response}\nfailed response headers: #{e.response.headers}"
27
- meta.warnings << ["003", url, headers]
27
+ meta.add_warning(["003", url, headers])
28
28
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
29
29
  if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
@@ -34,13 +34,13 @@ module HarvesterTools
34
34
  # now we are returning the headers and body that were returned
35
35
  rescue RestClient::Exception => e
36
36
  warn "EXCEPTION WITH NO RESPONSE! #{e}"
37
- meta.warnings << ["003", url, headers]
37
+ meta.add_warning(["003", url, headers])
38
38
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
39
39
  false
40
40
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
41
41
  rescue Exception => e
42
42
  warn "EXCEPTION UNKNOWN! #{e}"
43
- meta.warnings << ["003", url, headers]
43
+ meta.add_warning(["003", url, headers])
44
44
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
45
45
  false
46
46
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
@@ -190,6 +190,7 @@ files:
190
190
  - lib/fsp_harvester.rb
191
191
  - lib/fsp_harvester/version.rb
192
192
  - lib/harvester.rb
193
+ - lib/harvester_brute.rb
193
194
  - lib/harvester_utils.rb
194
195
  - lib/metadata_harvester.rb
195
196
  - lib/metadata_object.rb