fsp_harvester 0.1.13 → 0.1.16

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '099b2769aca02c9b6fba26583dfbcfc4b60c39798be4adf0a4c71a989af0094c'
4
- data.tar.gz: 80b8657befce11cdd8d58c420fe8039ba5407b29b6e0fed645325212de95f4d0
3
+ metadata.gz: aec11fd57963ffb176ddb88338b9e262027c9a7d39364089ae130fb4b628bf5b
4
+ data.tar.gz: f8733a00de5c6c24a622235c18ba0dae208f5bac52d50607480e51fd563678c8
5
5
  SHA512:
6
- metadata.gz: 683362c6a0710bf9a0d5420a9ae6fe8372338f9c6132ae5ae56619dce1bfa88df7914008a661cf49685000166a0c4c6b476691acda109326c3955f73b796cc4e
7
- data.tar.gz: 931972a9f872bcb90e11ed731e4de9b406b9c3ccb1f44e6af250d9cb817616040e86149d34f070fe9ac0c6711f37438922b0efb5e2915ce00e39fa4ce09c030b
6
+ metadata.gz: c484e41aa0305f34d0bf7f82cad60b9b02106ffe80b9371c99e77b199eef9ce52818222368b8b3a3ff73d94dba89b8d7fb815d29c95ca335772946e1e9762849
7
+ data.tar.gz: '09dfdcc12b9176bc88c31a196893ae9ede6c35c2fd59271ca8fa5b1c29f0807ee82c8416ca5f8b7ff75c6caab71648b6d0e15d0976784cb7d34ff8686332be37'
data/.rspec_status CHANGED
@@ -42,14 +42,15 @@ example_id | status | run_time |
42
42
  ./spec/describedby_spec.rb[1:1:15] | passed | 2.28 seconds |
43
43
  ./spec/fsp_harvester_spec.rb[1:1] | passed | 0.00058 seconds |
44
44
  ./spec/fsp_harvester_spec.rb[1:2] | failed | 2.92 seconds |
45
- ./spec/item_spec.rb[1:1:1] | passed | 2.94 seconds |
46
- ./spec/item_spec.rb[1:1:2] | passed | 3 seconds |
47
- ./spec/item_spec.rb[1:1:3] | passed | 1.35 seconds |
48
- ./spec/item_spec.rb[1:1:4] | passed | 1.83 seconds |
49
- ./spec/item_spec.rb[1:1:5] | passed | 2.26 seconds |
50
- ./spec/item_spec.rb[1:1:6] | passed | 2.17 seconds |
51
- ./spec/item_spec.rb[1:1:7] | passed | 2.8 seconds |
52
- ./spec/item_spec.rb[1:1:8] | passed | 0.52869 seconds |
45
+ ./spec/fsp_harvester_spec.rb[1:3] | passed | 6.87 seconds |
46
+ ./spec/item_spec.rb[1:1:1] | passed | 3.19 seconds |
47
+ ./spec/item_spec.rb[1:1:2] | passed | 2.81 seconds |
48
+ ./spec/item_spec.rb[1:1:3] | passed | 1.27 seconds |
49
+ ./spec/item_spec.rb[1:1:4] | passed | 1.76 seconds |
50
+ ./spec/item_spec.rb[1:1:5] | passed | 2.43 seconds |
51
+ ./spec/item_spec.rb[1:1:6] | passed | 2.23 seconds |
52
+ ./spec/item_spec.rb[1:1:7] | passed | 2.94 seconds |
53
+ ./spec/item_spec.rb[1:1:8] | passed | 0.52517 seconds |
53
54
  ./spec/type_spec.rb[1:1:1] | passed | 1.35 seconds |
54
55
  ./spec/type_spec.rb[1:1:2] | passed | 1.32 seconds |
55
56
  ./spec/type_spec.rb[1:1:3] | passed | 1.54 seconds |
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fsp_harvester (0.1.13)
4
+ fsp_harvester (0.1.16)
5
5
  json (~> 2.0)
6
6
  linkeddata (~> 3.2)
7
7
  linkheaders-processor (~> 0.1.17)
data/lib/config.conf CHANGED
@@ -2,7 +2,7 @@
2
2
  command="extruct"
3
3
 
4
4
  [rdf]
5
- command="/home/osboxes/.rvm/gems/ruby-3.0.0/bin/rdf"
5
+ command="/usr/local/bundle/bin/rdf"
6
6
 
7
7
  [tika]
8
8
  command="http://tika:9998/meta"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FspHarvester
4
- VERSION = "0.1.13"
4
+ VERSION = "0.1.16"
5
5
  end
@@ -0,0 +1,48 @@
1
+ module HarvesterTools
2
+ class Error < StandardError
3
+ end
4
+
5
+ class BruteForce
6
+ def self.begin_brute_force(guid:, metadata: HarvesterTools::MetadataObject.new)
7
+ type, url = HarvesterTools::Utils.convertToURL(guid: guid)
8
+ return false unless type
9
+
10
+ do_content_negotiation(url: url, metadata: metadata)
11
+ metadata
12
+ end
13
+
14
+ def self.do_content_negotiation(url:, metadata:)
15
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_ALL_HEADER)
16
+ if response
17
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata)
18
+ end
19
+ response = resolve_url_brute(url: url, metadata: metadata, headers: ACCEPT_STAR_HEADER)
20
+ if response
21
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
22
+ response = resolve_url_brute(url: response.request.url, metadata: metadata, headers: ACCEPT_ALL_HEADER) # now do content negotiation on the landing page
23
+ if response
24
+ HarvesterTools::MetadataHarvester.extract_metadata_from_body(response: response, metadata: metadata) # extract from landing page
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)
30
+ @meta = metadata
31
+ @meta.guidtype = 'uri' if @meta.guidtype.nil?
32
+ warn "\n\n BRUTE FETCHING #{url} #{headers}\n\n"
33
+ response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: @meta)
34
+ warn "\n\n head #{response.headers.inspect}\n\n" if response
35
+
36
+ unless response
37
+ @meta.add_warning(['001', url, headers])
38
+ @meta.comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
39
+ @meta.full_response << [url, "No response"]
40
+ false
41
+ end
42
+
43
+ @meta.comments << "INFO: following redirection using this header led to the following URL: #{@meta.all_uris.last}. Using the output from this URL for the next few tests..."
44
+ @meta.full_response << [url, response.body]
45
+ response
46
+ end
47
+ end
48
+ end
data/lib/web_utils.rb CHANGED
@@ -18,13 +18,13 @@ module HarvesterTools
18
18
  warn "final URL #{response.request.url}"
19
19
  warn "Response code #{response.code}"
20
20
  if response.code == 203
21
- meta.warnings << ["002", url, headers]
21
+ meta.add_warning(["002", url, headers])
22
22
  meta.comments << "WARN: Response is non-authoritative (HTTP response code: #{response.code}). Headers may have been manipulated encountered when trying to resolve #{url}\n"
23
23
  end
24
24
  response
25
25
  rescue RestClient::ExceptionWithResponse => e
26
26
  warn "EXCEPTION WITH RESPONSE! #{e.response.code} with response #{e.response}\nfailed response headers: #{e.response.headers}"
27
- meta.warnings << ["003", url, headers]
27
+ meta.add_warning(["003", url, headers])
28
28
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
29
29
  if (e.response.code == 500 or e.response.code == 404)
30
30
  return false
@@ -34,13 +34,13 @@ module HarvesterTools
34
34
  # now we are returning the headers and body that were returned
35
35
  rescue RestClient::Exception => e
36
36
  warn "EXCEPTION WITH NO RESPONSE! #{e}"
37
- meta.warnings << ["003", url, headers]
37
+ meta.add_warning(["003", url, headers])
38
38
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
39
39
  false
40
40
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
41
41
  rescue Exception => e
42
42
  warn "EXCEPTION UNKNOWN! #{e}"
43
- meta.warnings << ["003", url, headers]
43
+ meta.add_warning(["003", url, headers])
44
44
  meta.comments << "WARN: HTTP error #{e} encountered when trying to resolve #{url}\n"
45
45
  false
46
46
  # now we are returning 'False', and we will check that with an \"if\" statement in our main code
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fsp_harvester
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Wilkinson
@@ -190,6 +190,7 @@ files:
190
190
  - lib/fsp_harvester.rb
191
191
  - lib/fsp_harvester/version.rb
192
192
  - lib/harvester.rb
193
+ - lib/harvester_brute.rb
193
194
  - lib/harvester_utils.rb
194
195
  - lib/metadata_harvester.rb
195
196
  - lib/metadata_object.rb