aranha-parsers 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/parsers/firefox/request_from_firefox.rb +46 -0
- data/lib/aranha/parsers/firefox/request_header_from_firefox.rb +57 -0
- data/lib/aranha/parsers/firefox/uri_from_har.rb +25 -0
- data/lib/aranha/parsers/firefox.rb +11 -0
- data/lib/aranha/parsers/source_address/base.rb +55 -0
- data/lib/aranha/parsers/source_address/file.rb +11 -10
- data/lib/aranha/parsers/source_address/hash_http_base.rb +17 -20
- data/lib/aranha/parsers/source_address/http_get.rb +17 -17
- data/lib/aranha/parsers/source_address.rb +4 -2
- data/lib/aranha/parsers/version.rb +1 -1
- metadata +15 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e9f2435af805f1aaac850b47ade4df067af4517e9a8d44e39cc0aadc12ef8fe
|
4
|
+
data.tar.gz: 6e1146e598ca952c5f15f0d0c83deaec7382b6487996b489727642ec3ba5a553
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72d12c157d957b8238d2c9f8d39a545d06aa974e9ac49e7103b64c69648e946ecd8c439b626f1196b789a8eb30c17bc3fdf0fb46956999ddacdbbf08d37f4de0
|
7
|
+
data.tar.gz: ee5fe7392f18a318a64ad35bd16341118c4254d978288c64a62ddbaa85e6ff7997018311317614e5c48f1c27f159d547c33151f8ae1c5f768b3a191e343a5faa
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Firefox
|
8
|
+
class RequestFromFirefox
|
9
|
+
BASE_URL_SUBPATH = 'base_url'
|
10
|
+
BODY_SUBPATH = 'body'
|
11
|
+
REQUEST_SUBPATH = 'request'
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def from_directory(path)
|
15
|
+
path = path.to_pathname
|
16
|
+
body_path = path.join(BODY_SUBPATH)
|
17
|
+
new(
|
18
|
+
path.join(BASE_URL_SUBPATH).read.strip,
|
19
|
+
::Aranha::Parsers::Firefox::RequestHeaderFromFirefox
|
20
|
+
.from_file(path.join(REQUEST_SUBPATH)),
|
21
|
+
body_path.file? ? body_path.read : nil
|
22
|
+
)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
enable_simple_cache
|
27
|
+
common_constructor :the_base_uri, :header, :body, default: [nil] do
|
28
|
+
self.the_base_uri = the_base_uri.to_uri
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_uri_source
|
32
|
+
{
|
33
|
+
method: header.verb,
|
34
|
+
url: url,
|
35
|
+
headers: header.headers,
|
36
|
+
body: body
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def url
|
41
|
+
(the_base_uri + header.uri).to_s
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Firefox
|
6
|
+
class RequestHeaderFromFirefox
|
7
|
+
class << self
|
8
|
+
def from_file(path)
|
9
|
+
new(path.to_pathname.read)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
FIRST_LINE_PARSER = /\A(\S+)\s(\S+)\s(\S+)\z/.to_parser do |m|
|
14
|
+
{ verb: m[1], uri: m[2], version: m[3] }
|
15
|
+
end
|
16
|
+
|
17
|
+
HEADER_LINE_PARSER = /\A([^:]+):\s+(.+)\z/.to_parser do |m|
|
18
|
+
m[1..2]
|
19
|
+
end
|
20
|
+
|
21
|
+
enable_simple_cache
|
22
|
+
|
23
|
+
common_constructor :string
|
24
|
+
|
25
|
+
def to_h
|
26
|
+
%w[verb uri headers].index_with { |m| send(m) }
|
27
|
+
end
|
28
|
+
|
29
|
+
def headers
|
30
|
+
all_except_first_line.map { |line| HEADER_LINE_PARSER.parse!(line) }.to_h # rubocop:disable Style/MapToHash
|
31
|
+
end
|
32
|
+
|
33
|
+
def verb
|
34
|
+
parsed_first_line.fetch(:verb)
|
35
|
+
end
|
36
|
+
|
37
|
+
def uri
|
38
|
+
parsed_first_line.fetch(:uri)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def all_lines_uncached
|
44
|
+
string.each_line.map(&:strip)
|
45
|
+
end
|
46
|
+
|
47
|
+
def parsed_first_line_uncached
|
48
|
+
FIRST_LINE_PARSER.parse!(all_lines.first)
|
49
|
+
end
|
50
|
+
|
51
|
+
def all_except_first_line
|
52
|
+
all_lines[1..-1] # rubocop:disable Style/SlicingWithRange
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Firefox
|
8
|
+
class UriFromHar
|
9
|
+
class << self
|
10
|
+
def from_file(path)
|
11
|
+
new(::JSON.parse(path.to_pathname.read))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
common_constructor :data
|
16
|
+
|
17
|
+
def result
|
18
|
+
data.fetch('log').fetch('entries').map { |e| e.fetch('request').fetch('url') }
|
19
|
+
end
|
20
|
+
|
21
|
+
def request_data; end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
class SourceAddress
|
8
|
+
class Base
|
9
|
+
acts_as_abstract
|
10
|
+
common_constructor :source
|
11
|
+
compare_by :source
|
12
|
+
|
13
|
+
# @return [String]
|
14
|
+
def content
|
15
|
+
raise_abstract_method __method__
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return [Addressable::URI]
|
19
|
+
def uri
|
20
|
+
raise_abstract_method __method__
|
21
|
+
end
|
22
|
+
|
23
|
+
# @return [String]
|
24
|
+
def url
|
25
|
+
uri.to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
# @return [Hash]
|
29
|
+
def source_as_hash
|
30
|
+
source_as_hash? ? source.with_indifferent_access : raise('source is not a Hash')
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Boolean]
|
34
|
+
def source_as_hash?
|
35
|
+
source.is_a?(::Hash)
|
36
|
+
end
|
37
|
+
|
38
|
+
# @|return [Hash]
|
39
|
+
def source_as_uri
|
40
|
+
source_as_uri? ? source.to_uri : raise('source is not a URI')
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [Boolean]
|
44
|
+
def source_as_uri?
|
45
|
+
source.to_uri.scheme.present?
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Boolean]
|
49
|
+
def valid?
|
50
|
+
raise_abstract_method __method__
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/source_address/http_get'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
module Parsers
|
@@ -8,23 +9,23 @@ module Aranha
|
|
8
9
|
class File < ::Aranha::Parsers::SourceAddress::HttpGet
|
9
10
|
SCHEME = 'file://'
|
10
11
|
|
11
|
-
class << self
|
12
|
-
def valid_source?(source)
|
13
|
-
source.to_s.start_with?("#{SCHEME}/", '/')
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
12
|
def initialize(source)
|
18
13
|
super(source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, ''))
|
19
14
|
end
|
20
15
|
|
21
|
-
def url
|
22
|
-
"#{SCHEME}#{source}"
|
23
|
-
end
|
24
|
-
|
25
16
|
def content
|
26
17
|
::File.read(source)
|
27
18
|
end
|
19
|
+
|
20
|
+
# @return [Addressable::URI]
|
21
|
+
def uri
|
22
|
+
source_as_uri? ? source_as_uri : "#{SCHEME}#{source}".to_uri
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return [Boolean]
|
26
|
+
def valid?
|
27
|
+
source.to_s.start_with?("#{SCHEME}/", '/')
|
28
|
+
end
|
28
29
|
end
|
29
30
|
end
|
30
31
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/source_address/fetch_content_error'
|
4
|
-
require 'aranha/parsers/source_address/
|
4
|
+
require 'aranha/parsers/source_address/base'
|
5
5
|
require 'eac_envs/http/error'
|
6
6
|
require 'eac_envs/http/request'
|
7
7
|
require 'eac_ruby_utils/core_ext'
|
@@ -10,16 +10,11 @@ require 'yaml'
|
|
10
10
|
module Aranha
|
11
11
|
module Parsers
|
12
12
|
class SourceAddress
|
13
|
-
class HashHttpBase
|
13
|
+
class HashHttpBase < ::Aranha::Parsers::SourceAddress::Base
|
14
14
|
class << self
|
15
15
|
def http_method
|
16
16
|
const_get 'HTTP_METHOD'
|
17
17
|
end
|
18
|
-
|
19
|
-
def valid_source?(source)
|
20
|
-
source.is_a?(::Hash) &&
|
21
|
-
source.with_indifferent_access[:method].to_s.downcase.strip == http_method.to_s
|
22
|
-
end
|
23
18
|
end
|
24
19
|
|
25
20
|
DEFAULT_BODY = ''
|
@@ -31,11 +26,6 @@ module Aranha
|
|
31
26
|
|
32
27
|
enable_simple_cache
|
33
28
|
|
34
|
-
common_constructor :source do
|
35
|
-
self.source = source.with_indifferent_access
|
36
|
-
end
|
37
|
-
compare_by :source
|
38
|
-
|
39
29
|
def body
|
40
30
|
param(:body, DEFAULT_BODY)
|
41
31
|
end
|
@@ -48,27 +38,34 @@ module Aranha
|
|
48
38
|
param(:headers, DEFAULT_HEADERS)
|
49
39
|
end
|
50
40
|
|
51
|
-
def url
|
52
|
-
source.fetch(:url)
|
53
|
-
end
|
54
|
-
|
55
41
|
def serialize
|
56
|
-
|
42
|
+
source_as_hash.to_yaml
|
57
43
|
end
|
58
44
|
|
59
45
|
def content
|
60
46
|
request = http_request
|
61
|
-
request.response.body_str
|
47
|
+
request.response.body_str!
|
62
48
|
rescue ::EacEnvs::Http::Error => e
|
63
49
|
raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
|
64
50
|
end
|
65
51
|
|
66
52
|
def param(key, default_value)
|
67
|
-
|
53
|
+
source_as_hash[key] || params[key] || default_value
|
68
54
|
end
|
69
55
|
|
70
56
|
def params
|
71
|
-
|
57
|
+
source_as_hash[:params].if_present(DEFAULT_PARAMS)
|
58
|
+
end
|
59
|
+
|
60
|
+
# @return [Addressable::URI]
|
61
|
+
def uri
|
62
|
+
::Addressable::URI.parse(source_as_hash.fetch(:url))
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [Boolean]
|
66
|
+
def valid?
|
67
|
+
source_as_hash? &&
|
68
|
+
source_as_hash[:method].to_s.downcase.strip == self.class.http_method.to_s
|
72
69
|
end
|
73
70
|
|
74
71
|
private
|
@@ -1,54 +1,54 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable'
|
4
|
-
require 'aranha/parsers/source_address/
|
4
|
+
require 'aranha/parsers/source_address/base'
|
5
5
|
require 'eac_envs/http/error'
|
6
6
|
require 'eac_envs/http/request'
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
module Parsers
|
10
10
|
class SourceAddress
|
11
|
-
class HttpGet
|
11
|
+
class HttpGet < ::Aranha::Parsers::SourceAddress::Base
|
12
12
|
class << self
|
13
13
|
def location_uri(source_uri, location)
|
14
14
|
::Addressable::URI.join(source_uri, location).to_s
|
15
15
|
end
|
16
|
-
|
17
|
-
def valid_source?(source)
|
18
|
-
source.to_s =~ %r{\Ahttps?://}
|
19
|
-
end
|
20
16
|
end
|
21
17
|
|
22
|
-
|
23
|
-
|
24
|
-
def initialize(source)
|
25
|
-
@source = source.to_s
|
26
|
-
end
|
18
|
+
common_constructor :source, super_args: -> { [source.to_s] }
|
27
19
|
|
28
20
|
def ==(other)
|
29
21
|
self.class == other.class && source == other.source
|
30
22
|
end
|
31
23
|
|
32
|
-
def url
|
33
|
-
source
|
34
|
-
end
|
35
|
-
|
36
24
|
def final_url
|
37
25
|
content unless @final_url
|
38
26
|
@final_url
|
39
27
|
end
|
40
28
|
|
29
|
+
# @return [String]
|
30
|
+
# @raise [Aranha::Parsers::SourceAddress::FetchContentError]
|
41
31
|
def content
|
42
32
|
request = ::EacEnvs::Http::Request.new.url(url).retry(true).follow_redirect(true)
|
43
33
|
.header('user-agent', self.class.name)
|
44
|
-
request.response.body_str
|
34
|
+
request.response.body_str!
|
45
35
|
rescue ::EacEnvs::Http::Error => e
|
46
|
-
raise ::Aranha::Parsers::SourceAddress::FetchContentError
|
36
|
+
raise ::Aranha::Parsers::SourceAddress::FetchContentError.new(e.message, request)
|
47
37
|
end
|
48
38
|
|
49
39
|
def serialize
|
50
40
|
url
|
51
41
|
end
|
42
|
+
|
43
|
+
# @return [Addressable::URI]
|
44
|
+
def uri
|
45
|
+
source_as_uri
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Boolean]
|
49
|
+
def valid?
|
50
|
+
source.to_s =~ %r{\Ahttps?://}
|
51
|
+
end
|
52
52
|
end
|
53
53
|
end
|
54
54
|
end
|
@@ -20,8 +20,10 @@ module Aranha
|
|
20
20
|
def detect_sub(source)
|
21
21
|
return source.sub if source.is_a?(self)
|
22
22
|
|
23
|
-
SUBS.each do |
|
24
|
-
|
23
|
+
SUBS.each do |sub_class|
|
24
|
+
sub_class.new(source).then do |sub|
|
25
|
+
return sub if sub.valid?
|
26
|
+
end
|
25
27
|
end
|
26
28
|
raise "No content fetcher found for source \"#{source}\""
|
27
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha-parsers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.24.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Esquilo Azul Company
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -50,48 +50,34 @@ dependencies:
|
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '0.
|
53
|
+
version: '0.6'
|
54
54
|
- - ">="
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: 0.
|
56
|
+
version: 0.6.1
|
57
57
|
type: :runtime
|
58
58
|
prerelease: false
|
59
59
|
version_requirements: !ruby/object:Gem::Requirement
|
60
60
|
requirements:
|
61
61
|
- - "~>"
|
62
62
|
- !ruby/object:Gem::Version
|
63
|
-
version: '0.
|
63
|
+
version: '0.6'
|
64
64
|
- - ">="
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: 0.
|
67
|
-
- !ruby/object:Gem::Dependency
|
68
|
-
name: eac_ruby_gem_support
|
69
|
-
requirement: !ruby/object:Gem::Requirement
|
70
|
-
requirements:
|
71
|
-
- - "~>"
|
72
|
-
- !ruby/object:Gem::Version
|
73
|
-
version: '0.10'
|
74
|
-
type: :runtime
|
75
|
-
prerelease: false
|
76
|
-
version_requirements: !ruby/object:Gem::Requirement
|
77
|
-
requirements:
|
78
|
-
- - "~>"
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: '0.10'
|
66
|
+
version: 0.6.1
|
81
67
|
- !ruby/object:Gem::Dependency
|
82
68
|
name: eac_ruby_utils
|
83
69
|
requirement: !ruby/object:Gem::Requirement
|
84
70
|
requirements:
|
85
71
|
- - "~>"
|
86
72
|
- !ruby/object:Gem::Version
|
87
|
-
version: '0.
|
73
|
+
version: '0.123'
|
88
74
|
type: :runtime
|
89
75
|
prerelease: false
|
90
76
|
version_requirements: !ruby/object:Gem::Requirement
|
91
77
|
requirements:
|
92
78
|
- - "~>"
|
93
79
|
- !ruby/object:Gem::Version
|
94
|
-
version: '0.
|
80
|
+
version: '0.123'
|
95
81
|
- !ruby/object:Gem::Dependency
|
96
82
|
name: nokogiri
|
97
83
|
requirement: !ruby/object:Gem::Requirement
|
@@ -132,14 +118,14 @@ dependencies:
|
|
132
118
|
requirements:
|
133
119
|
- - "~>"
|
134
120
|
- !ruby/object:Gem::Version
|
135
|
-
version: '0.
|
121
|
+
version: '0.10'
|
136
122
|
type: :development
|
137
123
|
prerelease: false
|
138
124
|
version_requirements: !ruby/object:Gem::Requirement
|
139
125
|
requirements:
|
140
126
|
- - "~>"
|
141
127
|
- !ruby/object:Gem::Version
|
142
|
-
version: '0.
|
128
|
+
version: '0.10'
|
143
129
|
description:
|
144
130
|
email:
|
145
131
|
executables: []
|
@@ -148,6 +134,10 @@ extra_rdoc_files: []
|
|
148
134
|
files:
|
149
135
|
- lib/aranha/parsers.rb
|
150
136
|
- lib/aranha/parsers/base.rb
|
137
|
+
- lib/aranha/parsers/firefox.rb
|
138
|
+
- lib/aranha/parsers/firefox/request_from_firefox.rb
|
139
|
+
- lib/aranha/parsers/firefox/request_header_from_firefox.rb
|
140
|
+
- lib/aranha/parsers/firefox/uri_from_har.rb
|
151
141
|
- lib/aranha/parsers/html.rb
|
152
142
|
- lib/aranha/parsers/html/base.rb
|
153
143
|
- lib/aranha/parsers/html/item.rb
|
@@ -165,6 +155,7 @@ files:
|
|
165
155
|
- lib/aranha/parsers/patches.rb
|
166
156
|
- lib/aranha/parsers/patches/ofx_parser.rb
|
167
157
|
- lib/aranha/parsers/source_address.rb
|
158
|
+
- lib/aranha/parsers/source_address/base.rb
|
168
159
|
- lib/aranha/parsers/source_address/fetch_content_error.rb
|
169
160
|
- lib/aranha/parsers/source_address/file.rb
|
170
161
|
- lib/aranha/parsers/source_address/hash_http_base.rb
|