aranha-parsers 0.23.0 → 0.24.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/aranha/parsers/firefox/request_from_firefox.rb +46 -0
- data/lib/aranha/parsers/firefox/request_header_from_firefox.rb +57 -0
- data/lib/aranha/parsers/firefox/uri_from_har.rb +25 -0
- data/lib/aranha/parsers/firefox.rb +11 -0
- data/lib/aranha/parsers/source_address/base.rb +55 -0
- data/lib/aranha/parsers/source_address/file.rb +11 -10
- data/lib/aranha/parsers/source_address/hash_http_base.rb +17 -20
- data/lib/aranha/parsers/source_address/http_get.rb +17 -17
- data/lib/aranha/parsers/source_address.rb +4 -2
- data/lib/aranha/parsers/version.rb +1 -1
- metadata +15 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e9f2435af805f1aaac850b47ade4df067af4517e9a8d44e39cc0aadc12ef8fe
|
4
|
+
data.tar.gz: 6e1146e598ca952c5f15f0d0c83deaec7382b6487996b489727642ec3ba5a553
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72d12c157d957b8238d2c9f8d39a545d06aa974e9ac49e7103b64c69648e946ecd8c439b626f1196b789a8eb30c17bc3fdf0fb46956999ddacdbbf08d37f4de0
|
7
|
+
data.tar.gz: ee5fe7392f18a318a64ad35bd16341118c4254d978288c64a62ddbaa85e6ff7997018311317614e5c48f1c27f159d547c33151f8ae1c5f768b3a191e343a5faa
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Firefox
|
8
|
+
class RequestFromFirefox
|
9
|
+
BASE_URL_SUBPATH = 'base_url'
|
10
|
+
BODY_SUBPATH = 'body'
|
11
|
+
REQUEST_SUBPATH = 'request'
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def from_directory(path)
|
15
|
+
path = path.to_pathname
|
16
|
+
body_path = path.join(BODY_SUBPATH)
|
17
|
+
new(
|
18
|
+
path.join(BASE_URL_SUBPATH).read.strip,
|
19
|
+
::Aranha::Parsers::Firefox::RequestHeaderFromFirefox
|
20
|
+
.from_file(path.join(REQUEST_SUBPATH)),
|
21
|
+
body_path.file? ? body_path.read : nil
|
22
|
+
)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
enable_simple_cache
|
27
|
+
common_constructor :the_base_uri, :header, :body, default: [nil] do
|
28
|
+
self.the_base_uri = the_base_uri.to_uri
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_uri_source
|
32
|
+
{
|
33
|
+
method: header.verb,
|
34
|
+
url: url,
|
35
|
+
headers: header.headers,
|
36
|
+
body: body
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def url
|
41
|
+
(the_base_uri + header.uri).to_s
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Firefox
|
6
|
+
class RequestHeaderFromFirefox
|
7
|
+
class << self
|
8
|
+
def from_file(path)
|
9
|
+
new(path.to_pathname.read)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
FIRST_LINE_PARSER = /\A(\S+)\s(\S+)\s(\S+)\z/.to_parser do |m|
|
14
|
+
{ verb: m[1], uri: m[2], version: m[3] }
|
15
|
+
end
|
16
|
+
|
17
|
+
HEADER_LINE_PARSER = /\A([^:]+):\s+(.+)\z/.to_parser do |m|
|
18
|
+
m[1..2]
|
19
|
+
end
|
20
|
+
|
21
|
+
enable_simple_cache
|
22
|
+
|
23
|
+
common_constructor :string
|
24
|
+
|
25
|
+
def to_h
|
26
|
+
%w[verb uri headers].index_with { |m| send(m) }
|
27
|
+
end
|
28
|
+
|
29
|
+
def headers
|
30
|
+
all_except_first_line.map { |line| HEADER_LINE_PARSER.parse!(line) }.to_h # rubocop:disable Style/MapToHash
|
31
|
+
end
|
32
|
+
|
33
|
+
def verb
|
34
|
+
parsed_first_line.fetch(:verb)
|
35
|
+
end
|
36
|
+
|
37
|
+
def uri
|
38
|
+
parsed_first_line.fetch(:uri)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def all_lines_uncached
|
44
|
+
string.each_line.map(&:strip)
|
45
|
+
end
|
46
|
+
|
47
|
+
def parsed_first_line_uncached
|
48
|
+
FIRST_LINE_PARSER.parse!(all_lines.first)
|
49
|
+
end
|
50
|
+
|
51
|
+
def all_except_first_line
|
52
|
+
all_lines[1..-1] # rubocop:disable Style/SlicingWithRange
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Firefox
|
8
|
+
class UriFromHar
|
9
|
+
class << self
|
10
|
+
def from_file(path)
|
11
|
+
new(::JSON.parse(path.to_pathname.read))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
common_constructor :data
|
16
|
+
|
17
|
+
def result
|
18
|
+
data.fetch('log').fetch('entries').map { |e| e.fetch('request').fetch('url') }
|
19
|
+
end
|
20
|
+
|
21
|
+
def request_data; end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
class SourceAddress
|
8
|
+
class Base
|
9
|
+
acts_as_abstract
|
10
|
+
common_constructor :source
|
11
|
+
compare_by :source
|
12
|
+
|
13
|
+
# @return [String]
|
14
|
+
def content
|
15
|
+
raise_abstract_method __method__
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return [Addressable::URI]
|
19
|
+
def uri
|
20
|
+
raise_abstract_method __method__
|
21
|
+
end
|
22
|
+
|
23
|
+
# @return [String]
|
24
|
+
def url
|
25
|
+
uri.to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
# @return [Hash]
|
29
|
+
def source_as_hash
|
30
|
+
source_as_hash? ? source.with_indifferent_access : raise('source is not a Hash')
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Boolean]
|
34
|
+
def source_as_hash?
|
35
|
+
source.is_a?(::Hash)
|
36
|
+
end
|
37
|
+
|
38
|
+
# @|return [Hash]
|
39
|
+
def source_as_uri
|
40
|
+
source_as_uri? ? source.to_uri : raise('source is not a URI')
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [Boolean]
|
44
|
+
def source_as_uri?
|
45
|
+
source.to_uri.scheme.present?
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Boolean]
|
49
|
+
def valid?
|
50
|
+
raise_abstract_method __method__
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/source_address/http_get'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
module Parsers
|
@@ -8,23 +9,23 @@ module Aranha
|
|
8
9
|
class File < ::Aranha::Parsers::SourceAddress::HttpGet
|
9
10
|
SCHEME = 'file://'
|
10
11
|
|
11
|
-
class << self
|
12
|
-
def valid_source?(source)
|
13
|
-
source.to_s.start_with?("#{SCHEME}/", '/')
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
12
|
def initialize(source)
|
18
13
|
super(source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, ''))
|
19
14
|
end
|
20
15
|
|
21
|
-
def url
|
22
|
-
"#{SCHEME}#{source}"
|
23
|
-
end
|
24
|
-
|
25
16
|
def content
|
26
17
|
::File.read(source)
|
27
18
|
end
|
19
|
+
|
20
|
+
# @return [Addressable::URI]
|
21
|
+
def uri
|
22
|
+
source_as_uri? ? source_as_uri : "#{SCHEME}#{source}".to_uri
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return [Boolean]
|
26
|
+
def valid?
|
27
|
+
source.to_s.start_with?("#{SCHEME}/", '/')
|
28
|
+
end
|
28
29
|
end
|
29
30
|
end
|
30
31
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/source_address/fetch_content_error'
|
4
|
-
require 'aranha/parsers/source_address/
|
4
|
+
require 'aranha/parsers/source_address/base'
|
5
5
|
require 'eac_envs/http/error'
|
6
6
|
require 'eac_envs/http/request'
|
7
7
|
require 'eac_ruby_utils/core_ext'
|
@@ -10,16 +10,11 @@ require 'yaml'
|
|
10
10
|
module Aranha
|
11
11
|
module Parsers
|
12
12
|
class SourceAddress
|
13
|
-
class HashHttpBase
|
13
|
+
class HashHttpBase < ::Aranha::Parsers::SourceAddress::Base
|
14
14
|
class << self
|
15
15
|
def http_method
|
16
16
|
const_get 'HTTP_METHOD'
|
17
17
|
end
|
18
|
-
|
19
|
-
def valid_source?(source)
|
20
|
-
source.is_a?(::Hash) &&
|
21
|
-
source.with_indifferent_access[:method].to_s.downcase.strip == http_method.to_s
|
22
|
-
end
|
23
18
|
end
|
24
19
|
|
25
20
|
DEFAULT_BODY = ''
|
@@ -31,11 +26,6 @@ module Aranha
|
|
31
26
|
|
32
27
|
enable_simple_cache
|
33
28
|
|
34
|
-
common_constructor :source do
|
35
|
-
self.source = source.with_indifferent_access
|
36
|
-
end
|
37
|
-
compare_by :source
|
38
|
-
|
39
29
|
def body
|
40
30
|
param(:body, DEFAULT_BODY)
|
41
31
|
end
|
@@ -48,27 +38,34 @@ module Aranha
|
|
48
38
|
param(:headers, DEFAULT_HEADERS)
|
49
39
|
end
|
50
40
|
|
51
|
-
def url
|
52
|
-
source.fetch(:url)
|
53
|
-
end
|
54
|
-
|
55
41
|
def serialize
|
56
|
-
|
42
|
+
source_as_hash.to_yaml
|
57
43
|
end
|
58
44
|
|
59
45
|
def content
|
60
46
|
request = http_request
|
61
|
-
request.response.body_str
|
47
|
+
request.response.body_str!
|
62
48
|
rescue ::EacEnvs::Http::Error => e
|
63
49
|
raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
|
64
50
|
end
|
65
51
|
|
66
52
|
def param(key, default_value)
|
67
|
-
|
53
|
+
source_as_hash[key] || params[key] || default_value
|
68
54
|
end
|
69
55
|
|
70
56
|
def params
|
71
|
-
|
57
|
+
source_as_hash[:params].if_present(DEFAULT_PARAMS)
|
58
|
+
end
|
59
|
+
|
60
|
+
# @return [Addressable::URI]
|
61
|
+
def uri
|
62
|
+
::Addressable::URI.parse(source_as_hash.fetch(:url))
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [Boolean]
|
66
|
+
def valid?
|
67
|
+
source_as_hash? &&
|
68
|
+
source_as_hash[:method].to_s.downcase.strip == self.class.http_method.to_s
|
72
69
|
end
|
73
70
|
|
74
71
|
private
|
@@ -1,54 +1,54 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable'
|
4
|
-
require 'aranha/parsers/source_address/
|
4
|
+
require 'aranha/parsers/source_address/base'
|
5
5
|
require 'eac_envs/http/error'
|
6
6
|
require 'eac_envs/http/request'
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
module Parsers
|
10
10
|
class SourceAddress
|
11
|
-
class HttpGet
|
11
|
+
class HttpGet < ::Aranha::Parsers::SourceAddress::Base
|
12
12
|
class << self
|
13
13
|
def location_uri(source_uri, location)
|
14
14
|
::Addressable::URI.join(source_uri, location).to_s
|
15
15
|
end
|
16
|
-
|
17
|
-
def valid_source?(source)
|
18
|
-
source.to_s =~ %r{\Ahttps?://}
|
19
|
-
end
|
20
16
|
end
|
21
17
|
|
22
|
-
|
23
|
-
|
24
|
-
def initialize(source)
|
25
|
-
@source = source.to_s
|
26
|
-
end
|
18
|
+
common_constructor :source, super_args: -> { [source.to_s] }
|
27
19
|
|
28
20
|
def ==(other)
|
29
21
|
self.class == other.class && source == other.source
|
30
22
|
end
|
31
23
|
|
32
|
-
def url
|
33
|
-
source
|
34
|
-
end
|
35
|
-
|
36
24
|
def final_url
|
37
25
|
content unless @final_url
|
38
26
|
@final_url
|
39
27
|
end
|
40
28
|
|
29
|
+
# @return [String]
|
30
|
+
# @raise [Aranha::Parsers::SourceAddress::FetchContentError]
|
41
31
|
def content
|
42
32
|
request = ::EacEnvs::Http::Request.new.url(url).retry(true).follow_redirect(true)
|
43
33
|
.header('user-agent', self.class.name)
|
44
|
-
request.response.body_str
|
34
|
+
request.response.body_str!
|
45
35
|
rescue ::EacEnvs::Http::Error => e
|
46
|
-
raise ::Aranha::Parsers::SourceAddress::FetchContentError
|
36
|
+
raise ::Aranha::Parsers::SourceAddress::FetchContentError.new(e.message, request)
|
47
37
|
end
|
48
38
|
|
49
39
|
def serialize
|
50
40
|
url
|
51
41
|
end
|
42
|
+
|
43
|
+
# @return [Addressable::URI]
|
44
|
+
def uri
|
45
|
+
source_as_uri
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Boolean]
|
49
|
+
def valid?
|
50
|
+
source.to_s =~ %r{\Ahttps?://}
|
51
|
+
end
|
52
52
|
end
|
53
53
|
end
|
54
54
|
end
|
@@ -20,8 +20,10 @@ module Aranha
|
|
20
20
|
def detect_sub(source)
|
21
21
|
return source.sub if source.is_a?(self)
|
22
22
|
|
23
|
-
SUBS.each do |
|
24
|
-
|
23
|
+
SUBS.each do |sub_class|
|
24
|
+
sub_class.new(source).then do |sub|
|
25
|
+
return sub if sub.valid?
|
26
|
+
end
|
25
27
|
end
|
26
28
|
raise "No content fetcher found for source \"#{source}\""
|
27
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha-parsers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.24.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Esquilo Azul Company
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -50,48 +50,34 @@ dependencies:
|
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '0.
|
53
|
+
version: '0.6'
|
54
54
|
- - ">="
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: 0.
|
56
|
+
version: 0.6.1
|
57
57
|
type: :runtime
|
58
58
|
prerelease: false
|
59
59
|
version_requirements: !ruby/object:Gem::Requirement
|
60
60
|
requirements:
|
61
61
|
- - "~>"
|
62
62
|
- !ruby/object:Gem::Version
|
63
|
-
version: '0.
|
63
|
+
version: '0.6'
|
64
64
|
- - ">="
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: 0.
|
67
|
-
- !ruby/object:Gem::Dependency
|
68
|
-
name: eac_ruby_gem_support
|
69
|
-
requirement: !ruby/object:Gem::Requirement
|
70
|
-
requirements:
|
71
|
-
- - "~>"
|
72
|
-
- !ruby/object:Gem::Version
|
73
|
-
version: '0.10'
|
74
|
-
type: :runtime
|
75
|
-
prerelease: false
|
76
|
-
version_requirements: !ruby/object:Gem::Requirement
|
77
|
-
requirements:
|
78
|
-
- - "~>"
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: '0.10'
|
66
|
+
version: 0.6.1
|
81
67
|
- !ruby/object:Gem::Dependency
|
82
68
|
name: eac_ruby_utils
|
83
69
|
requirement: !ruby/object:Gem::Requirement
|
84
70
|
requirements:
|
85
71
|
- - "~>"
|
86
72
|
- !ruby/object:Gem::Version
|
87
|
-
version: '0.
|
73
|
+
version: '0.123'
|
88
74
|
type: :runtime
|
89
75
|
prerelease: false
|
90
76
|
version_requirements: !ruby/object:Gem::Requirement
|
91
77
|
requirements:
|
92
78
|
- - "~>"
|
93
79
|
- !ruby/object:Gem::Version
|
94
|
-
version: '0.
|
80
|
+
version: '0.123'
|
95
81
|
- !ruby/object:Gem::Dependency
|
96
82
|
name: nokogiri
|
97
83
|
requirement: !ruby/object:Gem::Requirement
|
@@ -132,14 +118,14 @@ dependencies:
|
|
132
118
|
requirements:
|
133
119
|
- - "~>"
|
134
120
|
- !ruby/object:Gem::Version
|
135
|
-
version: '0.
|
121
|
+
version: '0.10'
|
136
122
|
type: :development
|
137
123
|
prerelease: false
|
138
124
|
version_requirements: !ruby/object:Gem::Requirement
|
139
125
|
requirements:
|
140
126
|
- - "~>"
|
141
127
|
- !ruby/object:Gem::Version
|
142
|
-
version: '0.
|
128
|
+
version: '0.10'
|
143
129
|
description:
|
144
130
|
email:
|
145
131
|
executables: []
|
@@ -148,6 +134,10 @@ extra_rdoc_files: []
|
|
148
134
|
files:
|
149
135
|
- lib/aranha/parsers.rb
|
150
136
|
- lib/aranha/parsers/base.rb
|
137
|
+
- lib/aranha/parsers/firefox.rb
|
138
|
+
- lib/aranha/parsers/firefox/request_from_firefox.rb
|
139
|
+
- lib/aranha/parsers/firefox/request_header_from_firefox.rb
|
140
|
+
- lib/aranha/parsers/firefox/uri_from_har.rb
|
151
141
|
- lib/aranha/parsers/html.rb
|
152
142
|
- lib/aranha/parsers/html/base.rb
|
153
143
|
- lib/aranha/parsers/html/item.rb
|
@@ -165,6 +155,7 @@ files:
|
|
165
155
|
- lib/aranha/parsers/patches.rb
|
166
156
|
- lib/aranha/parsers/patches/ofx_parser.rb
|
167
157
|
- lib/aranha/parsers/source_address.rb
|
158
|
+
- lib/aranha/parsers/source_address/base.rb
|
168
159
|
- lib/aranha/parsers/source_address/fetch_content_error.rb
|
169
160
|
- lib/aranha/parsers/source_address/file.rb
|
170
161
|
- lib/aranha/parsers/source_address/hash_http_base.rb
|