aranha-parsers 0.23.1 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/aranha/parsers/firefox/request_from_firefox.rb +46 -0
- data/lib/aranha/parsers/firefox/request_header_from_firefox.rb +57 -0
- data/lib/aranha/parsers/firefox/uri_from_har.rb +25 -0
- data/lib/aranha/parsers/firefox.rb +11 -0
- data/lib/aranha/parsers/source_address/base.rb +55 -0
- data/lib/aranha/parsers/source_address/file.rb +11 -10
- data/lib/aranha/parsers/source_address/hash_http_base.rb +16 -19
- data/lib/aranha/parsers/source_address/http_get.rb +13 -15
- data/lib/aranha/parsers/source_address.rb +4 -2
- data/lib/aranha/parsers/version.rb +1 -1
- metadata +13 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e9f2435af805f1aaac850b47ade4df067af4517e9a8d44e39cc0aadc12ef8fe
|
4
|
+
data.tar.gz: 6e1146e598ca952c5f15f0d0c83deaec7382b6487996b489727642ec3ba5a553
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72d12c157d957b8238d2c9f8d39a545d06aa974e9ac49e7103b64c69648e946ecd8c439b626f1196b789a8eb30c17bc3fdf0fb46956999ddacdbbf08d37f4de0
|
7
|
+
data.tar.gz: ee5fe7392f18a318a64ad35bd16341118c4254d978288c64a62ddbaa85e6ff7997018311317614e5c48f1c27f159d547c33151f8ae1c5f768b3a191e343a5faa
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Firefox
|
8
|
+
class RequestFromFirefox
|
9
|
+
BASE_URL_SUBPATH = 'base_url'
|
10
|
+
BODY_SUBPATH = 'body'
|
11
|
+
REQUEST_SUBPATH = 'request'
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def from_directory(path)
|
15
|
+
path = path.to_pathname
|
16
|
+
body_path = path.join(BODY_SUBPATH)
|
17
|
+
new(
|
18
|
+
path.join(BASE_URL_SUBPATH).read.strip,
|
19
|
+
::Aranha::Parsers::Firefox::RequestHeaderFromFirefox
|
20
|
+
.from_file(path.join(REQUEST_SUBPATH)),
|
21
|
+
body_path.file? ? body_path.read : nil
|
22
|
+
)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
enable_simple_cache
|
27
|
+
common_constructor :the_base_uri, :header, :body, default: [nil] do
|
28
|
+
self.the_base_uri = the_base_uri.to_uri
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_uri_source
|
32
|
+
{
|
33
|
+
method: header.verb,
|
34
|
+
url: url,
|
35
|
+
headers: header.headers,
|
36
|
+
body: body
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def url
|
41
|
+
(the_base_uri + header.uri).to_s
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Aranha
|
4
|
+
module Parsers
|
5
|
+
module Firefox
|
6
|
+
class RequestHeaderFromFirefox
|
7
|
+
class << self
|
8
|
+
def from_file(path)
|
9
|
+
new(path.to_pathname.read)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
FIRST_LINE_PARSER = /\A(\S+)\s(\S+)\s(\S+)\z/.to_parser do |m|
|
14
|
+
{ verb: m[1], uri: m[2], version: m[3] }
|
15
|
+
end
|
16
|
+
|
17
|
+
HEADER_LINE_PARSER = /\A([^:]+):\s+(.+)\z/.to_parser do |m|
|
18
|
+
m[1..2]
|
19
|
+
end
|
20
|
+
|
21
|
+
enable_simple_cache
|
22
|
+
|
23
|
+
common_constructor :string
|
24
|
+
|
25
|
+
def to_h
|
26
|
+
%w[verb uri headers].index_with { |m| send(m) }
|
27
|
+
end
|
28
|
+
|
29
|
+
def headers
|
30
|
+
all_except_first_line.map { |line| HEADER_LINE_PARSER.parse!(line) }.to_h # rubocop:disable Style/MapToHash
|
31
|
+
end
|
32
|
+
|
33
|
+
def verb
|
34
|
+
parsed_first_line.fetch(:verb)
|
35
|
+
end
|
36
|
+
|
37
|
+
def uri
|
38
|
+
parsed_first_line.fetch(:uri)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def all_lines_uncached
|
44
|
+
string.each_line.map(&:strip)
|
45
|
+
end
|
46
|
+
|
47
|
+
def parsed_first_line_uncached
|
48
|
+
FIRST_LINE_PARSER.parse!(all_lines.first)
|
49
|
+
end
|
50
|
+
|
51
|
+
def all_except_first_line
|
52
|
+
all_lines[1..-1] # rubocop:disable Style/SlicingWithRange
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
module Firefox
|
8
|
+
class UriFromHar
|
9
|
+
class << self
|
10
|
+
def from_file(path)
|
11
|
+
new(::JSON.parse(path.to_pathname.read))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
common_constructor :data
|
16
|
+
|
17
|
+
def result
|
18
|
+
data.fetch('log').fetch('entries').map { |e| e.fetch('request').fetch('url') }
|
19
|
+
end
|
20
|
+
|
21
|
+
def request_data; end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'eac_ruby_utils/core_ext'
|
4
|
+
|
5
|
+
module Aranha
|
6
|
+
module Parsers
|
7
|
+
class SourceAddress
|
8
|
+
class Base
|
9
|
+
acts_as_abstract
|
10
|
+
common_constructor :source
|
11
|
+
compare_by :source
|
12
|
+
|
13
|
+
# @return [String]
|
14
|
+
def content
|
15
|
+
raise_abstract_method __method__
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return [Addressable::URI]
|
19
|
+
def uri
|
20
|
+
raise_abstract_method __method__
|
21
|
+
end
|
22
|
+
|
23
|
+
# @return [String]
|
24
|
+
def url
|
25
|
+
uri.to_s
|
26
|
+
end
|
27
|
+
|
28
|
+
# @return [Hash]
|
29
|
+
def source_as_hash
|
30
|
+
source_as_hash? ? source.with_indifferent_access : raise('source is not a Hash')
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Boolean]
|
34
|
+
def source_as_hash?
|
35
|
+
source.is_a?(::Hash)
|
36
|
+
end
|
37
|
+
|
38
|
+
# @|return [Hash]
|
39
|
+
def source_as_uri
|
40
|
+
source_as_uri? ? source.to_uri : raise('source is not a URI')
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [Boolean]
|
44
|
+
def source_as_uri?
|
45
|
+
source.to_uri.scheme.present?
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Boolean]
|
49
|
+
def valid?
|
50
|
+
raise_abstract_method __method__
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/source_address/http_get'
|
4
|
+
require 'eac_ruby_utils/core_ext'
|
4
5
|
|
5
6
|
module Aranha
|
6
7
|
module Parsers
|
@@ -8,23 +9,23 @@ module Aranha
|
|
8
9
|
class File < ::Aranha::Parsers::SourceAddress::HttpGet
|
9
10
|
SCHEME = 'file://'
|
10
11
|
|
11
|
-
class << self
|
12
|
-
def valid_source?(source)
|
13
|
-
source.to_s.start_with?("#{SCHEME}/", '/')
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
12
|
def initialize(source)
|
18
13
|
super(source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, ''))
|
19
14
|
end
|
20
15
|
|
21
|
-
def url
|
22
|
-
"#{SCHEME}#{source}"
|
23
|
-
end
|
24
|
-
|
25
16
|
def content
|
26
17
|
::File.read(source)
|
27
18
|
end
|
19
|
+
|
20
|
+
# @return [Addressable::URI]
|
21
|
+
def uri
|
22
|
+
source_as_uri? ? source_as_uri : "#{SCHEME}#{source}".to_uri
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return [Boolean]
|
26
|
+
def valid?
|
27
|
+
source.to_s.start_with?("#{SCHEME}/", '/')
|
28
|
+
end
|
28
29
|
end
|
29
30
|
end
|
30
31
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'aranha/parsers/source_address/fetch_content_error'
|
4
|
-
require 'aranha/parsers/source_address/
|
4
|
+
require 'aranha/parsers/source_address/base'
|
5
5
|
require 'eac_envs/http/error'
|
6
6
|
require 'eac_envs/http/request'
|
7
7
|
require 'eac_ruby_utils/core_ext'
|
@@ -10,16 +10,11 @@ require 'yaml'
|
|
10
10
|
module Aranha
|
11
11
|
module Parsers
|
12
12
|
class SourceAddress
|
13
|
-
class HashHttpBase
|
13
|
+
class HashHttpBase < ::Aranha::Parsers::SourceAddress::Base
|
14
14
|
class << self
|
15
15
|
def http_method
|
16
16
|
const_get 'HTTP_METHOD'
|
17
17
|
end
|
18
|
-
|
19
|
-
def valid_source?(source)
|
20
|
-
source.is_a?(::Hash) &&
|
21
|
-
source.with_indifferent_access[:method].to_s.downcase.strip == http_method.to_s
|
22
|
-
end
|
23
18
|
end
|
24
19
|
|
25
20
|
DEFAULT_BODY = ''
|
@@ -31,11 +26,6 @@ module Aranha
|
|
31
26
|
|
32
27
|
enable_simple_cache
|
33
28
|
|
34
|
-
common_constructor :source do
|
35
|
-
self.source = source.with_indifferent_access
|
36
|
-
end
|
37
|
-
compare_by :source
|
38
|
-
|
39
29
|
def body
|
40
30
|
param(:body, DEFAULT_BODY)
|
41
31
|
end
|
@@ -48,12 +38,8 @@ module Aranha
|
|
48
38
|
param(:headers, DEFAULT_HEADERS)
|
49
39
|
end
|
50
40
|
|
51
|
-
def url
|
52
|
-
source.fetch(:url)
|
53
|
-
end
|
54
|
-
|
55
41
|
def serialize
|
56
|
-
|
42
|
+
source_as_hash.to_yaml
|
57
43
|
end
|
58
44
|
|
59
45
|
def content
|
@@ -64,11 +50,22 @@ module Aranha
|
|
64
50
|
end
|
65
51
|
|
66
52
|
def param(key, default_value)
|
67
|
-
|
53
|
+
source_as_hash[key] || params[key] || default_value
|
68
54
|
end
|
69
55
|
|
70
56
|
def params
|
71
|
-
|
57
|
+
source_as_hash[:params].if_present(DEFAULT_PARAMS)
|
58
|
+
end
|
59
|
+
|
60
|
+
# @return [Addressable::URI]
|
61
|
+
def uri
|
62
|
+
::Addressable::URI.parse(source_as_hash.fetch(:url))
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [Boolean]
|
66
|
+
def valid?
|
67
|
+
source_as_hash? &&
|
68
|
+
source_as_hash[:method].to_s.downcase.strip == self.class.http_method.to_s
|
72
69
|
end
|
73
70
|
|
74
71
|
private
|
@@ -1,38 +1,26 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable'
|
4
|
-
require 'aranha/parsers/source_address/
|
4
|
+
require 'aranha/parsers/source_address/base'
|
5
5
|
require 'eac_envs/http/error'
|
6
6
|
require 'eac_envs/http/request'
|
7
7
|
|
8
8
|
module Aranha
|
9
9
|
module Parsers
|
10
10
|
class SourceAddress
|
11
|
-
class HttpGet
|
11
|
+
class HttpGet < ::Aranha::Parsers::SourceAddress::Base
|
12
12
|
class << self
|
13
13
|
def location_uri(source_uri, location)
|
14
14
|
::Addressable::URI.join(source_uri, location).to_s
|
15
15
|
end
|
16
|
-
|
17
|
-
def valid_source?(source)
|
18
|
-
source.to_s =~ %r{\Ahttps?://}
|
19
|
-
end
|
20
16
|
end
|
21
17
|
|
22
|
-
|
23
|
-
|
24
|
-
def initialize(source)
|
25
|
-
@source = source.to_s
|
26
|
-
end
|
18
|
+
common_constructor :source, super_args: -> { [source.to_s] }
|
27
19
|
|
28
20
|
def ==(other)
|
29
21
|
self.class == other.class && source == other.source
|
30
22
|
end
|
31
23
|
|
32
|
-
def url
|
33
|
-
source
|
34
|
-
end
|
35
|
-
|
36
24
|
def final_url
|
37
25
|
content unless @final_url
|
38
26
|
@final_url
|
@@ -51,6 +39,16 @@ module Aranha
|
|
51
39
|
def serialize
|
52
40
|
url
|
53
41
|
end
|
42
|
+
|
43
|
+
# @return [Addressable::URI]
|
44
|
+
def uri
|
45
|
+
source_as_uri
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Boolean]
|
49
|
+
def valid?
|
50
|
+
source.to_s =~ %r{\Ahttps?://}
|
51
|
+
end
|
54
52
|
end
|
55
53
|
end
|
56
54
|
end
|
@@ -20,8 +20,10 @@ module Aranha
|
|
20
20
|
def detect_sub(source)
|
21
21
|
return source.sub if source.is_a?(self)
|
22
22
|
|
23
|
-
SUBS.each do |
|
24
|
-
|
23
|
+
SUBS.each do |sub_class|
|
24
|
+
sub_class.new(source).then do |sub|
|
25
|
+
return sub if sub.valid?
|
26
|
+
end
|
25
27
|
end
|
26
28
|
raise "No content fetcher found for source \"#{source}\""
|
27
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aranha-parsers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.24.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Esquilo Azul Company
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -51,6 +51,9 @@ dependencies:
|
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
53
|
version: '0.6'
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 0.6.1
|
54
57
|
type: :runtime
|
55
58
|
prerelease: false
|
56
59
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -58,6 +61,9 @@ dependencies:
|
|
58
61
|
- - "~>"
|
59
62
|
- !ruby/object:Gem::Version
|
60
63
|
version: '0.6'
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 0.6.1
|
61
67
|
- !ruby/object:Gem::Dependency
|
62
68
|
name: eac_ruby_utils
|
63
69
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +134,10 @@ extra_rdoc_files: []
|
|
128
134
|
files:
|
129
135
|
- lib/aranha/parsers.rb
|
130
136
|
- lib/aranha/parsers/base.rb
|
137
|
+
- lib/aranha/parsers/firefox.rb
|
138
|
+
- lib/aranha/parsers/firefox/request_from_firefox.rb
|
139
|
+
- lib/aranha/parsers/firefox/request_header_from_firefox.rb
|
140
|
+
- lib/aranha/parsers/firefox/uri_from_har.rb
|
131
141
|
- lib/aranha/parsers/html.rb
|
132
142
|
- lib/aranha/parsers/html/base.rb
|
133
143
|
- lib/aranha/parsers/html/item.rb
|
@@ -145,6 +155,7 @@ files:
|
|
145
155
|
- lib/aranha/parsers/patches.rb
|
146
156
|
- lib/aranha/parsers/patches/ofx_parser.rb
|
147
157
|
- lib/aranha/parsers/source_address.rb
|
158
|
+
- lib/aranha/parsers/source_address/base.rb
|
148
159
|
- lib/aranha/parsers/source_address/fetch_content_error.rb
|
149
160
|
- lib/aranha/parsers/source_address/file.rb
|
150
161
|
- lib/aranha/parsers/source_address/hash_http_base.rb
|