aranha-parsers 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 29047ecc46278ee7a2a65dbd78f68e02274cbff544eb40c975468ce0f5e01111
4
- data.tar.gz: b99153904aab532fae9fe1bb2e73b6e21b69a0650bebb80cf8c1e6d20edb8b2c
3
+ metadata.gz: 0e9f2435af805f1aaac850b47ade4df067af4517e9a8d44e39cc0aadc12ef8fe
4
+ data.tar.gz: 6e1146e598ca952c5f15f0d0c83deaec7382b6487996b489727642ec3ba5a553
5
5
  SHA512:
6
- metadata.gz: c5292e264c641a4fd05d44f1799abe47c2a9672de472bcbeca7fe1f9de60b94c4e6ce0b33029736c906f4043df14c4be47a39b98150c68db9a91bfd57d7b1f90
7
- data.tar.gz: f2b9d85ea85c9902fb9ebbcfc2f2cc80fc66711fb93ebc569b9e32e7e5e007ea1917c707211919aa85125970449ebf07472df9cc3efd735288da6a6adbf129e6
6
+ metadata.gz: 72d12c157d957b8238d2c9f8d39a545d06aa974e9ac49e7103b64c69648e946ecd8c439b626f1196b789a8eb30c17bc3fdf0fb46956999ddacdbbf08d37f4de0
7
+ data.tar.gz: ee5fe7392f18a318a64ad35bd16341118c4254d978288c64a62ddbaa85e6ff7997018311317614e5c48f1c27f159d547c33151f8ae1c5f768b3a191e343a5faa
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Firefox
8
+ class RequestFromFirefox
9
+ BASE_URL_SUBPATH = 'base_url'
10
+ BODY_SUBPATH = 'body'
11
+ REQUEST_SUBPATH = 'request'
12
+
13
+ class << self
14
+ def from_directory(path)
15
+ path = path.to_pathname
16
+ body_path = path.join(BODY_SUBPATH)
17
+ new(
18
+ path.join(BASE_URL_SUBPATH).read.strip,
19
+ ::Aranha::Parsers::Firefox::RequestHeaderFromFirefox
20
+ .from_file(path.join(REQUEST_SUBPATH)),
21
+ body_path.file? ? body_path.read : nil
22
+ )
23
+ end
24
+ end
25
+
26
+ enable_simple_cache
27
+ common_constructor :the_base_uri, :header, :body, default: [nil] do
28
+ self.the_base_uri = the_base_uri.to_uri
29
+ end
30
+
31
+ def to_uri_source
32
+ {
33
+ method: header.verb,
34
+ url: url,
35
+ headers: header.headers,
36
+ body: body
37
+ }
38
+ end
39
+
40
+ def url
41
+ (the_base_uri + header.uri).to_s
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aranha
4
+ module Parsers
5
+ module Firefox
6
+ class RequestHeaderFromFirefox
7
+ class << self
8
+ def from_file(path)
9
+ new(path.to_pathname.read)
10
+ end
11
+ end
12
+
13
+ FIRST_LINE_PARSER = /\A(\S+)\s(\S+)\s(\S+)\z/.to_parser do |m|
14
+ { verb: m[1], uri: m[2], version: m[3] }
15
+ end
16
+
17
+ HEADER_LINE_PARSER = /\A([^:]+):\s+(.+)\z/.to_parser do |m|
18
+ m[1..2]
19
+ end
20
+
21
+ enable_simple_cache
22
+
23
+ common_constructor :string
24
+
25
+ def to_h
26
+ %w[verb uri headers].index_with { |m| send(m) }
27
+ end
28
+
29
+ def headers
30
+ all_except_first_line.map { |line| HEADER_LINE_PARSER.parse!(line) }.to_h # rubocop:disable Style/MapToHash
31
+ end
32
+
33
+ def verb
34
+ parsed_first_line.fetch(:verb)
35
+ end
36
+
37
+ def uri
38
+ parsed_first_line.fetch(:uri)
39
+ end
40
+
41
+ private
42
+
43
+ def all_lines_uncached
44
+ string.each_line.map(&:strip)
45
+ end
46
+
47
+ def parsed_first_line_uncached
48
+ FIRST_LINE_PARSER.parse!(all_lines.first)
49
+ end
50
+
51
+ def all_except_first_line
52
+ all_lines[1..-1] # rubocop:disable Style/SlicingWithRange
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Firefox
8
+ class UriFromHar
9
+ class << self
10
+ def from_file(path)
11
+ new(::JSON.parse(path.to_pathname.read))
12
+ end
13
+ end
14
+
15
+ common_constructor :data
16
+
17
+ def result
18
+ data.fetch('log').fetch('entries').map { |e| e.fetch('request').fetch('url') }
19
+ end
20
+
21
+ def request_data; end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/require_sub'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Firefox
8
+ ::EacRubyUtils.require_sub __FILE__
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ class SourceAddress
8
+ class Base
9
+ acts_as_abstract
10
+ common_constructor :source
11
+ compare_by :source
12
+
13
+ # @return [String]
14
+ def content
15
+ raise_abstract_method __method__
16
+ end
17
+
18
+ # @return [Addressable::URI]
19
+ def uri
20
+ raise_abstract_method __method__
21
+ end
22
+
23
+ # @return [String]
24
+ def url
25
+ uri.to_s
26
+ end
27
+
28
+ # @return [Hash]
29
+ def source_as_hash
30
+ source_as_hash? ? source.with_indifferent_access : raise('source is not a Hash')
31
+ end
32
+
33
+ # @return [Boolean]
34
+ def source_as_hash?
35
+ source.is_a?(::Hash)
36
+ end
37
+
38
+ # @|return [Hash]
39
+ def source_as_uri
40
+ source_as_uri? ? source.to_uri : raise('source is not a URI')
41
+ end
42
+
43
+ # @return [Boolean]
44
+ def source_as_uri?
45
+ source.to_uri.scheme.present?
46
+ end
47
+
48
+ # @return [Boolean]
49
+ def valid?
50
+ raise_abstract_method __method__
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/source_address/http_get'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
@@ -8,23 +9,23 @@ module Aranha
8
9
  class File < ::Aranha::Parsers::SourceAddress::HttpGet
9
10
  SCHEME = 'file://'
10
11
 
11
- class << self
12
- def valid_source?(source)
13
- source.to_s.start_with?("#{SCHEME}/", '/')
14
- end
15
- end
16
-
17
12
  def initialize(source)
18
13
  super(source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, ''))
19
14
  end
20
15
 
21
- def url
22
- "#{SCHEME}#{source}"
23
- end
24
-
25
16
  def content
26
17
  ::File.read(source)
27
18
  end
19
+
20
+ # @return [Addressable::URI]
21
+ def uri
22
+ source_as_uri? ? source_as_uri : "#{SCHEME}#{source}".to_uri
23
+ end
24
+
25
+ # @return [Boolean]
26
+ def valid?
27
+ source.to_s.start_with?("#{SCHEME}/", '/')
28
+ end
28
29
  end
29
30
  end
30
31
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/source_address/fetch_content_error'
4
- require 'aranha/parsers/source_address/hash_http_base'
4
+ require 'aranha/parsers/source_address/base'
5
5
  require 'eac_envs/http/error'
6
6
  require 'eac_envs/http/request'
7
7
  require 'eac_ruby_utils/core_ext'
@@ -10,16 +10,11 @@ require 'yaml'
10
10
  module Aranha
11
11
  module Parsers
12
12
  class SourceAddress
13
- class HashHttpBase
13
+ class HashHttpBase < ::Aranha::Parsers::SourceAddress::Base
14
14
  class << self
15
15
  def http_method
16
16
  const_get 'HTTP_METHOD'
17
17
  end
18
-
19
- def valid_source?(source)
20
- source.is_a?(::Hash) &&
21
- source.with_indifferent_access[:method].to_s.downcase.strip == http_method.to_s
22
- end
23
18
  end
24
19
 
25
20
  DEFAULT_BODY = ''
@@ -31,11 +26,6 @@ module Aranha
31
26
 
32
27
  enable_simple_cache
33
28
 
34
- common_constructor :source do
35
- self.source = source.with_indifferent_access
36
- end
37
- compare_by :source
38
-
39
29
  def body
40
30
  param(:body, DEFAULT_BODY)
41
31
  end
@@ -48,27 +38,34 @@ module Aranha
48
38
  param(:headers, DEFAULT_HEADERS)
49
39
  end
50
40
 
51
- def url
52
- source.fetch(:url)
53
- end
54
-
55
41
  def serialize
56
- source.to_yaml
42
+ source_as_hash.to_yaml
57
43
  end
58
44
 
59
45
  def content
60
46
  request = http_request
61
- request.response.body_str
47
+ request.response.body_str!
62
48
  rescue ::EacEnvs::Http::Error => e
63
49
  raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
64
50
  end
65
51
 
66
52
  def param(key, default_value)
67
- source[key] || params[key] || default_value
53
+ source_as_hash[key] || params[key] || default_value
68
54
  end
69
55
 
70
56
  def params
71
- source[:params].if_present(DEFAULT_PARAMS)
57
+ source_as_hash[:params].if_present(DEFAULT_PARAMS)
58
+ end
59
+
60
+ # @return [Addressable::URI]
61
+ def uri
62
+ ::Addressable::URI.parse(source_as_hash.fetch(:url))
63
+ end
64
+
65
+ # @return [Boolean]
66
+ def valid?
67
+ source_as_hash? &&
68
+ source_as_hash[:method].to_s.downcase.strip == self.class.http_method.to_s
72
69
  end
73
70
 
74
71
  private
@@ -1,54 +1,54 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable'
4
- require 'aranha/parsers/source_address/fetch_content_error'
4
+ require 'aranha/parsers/source_address/base'
5
5
  require 'eac_envs/http/error'
6
6
  require 'eac_envs/http/request'
7
7
 
8
8
  module Aranha
9
9
  module Parsers
10
10
  class SourceAddress
11
- class HttpGet
11
+ class HttpGet < ::Aranha::Parsers::SourceAddress::Base
12
12
  class << self
13
13
  def location_uri(source_uri, location)
14
14
  ::Addressable::URI.join(source_uri, location).to_s
15
15
  end
16
-
17
- def valid_source?(source)
18
- source.to_s =~ %r{\Ahttps?://}
19
- end
20
16
  end
21
17
 
22
- attr_reader :source
23
-
24
- def initialize(source)
25
- @source = source.to_s
26
- end
18
+ common_constructor :source, super_args: -> { [source.to_s] }
27
19
 
28
20
  def ==(other)
29
21
  self.class == other.class && source == other.source
30
22
  end
31
23
 
32
- def url
33
- source
34
- end
35
-
36
24
  def final_url
37
25
  content unless @final_url
38
26
  @final_url
39
27
  end
40
28
 
29
+ # @return [String]
30
+ # @raise [Aranha::Parsers::SourceAddress::FetchContentError]
41
31
  def content
42
32
  request = ::EacEnvs::Http::Request.new.url(url).retry(true).follow_redirect(true)
43
33
  .header('user-agent', self.class.name)
44
- request.response.body_str
34
+ request.response.body_str!
45
35
  rescue ::EacEnvs::Http::Error => e
46
- raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
36
+ raise ::Aranha::Parsers::SourceAddress::FetchContentError.new(e.message, request)
47
37
  end
48
38
 
49
39
  def serialize
50
40
  url
51
41
  end
42
+
43
+ # @return [Addressable::URI]
44
+ def uri
45
+ source_as_uri
46
+ end
47
+
48
+ # @return [Boolean]
49
+ def valid?
50
+ source.to_s =~ %r{\Ahttps?://}
51
+ end
52
52
  end
53
53
  end
54
54
  end
@@ -20,8 +20,10 @@ module Aranha
20
20
  def detect_sub(source)
21
21
  return source.sub if source.is_a?(self)
22
22
 
23
- SUBS.each do |sub|
24
- return sub.new(source) if sub.valid_source?(source)
23
+ SUBS.each do |sub_class|
24
+ sub_class.new(source).then do |sub|
25
+ return sub if sub.valid?
26
+ end
25
27
  end
26
28
  raise "No content fetcher found for source \"#{source}\""
27
29
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.23.0'
5
+ VERSION = '0.24.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.23.0
4
+ version: 0.24.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-09-05 00:00:00.000000000 Z
11
+ date: 2024-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -50,48 +50,34 @@ dependencies:
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '0.5'
53
+ version: '0.6'
54
54
  - - ">="
55
55
  - !ruby/object:Gem::Version
56
- version: 0.5.1
56
+ version: 0.6.1
57
57
  type: :runtime
58
58
  prerelease: false
59
59
  version_requirements: !ruby/object:Gem::Requirement
60
60
  requirements:
61
61
  - - "~>"
62
62
  - !ruby/object:Gem::Version
63
- version: '0.5'
63
+ version: '0.6'
64
64
  - - ">="
65
65
  - !ruby/object:Gem::Version
66
- version: 0.5.1
67
- - !ruby/object:Gem::Dependency
68
- name: eac_ruby_gem_support
69
- requirement: !ruby/object:Gem::Requirement
70
- requirements:
71
- - - "~>"
72
- - !ruby/object:Gem::Version
73
- version: '0.10'
74
- type: :runtime
75
- prerelease: false
76
- version_requirements: !ruby/object:Gem::Requirement
77
- requirements:
78
- - - "~>"
79
- - !ruby/object:Gem::Version
80
- version: '0.10'
66
+ version: 0.6.1
81
67
  - !ruby/object:Gem::Dependency
82
68
  name: eac_ruby_utils
83
69
  requirement: !ruby/object:Gem::Requirement
84
70
  requirements:
85
71
  - - "~>"
86
72
  - !ruby/object:Gem::Version
87
- version: '0.122'
73
+ version: '0.123'
88
74
  type: :runtime
89
75
  prerelease: false
90
76
  version_requirements: !ruby/object:Gem::Requirement
91
77
  requirements:
92
78
  - - "~>"
93
79
  - !ruby/object:Gem::Version
94
- version: '0.122'
80
+ version: '0.123'
95
81
  - !ruby/object:Gem::Dependency
96
82
  name: nokogiri
97
83
  requirement: !ruby/object:Gem::Requirement
@@ -132,14 +118,14 @@ dependencies:
132
118
  requirements:
133
119
  - - "~>"
134
120
  - !ruby/object:Gem::Version
135
- version: '0.9'
121
+ version: '0.10'
136
122
  type: :development
137
123
  prerelease: false
138
124
  version_requirements: !ruby/object:Gem::Requirement
139
125
  requirements:
140
126
  - - "~>"
141
127
  - !ruby/object:Gem::Version
142
- version: '0.9'
128
+ version: '0.10'
143
129
  description:
144
130
  email:
145
131
  executables: []
@@ -148,6 +134,10 @@ extra_rdoc_files: []
148
134
  files:
149
135
  - lib/aranha/parsers.rb
150
136
  - lib/aranha/parsers/base.rb
137
+ - lib/aranha/parsers/firefox.rb
138
+ - lib/aranha/parsers/firefox/request_from_firefox.rb
139
+ - lib/aranha/parsers/firefox/request_header_from_firefox.rb
140
+ - lib/aranha/parsers/firefox/uri_from_har.rb
151
141
  - lib/aranha/parsers/html.rb
152
142
  - lib/aranha/parsers/html/base.rb
153
143
  - lib/aranha/parsers/html/item.rb
@@ -165,6 +155,7 @@ files:
165
155
  - lib/aranha/parsers/patches.rb
166
156
  - lib/aranha/parsers/patches/ofx_parser.rb
167
157
  - lib/aranha/parsers/source_address.rb
158
+ - lib/aranha/parsers/source_address/base.rb
168
159
  - lib/aranha/parsers/source_address/fetch_content_error.rb
169
160
  - lib/aranha/parsers/source_address/file.rb
170
161
  - lib/aranha/parsers/source_address/hash_http_base.rb