aranha-parsers 0.23.0 → 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 29047ecc46278ee7a2a65dbd78f68e02274cbff544eb40c975468ce0f5e01111
4
- data.tar.gz: b99153904aab532fae9fe1bb2e73b6e21b69a0650bebb80cf8c1e6d20edb8b2c
3
+ metadata.gz: 0e9f2435af805f1aaac850b47ade4df067af4517e9a8d44e39cc0aadc12ef8fe
4
+ data.tar.gz: 6e1146e598ca952c5f15f0d0c83deaec7382b6487996b489727642ec3ba5a553
5
5
  SHA512:
6
- metadata.gz: c5292e264c641a4fd05d44f1799abe47c2a9672de472bcbeca7fe1f9de60b94c4e6ce0b33029736c906f4043df14c4be47a39b98150c68db9a91bfd57d7b1f90
7
- data.tar.gz: f2b9d85ea85c9902fb9ebbcfc2f2cc80fc66711fb93ebc569b9e32e7e5e007ea1917c707211919aa85125970449ebf07472df9cc3efd735288da6a6adbf129e6
6
+ metadata.gz: 72d12c157d957b8238d2c9f8d39a545d06aa974e9ac49e7103b64c69648e946ecd8c439b626f1196b789a8eb30c17bc3fdf0fb46956999ddacdbbf08d37f4de0
7
+ data.tar.gz: ee5fe7392f18a318a64ad35bd16341118c4254d978288c64a62ddbaa85e6ff7997018311317614e5c48f1c27f159d547c33151f8ae1c5f768b3a191e343a5faa
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Firefox
8
+ class RequestFromFirefox
9
+ BASE_URL_SUBPATH = 'base_url'
10
+ BODY_SUBPATH = 'body'
11
+ REQUEST_SUBPATH = 'request'
12
+
13
+ class << self
14
+ def from_directory(path)
15
+ path = path.to_pathname
16
+ body_path = path.join(BODY_SUBPATH)
17
+ new(
18
+ path.join(BASE_URL_SUBPATH).read.strip,
19
+ ::Aranha::Parsers::Firefox::RequestHeaderFromFirefox
20
+ .from_file(path.join(REQUEST_SUBPATH)),
21
+ body_path.file? ? body_path.read : nil
22
+ )
23
+ end
24
+ end
25
+
26
+ enable_simple_cache
27
+ common_constructor :the_base_uri, :header, :body, default: [nil] do
28
+ self.the_base_uri = the_base_uri.to_uri
29
+ end
30
+
31
+ def to_uri_source
32
+ {
33
+ method: header.verb,
34
+ url: url,
35
+ headers: header.headers,
36
+ body: body
37
+ }
38
+ end
39
+
40
+ def url
41
+ (the_base_uri + header.uri).to_s
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Aranha
4
+ module Parsers
5
+ module Firefox
6
+ class RequestHeaderFromFirefox
7
+ class << self
8
+ def from_file(path)
9
+ new(path.to_pathname.read)
10
+ end
11
+ end
12
+
13
+ FIRST_LINE_PARSER = /\A(\S+)\s(\S+)\s(\S+)\z/.to_parser do |m|
14
+ { verb: m[1], uri: m[2], version: m[3] }
15
+ end
16
+
17
+ HEADER_LINE_PARSER = /\A([^:]+):\s+(.+)\z/.to_parser do |m|
18
+ m[1..2]
19
+ end
20
+
21
+ enable_simple_cache
22
+
23
+ common_constructor :string
24
+
25
+ def to_h
26
+ %w[verb uri headers].index_with { |m| send(m) }
27
+ end
28
+
29
+ def headers
30
+ all_except_first_line.map { |line| HEADER_LINE_PARSER.parse!(line) }.to_h # rubocop:disable Style/MapToHash
31
+ end
32
+
33
+ def verb
34
+ parsed_first_line.fetch(:verb)
35
+ end
36
+
37
+ def uri
38
+ parsed_first_line.fetch(:uri)
39
+ end
40
+
41
+ private
42
+
43
+ def all_lines_uncached
44
+ string.each_line.map(&:strip)
45
+ end
46
+
47
+ def parsed_first_line_uncached
48
+ FIRST_LINE_PARSER.parse!(all_lines.first)
49
+ end
50
+
51
+ def all_except_first_line
52
+ all_lines[1..-1] # rubocop:disable Style/SlicingWithRange
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Firefox
8
+ class UriFromHar
9
+ class << self
10
+ def from_file(path)
11
+ new(::JSON.parse(path.to_pathname.read))
12
+ end
13
+ end
14
+
15
+ common_constructor :data
16
+
17
+ def result
18
+ data.fetch('log').fetch('entries').map { |e| e.fetch('request').fetch('url') }
19
+ end
20
+
21
+ def request_data; end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/require_sub'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Firefox
8
+ ::EacRubyUtils.require_sub __FILE__
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'eac_ruby_utils/core_ext'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ class SourceAddress
8
+ class Base
9
+ acts_as_abstract
10
+ common_constructor :source
11
+ compare_by :source
12
+
13
+ # @return [String]
14
+ def content
15
+ raise_abstract_method __method__
16
+ end
17
+
18
+ # @return [Addressable::URI]
19
+ def uri
20
+ raise_abstract_method __method__
21
+ end
22
+
23
+ # @return [String]
24
+ def url
25
+ uri.to_s
26
+ end
27
+
28
+ # @return [Hash]
29
+ def source_as_hash
30
+ source_as_hash? ? source.with_indifferent_access : raise('source is not a Hash')
31
+ end
32
+
33
+ # @return [Boolean]
34
+ def source_as_hash?
35
+ source.is_a?(::Hash)
36
+ end
37
+
38
+ # @|return [Hash]
39
+ def source_as_uri
40
+ source_as_uri? ? source.to_uri : raise('source is not a URI')
41
+ end
42
+
43
+ # @return [Boolean]
44
+ def source_as_uri?
45
+ source.to_uri.scheme.present?
46
+ end
47
+
48
+ # @return [Boolean]
49
+ def valid?
50
+ raise_abstract_method __method__
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/source_address/http_get'
4
+ require 'eac_ruby_utils/core_ext'
4
5
 
5
6
  module Aranha
6
7
  module Parsers
@@ -8,23 +9,23 @@ module Aranha
8
9
  class File < ::Aranha::Parsers::SourceAddress::HttpGet
9
10
  SCHEME = 'file://'
10
11
 
11
- class << self
12
- def valid_source?(source)
13
- source.to_s.start_with?("#{SCHEME}/", '/')
14
- end
15
- end
16
-
17
12
  def initialize(source)
18
13
  super(source.to_s.gsub(/\A#{Regexp.quote(SCHEME)}/, ''))
19
14
  end
20
15
 
21
- def url
22
- "#{SCHEME}#{source}"
23
- end
24
-
25
16
  def content
26
17
  ::File.read(source)
27
18
  end
19
+
20
+ # @return [Addressable::URI]
21
+ def uri
22
+ source_as_uri? ? source_as_uri : "#{SCHEME}#{source}".to_uri
23
+ end
24
+
25
+ # @return [Boolean]
26
+ def valid?
27
+ source.to_s.start_with?("#{SCHEME}/", '/')
28
+ end
28
29
  end
29
30
  end
30
31
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'aranha/parsers/source_address/fetch_content_error'
4
- require 'aranha/parsers/source_address/hash_http_base'
4
+ require 'aranha/parsers/source_address/base'
5
5
  require 'eac_envs/http/error'
6
6
  require 'eac_envs/http/request'
7
7
  require 'eac_ruby_utils/core_ext'
@@ -10,16 +10,11 @@ require 'yaml'
10
10
  module Aranha
11
11
  module Parsers
12
12
  class SourceAddress
13
- class HashHttpBase
13
+ class HashHttpBase < ::Aranha::Parsers::SourceAddress::Base
14
14
  class << self
15
15
  def http_method
16
16
  const_get 'HTTP_METHOD'
17
17
  end
18
-
19
- def valid_source?(source)
20
- source.is_a?(::Hash) &&
21
- source.with_indifferent_access[:method].to_s.downcase.strip == http_method.to_s
22
- end
23
18
  end
24
19
 
25
20
  DEFAULT_BODY = ''
@@ -31,11 +26,6 @@ module Aranha
31
26
 
32
27
  enable_simple_cache
33
28
 
34
- common_constructor :source do
35
- self.source = source.with_indifferent_access
36
- end
37
- compare_by :source
38
-
39
29
  def body
40
30
  param(:body, DEFAULT_BODY)
41
31
  end
@@ -48,27 +38,34 @@ module Aranha
48
38
  param(:headers, DEFAULT_HEADERS)
49
39
  end
50
40
 
51
- def url
52
- source.fetch(:url)
53
- end
54
-
55
41
  def serialize
56
- source.to_yaml
42
+ source_as_hash.to_yaml
57
43
  end
58
44
 
59
45
  def content
60
46
  request = http_request
61
- request.response.body_str
47
+ request.response.body_str!
62
48
  rescue ::EacEnvs::Http::Error => e
63
49
  raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
64
50
  end
65
51
 
66
52
  def param(key, default_value)
67
- source[key] || params[key] || default_value
53
+ source_as_hash[key] || params[key] || default_value
68
54
  end
69
55
 
70
56
  def params
71
- source[:params].if_present(DEFAULT_PARAMS)
57
+ source_as_hash[:params].if_present(DEFAULT_PARAMS)
58
+ end
59
+
60
+ # @return [Addressable::URI]
61
+ def uri
62
+ ::Addressable::URI.parse(source_as_hash.fetch(:url))
63
+ end
64
+
65
+ # @return [Boolean]
66
+ def valid?
67
+ source_as_hash? &&
68
+ source_as_hash[:method].to_s.downcase.strip == self.class.http_method.to_s
72
69
  end
73
70
 
74
71
  private
@@ -1,54 +1,54 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable'
4
- require 'aranha/parsers/source_address/fetch_content_error'
4
+ require 'aranha/parsers/source_address/base'
5
5
  require 'eac_envs/http/error'
6
6
  require 'eac_envs/http/request'
7
7
 
8
8
  module Aranha
9
9
  module Parsers
10
10
  class SourceAddress
11
- class HttpGet
11
+ class HttpGet < ::Aranha::Parsers::SourceAddress::Base
12
12
  class << self
13
13
  def location_uri(source_uri, location)
14
14
  ::Addressable::URI.join(source_uri, location).to_s
15
15
  end
16
-
17
- def valid_source?(source)
18
- source.to_s =~ %r{\Ahttps?://}
19
- end
20
16
  end
21
17
 
22
- attr_reader :source
23
-
24
- def initialize(source)
25
- @source = source.to_s
26
- end
18
+ common_constructor :source, super_args: -> { [source.to_s] }
27
19
 
28
20
  def ==(other)
29
21
  self.class == other.class && source == other.source
30
22
  end
31
23
 
32
- def url
33
- source
34
- end
35
-
36
24
  def final_url
37
25
  content unless @final_url
38
26
  @final_url
39
27
  end
40
28
 
29
+ # @return [String]
30
+ # @raise [Aranha::Parsers::SourceAddress::FetchContentError]
41
31
  def content
42
32
  request = ::EacEnvs::Http::Request.new.url(url).retry(true).follow_redirect(true)
43
33
  .header('user-agent', self.class.name)
44
- request.response.body_str
34
+ request.response.body_str!
45
35
  rescue ::EacEnvs::Http::Error => e
46
- raise ::Aranha::Parsers::SourceAddress::FetchContentError, e.message, request
36
+ raise ::Aranha::Parsers::SourceAddress::FetchContentError.new(e.message, request)
47
37
  end
48
38
 
49
39
  def serialize
50
40
  url
51
41
  end
42
+
43
+ # @return [Addressable::URI]
44
+ def uri
45
+ source_as_uri
46
+ end
47
+
48
+ # @return [Boolean]
49
+ def valid?
50
+ source.to_s =~ %r{\Ahttps?://}
51
+ end
52
52
  end
53
53
  end
54
54
  end
@@ -20,8 +20,10 @@ module Aranha
20
20
  def detect_sub(source)
21
21
  return source.sub if source.is_a?(self)
22
22
 
23
- SUBS.each do |sub|
24
- return sub.new(source) if sub.valid_source?(source)
23
+ SUBS.each do |sub_class|
24
+ sub_class.new(source).then do |sub|
25
+ return sub if sub.valid?
26
+ end
25
27
  end
26
28
  raise "No content fetcher found for source \"#{source}\""
27
29
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Aranha
4
4
  module Parsers
5
- VERSION = '0.23.0'
5
+ VERSION = '0.24.0'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha-parsers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.23.0
4
+ version: 0.24.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esquilo Azul Company
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-09-05 00:00:00.000000000 Z
11
+ date: 2024-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -50,48 +50,34 @@ dependencies:
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '0.5'
53
+ version: '0.6'
54
54
  - - ">="
55
55
  - !ruby/object:Gem::Version
56
- version: 0.5.1
56
+ version: 0.6.1
57
57
  type: :runtime
58
58
  prerelease: false
59
59
  version_requirements: !ruby/object:Gem::Requirement
60
60
  requirements:
61
61
  - - "~>"
62
62
  - !ruby/object:Gem::Version
63
- version: '0.5'
63
+ version: '0.6'
64
64
  - - ">="
65
65
  - !ruby/object:Gem::Version
66
- version: 0.5.1
67
- - !ruby/object:Gem::Dependency
68
- name: eac_ruby_gem_support
69
- requirement: !ruby/object:Gem::Requirement
70
- requirements:
71
- - - "~>"
72
- - !ruby/object:Gem::Version
73
- version: '0.10'
74
- type: :runtime
75
- prerelease: false
76
- version_requirements: !ruby/object:Gem::Requirement
77
- requirements:
78
- - - "~>"
79
- - !ruby/object:Gem::Version
80
- version: '0.10'
66
+ version: 0.6.1
81
67
  - !ruby/object:Gem::Dependency
82
68
  name: eac_ruby_utils
83
69
  requirement: !ruby/object:Gem::Requirement
84
70
  requirements:
85
71
  - - "~>"
86
72
  - !ruby/object:Gem::Version
87
- version: '0.122'
73
+ version: '0.123'
88
74
  type: :runtime
89
75
  prerelease: false
90
76
  version_requirements: !ruby/object:Gem::Requirement
91
77
  requirements:
92
78
  - - "~>"
93
79
  - !ruby/object:Gem::Version
94
- version: '0.122'
80
+ version: '0.123'
95
81
  - !ruby/object:Gem::Dependency
96
82
  name: nokogiri
97
83
  requirement: !ruby/object:Gem::Requirement
@@ -132,14 +118,14 @@ dependencies:
132
118
  requirements:
133
119
  - - "~>"
134
120
  - !ruby/object:Gem::Version
135
- version: '0.9'
121
+ version: '0.10'
136
122
  type: :development
137
123
  prerelease: false
138
124
  version_requirements: !ruby/object:Gem::Requirement
139
125
  requirements:
140
126
  - - "~>"
141
127
  - !ruby/object:Gem::Version
142
- version: '0.9'
128
+ version: '0.10'
143
129
  description:
144
130
  email:
145
131
  executables: []
@@ -148,6 +134,10 @@ extra_rdoc_files: []
148
134
  files:
149
135
  - lib/aranha/parsers.rb
150
136
  - lib/aranha/parsers/base.rb
137
+ - lib/aranha/parsers/firefox.rb
138
+ - lib/aranha/parsers/firefox/request_from_firefox.rb
139
+ - lib/aranha/parsers/firefox/request_header_from_firefox.rb
140
+ - lib/aranha/parsers/firefox/uri_from_har.rb
151
141
  - lib/aranha/parsers/html.rb
152
142
  - lib/aranha/parsers/html/base.rb
153
143
  - lib/aranha/parsers/html/item.rb
@@ -165,6 +155,7 @@ files:
165
155
  - lib/aranha/parsers/patches.rb
166
156
  - lib/aranha/parsers/patches/ofx_parser.rb
167
157
  - lib/aranha/parsers/source_address.rb
158
+ - lib/aranha/parsers/source_address/base.rb
168
159
  - lib/aranha/parsers/source_address/fetch_content_error.rb
169
160
  - lib/aranha/parsers/source_address/file.rb
170
161
  - lib/aranha/parsers/source_address/hash_http_base.rb