coelacanth 0.1.6 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +28 -30
- data/README.md +3 -2
- data/compose.yml +0 -1
- data/config/coelacanth.yml +0 -1
- data/lib/coelacanth/client.rb +14 -56
- data/lib/coelacanth/dom.rb +12 -0
- data/lib/coelacanth/redirect.rb +36 -0
- data/lib/coelacanth/validator.rb +15 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +7 -4
- metadata +5 -3
- data/CHANGELOG.md +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 506450022b367ef2795c9d67d66548b1893757dd740512286fb90f9b952f21d1
|
4
|
+
data.tar.gz: df860c4216846549ac335b15dbef4cd9f99aa068a464a47b58ba973ba9d3b771
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f4f483791381bf5672c74c382c73129703983c936720184cdd870db211cb740bb5ff6685460c6152213eea8dfcce85c6666ffa189d23e3b58e92c183658b486e
|
7
|
+
data.tar.gz: 577737cc29a81134a288bb3bee66a220d46f80749a9824ff5b487b62dfe97a8f1209124266715c58d9021e7e88d47e634f675462e9d27e62bd8e3e93c532e31d
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
coelacanth (0.
|
4
|
+
coelacanth (0.2.2)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
@@ -11,59 +11,57 @@ GEM
|
|
11
11
|
ansi (1.5.0)
|
12
12
|
ast (2.4.2)
|
13
13
|
concurrent-ruby (1.3.4)
|
14
|
-
diff-lcs (1.
|
14
|
+
diff-lcs (1.5.1)
|
15
15
|
ferrum (0.15)
|
16
16
|
addressable (~> 2.5)
|
17
17
|
concurrent-ruby (~> 1.1)
|
18
18
|
webrick (~> 1.7)
|
19
19
|
websocket-driver (~> 0.7)
|
20
|
-
json (2.7.
|
20
|
+
json (2.7.2)
|
21
21
|
language_server-protocol (3.17.0.3)
|
22
22
|
oga (3.4)
|
23
23
|
ast
|
24
24
|
ruby-ll (~> 2.1)
|
25
|
-
parallel (1.
|
26
|
-
parser (3.3.0
|
25
|
+
parallel (1.26.3)
|
26
|
+
parser (3.3.5.0)
|
27
27
|
ast (~> 2.4.1)
|
28
28
|
racc
|
29
29
|
public_suffix (6.0.1)
|
30
|
-
racc (1.
|
30
|
+
racc (1.8.1)
|
31
31
|
rainbow (3.1.1)
|
32
|
-
rake (13.1
|
33
|
-
regexp_parser (2.9.
|
34
|
-
|
35
|
-
|
36
|
-
rspec-
|
37
|
-
rspec-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
rspec-expectations (3.10.1)
|
32
|
+
rake (13.2.1)
|
33
|
+
regexp_parser (2.9.2)
|
34
|
+
rspec (3.13.0)
|
35
|
+
rspec-core (~> 3.13.0)
|
36
|
+
rspec-expectations (~> 3.13.0)
|
37
|
+
rspec-mocks (~> 3.13.0)
|
38
|
+
rspec-core (3.13.2)
|
39
|
+
rspec-support (~> 3.13.0)
|
40
|
+
rspec-expectations (3.13.3)
|
42
41
|
diff-lcs (>= 1.2.0, < 2.0)
|
43
|
-
rspec-support (~> 3.
|
44
|
-
rspec-mocks (3.
|
42
|
+
rspec-support (~> 3.13.0)
|
43
|
+
rspec-mocks (3.13.2)
|
45
44
|
diff-lcs (>= 1.2.0, < 2.0)
|
46
|
-
rspec-support (~> 3.
|
47
|
-
rspec-support (3.
|
48
|
-
rubocop (1.
|
45
|
+
rspec-support (~> 3.13.0)
|
46
|
+
rspec-support (3.13.1)
|
47
|
+
rubocop (1.67.0)
|
49
48
|
json (~> 2.3)
|
50
49
|
language_server-protocol (>= 3.17.0)
|
51
50
|
parallel (~> 1.10)
|
52
51
|
parser (>= 3.3.0.2)
|
53
52
|
rainbow (>= 2.2.2, < 4.0)
|
54
|
-
regexp_parser (>=
|
55
|
-
|
56
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
53
|
+
regexp_parser (>= 2.4, < 3.0)
|
54
|
+
rubocop-ast (>= 1.32.2, < 2.0)
|
57
55
|
ruby-progressbar (~> 1.7)
|
58
56
|
unicode-display_width (>= 2.4.0, < 3.0)
|
59
|
-
rubocop-ast (1.
|
60
|
-
parser (>= 3.3.0
|
57
|
+
rubocop-ast (1.32.3)
|
58
|
+
parser (>= 3.3.1.0)
|
61
59
|
ruby-ll (2.1.3)
|
62
60
|
ansi
|
63
61
|
ast
|
64
62
|
ruby-progressbar (1.13.0)
|
65
|
-
unicode-display_width (2.
|
66
|
-
webrick (1.8.
|
63
|
+
unicode-display_width (2.6.0)
|
64
|
+
webrick (1.8.2)
|
67
65
|
websocket-driver (0.7.6)
|
68
66
|
websocket-extensions (>= 0.1.0)
|
69
67
|
websocket-extensions (0.1.5)
|
@@ -76,9 +74,9 @@ DEPENDENCIES
|
|
76
74
|
coelacanth!
|
77
75
|
ferrum (~> 0.15)
|
78
76
|
oga (~> 3.4)
|
79
|
-
rake (~> 13.
|
77
|
+
rake (~> 13.2)
|
80
78
|
rspec (~> 3.0)
|
81
79
|
rubocop (~> 1.21)
|
82
80
|
|
83
81
|
BUNDLED WITH
|
84
|
-
2.5.
|
82
|
+
2.5.17
|
data/README.md
CHANGED
@@ -50,11 +50,12 @@ Then, you can easily parse and extract information from a web page like this:
|
|
50
50
|
|
51
51
|
```ruby
|
52
52
|
url = "https://example.com"
|
53
|
-
stats = Coelacanth.analyze(url)
|
53
|
+
stats = Coelacanth.analyze(url)
|
54
54
|
```
|
55
55
|
|
56
56
|
## Features
|
57
|
-
-
|
57
|
+
- Get dom by oga
|
58
|
+
- Get screenshot
|
58
59
|
|
59
60
|
## Commit Message Guidelines
|
60
61
|
|
data/compose.yml
CHANGED
data/config/coelacanth.yml
CHANGED
data/lib/coelacanth/client.rb
CHANGED
@@ -1,76 +1,34 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "ferrum"
|
4
|
-
require "oga"
|
5
4
|
|
6
5
|
module Coelacanth
|
7
6
|
# Coelacanth::Client
|
8
7
|
class Client
|
9
|
-
def initialize(url
|
8
|
+
def initialize(url)
|
9
|
+
@validator = Validator.new
|
10
|
+
raise URI::InvalidURIError unless @validator.valid_url?(url)
|
10
11
|
@config = Coelacanth.config
|
11
|
-
|
12
|
-
end
|
13
|
-
|
14
|
-
def valid_url?(url = nil)
|
15
|
-
@url = url if url
|
16
|
-
uri = URI.parse(@url)
|
17
|
-
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
18
|
-
rescue URI::InvalidURIError
|
19
|
-
false
|
20
|
-
end
|
21
|
-
|
22
|
-
def resolve_redirect(url = nil, limit = 10)
|
23
|
-
@url = url if url && valid_url?(url)
|
24
|
-
raise Coelacanth::DeepRedirectError, "Too many redirect" if limit.zero?
|
25
|
-
raise Coelacanth::RedirectError, "Url or location is nil" if @url.nil?
|
26
|
-
|
27
|
-
get_response(@url)
|
28
|
-
handle_response(@origin_response, limit)
|
29
|
-
end
|
30
|
-
|
31
|
-
def oga(url = nil)
|
32
|
-
@url = url if url && valid_url?(url)
|
33
|
-
Oga.parse_xml(get_response(@url))
|
12
|
+
remote_client.goto(url)
|
34
13
|
end
|
35
14
|
|
36
15
|
def get_response(url = nil)
|
37
|
-
@url = url if url && valid_url?(url)
|
38
|
-
if @config.read("use_remote_client")
|
39
|
-
response_by_remote_client
|
40
|
-
else
|
41
|
-
response_by_net_http
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
private
|
46
|
-
|
47
|
-
def handle_response(response, limit)
|
48
|
-
codes = Net::HTTPResponse::CODE_CLASS_TO_OBJ.invert
|
49
|
-
case @status_code.to_s
|
50
|
-
when /^#{codes[Net::HTTPSuccess]}\d\d$/
|
51
|
-
@url
|
52
|
-
when /^#{codes[Net::HTTPRedirection]}\d\d$/
|
53
|
-
@url = response["location"]
|
54
|
-
resolve_redirect(response["location"], limit - 1)
|
55
|
-
else
|
56
|
-
raise Coelacanth::RedirectError
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
def response_by_remote_client
|
61
|
-
remote_client.goto(@url)
|
62
16
|
@status_code = remote_client.network.status
|
63
17
|
@origin_response = remote_client
|
64
|
-
remote_client.body
|
18
|
+
body = remote_client.body
|
19
|
+
page.network.wait_for_idle! # might raise an error
|
20
|
+
body
|
65
21
|
end
|
66
22
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
23
|
+
def get_screenshot
|
24
|
+
tempfile = Tempfile.new
|
25
|
+
remote_client.screenshot(path: tempfile.path, format: "png")
|
26
|
+
page.network.wait_for_idle! # might raise an error
|
27
|
+
File.read(tempfile.path)
|
72
28
|
end
|
73
29
|
|
30
|
+
private
|
31
|
+
|
74
32
|
def remote_client
|
75
33
|
if @remote_client.nil?
|
76
34
|
headers = @config.read("remote_client.headers")
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ferrum"
|
4
|
+
require "oga"
|
5
|
+
|
6
|
+
module Coelacanth
|
7
|
+
# Coelacanth::Redirect
|
8
|
+
class Redirect
|
9
|
+
def resolve_redirect(url, limit = 10)
|
10
|
+
@url = url if url && Validator.new.valid_url?(url)
|
11
|
+
raise Coelacanth::DeepRedirectError, "Too many redirect" if limit.zero?
|
12
|
+
raise Coelacanth::RedirectError, "Url or location is nil" if @url.nil?
|
13
|
+
|
14
|
+
response = Net::HTTP.get_response(URI.parse(@url))
|
15
|
+
@status_code = response.code
|
16
|
+
@origin_response = response
|
17
|
+
|
18
|
+
handle_response(@origin_response, limit)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def handle_response(response, limit)
|
24
|
+
codes = Net::HTTPResponse::CODE_CLASS_TO_OBJ.invert
|
25
|
+
case @status_code.to_s
|
26
|
+
when /^#{codes[Net::HTTPSuccess]}\d\d$/
|
27
|
+
@url
|
28
|
+
when /^#{codes[Net::HTTPRedirection]}\d\d$/
|
29
|
+
@url = response["location"]
|
30
|
+
resolve_redirect(response["location"], limit - 1)
|
31
|
+
else
|
32
|
+
raise Coelacanth::RedirectError
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ferrum"
|
4
|
+
|
5
|
+
module Coelacanth
|
6
|
+
# Coelacanth::Validator
|
7
|
+
class Validator
|
8
|
+
def valid_url?(url)
|
9
|
+
uri = URI.parse(url)
|
10
|
+
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
11
|
+
rescue URI::InvalidURIError
|
12
|
+
false
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/coelacanth/version.rb
CHANGED
data/lib/coelacanth.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "net/http"
|
4
|
-
require_relative "coelacanth/version"
|
5
4
|
require_relative "coelacanth/configure"
|
6
5
|
require_relative "coelacanth/client"
|
6
|
+
require_relative "coelacanth/dom"
|
7
|
+
require_relative "coelacanth/redirect"
|
8
|
+
require_relative "coelacanth/validator"
|
9
|
+
require_relative "coelacanth/version"
|
7
10
|
|
8
11
|
# Coelacanth
|
9
12
|
module Coelacanth
|
@@ -13,10 +16,10 @@ module Coelacanth
|
|
13
16
|
|
14
17
|
def self.analyze(url)
|
15
18
|
@client = Client.new(url)
|
16
|
-
|
19
|
+
regular_url = Redirect.new.resolve_redirect(url)
|
17
20
|
{
|
18
|
-
|
19
|
-
|
21
|
+
dom: Dom.new.oga(regular_url),
|
22
|
+
screenshot: @client.get_screenshot,
|
20
23
|
}
|
21
24
|
end
|
22
25
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coelacanth
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yusuke
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
coelacanth is a gem that allows you to easily parse and analyze web pages,
|
@@ -21,7 +21,6 @@ extra_rdoc_files: []
|
|
21
21
|
files:
|
22
22
|
- ".rspec"
|
23
23
|
- ".rubocop.yml"
|
24
|
-
- CHANGELOG.md
|
25
24
|
- CODE_OF_CONDUCT.md
|
26
25
|
- Dockerfile
|
27
26
|
- Gemfile
|
@@ -34,6 +33,9 @@ files:
|
|
34
33
|
- lib/coelacanth.rb
|
35
34
|
- lib/coelacanth/client.rb
|
36
35
|
- lib/coelacanth/configure.rb
|
36
|
+
- lib/coelacanth/dom.rb
|
37
|
+
- lib/coelacanth/redirect.rb
|
38
|
+
- lib/coelacanth/validator.rb
|
37
39
|
- lib/coelacanth/version.rb
|
38
40
|
homepage: https://github.com/slidict/coelacanth
|
39
41
|
licenses:
|