url_to_markdown 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/url_to_markdown/cloudflare/client.rb +31 -12
- data/lib/url_to_markdown/cloudflare/processor.rb +2 -2
- data/lib/url_to_markdown/configuration.rb +2 -2
- data/lib/url_to_markdown/errors.rb +2 -0
- data/lib/url_to_markdown/version.rb +1 -1
- data/lib/url_to_markdown.rb +3 -3
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9717cbf5f44fb3b6c4c52d9e394aaeb6877dfb0908b39d1344d48c42d96a40d8
|
|
4
|
+
data.tar.gz: d66e680ab993c01eb41ac2acf8a6aa2f3b613fdaf0699867343cfa61b9888900
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9e55e675ab867a696d5efd23a0abaf3a8c614fa72634f934f298aa2f574fbe33c9195a90ad835aa7959eab37934fcaa71c7fbc90d3a996681dc1d2d566516ee4
|
|
7
|
+
data.tar.gz: b436fa839abbd87e20ec51400e5852fcd9acef09f2b6e4f692012af451133fc81c3b0b65cb0272e1f0a080afaa1131d4c986172f0ff2bc042a44abdf3646bd34
|
|
@@ -18,20 +18,21 @@ class UrlToMarkdown
|
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
def markdown(url: nil, html: nil, wait_for_selector: nil, wait_for_timeout_in_milliseconds: nil, cache_ttl: nil,
|
|
21
|
-
|
|
21
|
+
scripts: nil, set_extra_http_headers: nil)
|
|
22
22
|
validate_payload!(url: url, html: html)
|
|
23
23
|
|
|
24
24
|
response = connection.post("accounts/#{@account_id}/browser-rendering/markdown") do |request|
|
|
25
25
|
request.headers["Authorization"] = "Bearer #{@token}"
|
|
26
26
|
request.headers["Content-Type"] = "application/json"
|
|
27
27
|
request.options.timeout = @timeout
|
|
28
|
+
request.params["cacheTTL"] = cache_ttl if cache_ttl
|
|
28
29
|
request.body = JSON.generate(build_payload(
|
|
29
30
|
url: url,
|
|
30
31
|
html: html,
|
|
31
32
|
wait_for_selector: wait_for_selector,
|
|
32
33
|
wait_for_timeout_in_milliseconds: wait_for_timeout_in_milliseconds,
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
scripts: scripts,
|
|
35
|
+
set_extra_http_headers: set_extra_http_headers
|
|
35
36
|
))
|
|
36
37
|
end
|
|
37
38
|
|
|
@@ -44,6 +45,16 @@ class UrlToMarkdown
|
|
|
44
45
|
UrlToMarkdown::Result.failure(UrlToMarkdown::NetworkError.new(e))
|
|
45
46
|
end
|
|
46
47
|
|
|
48
|
+
SECURITY_CHECKPOINT_PATTERNS = [
|
|
49
|
+
"vercel.link/security-checkpoint", # Vercel Firewall
|
|
50
|
+
"Vercel Security Checkpoint",
|
|
51
|
+
"Just a moment", # Cloudflare challenge
|
|
52
|
+
"Checking your browser", # Cloudflare challenge
|
|
53
|
+
"Enable JavaScript and cookies", # Cloudflare challenge
|
|
54
|
+
"cf-browser-verification", # Cloudflare legacy
|
|
55
|
+
"DDoS protection by" # Generic DDoS protection
|
|
56
|
+
].freeze
|
|
57
|
+
|
|
47
58
|
private
|
|
48
59
|
|
|
49
60
|
def validate_credentials!
|
|
@@ -58,20 +69,23 @@ class UrlToMarkdown
|
|
|
58
69
|
raise UrlToMarkdown::ValidationError.new(nil, "Provide a URL or HTML")
|
|
59
70
|
end
|
|
60
71
|
|
|
61
|
-
def build_payload(url:, html:, wait_for_selector:, wait_for_timeout_in_milliseconds:,
|
|
72
|
+
def build_payload(url:, html:, wait_for_selector:, wait_for_timeout_in_milliseconds:, scripts:, set_extra_http_headers:)
|
|
62
73
|
payload = {}
|
|
63
74
|
payload[:url] = url if url
|
|
64
75
|
payload[:html] = html if html
|
|
65
|
-
payload[:
|
|
66
|
-
if wait_for_timeout_in_milliseconds
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
end
|
|
70
|
-
payload[:cache_ttl] = cache_ttl if cache_ttl
|
|
71
|
-
payload[:actions] = actions if actions&.any?
|
|
76
|
+
payload[:waitForSelector] = { selector: wait_for_selector } if wait_for_selector
|
|
77
|
+
payload[:waitForTimeout] = wait_for_timeout_in_milliseconds if wait_for_timeout_in_milliseconds
|
|
78
|
+
payload[:addScriptTag] = Array(scripts).map { { content: it } } if scripts&.any?
|
|
79
|
+
payload[:setExtraHTTPHeaders] = set_extra_http_headers if set_extra_http_headers
|
|
72
80
|
payload
|
|
73
81
|
end
|
|
74
82
|
|
|
83
|
+
def security_checkpoint?(content)
|
|
84
|
+
return false unless content.is_a?(String)
|
|
85
|
+
|
|
86
|
+
SECURITY_CHECKPOINT_PATTERNS.any? { |pattern| content.include?(pattern) }
|
|
87
|
+
end
|
|
88
|
+
|
|
75
89
|
def connection
|
|
76
90
|
@connection ||= Faraday.new(url: BASE_URL)
|
|
77
91
|
end
|
|
@@ -84,7 +98,12 @@ class UrlToMarkdown
|
|
|
84
98
|
when 200..299
|
|
85
99
|
data = JSON.parse(body)
|
|
86
100
|
if data.key?("result")
|
|
87
|
-
|
|
101
|
+
content = data["result"]
|
|
102
|
+
if security_checkpoint?(content)
|
|
103
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::SecurityCheckpointError.new(nil, "Blocked by security checkpoint"))
|
|
104
|
+
else
|
|
105
|
+
UrlToMarkdown::Result.success(content)
|
|
106
|
+
end
|
|
88
107
|
else
|
|
89
108
|
UrlToMarkdown::Result.failure(UrlToMarkdown::MissingResultKeyInResponse.new(status, body))
|
|
90
109
|
end
|
|
@@ -5,7 +5,7 @@ require "logger"
|
|
|
5
5
|
class UrlToMarkdown
|
|
6
6
|
class Configuration
|
|
7
7
|
attr_accessor :cloudflare_api_token, :cloudflare_account_id, :cloudflare_timeout_ms, :cloudflare_cache_ttl,
|
|
8
|
-
:logger, :default_processor, :
|
|
8
|
+
:logger, :default_processor, :default_scripts
|
|
9
9
|
|
|
10
10
|
def initialize
|
|
11
11
|
@cloudflare_api_token = ENV.fetch("CLOUDFLARE_API_TOKEN", nil)
|
|
@@ -14,7 +14,7 @@ class UrlToMarkdown
|
|
|
14
14
|
@cloudflare_cache_ttl = 5
|
|
15
15
|
@logger = Logger.new($stdout)
|
|
16
16
|
@default_processor = UrlToMarkdown::Cloudflare::Processor
|
|
17
|
-
@
|
|
17
|
+
@default_scripts = nil
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
def cloudflare_api_token!
|
data/lib/url_to_markdown.rb
CHANGED
|
@@ -8,12 +8,12 @@ loader.inflector.inflect("errors" => "Error", "pstore" => "PStore")
|
|
|
8
8
|
loader.setup
|
|
9
9
|
|
|
10
10
|
class UrlToMarkdown
|
|
11
|
-
def initialize(url:, processor: nil, logger: nil, cache_store: nil,
|
|
11
|
+
def initialize(url:, processor: nil, logger: nil, cache_store: nil, scripts: nil)
|
|
12
12
|
@url = url
|
|
13
13
|
@processor_class = processor || self.class.configuration.default_processor
|
|
14
14
|
@logger = logger || self.class.configuration.logger
|
|
15
15
|
@cache_store = cache_store
|
|
16
|
-
@
|
|
16
|
+
@scripts = scripts || self.class.configuration.default_scripts
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def convert
|
|
@@ -28,7 +28,7 @@ class UrlToMarkdown
|
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
processor = @processor_class.new(logger: @logger, cache_store: @cache_store)
|
|
31
|
-
result = processor.convert(@url,
|
|
31
|
+
result = processor.convert(@url, scripts: @scripts)
|
|
32
32
|
|
|
33
33
|
@cache_store.store!(@url, result.payload) if @cache_store && result.respond_to?(:success?) && result.success?
|
|
34
34
|
|