url_to_markdown 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ee86bddc38ccddc190d68aef6b7e29c0aea48d61810efad1e5b9e793dca374d
4
- data.tar.gz: aebb67cdd1aff8e9156fd616964b54118d5cec0e91762b51422ada379cfd4b76
3
+ metadata.gz: 9717cbf5f44fb3b6c4c52d9e394aaeb6877dfb0908b39d1344d48c42d96a40d8
4
+ data.tar.gz: d66e680ab993c01eb41ac2acf8a6aa2f3b613fdaf0699867343cfa61b9888900
5
5
  SHA512:
6
- metadata.gz: 90f9c1161a277a3d6fa9e92574e8410ade755ba00b0e71867674e5ae5a3f2f8ce3245af4a68c6682d6dd8b4eba3fb734caa5f74f133f2cea5408cc2566de102f
7
- data.tar.gz: b187a9e3ba8f0e1eb5f6f1007021df9b9c2ecbd800276e6a9b495e358aff131574f64753ed544516796d5fbc449c7c145b0b5d3360cec8b24a5288f95c20aed3
6
+ metadata.gz: 9e55e675ab867a696d5efd23a0abaf3a8c614fa72634f934f298aa2f574fbe33c9195a90ad835aa7959eab37934fcaa71c7fbc90d3a996681dc1d2d566516ee4
7
+ data.tar.gz: b436fa839abbd87e20ec51400e5852fcd9acef09f2b6e4f692012af451133fc81c3b0b65cb0272e1f0a080afaa1131d4c986172f0ff2bc042a44abdf3646bd34
@@ -18,20 +18,21 @@ class UrlToMarkdown
18
18
  end
19
19
 
20
20
  def markdown(url: nil, html: nil, wait_for_selector: nil, wait_for_timeout_in_milliseconds: nil, cache_ttl: nil,
21
- actions: nil)
21
+ scripts: nil, set_extra_http_headers: nil)
22
22
  validate_payload!(url: url, html: html)
23
23
 
24
24
  response = connection.post("accounts/#{@account_id}/browser-rendering/markdown") do |request|
25
25
  request.headers["Authorization"] = "Bearer #{@token}"
26
26
  request.headers["Content-Type"] = "application/json"
27
27
  request.options.timeout = @timeout
28
+ request.params["cacheTTL"] = cache_ttl if cache_ttl
28
29
  request.body = JSON.generate(build_payload(
29
30
  url: url,
30
31
  html: html,
31
32
  wait_for_selector: wait_for_selector,
32
33
  wait_for_timeout_in_milliseconds: wait_for_timeout_in_milliseconds,
33
- cache_ttl: cache_ttl,
34
- actions: actions
34
+ scripts: scripts,
35
+ set_extra_http_headers: set_extra_http_headers
35
36
  ))
36
37
  end
37
38
 
@@ -44,6 +45,16 @@ class UrlToMarkdown
44
45
  UrlToMarkdown::Result.failure(UrlToMarkdown::NetworkError.new(e))
45
46
  end
46
47
 
48
+ SECURITY_CHECKPOINT_PATTERNS = [
49
+ "vercel.link/security-checkpoint", # Vercel Firewall
50
+ "Vercel Security Checkpoint",
51
+ "Just a moment", # Cloudflare challenge
52
+ "Checking your browser", # Cloudflare challenge
53
+ "Enable JavaScript and cookies", # Cloudflare challenge
54
+ "cf-browser-verification", # Cloudflare legacy
55
+ "DDoS protection by" # Generic DDoS protection
56
+ ].freeze
57
+
47
58
  private
48
59
 
49
60
  def validate_credentials!
@@ -58,20 +69,23 @@ class UrlToMarkdown
58
69
  raise UrlToMarkdown::ValidationError.new(nil, "Provide a URL or HTML")
59
70
  end
60
71
 
61
- def build_payload(url:, html:, wait_for_selector:, wait_for_timeout_in_milliseconds:, cache_ttl:, actions: nil)
72
+ def build_payload(url:, html:, wait_for_selector:, wait_for_timeout_in_milliseconds:, scripts:, set_extra_http_headers:)
62
73
  payload = {}
63
74
  payload[:url] = url if url
64
75
  payload[:html] = html if html
65
- payload[:wait_for_selector] = wait_for_selector if wait_for_selector
66
- if wait_for_timeout_in_milliseconds
67
- payload[:wait_for_timeout_in_milliseconds] =
68
- wait_for_timeout_in_milliseconds
69
- end
70
- payload[:cache_ttl] = cache_ttl if cache_ttl
71
- payload[:actions] = actions if actions&.any?
76
+ payload[:waitForSelector] = { selector: wait_for_selector } if wait_for_selector
77
+ payload[:waitForTimeout] = wait_for_timeout_in_milliseconds if wait_for_timeout_in_milliseconds
78
+ payload[:addScriptTag] = Array(scripts).map { { content: it } } if scripts&.any?
79
+ payload[:setExtraHTTPHeaders] = set_extra_http_headers if set_extra_http_headers
72
80
  payload
73
81
  end
74
82
 
83
+ def security_checkpoint?(content)
84
+ return false unless content.is_a?(String)
85
+
86
+ SECURITY_CHECKPOINT_PATTERNS.any? { |pattern| content.include?(pattern) }
87
+ end
88
+
75
89
  def connection
76
90
  @connection ||= Faraday.new(url: BASE_URL)
77
91
  end
@@ -84,7 +98,12 @@ class UrlToMarkdown
84
98
  when 200..299
85
99
  data = JSON.parse(body)
86
100
  if data.key?("result")
87
- UrlToMarkdown::Result.success(data["result"])
101
+ content = data["result"]
102
+ if security_checkpoint?(content)
103
+ UrlToMarkdown::Result.failure(UrlToMarkdown::SecurityCheckpointError.new(nil, "Blocked by security checkpoint"))
104
+ else
105
+ UrlToMarkdown::Result.success(content)
106
+ end
88
107
  else
89
108
  UrlToMarkdown::Result.failure(UrlToMarkdown::MissingResultKeyInResponse.new(status, body))
90
109
  end
@@ -15,9 +15,9 @@ class UrlToMarkdown
15
15
  )
16
16
  end
17
17
 
18
- def convert(url, actions: nil)
18
+ def convert(url, scripts: nil)
19
19
  validate_url!(url)
20
- @client.markdown(url: url, actions: actions)
20
+ @client.markdown(url: url, scripts: scripts)
21
21
  end
22
22
 
23
23
  private
@@ -5,7 +5,7 @@ require "logger"
5
5
  class UrlToMarkdown
6
6
  class Configuration
7
7
  attr_accessor :cloudflare_api_token, :cloudflare_account_id, :cloudflare_timeout_ms, :cloudflare_cache_ttl,
8
- :logger, :default_processor, :default_actions
8
+ :logger, :default_processor, :default_scripts
9
9
 
10
10
  def initialize
11
11
  @cloudflare_api_token = ENV.fetch("CLOUDFLARE_API_TOKEN", nil)
@@ -14,7 +14,7 @@ class UrlToMarkdown
14
14
  @cloudflare_cache_ttl = 5
15
15
  @logger = Logger.new($stdout)
16
16
  @default_processor = UrlToMarkdown::Cloudflare::Processor
17
- @default_actions = nil
17
+ @default_scripts = nil
18
18
  end
19
19
 
20
20
  def cloudflare_api_token!
@@ -42,6 +42,8 @@ class UrlToMarkdown
42
42
  class ServerError < ApiError; end
43
43
  class MissingResultKeyInResponse < ApiError; end
44
44
 
45
+ class SecurityCheckpointError < Error; end
46
+
45
47
  class ValidationError < Error; end
46
48
  class InvalidUrlError < ValidationError; end
47
49
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class UrlToMarkdown
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.2"
5
5
  end
@@ -8,12 +8,12 @@ loader.inflector.inflect("errors" => "Error", "pstore" => "PStore")
8
8
  loader.setup
9
9
 
10
10
  class UrlToMarkdown
11
- def initialize(url:, processor: nil, logger: nil, cache_store: nil, actions: nil)
11
+ def initialize(url:, processor: nil, logger: nil, cache_store: nil, scripts: nil)
12
12
  @url = url
13
13
  @processor_class = processor || self.class.configuration.default_processor
14
14
  @logger = logger || self.class.configuration.logger
15
15
  @cache_store = cache_store
16
- @actions = actions || self.class.configuration.default_actions
16
+ @scripts = scripts || self.class.configuration.default_scripts
17
17
  end
18
18
 
19
19
  def convert
@@ -28,7 +28,7 @@ class UrlToMarkdown
28
28
  end
29
29
 
30
30
  processor = @processor_class.new(logger: @logger, cache_store: @cache_store)
31
- result = processor.convert(@url, actions: @actions)
31
+ result = processor.convert(@url, scripts: @scripts)
32
32
 
33
33
  @cache_store.store!(@url, result.payload) if @cache_store && result.respond_to?(:success?) && result.success?
34
34
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_to_markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucian Ghinda