url_to_markdown 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +206 -0
- data/lib/url_to_markdown/cache_store/pstore.rb +56 -0
- data/lib/url_to_markdown/cache_store.rb +25 -0
- data/lib/url_to_markdown/cloudflare/client.rb +105 -0
- data/lib/url_to_markdown/cloudflare/processor.rb +35 -0
- data/lib/url_to_markdown/configuration.rb +31 -0
- data/lib/url_to_markdown/errors.rb +51 -0
- data/lib/url_to_markdown/processor.rb +14 -0
- data/lib/url_to_markdown/result.rb +23 -0
- data/lib/url_to_markdown/version.rb +5 -0
- data/lib/url_to_markdown.rb +56 -0
- metadata +109 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 2183cbb77e1eb9a58965d80e977b81698d81d5dc48ccb433d286e02dbcfa53e6
|
|
4
|
+
data.tar.gz: 30e5a87e074c00f456d67258b2ddbeed332dc6b2670e49915e356729560c0948
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 66d0679ca3291058214204f6ac251898e7cc9fe348f3aa0ac5bf3167bb07d582180b49602191c6e023e62dfb8cc5395f8db6cdede0433fec289b926e46154e1c
|
|
7
|
+
data.tar.gz: 8e30c81de02f1360ab9e143aa21d947a0212c14ac1b0d05554e44c975a9b087d7083ebb13915390848b7530a3072eaf9f7b36747bfc6275f7703ae0a2da5f381
|
data/README.md
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# UrlToMarkdown
|
|
2
|
+
|
|
3
|
+
Convert URLs (or raw HTML) into Markdown using a configurable processor.
|
|
4
|
+
Cloudflare Browser Rendering is the default processor, but you can plug in your own.
|
|
5
|
+
|
|
6
|
+
- Uses Cloudflare's rendering service for JavaScript-heavy pages by default.
|
|
7
|
+
- Returns results as `SimpleResult::Success` / `SimpleResult::Failure`.
|
|
8
|
+
- Supports optional caching via `UrlToMarkdown::CacheStore::PStore`.
|
|
9
|
+
|
|
10
|
+
> Cloudflare Browser Rendering docs: https://workers.cloudflare.com/product/browser-rendering/
|
|
11
|
+
|
|
12
|
+
## Requirements
|
|
13
|
+
|
|
14
|
+
- Ruby 4.0.1
|
|
15
|
+
- Cloudflare API token and account ID
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
bundle install
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Configuration
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
require 'url_to_markdown'
|
|
27
|
+
|
|
28
|
+
UrlToMarkdown.configure do |config|
|
|
29
|
+
config.cloudflare_api_token = ENV.fetch('CLOUDFLARE_API_TOKEN')
|
|
30
|
+
config.cloudflare_account_id = ENV.fetch('CLOUDFLARE_ACCOUNT_ID')
|
|
31
|
+
config.cloudflare_timeout_ms = 30_000
|
|
32
|
+
config.cloudflare_cache_ttl = 5
|
|
33
|
+
config.logger = Logger.new($stdout)
|
|
34
|
+
end
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Basic Usage
|
|
38
|
+
|
|
39
|
+
```ruby
|
|
40
|
+
result = UrlToMarkdown.convert('https://example.com')
|
|
41
|
+
|
|
42
|
+
if result.success?
|
|
43
|
+
puts result.payload
|
|
44
|
+
else
|
|
45
|
+
warn result.error.message
|
|
46
|
+
end
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Using a Cache Store
|
|
50
|
+
|
|
51
|
+
`UrlToMarkdown` checks the cache before calling the processor. If `cache_store.exists?(url)` returns
|
|
52
|
+
true, it returns the cached content from `cache_store.find_by(url)`. When the processor succeeds,
|
|
53
|
+
it writes the markdown with `cache_store.store!(url, result.payload)`.
|
|
54
|
+
|
|
55
|
+
```ruby
|
|
56
|
+
cache = UrlToMarkdown::CacheStore::PStore.new(path: '/tmp/url_to_markdown.pstore')
|
|
57
|
+
converter = UrlToMarkdown.new(url: 'https://example.com', cache_store: cache)
|
|
58
|
+
|
|
59
|
+
result = converter.convert
|
|
60
|
+
puts result.payload if result.success?
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Implementing Your Own Cache Store
|
|
64
|
+
|
|
65
|
+
Implement the `UrlToMarkdown::CacheStore` interface with these methods:
|
|
66
|
+
|
|
67
|
+
- `exists?(key)` → boolean
|
|
68
|
+
- `find_by(key)` → cached value (raise `CacheReadError` if missing)
|
|
69
|
+
- `store!(key, value)`
|
|
70
|
+
- `invalidate!(key)`
|
|
71
|
+
- `clear!`
|
|
72
|
+
|
|
73
|
+
Example in-memory cache:
|
|
74
|
+
|
|
75
|
+
```ruby
|
|
76
|
+
class MemoryCache < UrlToMarkdown::CacheStore
|
|
77
|
+
def initialize
|
|
78
|
+
@store = {}
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def exists?(key)
|
|
82
|
+
@store.key?(key)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def find_by(key)
|
|
86
|
+
@store.fetch(key) do
|
|
87
|
+
raise UrlToMarkdown::CacheReadError.new(nil, 'Cache miss')
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def store!(key, value)
|
|
92
|
+
@store[key] = value
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def invalidate!(key)
|
|
96
|
+
@store.delete(key)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def clear!
|
|
100
|
+
@store.clear
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Custom Processor Options
|
|
106
|
+
|
|
107
|
+
Use the Cloudflare processor directly for advanced options such as HTML input or dynamic rendering.
|
|
108
|
+
|
|
109
|
+
```ruby
|
|
110
|
+
processor = UrlToMarkdown::Cloudflare::Processor.new(
|
|
111
|
+
config: UrlToMarkdown.configuration,
|
|
112
|
+
logger: Logger.new($stdout)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
result = processor.convert('https://example.com')
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Implementing Your Own Processor
|
|
119
|
+
|
|
120
|
+
Custom processors must implement `convert(url)` and return a `SimpleResult::Success` or
|
|
121
|
+
`SimpleResult::Failure`. You can also accept a `logger:` and `cache_store:` in `initialize` to match
|
|
122
|
+
the base processor signature.
|
|
123
|
+
|
|
124
|
+
```ruby
|
|
125
|
+
class StaticProcessor < UrlToMarkdown::Processor
|
|
126
|
+
def initialize(logger: nil, cache_store: nil)
|
|
127
|
+
super
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def convert(url)
|
|
131
|
+
markdown = "# Offline content for #{url}"
|
|
132
|
+
UrlToMarkdown::Result.success(markdown)
|
|
133
|
+
rescue StandardError => e
|
|
134
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::Error.new(e))
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
UrlToMarkdown.configure do |config|
|
|
139
|
+
config.default_processor = StaticProcessor
|
|
140
|
+
end
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Rendering HTML Instead of a URL
|
|
144
|
+
|
|
145
|
+
```ruby
|
|
146
|
+
client = UrlToMarkdown::Cloudflare::Client.new(
|
|
147
|
+
token: ENV.fetch('CLOUDFLARE_API_TOKEN'),
|
|
148
|
+
account_id: ENV.fetch('CLOUDFLARE_ACCOUNT_ID')
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
result = client.markdown(html: '<h1>Hello</h1>')
|
|
152
|
+
puts result.payload if result.success?
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Waiting for Dynamic Content
|
|
156
|
+
|
|
157
|
+
```ruby
|
|
158
|
+
client = UrlToMarkdown::Cloudflare::Client.new(
|
|
159
|
+
token: ENV.fetch('CLOUDFLARE_API_TOKEN'),
|
|
160
|
+
account_id: ENV.fetch('CLOUDFLARE_ACCOUNT_ID')
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
result = client.markdown(
|
|
164
|
+
url: 'https://spa-example.com',
|
|
165
|
+
wait_for_selector: '#main-content',
|
|
166
|
+
wait_for_timeout_in_milliseconds: 10_000
|
|
167
|
+
)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Error Handling
|
|
171
|
+
|
|
172
|
+
Failures return a `SimpleResult::Failure` with a rich error type. Common errors include:
|
|
173
|
+
|
|
174
|
+
- `UrlToMarkdown::MissingCredentialsError`
|
|
175
|
+
- `UrlToMarkdown::AuthenticationError`
|
|
176
|
+
- `UrlToMarkdown::RateLimitError`
|
|
177
|
+
- `UrlToMarkdown::ServerError`
|
|
178
|
+
- `UrlToMarkdown::InvalidUrlError`
|
|
179
|
+
|
|
180
|
+
```ruby
|
|
181
|
+
result = UrlToMarkdown.convert('https://example.com')
|
|
182
|
+
|
|
183
|
+
result.on_error do |error|
|
|
184
|
+
warn "Conversion failed: #{error.class} - #{error.message}"
|
|
185
|
+
end
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Development
|
|
189
|
+
|
|
190
|
+
### Tests
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
bundle exec rake test
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### Sorbet
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
bundle exec srb tc
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### RuboCop
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
bundle exec rubocop
|
|
206
|
+
```
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "pstore"
|
|
5
|
+
|
|
6
|
+
class UrlToMarkdown
|
|
7
|
+
class CacheStore
|
|
8
|
+
class PStore < UrlToMarkdown::CacheStore
|
|
9
|
+
def initialize(path: "tmp/url_to_markdown.pstore")
|
|
10
|
+
@path = path
|
|
11
|
+
@store = ::PStore.new(path)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def exists?(key)
|
|
15
|
+
@store.transaction(true) do
|
|
16
|
+
!@store[key].nil?
|
|
17
|
+
end
|
|
18
|
+
rescue ::PStore::Error => e
|
|
19
|
+
raise UrlToMarkdown::CacheReadError, e
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def find_by(key)
|
|
23
|
+
@store.transaction(true) do
|
|
24
|
+
value = @store[key]
|
|
25
|
+
raise UrlToMarkdown::CacheReadError.new(nil, "Cache miss") if value.nil?
|
|
26
|
+
|
|
27
|
+
value
|
|
28
|
+
end
|
|
29
|
+
rescue ::PStore::Error => e
|
|
30
|
+
raise UrlToMarkdown::CacheReadError, e
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def store!(key, value)
|
|
34
|
+
@store.transaction do
|
|
35
|
+
@store[key] = value
|
|
36
|
+
end
|
|
37
|
+
rescue ::PStore::Error => e
|
|
38
|
+
raise UrlToMarkdown::CacheWriteError, e
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def invalidate!(key)
|
|
42
|
+
@store.transaction do
|
|
43
|
+
@store.delete(key)
|
|
44
|
+
end
|
|
45
|
+
rescue ::PStore::Error => e
|
|
46
|
+
raise UrlToMarkdown::CacheWriteError, e
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def clear!
|
|
50
|
+
FileUtils.rm_f(@path)
|
|
51
|
+
rescue StandardError => e
|
|
52
|
+
raise UrlToMarkdown::CacheWriteError, e
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class UrlToMarkdown
|
|
4
|
+
class CacheStore
|
|
5
|
+
def exists?(_key)
|
|
6
|
+
raise NotImplementedError, "Implement in subclass"
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def find_by(_key)
|
|
10
|
+
raise NotImplementedError, "Implement in subclass"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def store!(_key, _value)
|
|
14
|
+
raise NotImplementedError, "Implement in subclass"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def invalidate!(_key)
|
|
18
|
+
raise NotImplementedError, "Implement in subclass"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def clear!
|
|
22
|
+
raise NotImplementedError, "Implement in subclass"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
class UrlToMarkdown
|
|
7
|
+
module Cloudflare
|
|
8
|
+
class Client
|
|
9
|
+
BASE_URL = "https://api.cloudflare.com/client/v4"
|
|
10
|
+
DEFAULT_ACTION_TIMEOUT = 30_000
|
|
11
|
+
|
|
12
|
+
def initialize(token:, account_id:, action_timeout_in_milliseconds: nil)
|
|
13
|
+
@token = token
|
|
14
|
+
@account_id = account_id
|
|
15
|
+
@timeout = (action_timeout_in_milliseconds || DEFAULT_ACTION_TIMEOUT) / 1000.0
|
|
16
|
+
|
|
17
|
+
validate_credentials!
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def markdown(url: nil, html: nil, wait_for_selector: nil, wait_for_timeout_in_milliseconds: nil, cache_ttl: nil)
|
|
21
|
+
validate_payload!(url: url, html: html)
|
|
22
|
+
|
|
23
|
+
response = connection.post("accounts/#{@account_id}/browser-rendering/markdown") do |request|
|
|
24
|
+
request.headers["Authorization"] = "Bearer #{@token}"
|
|
25
|
+
request.headers["Content-Type"] = "application/json"
|
|
26
|
+
request.options.timeout = @timeout
|
|
27
|
+
request.body = JSON.generate(build_payload(
|
|
28
|
+
url: url,
|
|
29
|
+
html: html,
|
|
30
|
+
wait_for_selector: wait_for_selector,
|
|
31
|
+
wait_for_timeout_in_milliseconds: wait_for_timeout_in_milliseconds,
|
|
32
|
+
cache_ttl: cache_ttl
|
|
33
|
+
))
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
handle_response(response)
|
|
37
|
+
rescue Faraday::TimeoutError => e
|
|
38
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::TimeoutError.new(e))
|
|
39
|
+
rescue Faraday::ConnectionFailed => e
|
|
40
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::ConnectionError.new(e))
|
|
41
|
+
rescue Faraday::Error => e
|
|
42
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::NetworkError.new(e))
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def validate_credentials!
|
|
48
|
+
return if @token && @account_id
|
|
49
|
+
|
|
50
|
+
raise UrlToMarkdown::MissingCredentialsError.new(nil, "Missing Cloudflare credentials")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def validate_payload!(url:, html:)
|
|
54
|
+
return if url || html
|
|
55
|
+
|
|
56
|
+
raise UrlToMarkdown::ValidationError.new(nil, "Provide a URL or HTML")
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def build_payload(url:, html:, wait_for_selector:, wait_for_timeout_in_milliseconds:, cache_ttl:)
|
|
60
|
+
payload = {}
|
|
61
|
+
payload[:url] = url if url
|
|
62
|
+
payload[:html] = html if html
|
|
63
|
+
payload[:wait_for_selector] = wait_for_selector if wait_for_selector
|
|
64
|
+
if wait_for_timeout_in_milliseconds
|
|
65
|
+
payload[:wait_for_timeout_in_milliseconds] =
|
|
66
|
+
wait_for_timeout_in_milliseconds
|
|
67
|
+
end
|
|
68
|
+
payload[:cache_ttl] = cache_ttl if cache_ttl
|
|
69
|
+
payload
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def connection
|
|
73
|
+
@connection ||= Faraday.new(url: BASE_URL)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def handle_response(response)
|
|
77
|
+
status = response.status
|
|
78
|
+
body = response.body.to_s
|
|
79
|
+
|
|
80
|
+
case status
|
|
81
|
+
when 200..299
|
|
82
|
+
data = JSON.parse(body)
|
|
83
|
+
if data.key?("result")
|
|
84
|
+
UrlToMarkdown::Result.success(data["result"])
|
|
85
|
+
else
|
|
86
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::MissingResultKeyInResponse.new(status, body))
|
|
87
|
+
end
|
|
88
|
+
when 401
|
|
89
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::AuthenticationError.new(status, body))
|
|
90
|
+
when 404
|
|
91
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::NotFoundError.new(status, body))
|
|
92
|
+
when 429
|
|
93
|
+
retry_after = response.headers["Retry-After"]
|
|
94
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::RateLimitError.new(status, body, retry_after: retry_after))
|
|
95
|
+
when 500..599
|
|
96
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::ServerError.new(status, body))
|
|
97
|
+
else
|
|
98
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::ApiError.new(status, body))
|
|
99
|
+
end
|
|
100
|
+
rescue JSON::ParserError
|
|
101
|
+
UrlToMarkdown::Result.failure(UrlToMarkdown::ApiError.new(status, body))
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
class UrlToMarkdown
|
|
6
|
+
module Cloudflare
|
|
7
|
+
class Processor < UrlToMarkdown::Processor
|
|
8
|
+
def initialize(config: UrlToMarkdown.configuration, logger: nil, cache_store: nil)
|
|
9
|
+
super(logger: logger, cache_store: cache_store)
|
|
10
|
+
@config = config
|
|
11
|
+
@client = UrlToMarkdown::Cloudflare::Client.new(
|
|
12
|
+
token: @config.cloudflare_api_token!,
|
|
13
|
+
account_id: @config.cloudflare_account_id!,
|
|
14
|
+
action_timeout_in_milliseconds: @config.cloudflare_timeout_ms
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def convert(url)
|
|
19
|
+
validate_url!(url)
|
|
20
|
+
@client.markdown(url: url)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def validate_url!(url)
|
|
26
|
+
uri = URI.parse(url)
|
|
27
|
+
return if uri.is_a?(URI::HTTP) && !uri.host.nil?
|
|
28
|
+
|
|
29
|
+
raise UrlToMarkdown::InvalidUrlError.new(nil, "Invalid URL")
|
|
30
|
+
rescue URI::InvalidURIError
|
|
31
|
+
raise UrlToMarkdown::InvalidUrlError.new(nil, "Invalid URL")
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "logger"
|
|
4
|
+
|
|
5
|
+
class UrlToMarkdown
|
|
6
|
+
class Configuration
|
|
7
|
+
attr_accessor :cloudflare_api_token, :cloudflare_account_id, :cloudflare_timeout_ms, :cloudflare_cache_ttl,
|
|
8
|
+
:logger, :default_processor
|
|
9
|
+
|
|
10
|
+
def initialize
|
|
11
|
+
@cloudflare_api_token = ENV.fetch("CLOUDFLARE_API_TOKEN", nil)
|
|
12
|
+
@cloudflare_account_id = ENV.fetch("CLOUDFLARE_ACCOUNT_ID", nil)
|
|
13
|
+
@cloudflare_timeout_ms = 30_000
|
|
14
|
+
@cloudflare_cache_ttl = 5
|
|
15
|
+
@logger = Logger.new($stdout)
|
|
16
|
+
@default_processor = UrlToMarkdown::Cloudflare::Processor
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def cloudflare_api_token!
|
|
20
|
+
return cloudflare_api_token if cloudflare_api_token && !cloudflare_api_token.empty?
|
|
21
|
+
|
|
22
|
+
raise UrlToMarkdown::MissingCredentialsError.new(nil, "Missing Cloudflare API token")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def cloudflare_account_id!
|
|
26
|
+
return cloudflare_account_id if cloudflare_account_id && !cloudflare_account_id.empty?
|
|
27
|
+
|
|
28
|
+
raise UrlToMarkdown::MissingCredentialsError.new(nil, "Missing Cloudflare account ID")
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class UrlToMarkdown
|
|
4
|
+
class Error < StandardError
|
|
5
|
+
attr_reader :original_error
|
|
6
|
+
|
|
7
|
+
def initialize(original_error = nil, message = nil)
|
|
8
|
+
@original_error = original_error
|
|
9
|
+
super(message || original_error&.message)
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
class ConfigurationError < Error; end
|
|
14
|
+
class MissingCredentialsError < ConfigurationError; end
|
|
15
|
+
|
|
16
|
+
class NetworkError < Error; end
|
|
17
|
+
class TimeoutError < NetworkError; end
|
|
18
|
+
class ConnectionError < NetworkError; end
|
|
19
|
+
|
|
20
|
+
class ApiError < Error
|
|
21
|
+
attr_reader :status_code, :response_body
|
|
22
|
+
|
|
23
|
+
def initialize(status_code = nil, response_body = nil, message: nil)
|
|
24
|
+
@status_code = status_code
|
|
25
|
+
@response_body = response_body
|
|
26
|
+
super(nil, message)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
class AuthenticationError < ApiError; end
|
|
31
|
+
|
|
32
|
+
class RateLimitError < ApiError
|
|
33
|
+
attr_reader :retry_after
|
|
34
|
+
|
|
35
|
+
def initialize(status_code = nil, response_body = nil, retry_after: nil, message: nil)
|
|
36
|
+
@retry_after = retry_after
|
|
37
|
+
super(status_code, response_body, message: message)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
class NotFoundError < ApiError; end
|
|
42
|
+
class ServerError < ApiError; end
|
|
43
|
+
class MissingResultKeyInResponse < ApiError; end
|
|
44
|
+
|
|
45
|
+
class ValidationError < Error; end
|
|
46
|
+
class InvalidUrlError < ValidationError; end
|
|
47
|
+
|
|
48
|
+
class CacheError < Error; end
|
|
49
|
+
class CacheReadError < CacheError; end
|
|
50
|
+
class CacheWriteError < CacheError; end
|
|
51
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class UrlToMarkdown
|
|
4
|
+
class Processor
|
|
5
|
+
def initialize(logger: nil, cache_store: nil)
|
|
6
|
+
@logger = logger
|
|
7
|
+
@cache_store = cache_store
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def convert(_url)
|
|
11
|
+
raise NotImplementedError, "Implement in subclass"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class UrlToMarkdown
|
|
4
|
+
module Result
|
|
5
|
+
class << self
|
|
6
|
+
def success(value)
|
|
7
|
+
if SimpleResult.respond_to?(:success)
|
|
8
|
+
SimpleResult.success(value)
|
|
9
|
+
else
|
|
10
|
+
SimpleResult::Success.new(payload: value)
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def failure(error)
|
|
15
|
+
if SimpleResult.respond_to?(:failure)
|
|
16
|
+
SimpleResult.failure(error)
|
|
17
|
+
else
|
|
18
|
+
SimpleResult::Failure.new(error: error)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zeitwerk"
|
|
4
|
+
require "simple_result"
|
|
5
|
+
|
|
6
|
+
loader = Zeitwerk::Loader.for_gem
|
|
7
|
+
loader.inflector.inflect("errors" => "Error", "pstore" => "PStore")
|
|
8
|
+
loader.setup
|
|
9
|
+
|
|
10
|
+
class UrlToMarkdown
|
|
11
|
+
def initialize(url:, processor: nil, logger: nil, cache_store: nil)
|
|
12
|
+
@url = url
|
|
13
|
+
@processor_class = processor || self.class.configuration.default_processor
|
|
14
|
+
@logger = logger || self.class.configuration.logger
|
|
15
|
+
@cache_store = cache_store
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def convert
|
|
19
|
+
@logger.info("UrlToMarkdown: converting #{@url}")
|
|
20
|
+
|
|
21
|
+
if @cache_store&.exists?(@url)
|
|
22
|
+
cached = @cache_store.find_by(@url)
|
|
23
|
+
result = UrlToMarkdown::Result.success(cached)
|
|
24
|
+
@logger.info("UrlToMarkdown: cache hit for #{@url}")
|
|
25
|
+
@logger.info("UrlToMarkdown: completed #{@url}")
|
|
26
|
+
return result
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
processor = @processor_class.new(logger: @logger, cache_store: @cache_store)
|
|
30
|
+
result = processor.convert(@url)
|
|
31
|
+
|
|
32
|
+
@cache_store.store!(@url, result.payload) if @cache_store && result.respond_to?(:success?) && result.success?
|
|
33
|
+
|
|
34
|
+
@logger.info("UrlToMarkdown: completed #{@url}")
|
|
35
|
+
result
|
|
36
|
+
rescue StandardError => e
|
|
37
|
+
wrapped = e.is_a?(UrlToMarkdown::Error) ? e : UrlToMarkdown::Error.new(e)
|
|
38
|
+
UrlToMarkdown::Result.failure(wrapped)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
class << self
|
|
42
|
+
def configuration
|
|
43
|
+
@configuration ||= UrlToMarkdown::Configuration.new
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def configure
|
|
47
|
+
yield(configuration)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def convert(url, **)
|
|
51
|
+
new(url: url, **).convert
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
loader.eager_load
|
metadata
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: url_to_markdown
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Lucian Ghinda
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: faraday
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '2.0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '2.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: pstore
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0.1'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0.1'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: simple-result
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0.3'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0.3'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: zeitwerk
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '2.6'
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '2.6'
|
|
68
|
+
email:
|
|
69
|
+
- lucian@shortruby.com
|
|
70
|
+
executables: []
|
|
71
|
+
extensions: []
|
|
72
|
+
extra_rdoc_files: []
|
|
73
|
+
files:
|
|
74
|
+
- README.md
|
|
75
|
+
- lib/url_to_markdown.rb
|
|
76
|
+
- lib/url_to_markdown/cache_store.rb
|
|
77
|
+
- lib/url_to_markdown/cache_store/pstore.rb
|
|
78
|
+
- lib/url_to_markdown/cloudflare/client.rb
|
|
79
|
+
- lib/url_to_markdown/cloudflare/processor.rb
|
|
80
|
+
- lib/url_to_markdown/configuration.rb
|
|
81
|
+
- lib/url_to_markdown/errors.rb
|
|
82
|
+
- lib/url_to_markdown/processor.rb
|
|
83
|
+
- lib/url_to_markdown/result.rb
|
|
84
|
+
- lib/url_to_markdown/version.rb
|
|
85
|
+
homepage: https://github.com/lucianghinda/url_to_markdown
|
|
86
|
+
licenses:
|
|
87
|
+
- Apache-2.0
|
|
88
|
+
metadata:
|
|
89
|
+
rubygems_mfa_required: 'true'
|
|
90
|
+
homepage_uri: https://github.com/lucianghinda/url_to_markdown
|
|
91
|
+
source_code_uri: https://github.com/lucianghinda/url_to_markdown
|
|
92
|
+
rdoc_options: []
|
|
93
|
+
require_paths:
|
|
94
|
+
- lib
|
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
96
|
+
requirements:
|
|
97
|
+
- - ">="
|
|
98
|
+
- !ruby/object:Gem::Version
|
|
99
|
+
version: 3.4.4
|
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
101
|
+
requirements:
|
|
102
|
+
- - ">="
|
|
103
|
+
- !ruby/object:Gem::Version
|
|
104
|
+
version: '0'
|
|
105
|
+
requirements: []
|
|
106
|
+
rubygems_version: 4.0.3
|
|
107
|
+
specification_version: 4
|
|
108
|
+
summary: Convert URLs to Markdown via Cloudflare Browser Rendering
|
|
109
|
+
test_files: []
|