crawlscope 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/README.md +32 -0
- data/lib/crawlscope/cli.rb +16 -0
- data/lib/crawlscope/configuration.rb +10 -1
- data/lib/crawlscope/context.rb +1 -1
- data/lib/crawlscope/crawl.rb +72 -14
- data/lib/crawlscope/crawler.rb +3 -17
- data/lib/crawlscope/document_text.rb +7 -2
- data/lib/crawlscope/fetch_executor/async.rb +32 -0
- data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
- data/lib/crawlscope/fetch_executor.rb +43 -0
- data/lib/crawlscope/http.rb +7 -1
- data/lib/crawlscope/reporter.rb +123 -14
- data/lib/crawlscope/result.rb +1 -1
- data/lib/crawlscope/rules/content_quality.rb +1 -1
- data/lib/crawlscope/rules/indexability.rb +28 -6
- data/lib/crawlscope/rules/links.rb +80 -16
- data/lib/crawlscope/rules/uniqueness.rb +23 -4
- data/lib/crawlscope/sitemap.rb +30 -11
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +1 -1
- data/test/crawlscope/cli_test.rb +28 -2
- data/test/crawlscope/configuration_test.rb +21 -0
- data/test/crawlscope/content_quality_rule_test.rb +18 -0
- data/test/crawlscope/crawl_test.rb +142 -4
- data/test/crawlscope/crawler_test.rb +61 -0
- data/test/crawlscope/fetch_executor_test.rb +44 -0
- data/test/crawlscope/links_rule_test.rb +101 -0
- data/test/crawlscope/reporter_test.rb +136 -11
- data/test/crawlscope/result_test.rb +35 -0
- data/test/crawlscope/sitemap_test.rb +52 -0
- data/test/performance/async_fetch_benchmark.rb +127 -0
- data/test/performance/fetch_executor_matrix.rb +162 -0
- data/test/performance/sitemap_expansion_benchmark.rb +121 -0
- metadata +38 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cb1ed58c5dc558d7d7efcb357870fc4764a1d6d5caeb5ddc30e466334c986421
|
|
4
|
+
data.tar.gz: 9e90845271e781a0586c30c5c3f2c770b4a0c837474d78e8a19afa89c5b2fb6d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cd645a628045089499e213491a08157e5268b2238007c86e20eeff996dcb0246037915dce0019554995ade10c57eae31ffc79d1b256a123160f728a7f6e74722
|
|
7
|
+
data.tar.gz: d12723425911cc2c6184f6f2f31f8d0dbe6fde8bc021c3c45c0b5490e500906be6f49da74c3457e04778bb6ed0c50438489fd1875191090de702cf6d0ed494f0
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,33 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.6.0] - 2026-06-01
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- add bounded async crawl execution
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
|
|
20
|
+
- default HTTP crawling to async
|
|
21
|
+
|
|
22
|
+
- update Ruby CI matrix
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
|
|
29
|
+
- respect noindex targets in sitemap link audit
|
|
30
|
+
|
|
31
|
+
- improve validation report readability
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
8
35
|
## [0.5.0] - 2026-05-31
|
|
9
36
|
|
|
10
37
|
|
data/README.md
CHANGED
|
@@ -32,6 +32,8 @@ The default rule set includes:
|
|
|
32
32
|
|
|
33
33
|
## Installation
|
|
34
34
|
|
|
35
|
+
Crawlscope requires Ruby 3.3 or newer.
|
|
36
|
+
|
|
35
37
|
Add this line to your application's Gemfile:
|
|
36
38
|
|
|
37
39
|
```ruby
|
|
@@ -87,6 +89,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site
|
|
|
87
89
|
|
|
88
90
|
Child sitemap indexes are supported automatically.
|
|
89
91
|
|
|
92
|
+
Validation output is grouped for terminal scanning:
|
|
93
|
+
|
|
94
|
+
```text
|
|
95
|
+
Crawlscope validation
|
|
96
|
+
Base URL: https://example.com
|
|
97
|
+
Sitemap: https://example.com/sitemap.xml
|
|
98
|
+
URLs: 24
|
|
99
|
+
Pages: 24
|
|
100
|
+
Status: FAILED
|
|
101
|
+
Issues: 3 3 warnings
|
|
102
|
+
|
|
103
|
+
Summary:
|
|
104
|
+
links 2
|
|
105
|
+
metadata 1
|
|
106
|
+
|
|
107
|
+
links / low_dofollow_inlinks: 2
|
|
108
|
+
- /pricing inbound 1/2 sources: /
|
|
109
|
+
- /features inbound 1/2 sources: /
|
|
110
|
+
|
|
111
|
+
metadata / missing_title: 1
|
|
112
|
+
- /draft missing <title>
|
|
113
|
+
```
|
|
114
|
+
|
|
90
115
|
## Ruby Usage
|
|
91
116
|
|
|
92
117
|
```ruby
|
|
@@ -143,6 +168,7 @@ Available environment overrides:
|
|
|
143
168
|
- `TIMEOUT=30`
|
|
144
169
|
- `NETWORK_IDLE_TIMEOUT=10`
|
|
145
170
|
- `CONCURRENCY=5`
|
|
171
|
+
- `FETCH_EXECUTOR=threaded` or `FETCH_EXECUTOR=async`
|
|
146
172
|
|
|
147
173
|
Available tasks:
|
|
148
174
|
|
|
@@ -173,6 +199,12 @@ bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
|
|
|
173
199
|
Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
|
|
174
200
|
task-argument form above instead.
|
|
175
201
|
|
|
202
|
+
`FETCH_EXECUTOR=async` is the default for HTTP crawling. It uses Ruby's fiber
|
|
203
|
+
scheduler and Async::HTTP through Faraday, preserving the same `CONCURRENCY`
|
|
204
|
+
bound. Use `FETCH_EXECUTOR=threaded` or `--fetch-executor threaded` for the
|
|
205
|
+
thread-pool executor. Browser rendering uses the threaded executor by default
|
|
206
|
+
because async fetch execution is only supported with HTTP rendering.
|
|
207
|
+
|
|
176
208
|
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
|
|
177
209
|
|
|
178
210
|
### Structured Data URL Audit
|
data/lib/crawlscope/cli.rb
CHANGED
|
@@ -134,6 +134,8 @@ module Crawlscope
|
|
|
134
134
|
|
|
135
135
|
configure_renderer(resolved_renderer)
|
|
136
136
|
@configuration.concurrency = resolved_concurrency
|
|
137
|
+
fetch_executor_configured = !normalized_string(ENV["FETCH_EXECUTOR"]).nil?
|
|
138
|
+
@configuration.fetch_executor = resolved_fetch_executor
|
|
137
139
|
@configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1)
|
|
138
140
|
@configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
|
|
139
141
|
|
|
@@ -167,9 +169,15 @@ module Crawlscope
|
|
|
167
169
|
opts.on("--concurrency COUNT", Integer, "Set crawl concurrency") do |value|
|
|
168
170
|
@configuration.concurrency = integer_option(value, minimum: 1, name: "concurrency")
|
|
169
171
|
end
|
|
172
|
+
|
|
173
|
+
opts.on("--fetch-executor NAME", "Use threaded or async fetch execution") do |value|
|
|
174
|
+
fetch_executor_configured = true
|
|
175
|
+
@configuration.fetch_executor = value
|
|
176
|
+
end
|
|
170
177
|
end
|
|
171
178
|
|
|
172
179
|
parser.parse!(@argv)
|
|
180
|
+
@configuration.fetch_executor = :threaded if @configuration.renderer == :browser && !fetch_executor_configured
|
|
173
181
|
|
|
174
182
|
result = task.validate(
|
|
175
183
|
base_url: options[:url],
|
|
@@ -221,6 +229,14 @@ module Crawlscope
|
|
|
221
229
|
end
|
|
222
230
|
end
|
|
223
231
|
|
|
232
|
+
def resolved_fetch_executor
|
|
233
|
+
configured_executor = normalized_string(ENV["FETCH_EXECUTOR"])
|
|
234
|
+
return configured_executor if configured_executor
|
|
235
|
+
return :threaded if @configuration.renderer == :browser
|
|
236
|
+
|
|
237
|
+
@configuration.fetch_executor
|
|
238
|
+
end
|
|
239
|
+
|
|
224
240
|
def resolved_integer(name, default:, minimum:)
|
|
225
241
|
raw_value = normalized_string(ENV[name])
|
|
226
242
|
return default if raw_value.nil?
|
|
@@ -7,10 +7,11 @@ module Crawlscope
|
|
|
7
7
|
DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
|
|
8
8
|
DEFAULT_BROWSER_SCROLL_PAGE = true
|
|
9
9
|
DEFAULT_CONCURRENCY = 10
|
|
10
|
+
DEFAULT_FETCH_EXECUTOR = :async
|
|
10
11
|
RENDERERS = %i[http browser].freeze
|
|
11
12
|
DEFAULT_TIMEOUT_SECONDS = 20
|
|
12
13
|
|
|
13
|
-
attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
|
|
14
|
+
attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
|
|
14
15
|
|
|
15
16
|
def allowed_statuses
|
|
16
17
|
value = resolve(@allowed_statuses)
|
|
@@ -30,6 +31,13 @@ module Crawlscope
|
|
|
30
31
|
positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
|
|
31
32
|
end
|
|
32
33
|
|
|
34
|
+
def fetch_executor
|
|
35
|
+
value = resolve(@fetch_executor)
|
|
36
|
+
default = (renderer == :browser) ? :threaded : DEFAULT_FETCH_EXECUTOR
|
|
37
|
+
|
|
38
|
+
FetchExecutor.normalize(value.nil? ? default : value)
|
|
39
|
+
end
|
|
40
|
+
|
|
33
41
|
def browser_concurrency
|
|
34
42
|
value = concurrency
|
|
35
43
|
default_value = DEFAULT_BROWSER_CONCURRENCY
|
|
@@ -83,6 +91,7 @@ module Crawlscope
|
|
|
83
91
|
sitemap_path: sitemap_path,
|
|
84
92
|
browser_factory: browser_factory,
|
|
85
93
|
concurrency: concurrency,
|
|
94
|
+
fetch_executor: fetch_executor,
|
|
86
95
|
network_idle_timeout_seconds: network_idle_timeout_seconds,
|
|
87
96
|
renderer: renderer,
|
|
88
97
|
timeout_seconds: timeout_seconds,
|
data/lib/crawlscope/context.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Crawlscope
|
|
4
|
-
Context = Data.define(:allowed_statuses, :base_url, :resolve_target, :schema_registry) do
|
|
4
|
+
Context = Data.define(:allowed_statuses, :base_url, :concurrency, :fetch_executor, :resolve_target, :resolve_targets, :schema_registry) do
|
|
5
5
|
def fetch(name)
|
|
6
6
|
public_send(name)
|
|
7
7
|
end
|
data/lib/crawlscope/crawl.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module Crawlscope
|
|
4
4
|
class Crawl
|
|
5
|
-
def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
|
|
5
|
+
def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: nil, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
|
|
6
6
|
@base_url = base_url
|
|
7
7
|
@sitemap_path = sitemap_path
|
|
8
8
|
@rules = Array(rules)
|
|
@@ -11,16 +11,19 @@ module Crawlscope
|
|
|
11
11
|
@concurrency = concurrency
|
|
12
12
|
@network_idle_timeout_seconds = network_idle_timeout_seconds
|
|
13
13
|
@renderer = renderer.to_sym
|
|
14
|
+
@fetch_executor = fetch_executor || default_fetch_executor
|
|
14
15
|
@scroll_page = scroll_page
|
|
15
16
|
@timeout_seconds = timeout_seconds
|
|
16
17
|
@allowed_statuses = allowed_statuses
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
def call
|
|
21
|
+
validate_fetch_executor!
|
|
22
|
+
|
|
20
23
|
urls = sitemap_urls
|
|
21
24
|
|
|
22
25
|
@page_fetcher = page
|
|
23
|
-
pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls)
|
|
26
|
+
pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(urls)
|
|
24
27
|
issues = IssueCollection.new
|
|
25
28
|
|
|
26
29
|
collect(pages, issues)
|
|
@@ -41,7 +44,13 @@ module Crawlscope
|
|
|
41
44
|
private
|
|
42
45
|
|
|
43
46
|
def sitemap_urls
|
|
44
|
-
urls = Sitemap.new(
|
|
47
|
+
urls = Sitemap.new(
|
|
48
|
+
path: @sitemap_path,
|
|
49
|
+
adapter: http_adapter,
|
|
50
|
+
concurrency: @concurrency,
|
|
51
|
+
fetch_executor: @fetch_executor,
|
|
52
|
+
timeout_seconds: @timeout_seconds
|
|
53
|
+
).urls(base_url: @base_url)
|
|
45
54
|
raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
|
|
46
55
|
|
|
47
56
|
urls
|
|
@@ -62,15 +71,35 @@ module Crawlscope
|
|
|
62
71
|
if @renderer == :browser
|
|
63
72
|
(@browser_factory || method(:browser)).call
|
|
64
73
|
else
|
|
65
|
-
Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
|
|
74
|
+
Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds, adapter: http_adapter)
|
|
66
75
|
end
|
|
67
76
|
end
|
|
68
77
|
|
|
78
|
+
def http_adapter
|
|
79
|
+
return unless FetchExecutor.normalize(@fetch_executor) == :async
|
|
80
|
+
|
|
81
|
+
require "async/http/faraday"
|
|
82
|
+
:async_http
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def validate_fetch_executor!
|
|
86
|
+
return unless @renderer == :browser && FetchExecutor.normalize(@fetch_executor) == :async
|
|
87
|
+
|
|
88
|
+
raise ConfigurationError, "Async fetch execution is only supported with http rendering"
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def default_fetch_executor
|
|
92
|
+
(@renderer == :browser) ? :threaded : Configuration::DEFAULT_FETCH_EXECUTOR
|
|
93
|
+
end
|
|
94
|
+
|
|
69
95
|
def context
|
|
70
96
|
Context.new(
|
|
71
97
|
allowed_statuses: @allowed_statuses,
|
|
72
98
|
base_url: @base_url,
|
|
99
|
+
concurrency: @concurrency,
|
|
100
|
+
fetch_executor: @fetch_executor,
|
|
73
101
|
resolve_target: method(:resolve),
|
|
102
|
+
resolve_targets: method(:resolve_all),
|
|
74
103
|
schema_registry: @schema_registry
|
|
75
104
|
)
|
|
76
105
|
end
|
|
@@ -93,11 +122,15 @@ module Crawlscope
|
|
|
93
122
|
@targets = {}
|
|
94
123
|
|
|
95
124
|
pages.each do |page|
|
|
96
|
-
|
|
97
|
-
@pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
125
|
+
cache_page(page)
|
|
98
126
|
end
|
|
99
127
|
end
|
|
100
128
|
|
|
129
|
+
def cache_page(page)
|
|
130
|
+
@pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
|
|
131
|
+
@pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
|
|
132
|
+
end
|
|
133
|
+
|
|
101
134
|
def scan(urls, pages, issues)
|
|
102
135
|
@rules.each do |rule|
|
|
103
136
|
rule.call(urls: urls, pages: pages, issues: issues, context: context)
|
|
@@ -105,17 +138,40 @@ module Crawlscope
|
|
|
105
138
|
end
|
|
106
139
|
|
|
107
140
|
def resolve(target_url)
|
|
108
|
-
|
|
109
|
-
|
|
141
|
+
resolve_all([target_url]).fetch(target_url)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def resolve_all(target_urls)
|
|
145
|
+
normalized_by_url = Array(target_urls).to_h do |target_url|
|
|
146
|
+
[target_url, Url.normalize(target_url, base_url: @base_url)]
|
|
147
|
+
end
|
|
148
|
+
normalized_urls = normalized_by_url.values.compact.uniq
|
|
149
|
+
missing_urls = []
|
|
150
|
+
|
|
151
|
+
normalized_urls.each do |normalized_url|
|
|
152
|
+
next if @targets.key?(normalized_url)
|
|
153
|
+
|
|
154
|
+
resolved = resolved_page(normalized_url)
|
|
155
|
+
if resolved
|
|
156
|
+
@targets[normalized_url] = resolved
|
|
157
|
+
else
|
|
158
|
+
missing_urls << normalized_url
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
fetched_pages(missing_urls).each do |page|
|
|
163
|
+
normalized_url = Url.normalize(page.url, base_url: @base_url)
|
|
164
|
+
cache_page(page)
|
|
165
|
+
@targets[normalized_url] = resolution(page, normalized_url, crawled: false)
|
|
166
|
+
end
|
|
110
167
|
|
|
111
|
-
@targets[normalized_url]
|
|
168
|
+
normalized_by_url.to_h { |target_url, normalized_url| [target_url, @targets[normalized_url]] }
|
|
112
169
|
end
|
|
113
170
|
|
|
114
|
-
def
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
resolution(page, normalized_url, crawled: false)
|
|
171
|
+
def fetched_pages(normalized_urls)
|
|
172
|
+
return [] if normalized_urls.empty?
|
|
173
|
+
|
|
174
|
+
Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(normalized_urls)
|
|
119
175
|
end
|
|
120
176
|
|
|
121
177
|
def resolved_page(normalized_url)
|
|
@@ -126,8 +182,10 @@ module Crawlscope
|
|
|
126
182
|
def resolution(page, normalized_url, crawled:)
|
|
127
183
|
{
|
|
128
184
|
crawled: crawled,
|
|
185
|
+
doc: page.doc,
|
|
129
186
|
error: page.error,
|
|
130
187
|
final_url: page.normalized_final_url || normalized_url,
|
|
188
|
+
headers: page.headers,
|
|
131
189
|
html: page.html?,
|
|
132
190
|
status: page.status
|
|
133
191
|
}
|
data/lib/crawlscope/crawler.rb
CHANGED
|
@@ -1,28 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "concurrent"
|
|
4
|
-
|
|
5
3
|
module Crawlscope
|
|
6
4
|
class Crawler
|
|
7
|
-
def initialize(page_fetcher:, concurrency:)
|
|
5
|
+
def initialize(page_fetcher:, concurrency:, fetch_executor: :threaded)
|
|
8
6
|
@page_fetcher = page_fetcher
|
|
9
|
-
@
|
|
7
|
+
@fetch_executor = FetchExecutor.build(name: fetch_executor, concurrency: concurrency)
|
|
10
8
|
end
|
|
11
9
|
|
|
12
10
|
def call(urls)
|
|
13
|
-
|
|
14
|
-
pool = Concurrent::FixedThreadPool.new(@concurrency)
|
|
15
|
-
|
|
16
|
-
urls.each do |url|
|
|
17
|
-
pool.post do
|
|
18
|
-
pages << fetch(url)
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
pool.shutdown
|
|
23
|
-
pool.wait_for_termination
|
|
24
|
-
|
|
25
|
-
pages.to_a
|
|
11
|
+
@fetch_executor.call(urls) { |url| fetch(url) }
|
|
26
12
|
end
|
|
27
13
|
|
|
28
14
|
private
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
module Crawlscope
|
|
4
4
|
module DocumentText
|
|
5
5
|
REMOVED_SELECTORS = "script, style, noscript, template, svg"
|
|
6
|
+
CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form"
|
|
6
7
|
TOKEN_PATTERN = /[[:alnum:]]+/
|
|
7
8
|
|
|
8
9
|
module_function
|
|
@@ -15,6 +16,10 @@ module Crawlscope
|
|
|
15
16
|
root_for(doc, selector: selector)&.to_html.to_s
|
|
16
17
|
end
|
|
17
18
|
|
|
19
|
+
def content_ratio_html_for(doc, selector: "main")
|
|
20
|
+
root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
|
|
21
|
+
end
|
|
22
|
+
|
|
18
23
|
def text_for(doc, selector: "main")
|
|
19
24
|
normalize(root_for(doc, selector: selector)&.text)
|
|
20
25
|
end
|
|
@@ -27,11 +32,11 @@ module Crawlscope
|
|
|
27
32
|
text.to_s.gsub(/\s+/, " ").strip
|
|
28
33
|
end
|
|
29
34
|
|
|
30
|
-
def root_for(doc, selector:)
|
|
35
|
+
def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
|
|
31
36
|
return unless doc
|
|
32
37
|
|
|
33
38
|
copy = doc.dup
|
|
34
|
-
copy.css(
|
|
39
|
+
copy.css(removed_selectors).remove
|
|
35
40
|
|
|
36
41
|
root = selector.to_s.empty? ? nil : copy.at_css(selector)
|
|
37
42
|
root || copy.at_css("body") || copy
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "async"
|
|
4
|
+
require "async/semaphore"
|
|
5
|
+
|
|
6
|
+
module Crawlscope
|
|
7
|
+
module FetchExecutor
|
|
8
|
+
class Async
|
|
9
|
+
def initialize(concurrency:)
|
|
10
|
+
@concurrency = concurrency
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(items)
|
|
14
|
+
indexed_items = Array(items).each_with_index.to_a
|
|
15
|
+
results = Array.new(indexed_items.size)
|
|
16
|
+
|
|
17
|
+
Sync do |parent|
|
|
18
|
+
semaphore = ::Async::Semaphore.new(@concurrency)
|
|
19
|
+
tasks = indexed_items.map do |item, index|
|
|
20
|
+
semaphore.async(parent: parent) do
|
|
21
|
+
results[index] = yield(item)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
tasks.each(&:wait)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
results
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent"
|
|
4
|
+
|
|
5
|
+
module Crawlscope
|
|
6
|
+
module FetchExecutor
|
|
7
|
+
class Threaded
|
|
8
|
+
def initialize(concurrency:)
|
|
9
|
+
@concurrency = concurrency
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(items)
|
|
13
|
+
indexed_items = Array(items).each_with_index.to_a
|
|
14
|
+
results = Array.new(indexed_items.size)
|
|
15
|
+
mutex = Mutex.new
|
|
16
|
+
pool = Concurrent::FixedThreadPool.new(@concurrency)
|
|
17
|
+
|
|
18
|
+
indexed_items.each do |item, index|
|
|
19
|
+
pool.post do
|
|
20
|
+
result = yield(item)
|
|
21
|
+
mutex.synchronize { results[index] = result }
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
pool.shutdown
|
|
26
|
+
pool.wait_for_termination
|
|
27
|
+
|
|
28
|
+
results
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Crawlscope
|
|
4
|
+
module FetchExecutor
|
|
5
|
+
NAMES = %i[threaded async].freeze
|
|
6
|
+
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def build(name:, concurrency:)
|
|
10
|
+
return name if name.respond_to?(:call)
|
|
11
|
+
|
|
12
|
+
case normalized_name(name)
|
|
13
|
+
when :threaded
|
|
14
|
+
Threaded.new(concurrency: concurrency)
|
|
15
|
+
when :async
|
|
16
|
+
Async.new(concurrency: concurrency)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def map(name:, concurrency:, items:, &block)
|
|
21
|
+
items = Array(items)
|
|
22
|
+
return items.map(&block) if items.size < 2 || concurrency.to_i <= 1
|
|
23
|
+
|
|
24
|
+
build(name: name, concurrency: concurrency).call(items, &block)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def normalize(name)
|
|
28
|
+
return name if name.respond_to?(:call)
|
|
29
|
+
|
|
30
|
+
normalized_name(name)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def normalized_name(name)
|
|
34
|
+
normalized = name.to_s.strip
|
|
35
|
+
normalized = "threaded" if normalized.empty?
|
|
36
|
+
|
|
37
|
+
value = normalized.to_sym
|
|
38
|
+
return value if NAMES.include?(value)
|
|
39
|
+
|
|
40
|
+
raise ConfigurationError, "Crawlscope fetch_executor must be threaded or async"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
data/lib/crawlscope/http.rb
CHANGED
|
@@ -10,13 +10,18 @@ module Crawlscope
|
|
|
10
10
|
MAX_REDIRECTS = 5
|
|
11
11
|
USER_AGENT = "Mozilla/5.0 (compatible; Crawlscope/1.0)"
|
|
12
12
|
|
|
13
|
-
def initialize(base_url:, timeout_seconds:)
|
|
13
|
+
def initialize(base_url:, timeout_seconds:, adapter: nil)
|
|
14
14
|
@base_url = base_url
|
|
15
15
|
@timeout_seconds = timeout_seconds
|
|
16
|
+
@adapter = adapter
|
|
16
17
|
@connections_by_thread = Concurrent::Map.new
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
def close
|
|
21
|
+
@connections_by_thread.each_value do |connection|
|
|
22
|
+
connection.close if connection.respond_to?(:close)
|
|
23
|
+
end
|
|
24
|
+
|
|
20
25
|
@connections_by_thread.clear
|
|
21
26
|
end
|
|
22
27
|
|
|
@@ -65,6 +70,7 @@ module Crawlscope
|
|
|
65
70
|
faraday.response :follow_redirects, limit: MAX_REDIRECTS
|
|
66
71
|
faraday.options.timeout = @timeout_seconds
|
|
67
72
|
faraday.options.open_timeout = @timeout_seconds
|
|
73
|
+
faraday.adapter @adapter if @adapter
|
|
68
74
|
end
|
|
69
75
|
end
|
|
70
76
|
end
|