crawlscope 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/README.md +32 -0
  4. data/lib/crawlscope/cli.rb +16 -0
  5. data/lib/crawlscope/configuration.rb +10 -1
  6. data/lib/crawlscope/context.rb +1 -1
  7. data/lib/crawlscope/crawl.rb +72 -14
  8. data/lib/crawlscope/crawler.rb +3 -17
  9. data/lib/crawlscope/document_text.rb +7 -2
  10. data/lib/crawlscope/fetch_executor/async.rb +32 -0
  11. data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
  12. data/lib/crawlscope/fetch_executor.rb +43 -0
  13. data/lib/crawlscope/http.rb +7 -1
  14. data/lib/crawlscope/reporter.rb +123 -14
  15. data/lib/crawlscope/result.rb +1 -1
  16. data/lib/crawlscope/rules/content_quality.rb +1 -1
  17. data/lib/crawlscope/rules/indexability.rb +28 -6
  18. data/lib/crawlscope/rules/links.rb +80 -16
  19. data/lib/crawlscope/rules/uniqueness.rb +23 -4
  20. data/lib/crawlscope/sitemap.rb +30 -11
  21. data/lib/crawlscope/version.rb +1 -1
  22. data/lib/tasks/crawlscope_tasks.rake +1 -1
  23. data/test/crawlscope/cli_test.rb +28 -2
  24. data/test/crawlscope/configuration_test.rb +21 -0
  25. data/test/crawlscope/content_quality_rule_test.rb +18 -0
  26. data/test/crawlscope/crawl_test.rb +142 -4
  27. data/test/crawlscope/crawler_test.rb +61 -0
  28. data/test/crawlscope/fetch_executor_test.rb +44 -0
  29. data/test/crawlscope/links_rule_test.rb +101 -0
  30. data/test/crawlscope/reporter_test.rb +136 -11
  31. data/test/crawlscope/result_test.rb +35 -0
  32. data/test/crawlscope/sitemap_test.rb +52 -0
  33. data/test/performance/async_fetch_benchmark.rb +127 -0
  34. data/test/performance/fetch_executor_matrix.rb +162 -0
  35. data/test/performance/sitemap_expansion_benchmark.rb +121 -0
  36. metadata +38 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d9e56ae9a55e3c4bb6b079585b8a302edcc1bfad9110938c9421c5224bf27f9
4
- data.tar.gz: ab1908aa4a1bef4c15f055800ca9862ba973c9257f39020309de1f5554923cca
3
+ metadata.gz: cb1ed58c5dc558d7d7efcb357870fc4764a1d6d5caeb5ddc30e466334c986421
4
+ data.tar.gz: 9e90845271e781a0586c30c5c3f2c770b4a0c837474d78e8a19afa89c5b2fb6d
5
5
  SHA512:
6
- metadata.gz: 8981de1e7bc19737df3048b1e19f28d585f22eed8f2b32ea4eea473ba377d3a261e08df8165114c8d967b7ec7d14a48c47a6b83cfe14261f6c83b56b39134766
7
- data.tar.gz: 3df1e21bf74c12e994c0a932f9c581e4a6e12b55dd14975e43c5eca073bcc8eb4a49337d3c3c2e52137c4436fb5ddf52accd6fc11954171d77e75ce8f75e69a5
6
+ metadata.gz: cd645a628045089499e213491a08157e5268b2238007c86e20eeff996dcb0246037915dce0019554995ade10c57eae31ffc79d1b256a123160f728a7f6e74722
7
+ data.tar.gz: d12723425911cc2c6184f6f2f31f8d0dbe6fde8bc021c3c45c0b5490e500906be6f49da74c3457e04778bb6ed0c50438489fd1875191090de702cf6d0ed494f0
data/CHANGELOG.md CHANGED
@@ -5,6 +5,33 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.6.0] - 2026-06-01
9
+
10
+
11
+ ### Added
12
+
13
+ - add bounded async crawl execution
14
+
15
+
16
+
17
+
18
+ ### Changed
19
+
20
+ - default HTTP crawling to async
21
+
22
+ - update Ruby CI matrix
23
+
24
+
25
+
26
+
27
+ ### Fixed
28
+
29
+ - respect noindex targets in sitemap link audit
30
+
31
+ - improve validation report readability
32
+
33
+
34
+
8
35
  ## [0.5.0] - 2026-05-31
9
36
 
10
37
 
data/README.md CHANGED
@@ -32,6 +32,8 @@ The default rule set includes:
32
32
 
33
33
  ## Installation
34
34
 
35
+ Crawlscope requires Ruby 3.3 or newer.
36
+
35
37
  Add this line to your application's Gemfile:
36
38
 
37
39
  ```ruby
@@ -87,6 +89,29 @@ crawlscope validate --url https://example.com --sitemap https://example.com/site
87
89
 
88
90
  Child sitemap indexes are supported automatically.
89
91
 
92
+ Validation output is grouped for terminal scanning:
93
+
94
+ ```text
95
+ Crawlscope validation
96
+ Base URL: https://example.com
97
+ Sitemap: https://example.com/sitemap.xml
98
+ URLs: 24
99
+ Pages: 24
100
+ Status: FAILED
101
+ Issues: 3 3 warnings
102
+
103
+ Summary:
104
+ links 2
105
+ metadata 1
106
+
107
+ links / low_dofollow_inlinks: 2
108
+ - /pricing inbound 1/2 sources: /
109
+ - /features inbound 1/2 sources: /
110
+
111
+ metadata / missing_title: 1
112
+ - /draft missing <title>
113
+ ```
114
+
90
115
  ## Ruby Usage
91
116
 
92
117
  ```ruby
@@ -143,6 +168,7 @@ Available environment overrides:
143
168
  - `TIMEOUT=30`
144
169
  - `NETWORK_IDLE_TIMEOUT=10`
145
170
  - `CONCURRENCY=5`
171
+ - `FETCH_EXECUTOR=threaded` or `FETCH_EXECUTOR=async`
146
172
 
147
173
  Available tasks:
148
174
 
@@ -173,6 +199,12 @@ bundle exec rake 'crawlscope:validate:ldjson[https://example.com/article]'
173
199
  Plain `rake` does not pass `--url` style flags to tasks. Use `URL=...` or the
174
200
  task-argument form above instead.
175
201
 
202
+ `FETCH_EXECUTOR=async` is the default for HTTP crawling. It uses Ruby's fiber
203
+ scheduler and Async::HTTP through Faraday, preserving the same `CONCURRENCY`
204
+ bound. Use `FETCH_EXECUTOR=threaded` or `--fetch-executor threaded` for the
205
+ thread-pool executor. Browser rendering uses the threaded executor by default
206
+ because async fetch execution is only supported with HTTP rendering.
207
+
176
208
  `crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.
177
209
 
178
210
  ### Structured Data URL Audit
@@ -134,6 +134,8 @@ module Crawlscope
134
134
 
135
135
  configure_renderer(resolved_renderer)
136
136
  @configuration.concurrency = resolved_concurrency
137
+ fetch_executor_configured = !normalized_string(ENV["FETCH_EXECUTOR"]).nil?
138
+ @configuration.fetch_executor = resolved_fetch_executor
137
139
  @configuration.network_idle_timeout_seconds = resolved_integer("NETWORK_IDLE_TIMEOUT", default: @configuration.network_idle_timeout_seconds, minimum: 1)
138
140
  @configuration.timeout_seconds = resolved_integer("TIMEOUT", default: @configuration.timeout_seconds, minimum: 1)
139
141
 
@@ -167,9 +169,15 @@ module Crawlscope
167
169
  opts.on("--concurrency COUNT", Integer, "Set crawl concurrency") do |value|
168
170
  @configuration.concurrency = integer_option(value, minimum: 1, name: "concurrency")
169
171
  end
172
+
173
+ opts.on("--fetch-executor NAME", "Use threaded or async fetch execution") do |value|
174
+ fetch_executor_configured = true
175
+ @configuration.fetch_executor = value
176
+ end
170
177
  end
171
178
 
172
179
  parser.parse!(@argv)
180
+ @configuration.fetch_executor = :threaded if @configuration.renderer == :browser && !fetch_executor_configured
173
181
 
174
182
  result = task.validate(
175
183
  base_url: options[:url],
@@ -221,6 +229,14 @@ module Crawlscope
221
229
  end
222
230
  end
223
231
 
232
+ def resolved_fetch_executor
233
+ configured_executor = normalized_string(ENV["FETCH_EXECUTOR"])
234
+ return configured_executor if configured_executor
235
+ return :threaded if @configuration.renderer == :browser
236
+
237
+ @configuration.fetch_executor
238
+ end
239
+
224
240
  def resolved_integer(name, default:, minimum:)
225
241
  raw_value = normalized_string(ENV[name])
226
242
  return default if raw_value.nil?
@@ -7,10 +7,11 @@ module Crawlscope
7
7
  DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS = 5
8
8
  DEFAULT_BROWSER_SCROLL_PAGE = true
9
9
  DEFAULT_CONCURRENCY = 10
10
+ DEFAULT_FETCH_EXECUTOR = :async
10
11
  RENDERERS = %i[http browser].freeze
11
12
  DEFAULT_TIMEOUT_SECONDS = 20
12
13
 
13
- attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
14
+ attr_writer :allowed_statuses, :base_url, :browser_factory, :concurrency, :fetch_executor, :network_idle_timeout_seconds, :output, :renderer, :rule_registry, :schema_registry, :scroll_page, :site_name, :sitemap_path, :timeout_seconds
14
15
 
15
16
  def allowed_statuses
16
17
  value = resolve(@allowed_statuses)
@@ -30,6 +31,13 @@ module Crawlscope
30
31
  positive_integer(value, default: DEFAULT_CONCURRENCY, name: "concurrency")
31
32
  end
32
33
 
34
+ def fetch_executor
35
+ value = resolve(@fetch_executor)
36
+ default = (renderer == :browser) ? :threaded : DEFAULT_FETCH_EXECUTOR
37
+
38
+ FetchExecutor.normalize(value.nil? ? default : value)
39
+ end
40
+
33
41
  def browser_concurrency
34
42
  value = concurrency
35
43
  default_value = DEFAULT_BROWSER_CONCURRENCY
@@ -83,6 +91,7 @@ module Crawlscope
83
91
  sitemap_path: sitemap_path,
84
92
  browser_factory: browser_factory,
85
93
  concurrency: concurrency,
94
+ fetch_executor: fetch_executor,
86
95
  network_idle_timeout_seconds: network_idle_timeout_seconds,
87
96
  renderer: renderer,
88
97
  timeout_seconds: timeout_seconds,
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlscope
4
- Context = Data.define(:allowed_statuses, :base_url, :resolve_target, :schema_registry) do
4
+ Context = Data.define(:allowed_statuses, :base_url, :concurrency, :fetch_executor, :resolve_target, :resolve_targets, :schema_registry) do
5
5
  def fetch(name)
6
6
  public_send(name)
7
7
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Crawlscope
4
4
  class Crawl
5
- def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
5
+ def initialize(base_url:, sitemap_path:, rules:, schema_registry:, browser_factory: nil, concurrency: Configuration::DEFAULT_CONCURRENCY, fetch_executor: nil, network_idle_timeout_seconds: Configuration::DEFAULT_BROWSER_NETWORK_IDLE_TIMEOUT_SECONDS, renderer: :http, scroll_page: Configuration::DEFAULT_BROWSER_SCROLL_PAGE, timeout_seconds: Configuration::DEFAULT_TIMEOUT_SECONDS, allowed_statuses: Configuration::DEFAULT_ALLOWED_STATUSES)
6
6
  @base_url = base_url
7
7
  @sitemap_path = sitemap_path
8
8
  @rules = Array(rules)
@@ -11,16 +11,19 @@ module Crawlscope
11
11
  @concurrency = concurrency
12
12
  @network_idle_timeout_seconds = network_idle_timeout_seconds
13
13
  @renderer = renderer.to_sym
14
+ @fetch_executor = fetch_executor || default_fetch_executor
14
15
  @scroll_page = scroll_page
15
16
  @timeout_seconds = timeout_seconds
16
17
  @allowed_statuses = allowed_statuses
17
18
  end
18
19
 
19
20
  def call
21
+ validate_fetch_executor!
22
+
20
23
  urls = sitemap_urls
21
24
 
22
25
  @page_fetcher = page
23
- pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency).call(urls)
26
+ pages = Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(urls)
24
27
  issues = IssueCollection.new
25
28
 
26
29
  collect(pages, issues)
@@ -41,7 +44,13 @@ module Crawlscope
41
44
  private
42
45
 
43
46
  def sitemap_urls
44
- urls = Sitemap.new(path: @sitemap_path).urls(base_url: @base_url)
47
+ urls = Sitemap.new(
48
+ path: @sitemap_path,
49
+ adapter: http_adapter,
50
+ concurrency: @concurrency,
51
+ fetch_executor: @fetch_executor,
52
+ timeout_seconds: @timeout_seconds
53
+ ).urls(base_url: @base_url)
45
54
  raise ValidationError, "No URLs found in sitemap: #{@sitemap_path}" if urls.empty?
46
55
 
47
56
  urls
@@ -62,15 +71,35 @@ module Crawlscope
62
71
  if @renderer == :browser
63
72
  (@browser_factory || method(:browser)).call
64
73
  else
65
- Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds)
74
+ Http.new(base_url: @base_url, timeout_seconds: @timeout_seconds, adapter: http_adapter)
66
75
  end
67
76
  end
68
77
 
78
+ def http_adapter
79
+ return unless FetchExecutor.normalize(@fetch_executor) == :async
80
+
81
+ require "async/http/faraday"
82
+ :async_http
83
+ end
84
+
85
+ def validate_fetch_executor!
86
+ return unless @renderer == :browser && FetchExecutor.normalize(@fetch_executor) == :async
87
+
88
+ raise ConfigurationError, "Async fetch execution is only supported with http rendering"
89
+ end
90
+
91
+ def default_fetch_executor
92
+ (@renderer == :browser) ? :threaded : Configuration::DEFAULT_FETCH_EXECUTOR
93
+ end
94
+
69
95
  def context
70
96
  Context.new(
71
97
  allowed_statuses: @allowed_statuses,
72
98
  base_url: @base_url,
99
+ concurrency: @concurrency,
100
+ fetch_executor: @fetch_executor,
73
101
  resolve_target: method(:resolve),
102
+ resolve_targets: method(:resolve_all),
74
103
  schema_registry: @schema_registry
75
104
  )
76
105
  end
@@ -93,11 +122,15 @@ module Crawlscope
93
122
  @targets = {}
94
123
 
95
124
  pages.each do |page|
96
- @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
97
- @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
125
+ cache_page(page)
98
126
  end
99
127
  end
100
128
 
129
+ def cache_page(page)
130
+ @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
131
+ @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
132
+ end
133
+
101
134
  def scan(urls, pages, issues)
102
135
  @rules.each do |rule|
103
136
  rule.call(urls: urls, pages: pages, issues: issues, context: context)
@@ -105,17 +138,40 @@ module Crawlscope
105
138
  end
106
139
 
107
140
  def resolve(target_url)
108
- normalized_url = Url.normalize(target_url, base_url: @base_url)
109
- return @targets[normalized_url] if @targets.key?(normalized_url)
141
+ resolve_all([target_url]).fetch(target_url)
142
+ end
143
+
144
+ def resolve_all(target_urls)
145
+ normalized_by_url = Array(target_urls).to_h do |target_url|
146
+ [target_url, Url.normalize(target_url, base_url: @base_url)]
147
+ end
148
+ normalized_urls = normalized_by_url.values.compact.uniq
149
+ missing_urls = []
150
+
151
+ normalized_urls.each do |normalized_url|
152
+ next if @targets.key?(normalized_url)
153
+
154
+ resolved = resolved_page(normalized_url)
155
+ if resolved
156
+ @targets[normalized_url] = resolved
157
+ else
158
+ missing_urls << normalized_url
159
+ end
160
+ end
161
+
162
+ fetched_pages(missing_urls).each do |page|
163
+ normalized_url = Url.normalize(page.url, base_url: @base_url)
164
+ cache_page(page)
165
+ @targets[normalized_url] = resolution(page, normalized_url, crawled: false)
166
+ end
110
167
 
111
- @targets[normalized_url] = resolved_page(normalized_url) || fetched_page(normalized_url)
168
+ normalized_by_url.to_h { |target_url, normalized_url| [target_url, @targets[normalized_url]] }
112
169
  end
113
170
 
114
- def fetched_page(normalized_url)
115
- page = @page_fetcher.fetch(normalized_url)
116
- @pages[page.normalized_url] = page unless page.normalized_url.to_s.empty?
117
- @pages[page.normalized_final_url] = page unless page.normalized_final_url.to_s.empty?
118
- resolution(page, normalized_url, crawled: false)
171
+ def fetched_pages(normalized_urls)
172
+ return [] if normalized_urls.empty?
173
+
174
+ Crawler.new(page_fetcher: @page_fetcher, concurrency: @concurrency, fetch_executor: @fetch_executor).call(normalized_urls)
119
175
  end
120
176
 
121
177
  def resolved_page(normalized_url)
@@ -126,8 +182,10 @@ module Crawlscope
126
182
  def resolution(page, normalized_url, crawled:)
127
183
  {
128
184
  crawled: crawled,
185
+ doc: page.doc,
129
186
  error: page.error,
130
187
  final_url: page.normalized_final_url || normalized_url,
188
+ headers: page.headers,
131
189
  html: page.html?,
132
190
  status: page.status
133
191
  }
@@ -1,28 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "concurrent"
4
-
5
3
  module Crawlscope
6
4
  class Crawler
7
- def initialize(page_fetcher:, concurrency:)
5
+ def initialize(page_fetcher:, concurrency:, fetch_executor: :threaded)
8
6
  @page_fetcher = page_fetcher
9
- @concurrency = concurrency
7
+ @fetch_executor = FetchExecutor.build(name: fetch_executor, concurrency: concurrency)
10
8
  end
11
9
 
12
10
  def call(urls)
13
- pages = Concurrent::Array.new
14
- pool = Concurrent::FixedThreadPool.new(@concurrency)
15
-
16
- urls.each do |url|
17
- pool.post do
18
- pages << fetch(url)
19
- end
20
- end
21
-
22
- pool.shutdown
23
- pool.wait_for_termination
24
-
25
- pages.to_a
11
+ @fetch_executor.call(urls) { |url| fetch(url) }
26
12
  end
27
13
 
28
14
  private
@@ -3,6 +3,7 @@
3
3
  module Crawlscope
4
4
  module DocumentText
5
5
  REMOVED_SELECTORS = "script, style, noscript, template, svg"
6
+ CONTENT_RATIO_REMOVED_SELECTORS = "#{REMOVED_SELECTORS}, form"
6
7
  TOKEN_PATTERN = /[[:alnum:]]+/
7
8
 
8
9
  module_function
@@ -15,6 +16,10 @@ module Crawlscope
15
16
  root_for(doc, selector: selector)&.to_html.to_s
16
17
  end
17
18
 
19
+ def content_ratio_html_for(doc, selector: "main")
20
+ root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
21
+ end
22
+
18
23
  def text_for(doc, selector: "main")
19
24
  normalize(root_for(doc, selector: selector)&.text)
20
25
  end
@@ -27,11 +32,11 @@ module Crawlscope
27
32
  text.to_s.gsub(/\s+/, " ").strip
28
33
  end
29
34
 
30
- def root_for(doc, selector:)
35
+ def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
31
36
  return unless doc
32
37
 
33
38
  copy = doc.dup
34
- copy.css(REMOVED_SELECTORS).remove
39
+ copy.css(removed_selectors).remove
35
40
 
36
41
  root = selector.to_s.empty? ? nil : copy.at_css(selector)
37
42
  root || copy.at_css("body") || copy
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "async"
4
+ require "async/semaphore"
5
+
6
+ module Crawlscope
7
+ module FetchExecutor
8
+ class Async
9
+ def initialize(concurrency:)
10
+ @concurrency = concurrency
11
+ end
12
+
13
+ def call(items)
14
+ indexed_items = Array(items).each_with_index.to_a
15
+ results = Array.new(indexed_items.size)
16
+
17
+ Sync do |parent|
18
+ semaphore = ::Async::Semaphore.new(@concurrency)
19
+ tasks = indexed_items.map do |item, index|
20
+ semaphore.async(parent: parent) do
21
+ results[index] = yield(item)
22
+ end
23
+ end
24
+
25
+ tasks.each(&:wait)
26
+ end
27
+
28
+ results
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "concurrent"
4
+
5
+ module Crawlscope
6
+ module FetchExecutor
7
+ class Threaded
8
+ def initialize(concurrency:)
9
+ @concurrency = concurrency
10
+ end
11
+
12
+ def call(items)
13
+ indexed_items = Array(items).each_with_index.to_a
14
+ results = Array.new(indexed_items.size)
15
+ mutex = Mutex.new
16
+ pool = Concurrent::FixedThreadPool.new(@concurrency)
17
+
18
+ indexed_items.each do |item, index|
19
+ pool.post do
20
+ result = yield(item)
21
+ mutex.synchronize { results[index] = result }
22
+ end
23
+ end
24
+
25
+ pool.shutdown
26
+ pool.wait_for_termination
27
+
28
+ results
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlscope
4
+ module FetchExecutor
5
+ NAMES = %i[threaded async].freeze
6
+
7
+ module_function
8
+
9
+ def build(name:, concurrency:)
10
+ return name if name.respond_to?(:call)
11
+
12
+ case normalized_name(name)
13
+ when :threaded
14
+ Threaded.new(concurrency: concurrency)
15
+ when :async
16
+ Async.new(concurrency: concurrency)
17
+ end
18
+ end
19
+
20
+ def map(name:, concurrency:, items:, &block)
21
+ items = Array(items)
22
+ return items.map(&block) if items.size < 2 || concurrency.to_i <= 1
23
+
24
+ build(name: name, concurrency: concurrency).call(items, &block)
25
+ end
26
+
27
+ def normalize(name)
28
+ return name if name.respond_to?(:call)
29
+
30
+ normalized_name(name)
31
+ end
32
+
33
+ def normalized_name(name)
34
+ normalized = name.to_s.strip
35
+ normalized = "threaded" if normalized.empty?
36
+
37
+ value = normalized.to_sym
38
+ return value if NAMES.include?(value)
39
+
40
+ raise ConfigurationError, "Crawlscope fetch_executor must be threaded or async"
41
+ end
42
+ end
43
+ end
@@ -10,13 +10,18 @@ module Crawlscope
10
10
  MAX_REDIRECTS = 5
11
11
  USER_AGENT = "Mozilla/5.0 (compatible; Crawlscope/1.0)"
12
12
 
13
- def initialize(base_url:, timeout_seconds:)
13
+ def initialize(base_url:, timeout_seconds:, adapter: nil)
14
14
  @base_url = base_url
15
15
  @timeout_seconds = timeout_seconds
16
+ @adapter = adapter
16
17
  @connections_by_thread = Concurrent::Map.new
17
18
  end
18
19
 
19
20
  def close
21
+ @connections_by_thread.each_value do |connection|
22
+ connection.close if connection.respond_to?(:close)
23
+ end
24
+
20
25
  @connections_by_thread.clear
21
26
  end
22
27
 
@@ -65,6 +70,7 @@ module Crawlscope
65
70
  faraday.response :follow_redirects, limit: MAX_REDIRECTS
66
71
  faraday.options.timeout = @timeout_seconds
67
72
  faraday.options.open_timeout = @timeout_seconds
73
+ faraday.adapter @adapter if @adapter
68
74
  end
69
75
  end
70
76
  end