crawlscope 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/README.md +32 -0
- data/lib/crawlscope/cli.rb +16 -0
- data/lib/crawlscope/configuration.rb +10 -1
- data/lib/crawlscope/context.rb +1 -1
- data/lib/crawlscope/crawl.rb +72 -14
- data/lib/crawlscope/crawler.rb +3 -17
- data/lib/crawlscope/document_text.rb +7 -2
- data/lib/crawlscope/fetch_executor/async.rb +32 -0
- data/lib/crawlscope/fetch_executor/threaded.rb +32 -0
- data/lib/crawlscope/fetch_executor.rb +43 -0
- data/lib/crawlscope/http.rb +7 -1
- data/lib/crawlscope/reporter.rb +123 -14
- data/lib/crawlscope/result.rb +1 -1
- data/lib/crawlscope/rules/content_quality.rb +1 -1
- data/lib/crawlscope/rules/indexability.rb +28 -6
- data/lib/crawlscope/rules/links.rb +80 -16
- data/lib/crawlscope/rules/uniqueness.rb +23 -4
- data/lib/crawlscope/sitemap.rb +30 -11
- data/lib/crawlscope/version.rb +1 -1
- data/lib/tasks/crawlscope_tasks.rake +1 -1
- data/test/crawlscope/cli_test.rb +28 -2
- data/test/crawlscope/configuration_test.rb +21 -0
- data/test/crawlscope/content_quality_rule_test.rb +18 -0
- data/test/crawlscope/crawl_test.rb +142 -4
- data/test/crawlscope/crawler_test.rb +61 -0
- data/test/crawlscope/fetch_executor_test.rb +44 -0
- data/test/crawlscope/links_rule_test.rb +101 -0
- data/test/crawlscope/reporter_test.rb +136 -11
- data/test/crawlscope/result_test.rb +35 -0
- data/test/crawlscope/sitemap_test.rb +52 -0
- data/test/performance/async_fetch_benchmark.rb +127 -0
- data/test/performance/fetch_executor_matrix.rb +162 -0
- data/test/performance/sitemap_expansion_benchmark.rb +121 -0
- metadata +38 -2
|
@@ -3,6 +3,19 @@
|
|
|
3
3
|
require "test_helper"
|
|
4
4
|
|
|
5
5
|
class CrawlscopeSitemapTest < Minitest::Test
|
|
6
|
+
class RecordingExecutor
|
|
7
|
+
attr_reader :batches
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@batches = []
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call(items)
|
|
14
|
+
@batches << items
|
|
15
|
+
items.map { |item| yield(item) }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
6
19
|
def test_parses_remote_sitemap_urlset
|
|
7
20
|
stub_request(:get, "https://www.example.com/sitemap.xml")
|
|
8
21
|
.to_return(
|
|
@@ -127,4 +140,43 @@ class CrawlscopeSitemapTest < Minitest::Test
|
|
|
127
140
|
assert_equal ["http://localhost:3000/features/reviews"], parser.urls(base_url: "http://localhost:3000")
|
|
128
141
|
end
|
|
129
142
|
end
|
|
143
|
+
|
|
144
|
+
def test_child_sitemaps_are_collected_through_the_fetch_executor
|
|
145
|
+
Dir.mktmpdir do |dir|
|
|
146
|
+
File.write(
|
|
147
|
+
File.join(dir, "sitemap.xml"),
|
|
148
|
+
<<~XML
|
|
149
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
150
|
+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
151
|
+
<sitemap><loc>first.xml</loc></sitemap>
|
|
152
|
+
<sitemap><loc>second.xml</loc></sitemap>
|
|
153
|
+
</sitemapindex>
|
|
154
|
+
XML
|
|
155
|
+
)
|
|
156
|
+
File.write(
|
|
157
|
+
File.join(dir, "first.xml"),
|
|
158
|
+
<<~XML
|
|
159
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
160
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
161
|
+
<url><loc>http://localhost:3000/first</loc></url>
|
|
162
|
+
</urlset>
|
|
163
|
+
XML
|
|
164
|
+
)
|
|
165
|
+
File.write(
|
|
166
|
+
File.join(dir, "second.xml"),
|
|
167
|
+
<<~XML
|
|
168
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
169
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
170
|
+
<url><loc>http://localhost:3000/second</loc></url>
|
|
171
|
+
</urlset>
|
|
172
|
+
XML
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
executor = RecordingExecutor.new
|
|
176
|
+
parser = Crawlscope::Sitemap.new(path: File.join(dir, "sitemap.xml"), fetch_executor: executor)
|
|
177
|
+
|
|
178
|
+
assert_equal ["http://localhost:3000/first", "http://localhost:3000/second"], parser.urls(base_url: "http://localhost:3000")
|
|
179
|
+
assert_equal [[File.join(dir, "first.xml"), File.join(dir, "second.xml")]], executor.batches
|
|
180
|
+
end
|
|
181
|
+
end
|
|
130
182
|
end
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
$LOAD_PATH.unshift File.expand_path("../../lib", __dir__)
|
|
4
|
+
|
|
5
|
+
require "bundler/setup"
|
|
6
|
+
require "crawlscope"
|
|
7
|
+
require "json"
|
|
8
|
+
require "socket"
|
|
9
|
+
require "time"
|
|
10
|
+
|
|
11
|
+
class DelayedHttpServer
|
|
12
|
+
attr_reader :base_url
|
|
13
|
+
|
|
14
|
+
def initialize(page_count:, delay_seconds:)
|
|
15
|
+
@page_count = page_count
|
|
16
|
+
@delay_seconds = delay_seconds
|
|
17
|
+
@server = TCPServer.new("127.0.0.1", 0)
|
|
18
|
+
@base_url = "http://127.0.0.1:#{@server.addr[1]}"
|
|
19
|
+
@threads = []
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def start
|
|
23
|
+
@thread = Thread.new do
|
|
24
|
+
loop do
|
|
25
|
+
socket = @server.accept
|
|
26
|
+
@threads << Thread.new(socket) { |client| respond(client) }
|
|
27
|
+
rescue IOError
|
|
28
|
+
break
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def stop
|
|
34
|
+
@server.close
|
|
35
|
+
@thread&.join
|
|
36
|
+
@threads.each(&:join)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def respond(socket)
|
|
42
|
+
request_line = socket.gets.to_s
|
|
43
|
+
path = request_line.split[1].to_s
|
|
44
|
+
read_headers(socket)
|
|
45
|
+
|
|
46
|
+
if path == "/sitemap.xml"
|
|
47
|
+
write_response(socket, sitemap_xml, content_type: "application/xml")
|
|
48
|
+
else
|
|
49
|
+
sleep @delay_seconds
|
|
50
|
+
write_response(socket, page_html(path), content_type: "text/html")
|
|
51
|
+
end
|
|
52
|
+
ensure
|
|
53
|
+
socket.close
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def read_headers(socket)
|
|
57
|
+
loop do
|
|
58
|
+
line = socket.gets
|
|
59
|
+
break if line.nil? || line == "\r\n"
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def sitemap_xml
|
|
64
|
+
urls = (1..@page_count).map do |index|
|
|
65
|
+
"<url><loc>#{@base_url}/pages/#{index}</loc></url>"
|
|
66
|
+
end.join
|
|
67
|
+
|
|
68
|
+
%(<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">#{urls}</urlset>)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def page_html(path)
|
|
72
|
+
<<~HTML
|
|
73
|
+
<html>
|
|
74
|
+
<head><title>#{path}</title></head>
|
|
75
|
+
<body><main><h1>#{path}</h1><p>#{path} benchmark page</p></main></body>
|
|
76
|
+
</html>
|
|
77
|
+
HTML
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def write_response(socket, body, content_type:)
|
|
81
|
+
socket.write "HTTP/1.1 200 OK\r\n"
|
|
82
|
+
socket.write "Content-Type: #{content_type}\r\n"
|
|
83
|
+
socket.write "Content-Length: #{body.bytesize}\r\n"
|
|
84
|
+
socket.write "Connection: close\r\n"
|
|
85
|
+
socket.write "\r\n"
|
|
86
|
+
socket.write body
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def measure(name, base_url:, concurrency:, fetch_executor:)
|
|
91
|
+
started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
92
|
+
|
|
93
|
+
result = Crawlscope::Crawl.new(
|
|
94
|
+
base_url: base_url,
|
|
95
|
+
sitemap_path: "#{base_url}/sitemap.xml",
|
|
96
|
+
rules: [],
|
|
97
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
98
|
+
concurrency: concurrency,
|
|
99
|
+
fetch_executor: fetch_executor
|
|
100
|
+
).call
|
|
101
|
+
|
|
102
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at
|
|
103
|
+
[name, {seconds: elapsed.round(3), pages: result.pages.size, issues: result.issues.size}]
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
server = DelayedHttpServer.new(page_count: 24, delay_seconds: 0.08)
|
|
107
|
+
server.start
|
|
108
|
+
|
|
109
|
+
begin
|
|
110
|
+
results = {}
|
|
111
|
+
[
|
|
112
|
+
measure("threaded_concurrency_1", base_url: server.base_url, concurrency: 1, fetch_executor: :threaded),
|
|
113
|
+
measure("threaded_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :threaded),
|
|
114
|
+
measure("async_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :async)
|
|
115
|
+
].each { |name, result| results[name] = result }
|
|
116
|
+
|
|
117
|
+
sequential = results.fetch("threaded_concurrency_1").fetch(:seconds)
|
|
118
|
+
threaded = results.fetch("threaded_concurrency_8").fetch(:seconds)
|
|
119
|
+
async = results.fetch("async_concurrency_8").fetch(:seconds)
|
|
120
|
+
|
|
121
|
+
abort "async benchmark failed: async was not meaningfully faster than sequential" unless async < sequential * 0.6
|
|
122
|
+
abort "async benchmark failed: async was more than 2x slower than threaded" if async > threaded * 2.0
|
|
123
|
+
|
|
124
|
+
puts JSON.pretty_generate(results)
|
|
125
|
+
ensure
|
|
126
|
+
server.stop
|
|
127
|
+
end
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
$LOAD_PATH.unshift File.expand_path("../../lib", __dir__)
|
|
4
|
+
|
|
5
|
+
require "bundler/setup"
|
|
6
|
+
require "crawlscope"
|
|
7
|
+
require "json"
|
|
8
|
+
require "socket"
|
|
9
|
+
|
|
10
|
+
class MatrixHttpServer
|
|
11
|
+
attr_reader :base_url
|
|
12
|
+
|
|
13
|
+
def initialize(page_count:, delay_seconds:, link_targets: false)
|
|
14
|
+
@page_count = page_count
|
|
15
|
+
@delay_seconds = delay_seconds
|
|
16
|
+
@link_targets = link_targets
|
|
17
|
+
@server = TCPServer.new("127.0.0.1", 0)
|
|
18
|
+
@base_url = "http://127.0.0.1:#{@server.addr[1]}"
|
|
19
|
+
@threads = []
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def start
|
|
23
|
+
@thread = Thread.new do
|
|
24
|
+
loop do
|
|
25
|
+
socket = @server.accept
|
|
26
|
+
@threads << Thread.new(socket) { |client| respond(client) }
|
|
27
|
+
rescue IOError
|
|
28
|
+
break
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def stop
|
|
34
|
+
@server.close
|
|
35
|
+
@thread&.join
|
|
36
|
+
@threads.each(&:join)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def respond(socket)
|
|
42
|
+
request_line = socket.gets.to_s
|
|
43
|
+
path = request_line.split[1].to_s
|
|
44
|
+
read_headers(socket)
|
|
45
|
+
|
|
46
|
+
if path == "/sitemap.xml"
|
|
47
|
+
write_response(socket, sitemap_xml, content_type: "application/xml")
|
|
48
|
+
else
|
|
49
|
+
sleep @delay_seconds
|
|
50
|
+
write_response(socket, page_html(path), content_type: "text/html")
|
|
51
|
+
end
|
|
52
|
+
ensure
|
|
53
|
+
socket.close
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def read_headers(socket)
|
|
57
|
+
loop do
|
|
58
|
+
line = socket.gets
|
|
59
|
+
break if line.nil? || line == "\r\n"
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def sitemap_xml
|
|
64
|
+
paths = @link_targets ? ["/seed"] : (1..@page_count).map { |index| "/pages/#{index}" }
|
|
65
|
+
urls = paths.map { |path| "<url><loc>#{@base_url}#{path}</loc></url>" }.join
|
|
66
|
+
|
|
67
|
+
%(<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">#{urls}</urlset>)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def page_html(path)
|
|
71
|
+
links = if @link_targets && path == "/seed"
|
|
72
|
+
(1..@page_count).map { |index| %(<a href="/targets/#{index}">Target #{index}</a>) }.join
|
|
73
|
+
else
|
|
74
|
+
""
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
<<~HTML
|
|
78
|
+
<html>
|
|
79
|
+
<head>
|
|
80
|
+
<title>#{path}</title>
|
|
81
|
+
<meta name="robots" content="noindex">
|
|
82
|
+
</head>
|
|
83
|
+
<body>
|
|
84
|
+
<main><h1>#{path}</h1><p>#{path} benchmark page</p>#{links}</main>
|
|
85
|
+
</body>
|
|
86
|
+
</html>
|
|
87
|
+
HTML
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def write_response(socket, body, content_type:)
|
|
91
|
+
socket.write "HTTP/1.1 200 OK\r\n"
|
|
92
|
+
socket.write "Content-Type: #{content_type}\r\n"
|
|
93
|
+
socket.write "Content-Length: #{body.bytesize}\r\n"
|
|
94
|
+
socket.write "Connection: close\r\n"
|
|
95
|
+
socket.write "\r\n"
|
|
96
|
+
socket.write body
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def measure(base_url:, concurrency:, fetch_executor:, rules:)
|
|
101
|
+
started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
102
|
+
|
|
103
|
+
result = Crawlscope::Crawl.new(
|
|
104
|
+
base_url: base_url,
|
|
105
|
+
sitemap_path: "#{base_url}/sitemap.xml",
|
|
106
|
+
rules: rules,
|
|
107
|
+
schema_registry: Crawlscope::SchemaRegistry.default,
|
|
108
|
+
concurrency: concurrency,
|
|
109
|
+
fetch_executor: fetch_executor
|
|
110
|
+
).call
|
|
111
|
+
|
|
112
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at
|
|
113
|
+
{seconds: elapsed, pages: result.pages.size, issues: result.issues.size}
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def median(values)
|
|
117
|
+
sorted = values.sort
|
|
118
|
+
sorted[sorted.length / 2]
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def run_case(name:, page_count:, delay_seconds:, concurrency:, link_targets: false)
|
|
122
|
+
server = MatrixHttpServer.new(page_count: page_count, delay_seconds: delay_seconds, link_targets: link_targets)
|
|
123
|
+
server.start
|
|
124
|
+
|
|
125
|
+
rules = link_targets ? [Crawlscope::Rules::Links.new] : []
|
|
126
|
+
threaded = []
|
|
127
|
+
async = []
|
|
128
|
+
|
|
129
|
+
3.times do
|
|
130
|
+
threaded << measure(base_url: server.base_url, concurrency: concurrency, fetch_executor: :threaded, rules: rules)
|
|
131
|
+
async << measure(base_url: server.base_url, concurrency: concurrency, fetch_executor: :async, rules: rules)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
threaded_seconds = median(threaded.map { |result| result.fetch(:seconds) })
|
|
135
|
+
async_seconds = median(async.map { |result| result.fetch(:seconds) })
|
|
136
|
+
|
|
137
|
+
{
|
|
138
|
+
name: name,
|
|
139
|
+
page_count: page_count,
|
|
140
|
+
delay_seconds: delay_seconds,
|
|
141
|
+
concurrency: concurrency,
|
|
142
|
+
link_targets: link_targets,
|
|
143
|
+
threaded_seconds: threaded_seconds.round(3),
|
|
144
|
+
async_seconds: async_seconds.round(3),
|
|
145
|
+
async_vs_threaded: (threaded_seconds / async_seconds).round(2),
|
|
146
|
+
pages: async.first.fetch(:pages),
|
|
147
|
+
issues: async.first.fetch(:issues)
|
|
148
|
+
}
|
|
149
|
+
ensure
|
|
150
|
+
server&.stop
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
cases = [
|
|
154
|
+
{name: "direct_pages_c8", page_count: 48, delay_seconds: 0.02, concurrency: 8},
|
|
155
|
+
{name: "direct_pages_c16", page_count: 48, delay_seconds: 0.02, concurrency: 16},
|
|
156
|
+
{name: "slow_direct_pages_c8", page_count: 48, delay_seconds: 0.08, concurrency: 8},
|
|
157
|
+
{name: "slow_direct_pages_c16", page_count: 48, delay_seconds: 0.08, concurrency: 16},
|
|
158
|
+
{name: "link_targets_c8", page_count: 48, delay_seconds: 0.02, concurrency: 8, link_targets: true},
|
|
159
|
+
{name: "slow_link_targets_c8", page_count: 48, delay_seconds: 0.08, concurrency: 8, link_targets: true}
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
puts JSON.pretty_generate(cases.map { |attributes| run_case(**attributes) })
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
$LOAD_PATH.unshift File.expand_path("../../lib", __dir__)
|
|
4
|
+
|
|
5
|
+
require "bundler/setup"
|
|
6
|
+
require "crawlscope"
|
|
7
|
+
require "json"
|
|
8
|
+
require "socket"
|
|
9
|
+
|
|
10
|
+
class DelayedSitemapServer
|
|
11
|
+
attr_reader :base_url
|
|
12
|
+
|
|
13
|
+
def initialize(child_count:, delay_seconds:)
|
|
14
|
+
@child_count = child_count
|
|
15
|
+
@delay_seconds = delay_seconds
|
|
16
|
+
@server = TCPServer.new("127.0.0.1", 0)
|
|
17
|
+
@base_url = "http://127.0.0.1:#{@server.addr[1]}"
|
|
18
|
+
@threads = []
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def start
|
|
22
|
+
@thread = Thread.new do
|
|
23
|
+
loop do
|
|
24
|
+
socket = @server.accept
|
|
25
|
+
@threads << Thread.new(socket) { |client| respond(client) }
|
|
26
|
+
rescue IOError
|
|
27
|
+
break
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def stop
|
|
33
|
+
@server.close
|
|
34
|
+
@thread&.join
|
|
35
|
+
@threads.each(&:join)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def respond(socket)
|
|
41
|
+
request_line = socket.gets.to_s
|
|
42
|
+
path = request_line.split[1].to_s
|
|
43
|
+
read_headers(socket)
|
|
44
|
+
|
|
45
|
+
if path == "/sitemap.xml"
|
|
46
|
+
write_response(socket, sitemap_index, content_type: "application/xml")
|
|
47
|
+
else
|
|
48
|
+
sleep @delay_seconds
|
|
49
|
+
write_response(socket, child_sitemap(path), content_type: "application/xml")
|
|
50
|
+
end
|
|
51
|
+
ensure
|
|
52
|
+
socket.close
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def read_headers(socket)
|
|
56
|
+
loop do
|
|
57
|
+
line = socket.gets
|
|
58
|
+
break if line.nil? || line == "\r\n"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def sitemap_index
|
|
63
|
+
children = (1..@child_count).map do |index|
|
|
64
|
+
"<sitemap><loc>#{@base_url}/sitemaps/#{index}.xml</loc></sitemap>"
|
|
65
|
+
end.join
|
|
66
|
+
|
|
67
|
+
%(<?xml version="1.0" encoding="UTF-8"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">#{children}</sitemapindex>)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def child_sitemap(path)
|
|
71
|
+
index = File.basename(path, ".xml")
|
|
72
|
+
|
|
73
|
+
%(<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>#{@base_url}/pages/#{index}</loc></url></urlset>)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def write_response(socket, body, content_type:)
|
|
77
|
+
socket.write "HTTP/1.1 200 OK\r\n"
|
|
78
|
+
socket.write "Content-Type: #{content_type}\r\n"
|
|
79
|
+
socket.write "Content-Length: #{body.bytesize}\r\n"
|
|
80
|
+
socket.write "Connection: close\r\n"
|
|
81
|
+
socket.write "\r\n"
|
|
82
|
+
socket.write body
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def measure(name, base_url:, concurrency:, fetch_executor:)
|
|
87
|
+
started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
88
|
+
|
|
89
|
+
urls = Crawlscope::Sitemap.new(
|
|
90
|
+
path: "#{base_url}/sitemap.xml",
|
|
91
|
+
concurrency: concurrency,
|
|
92
|
+
fetch_executor: fetch_executor,
|
|
93
|
+
timeout_seconds: 5
|
|
94
|
+
).urls(base_url: base_url)
|
|
95
|
+
|
|
96
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at
|
|
97
|
+
[name, {seconds: elapsed.round(3), urls: urls.size}]
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
server = DelayedSitemapServer.new(child_count: 24, delay_seconds: 0.08)
|
|
101
|
+
server.start
|
|
102
|
+
|
|
103
|
+
begin
|
|
104
|
+
results = {}
|
|
105
|
+
[
|
|
106
|
+
measure("threaded_concurrency_1", base_url: server.base_url, concurrency: 1, fetch_executor: :threaded),
|
|
107
|
+
measure("threaded_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :threaded),
|
|
108
|
+
measure("async_concurrency_8", base_url: server.base_url, concurrency: 8, fetch_executor: :async)
|
|
109
|
+
].each { |name, result| results[name] = result }
|
|
110
|
+
|
|
111
|
+
sequential = results.fetch("threaded_concurrency_1").fetch(:seconds)
|
|
112
|
+
threaded = results.fetch("threaded_concurrency_8").fetch(:seconds)
|
|
113
|
+
async = results.fetch("async_concurrency_8").fetch(:seconds)
|
|
114
|
+
|
|
115
|
+
abort "sitemap benchmark failed: threaded parallelism was not at least 2x faster" unless threaded < sequential * 0.5
|
|
116
|
+
abort "sitemap benchmark failed: async parallelism was not at least 2x faster" unless async < sequential * 0.5
|
|
117
|
+
|
|
118
|
+
puts JSON.pretty_generate(results)
|
|
119
|
+
ensure
|
|
120
|
+
server.stop
|
|
121
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: crawlscope
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Paulo Fidalgo
|
|
@@ -24,6 +24,34 @@ dependencies:
|
|
|
24
24
|
- - ">="
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '1.3'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: async
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '2.0'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '2.0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: async-http-faraday
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0.22'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0.22'
|
|
27
55
|
- !ruby/object:Gem::Dependency
|
|
28
56
|
name: faraday
|
|
29
57
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -200,6 +228,9 @@ files:
|
|
|
200
228
|
- lib/crawlscope/crawl.rb
|
|
201
229
|
- lib/crawlscope/crawler.rb
|
|
202
230
|
- lib/crawlscope/document_text.rb
|
|
231
|
+
- lib/crawlscope/fetch_executor.rb
|
|
232
|
+
- lib/crawlscope/fetch_executor/async.rb
|
|
233
|
+
- lib/crawlscope/fetch_executor/threaded.rb
|
|
203
234
|
- lib/crawlscope/http.rb
|
|
204
235
|
- lib/crawlscope/issue.rb
|
|
205
236
|
- lib/crawlscope/issue_collection.rb
|
|
@@ -234,6 +265,7 @@ files:
|
|
|
234
265
|
- test/crawlscope/content_quality_rule_test.rb
|
|
235
266
|
- test/crawlscope/crawl_test.rb
|
|
236
267
|
- test/crawlscope/crawler_test.rb
|
|
268
|
+
- test/crawlscope/fetch_executor_test.rb
|
|
237
269
|
- test/crawlscope/http_test.rb
|
|
238
270
|
- test/crawlscope/indexability_rule_test.rb
|
|
239
271
|
- test/crawlscope/links_rule_test.rb
|
|
@@ -241,6 +273,7 @@ files:
|
|
|
241
273
|
- test/crawlscope/metadata_rule_test.rb
|
|
242
274
|
- test/crawlscope/rake_tasks_test.rb
|
|
243
275
|
- test/crawlscope/reporter_test.rb
|
|
276
|
+
- test/crawlscope/result_test.rb
|
|
244
277
|
- test/crawlscope/rule_registry_test.rb
|
|
245
278
|
- test/crawlscope/run_test.rb
|
|
246
279
|
- test/crawlscope/schema_registry_test.rb
|
|
@@ -253,6 +286,9 @@ files:
|
|
|
253
286
|
- test/crawlscope/structured_data_writer_test.rb
|
|
254
287
|
- test/crawlscope/uniqueness_rule_test.rb
|
|
255
288
|
- test/crawlscope/url_test.rb
|
|
289
|
+
- test/performance/async_fetch_benchmark.rb
|
|
290
|
+
- test/performance/fetch_executor_matrix.rb
|
|
291
|
+
- test/performance/sitemap_expansion_benchmark.rb
|
|
256
292
|
- test/release_task_test.rb
|
|
257
293
|
- test/test_helper.rb
|
|
258
294
|
homepage: https://www.ethos-link.com/opensource/crawlscope
|
|
@@ -275,7 +311,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
275
311
|
requirements:
|
|
276
312
|
- - ">="
|
|
277
313
|
- !ruby/object:Gem::Version
|
|
278
|
-
version: 3.
|
|
314
|
+
version: 3.3.0
|
|
279
315
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
280
316
|
requirements:
|
|
281
317
|
- - ">="
|