sunscraper 1.1.0.beta3 → 1.2.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/common/common.pro +13 -0
- data/ext/common/libsunscraper_common.a +0 -0
- data/ext/common/sunscraperproxy.cpp +11 -0
- data/ext/{standalone → common}/sunscraperproxy.h +5 -5
- data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
- data/ext/{embed → common}/sunscraperwebpage.h +0 -0
- data/ext/common/sunscraperworker.cpp +124 -0
- data/ext/common/sunscraperworker.h +50 -0
- data/ext/embed/embed.pro +14 -12
- data/ext/embed/sunscraperexternal.cpp +17 -16
- data/ext/embed/sunscraperinterface.cpp +206 -0
- data/ext/embed/sunscraperinterface.h +66 -0
- data/ext/embed/sunscraperlibrary.cpp +2 -12
- data/ext/embed/sunscraperlibrary.h +0 -1
- data/ext/embed/sunscraperthread.cpp +49 -0
- data/ext/embed/sunscraperthread.h +24 -0
- data/ext/extconf.rb +5 -3
- data/ext/standalone/standalone.pro +12 -6
- data/ext/standalone/sunscrapermain.cpp +13 -3
- data/ext/standalone/sunscraperrpc.cpp +76 -88
- data/ext/standalone/sunscraperrpc.h +19 -22
- data/ext/standalone/sunscraperrpcserver.cpp +26 -0
- data/ext/standalone/sunscraperrpcserver.h +24 -0
- data/ext/sunscraper-ext.pro +1 -1
- data/lib/sunscraper.rb +14 -14
- data/lib/sunscraper/library.rb +9 -9
- data/lib/sunscraper/standalone.rb +53 -107
- data/spec/sunscraper_spec.rb +86 -44
- data/sunscraper.gemspec +1 -1
- metadata +19 -17
- data/ext/embed/sunscraper.cpp +0 -92
- data/ext/embed/sunscraper.h +0 -47
- data/ext/embed/sunscraperproxy.cpp +0 -14
- data/ext/embed/sunscraperproxy.h +0 -24
- data/ext/embed/sunscraperworker.cpp +0 -163
- data/ext/embed/sunscraperworker.h +0 -58
- data/ext/standalone/sunscraperproxy.cpp +0 -14
- data/ext/standalone/sunscraperworker.cpp +0 -60
- data/ext/standalone/sunscraperworker.h +0 -34
@@ -7,7 +7,6 @@
|
|
7
7
|
|
8
8
|
class SunscraperWorker;
|
9
9
|
class QLocalSocket;
|
10
|
-
class QTimer;
|
11
10
|
|
12
11
|
class SunscraperRPC : public QObject
|
13
12
|
{
|
@@ -18,47 +17,45 @@ class SunscraperRPC : public QObject
|
|
18
17
|
StateData,
|
19
18
|
};
|
20
19
|
|
21
|
-
struct Header {
|
22
|
-
quint32 queryId;
|
23
|
-
quint32 requestType;
|
24
|
-
quint32 dataLength;
|
25
|
-
};
|
26
|
-
|
27
20
|
enum Request {
|
28
|
-
|
29
|
-
|
21
|
+
RPC_LOAD_URL = 1,
|
22
|
+
RPC_LOAD_HTML = 2,
|
30
23
|
RPC_WAIT = 3,
|
31
24
|
RPC_FETCH = 4,
|
32
|
-
RPC_DISCARD = 5,
|
33
25
|
};
|
34
26
|
|
35
27
|
public:
|
36
|
-
SunscraperRPC(
|
28
|
+
SunscraperRPC(QLocalSocket *socket);
|
37
29
|
~SunscraperRPC();
|
38
30
|
|
31
|
+
signals:
|
32
|
+
void disconnected();
|
33
|
+
|
39
34
|
private slots:
|
40
35
|
void onInputReadable();
|
41
36
|
void onInputDisconnected();
|
42
|
-
|
43
|
-
void
|
37
|
+
|
38
|
+
void onFinish(unsigned queryId);
|
39
|
+
void onTimeout(unsigned queryId);
|
40
|
+
void onFetchDone(unsigned queryId, QString data);
|
44
41
|
|
45
42
|
private:
|
43
|
+
static unsigned m_nextQueryId;
|
44
|
+
static SunscraperWorker *m_worker;
|
45
|
+
|
46
|
+
unsigned m_queryId;
|
46
47
|
QLocalSocket *m_socket;
|
47
48
|
|
48
|
-
State
|
49
|
-
|
49
|
+
State m_state;
|
50
|
+
unsigned m_pendingRequest, m_pendingDataLength;
|
50
51
|
QByteArray m_buffer;
|
51
52
|
|
52
|
-
|
53
|
-
|
54
|
-
QList<unsigned> m_waitQueue;
|
55
|
-
QMap<unsigned, QTimer*> m_timers;
|
56
|
-
QMap<unsigned, QString> m_results;
|
53
|
+
bool m_result;
|
57
54
|
|
58
55
|
SunscraperRPC();
|
59
56
|
|
60
|
-
void processRequest(
|
61
|
-
void sendReply(
|
57
|
+
void processRequest(unsigned requestType, QByteArray data);
|
58
|
+
void sendReply(QByteArray data);
|
62
59
|
};
|
63
60
|
|
64
61
|
#endif // SUNSCRAPERRPC_H
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#include <QLocalServer>
|
2
|
+
#include "sunscraperrpcserver.h"
|
3
|
+
#include "sunscraperrpc.h"
|
4
|
+
|
5
|
+
SunscraperRPCServer::SunscraperRPCServer(QObject *parent) :
|
6
|
+
QObject(parent)
|
7
|
+
{
|
8
|
+
m_localServer = new QLocalServer();
|
9
|
+
|
10
|
+
connect(m_localServer, SIGNAL(newConnection()), this, SLOT(onNewConnection()));
|
11
|
+
}
|
12
|
+
|
13
|
+
bool SunscraperRPCServer::listen(QString socketPath)
|
14
|
+
{
|
15
|
+
return m_localServer->listen(socketPath);
|
16
|
+
}
|
17
|
+
|
18
|
+
void SunscraperRPCServer::onNewConnection()
|
19
|
+
{
|
20
|
+
while(m_localServer->hasPendingConnections()) {
|
21
|
+
QLocalSocket *socket = m_localServer->nextPendingConnection();
|
22
|
+
|
23
|
+
SunscraperRPC *rpc = new SunscraperRPC(socket);
|
24
|
+
connect(rpc, SIGNAL(disconnected()), rpc, SLOT(deleteLater()));
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#ifndef SUNSCRAPERRPCSERVER_H
|
2
|
+
#define SUNSCRAPERRPCSERVER_H
|
3
|
+
|
4
|
+
#include <QObject>
|
5
|
+
|
6
|
+
class QLocalServer;
|
7
|
+
|
8
|
+
class SunscraperRPCServer : public QObject
|
9
|
+
{
|
10
|
+
Q_OBJECT
|
11
|
+
|
12
|
+
public:
|
13
|
+
SunscraperRPCServer(QObject *parent = 0);
|
14
|
+
|
15
|
+
bool listen(QString socketPath);
|
16
|
+
|
17
|
+
private slots:
|
18
|
+
void onNewConnection();
|
19
|
+
|
20
|
+
private:
|
21
|
+
QLocalServer *m_localServer;
|
22
|
+
};
|
23
|
+
|
24
|
+
#endif
|
data/ext/sunscraper-ext.pro
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
TEMPLATE = subdirs
|
2
|
-
SUBDIRS = embed standalone
|
2
|
+
SUBDIRS = common embed standalone
|
data/lib/sunscraper.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
if
|
2
|
-
raise RuntimeError, "Sunscraper does not work on Ruby
|
1
|
+
if RUBY_VERSION =~ /^1.8/
|
2
|
+
raise RuntimeError, "Sunscraper does not work on Ruby 1.8."
|
3
3
|
end
|
4
4
|
|
5
5
|
# Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
|
@@ -27,9 +27,9 @@ module Sunscraper
|
|
27
27
|
# If your application depends on base URL being available, use {scrape_url}.
|
28
28
|
#
|
29
29
|
# @param [Integer] timeout timeout in milliseconds
|
30
|
-
def scrape_html(html, timeout=5000)
|
30
|
+
def scrape_html(html, url="about:blank", timeout=5000)
|
31
31
|
scrape(timeout) do |worker, context|
|
32
|
-
worker.load_html context, html
|
32
|
+
worker.load_html context, html, url
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
@@ -47,19 +47,19 @@ module Sunscraper
|
|
47
47
|
def scrape(timeout)
|
48
48
|
worker = load_worker
|
49
49
|
|
50
|
-
|
51
|
-
|
52
|
-
worker.wait(context, timeout)
|
50
|
+
begin
|
51
|
+
context = worker.create
|
53
52
|
|
54
|
-
|
53
|
+
yield worker, context
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
55
|
+
if worker.wait(context, timeout)
|
56
|
+
worker.fetch(context)
|
57
|
+
else
|
58
|
+
raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
|
59
|
+
end
|
60
|
+
ensure
|
61
|
+
worker.finalize(context) if context
|
60
62
|
end
|
61
|
-
ensure
|
62
|
-
worker.discard(context) if context
|
63
63
|
end
|
64
64
|
|
65
65
|
def load_worker
|
data/lib/sunscraper/library.rb
CHANGED
@@ -19,23 +19,23 @@ module Sunscraper
|
|
19
19
|
ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
|
20
20
|
'ext', 'embed', "libsunscraper.#{extension}")
|
21
21
|
|
22
|
-
attach_function 'create', :sunscraper_create, [],
|
23
|
-
attach_function 'load_html', :sunscraper_load_html, [:
|
24
|
-
attach_function 'load_url', :sunscraper_load_url, [:
|
25
|
-
attach_function 'fetch', :sunscraper_fetch, [:
|
26
|
-
attach_function '
|
22
|
+
attach_function 'create', :sunscraper_create, [], :uint
|
23
|
+
attach_function 'load_html', :sunscraper_load_html, [:uint, :string, :string], :void
|
24
|
+
attach_function 'load_url', :sunscraper_load_url, [:uint, :string], :void
|
25
|
+
attach_function 'fetch', :sunscraper_fetch, [:uint], :string
|
26
|
+
attach_function 'finalize', :sunscraper_finalize, [:uint], :void
|
27
27
|
|
28
28
|
if RUBY_ENGINE == 'ruby'
|
29
29
|
# MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
|
30
|
-
attach_function 'wait', :sunscraper_wait, [:
|
30
|
+
attach_function 'wait', :sunscraper_wait, [:uint, :uint], :bool, :blocking => true
|
31
31
|
else
|
32
32
|
# Rubinius does not have GVL neither it has options in attach_function.
|
33
33
|
# Same for JRuby.
|
34
|
-
attach_function 'wait', :sunscraper_wait, [:
|
34
|
+
attach_function 'wait', :sunscraper_wait, [:uint, :uint], :bool
|
35
35
|
end
|
36
36
|
|
37
|
-
attach_function '
|
37
|
+
attach_function 'quit', :sunscraper_quit, [], :void
|
38
38
|
|
39
|
-
at_exit {
|
39
|
+
at_exit { quit }
|
40
40
|
end
|
41
41
|
end
|
@@ -3,113 +3,65 @@ require 'socket'
|
|
3
3
|
# @private
|
4
4
|
module Sunscraper
|
5
5
|
module Standalone
|
6
|
-
@
|
6
|
+
@rpc_mutex = Mutex.new
|
7
|
+
@rpc_socket = nil
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
@rpc_results = {}
|
11
|
-
@rpc_thread = nil
|
12
|
-
|
13
|
-
RPC_LOAD_HTML = 1
|
14
|
-
RPC_LOAD_URL = 2
|
9
|
+
RPC_LOAD_URL = 1
|
10
|
+
RPC_LOAD_HTML = 2
|
15
11
|
RPC_WAIT = 3
|
16
12
|
RPC_FETCH = 4
|
17
|
-
RPC_DISCARD = 5
|
18
13
|
|
19
14
|
class << self
|
20
|
-
attr_reader :rpc_mutex, :rpc_waiters, :rpc_results
|
21
|
-
|
22
15
|
def create
|
23
|
-
|
24
|
-
@last_query_id += 1
|
25
|
-
@last_query_id
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
def load_html(query_id, html)
|
30
|
-
perform_rpc query_id,
|
31
|
-
request: RPC_LOAD_HTML,
|
32
|
-
data: html
|
16
|
+
connect_to_worker
|
33
17
|
end
|
34
18
|
|
35
|
-
def load_url(
|
36
|
-
perform_rpc
|
19
|
+
def load_url(socket, url)
|
20
|
+
perform_rpc socket,
|
37
21
|
request: RPC_LOAD_URL,
|
38
22
|
data: url
|
39
23
|
end
|
40
24
|
|
41
|
-
def
|
42
|
-
|
25
|
+
def load_html(socket, html, baseUrl)
|
26
|
+
html, baseUrl = [html, baseUrl].map(&:to_s)
|
27
|
+
perform_rpc socket,
|
28
|
+
request: RPC_LOAD_HTML,
|
29
|
+
data: [html.length, html, baseUrl.length, baseUrl].pack("Na*Na*")
|
30
|
+
end
|
31
|
+
|
32
|
+
def wait(socket, timeout)
|
33
|
+
result = perform_rpc socket,
|
43
34
|
request: RPC_WAIT,
|
44
35
|
data: [timeout].pack("N"),
|
45
36
|
want_result: true
|
37
|
+
code, = result.unpack("N")
|
38
|
+
|
39
|
+
code == 1 # true
|
46
40
|
end
|
47
41
|
|
48
|
-
def fetch(
|
49
|
-
perform_rpc
|
42
|
+
def fetch(socket)
|
43
|
+
perform_rpc socket,
|
50
44
|
request: RPC_FETCH,
|
51
45
|
want_result: true
|
52
46
|
end
|
53
47
|
|
54
|
-
def
|
55
|
-
|
56
|
-
request: RPC_DISCARD
|
48
|
+
def finalize(socket)
|
49
|
+
socket.close
|
57
50
|
end
|
58
51
|
|
59
52
|
private
|
60
53
|
|
61
|
-
def perform_rpc(
|
54
|
+
def perform_rpc(socket, options={})
|
62
55
|
data = options[:data] || ""
|
63
|
-
|
64
|
-
|
65
|
-
@rpc_mutex.synchronize do
|
66
|
-
if @rpc_thread.nil?
|
67
|
-
@rpc_thread = Standalone::Thread.new(::Thread.current)
|
68
|
-
|
69
|
-
# Some fucko decided not to put any semaphores in Ruby,
|
70
|
-
# _and_ restrict Mutexes to be unlocked only from the thread
|
71
|
-
# which has locked them.
|
72
|
-
#
|
73
|
-
# Please, kill yourself if you're reading this.
|
74
|
-
::Thread.stop
|
75
|
-
end
|
76
|
-
|
77
|
-
@rpc_thread.perform(query_id, options[:request], data)
|
78
|
-
|
79
|
-
if block
|
80
|
-
@rpc_waiters[query_id] = Thread.current
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
if block
|
85
|
-
Thread.stop
|
86
|
-
@rpc_results[query_id]
|
87
|
-
end
|
88
|
-
ensure
|
89
|
-
if block
|
90
|
-
@rpc_waiters.delete query_id
|
91
|
-
@rpc_results.delete query_id
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
class Thread < ::Thread
|
97
|
-
def initialize(creator)
|
98
|
-
@creator = creator
|
56
|
+
socket.write([options[:request], data.length, data].pack("NNa*"))
|
99
57
|
|
100
|
-
|
101
|
-
|
102
|
-
|
58
|
+
if options[:want_result]
|
59
|
+
data_length, = socket.read(4).unpack("N")
|
60
|
+
socket.read(data_length)
|
103
61
|
end
|
104
62
|
end
|
105
63
|
|
106
|
-
def
|
107
|
-
@socket.write([query_id, request, data.length, data].pack("NNNa*"))
|
108
|
-
end
|
109
|
-
|
110
|
-
private
|
111
|
-
|
112
|
-
def work
|
64
|
+
def spawn_worker
|
113
65
|
if ::Sunscraper.os_x?
|
114
66
|
# Fuck you, OS X.
|
115
67
|
suffix = ".app/Contents/MacOS/sunscraper"
|
@@ -121,47 +73,41 @@ module Sunscraper
|
|
121
73
|
'ext', 'standalone', "sunscraper#{suffix}")
|
122
74
|
|
123
75
|
server_path = "/tmp/sunscraper.#{Process.pid}.sock"
|
124
|
-
|
76
|
+
File.unlink server_path if File.exists? server_path
|
125
77
|
|
126
78
|
if Kernel.respond_to? :spawn
|
127
|
-
|
79
|
+
@rpc_pid = Kernel.spawn "#{executable} #{server_path}"
|
128
80
|
else
|
129
81
|
# rbx does not have Kernel.spawn (yet). Sigh...
|
130
|
-
|
82
|
+
@rpc_pid = fork { exec executable, server_path }
|
131
83
|
end
|
132
84
|
|
133
|
-
|
134
|
-
|
135
|
-
|
85
|
+
# Sigh again. Probably no other way.
|
86
|
+
loop do
|
87
|
+
if File.exists? server_path
|
88
|
+
@rpc_socket = server_path
|
89
|
+
break
|
90
|
+
elsif Process.wait(@rpc_pid, Process::WNOHANG)
|
91
|
+
raise RuntimeError, "Cannot start Sunscraper process"
|
92
|
+
end
|
136
93
|
|
137
|
-
|
138
|
-
|
94
|
+
sleep 0.1
|
95
|
+
end
|
139
96
|
|
140
|
-
|
141
|
-
@creator.wakeup
|
97
|
+
Process.detach @rpc_pid
|
142
98
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
data = @socket.read(data_length) if data_length > 0
|
147
|
-
|
148
|
-
@parent.rpc_mutex.synchronize do
|
149
|
-
if !@parent.rpc_waiters.include?(query_id)
|
150
|
-
$stderr.puts "Sunscraper/standalone: no waiter for #{query_id}"
|
151
|
-
else
|
152
|
-
@parent.rpc_results[query_id] = data
|
153
|
-
@parent.rpc_waiters[query_id].wakeup
|
154
|
-
end
|
155
|
-
end
|
99
|
+
at_exit do
|
100
|
+
Process.kill "KILL", @rpc_pid
|
101
|
+
File.unlink @rpc_socket
|
156
102
|
end
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
103
|
+
end
|
104
|
+
|
105
|
+
def connect_to_worker
|
106
|
+
@rpc_mutex.synchronize do
|
107
|
+
spawn_worker if @rpc_socket.nil?
|
161
108
|
end
|
162
|
-
|
163
|
-
@
|
164
|
-
Process.kill pid
|
109
|
+
|
110
|
+
UNIXSocket.new(@rpc_socket)
|
165
111
|
end
|
166
112
|
end
|
167
113
|
end
|
data/spec/sunscraper_spec.rb
CHANGED
@@ -1,15 +1,12 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
|
3
2
|
require 'webrick'
|
4
3
|
|
5
|
-
|
4
|
+
HTML_TEMPLATE = <<HTML
|
6
5
|
<html>
|
7
6
|
<head>
|
8
7
|
<script type="text/javascript">
|
9
8
|
document.addEventListener("DOMContentLoaded", function() {
|
10
|
-
|
11
|
-
("!skrow tI").split("").reverse().join("");
|
12
|
-
Sunscraper.finish();
|
9
|
+
%code%
|
13
10
|
}, true);
|
14
11
|
</script>
|
15
12
|
</head>
|
@@ -19,73 +16,118 @@ HTML = <<HTML
|
|
19
16
|
</html>
|
20
17
|
HTML
|
21
18
|
|
22
|
-
|
19
|
+
HTML_FUGA = HTML_TEMPLATE.sub("%code%", <<CODE)
|
20
|
+
document.getElementById('fuga').textContent =
|
21
|
+
("!skrow tI").split("").reverse().join("");
|
22
|
+
Sunscraper.finish();
|
23
|
+
CODE
|
24
|
+
|
25
|
+
HTML_BASEURL = HTML_TEMPLATE.sub("%code%", <<CODE)
|
26
|
+
var xhr = new XMLHttpRequest();
|
27
|
+
xhr.onreadystatechange = function() {
|
28
|
+
if(xhr.readyState > 3) {
|
29
|
+
document.getElementById('fuga').textContent = xhr.responseText;
|
30
|
+
Sunscraper.finish();
|
31
|
+
}
|
32
|
+
};
|
33
|
+
xhr.open('GET', '/comicstrip', 1);
|
34
|
+
xhr.send();
|
35
|
+
CODE
|
36
|
+
|
37
|
+
HTML_USERAGENT = HTML_TEMPLATE.sub("%code%", <<CODE)
|
38
|
+
document.getElementById('fuga').textContent =
|
39
|
+
window.navigator.userAgent;
|
40
|
+
Sunscraper.finish();
|
41
|
+
CODE
|
42
|
+
|
43
|
+
HTML_LOCALSTORAGE = HTML_TEMPLATE.sub("%code%", <<CODE)
|
44
|
+
window.localStorage.setItem("key", ["O", "K"].join(""))
|
45
|
+
document.getElementById('fuga').textContent =
|
46
|
+
window.localStorage.getItem("key");
|
47
|
+
Sunscraper.finish();
|
48
|
+
CODE
|
23
49
|
|
24
|
-
def with_webserver
|
25
|
-
|
50
|
+
def with_webserver(html)
|
51
|
+
port = 45555
|
52
|
+
server = WEBrick::HTTPServer.new :Port => port, :Logger => WEBrick::Log.new('/dev/null'), :AccessLog => []
|
26
53
|
server.mount_proc '/' do |req, res|
|
27
|
-
res.body =
|
54
|
+
res.body = html
|
28
55
|
end
|
29
|
-
|
56
|
+
server.mount_proc '/comicstrip' do |req, res|
|
57
|
+
res.body = 'Go Get a Roomie!'
|
58
|
+
end
|
59
|
+
thread = Thread.new { server.start }
|
30
60
|
|
31
|
-
yield
|
61
|
+
yield "http://127.0.0.1:#{port}/"
|
32
62
|
ensure
|
33
|
-
server.shutdown
|
63
|
+
server.shutdown
|
64
|
+
thread.join
|
34
65
|
end
|
35
66
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
67
|
+
define_tests = lambda do |klass, worker|
|
68
|
+
describe klass do
|
69
|
+
before(:all) do
|
70
|
+
Sunscraper.worker = worker
|
71
|
+
end
|
41
72
|
|
42
|
-
|
43
|
-
|
44
|
-
describe "Sunscraper::Library" do
|
45
|
-
before do
|
46
|
-
Sunscraper.worker = :embed
|
73
|
+
after(:all) do
|
74
|
+
sleep(5) # let threads rest in peace
|
47
75
|
end
|
48
76
|
|
49
77
|
it "can scrape an HTML provided as a string" do
|
50
|
-
Sunscraper.scrape_html(
|
78
|
+
Sunscraper.scrape_html(HTML_FUGA).should include('It works!')
|
51
79
|
end
|
52
80
|
|
53
81
|
it "can scrape an URL" do
|
54
|
-
with_webserver do |
|
55
|
-
Sunscraper.scrape_url(
|
82
|
+
with_webserver(HTML_FUGA) do |url|
|
83
|
+
Sunscraper.scrape_url(url).should include('It works!')
|
56
84
|
end
|
57
85
|
end
|
58
86
|
|
59
87
|
it "should time out if callback is not called" do
|
60
|
-
lambda { Sunscraper.scrape_html("<!-- nothing. at least no callbacks -->",
|
88
|
+
lambda { Sunscraper.scrape_html("<!-- nothing. at least no callbacks -->", "about:blank", 500) }.
|
61
89
|
should raise_exception(Sunscraper::ScrapeTimeout)
|
62
90
|
end
|
63
|
-
end
|
64
|
-
end
|
65
91
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
# work even on jruby master (as of Mar 09, 2012).
|
71
|
-
describe "Sunscraper::Standalone" do
|
72
|
-
before do
|
73
|
-
Sunscraper.worker = :standalone
|
92
|
+
it "respects baseUrl parameter" do
|
93
|
+
with_webserver("<!-- nothing -->") do |url|
|
94
|
+
Sunscraper.scrape_html(HTML_BASEURL, url).should include('Go Get a Roomie')
|
95
|
+
end
|
74
96
|
end
|
75
97
|
|
76
|
-
it "
|
77
|
-
Sunscraper.scrape_html(
|
98
|
+
it "should identify itself as Sunscraper" do
|
99
|
+
Sunscraper.scrape_html(HTML_USERAGENT).should include("Sunscraper")
|
78
100
|
end
|
79
101
|
|
80
|
-
it "
|
81
|
-
with_webserver do |
|
82
|
-
Sunscraper.scrape_url(
|
102
|
+
it "should work with window.localStorage through webserver" do
|
103
|
+
with_webserver(HTML_LOCALSTORAGE) do |url|
|
104
|
+
Sunscraper.scrape_url(url).should include("OK")
|
83
105
|
end
|
84
106
|
end
|
85
107
|
|
86
|
-
it "should
|
87
|
-
|
88
|
-
|
108
|
+
it "should withstand a lot of concurrent threads" do
|
109
|
+
500.times.map {
|
110
|
+
Thread.new {
|
111
|
+
Sunscraper.scrape_html(HTML_FUGA)
|
112
|
+
}
|
113
|
+
}.each(&:join).
|
114
|
+
map(&:value).
|
115
|
+
each { |result|
|
116
|
+
result.should include('It works!')
|
117
|
+
}
|
89
118
|
end
|
90
119
|
end
|
120
|
+
end
|
121
|
+
|
122
|
+
unless Sunscraper.os_x?
|
123
|
+
# This part currently crashes on OS X (and will forever).
|
124
|
+
define_tests.("Sunscraper-Embed", :embed)
|
125
|
+
end
|
126
|
+
|
127
|
+
if !(RUBY_ENGINE =~ /rbx/ || RUBY_ENGINE =~ /jruby/) ||
|
128
|
+
ENV['EXPERIMENTAL'] == 'true'
|
129
|
+
# This part currently crashes Rubinius (as of Mar 09, 2012),
|
130
|
+
# and crashes jruby < 1.7.0, and uses Unix sockets which don't
|
131
|
+
# work even on jruby master (as of Mar 09, 2012).
|
132
|
+
define_tests.("Sunscraper-Standalone", :standalone)
|
91
133
|
end
|