sunscraper 1.1.0.beta3 → 1.2.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/common/common.pro +13 -0
- data/ext/common/libsunscraper_common.a +0 -0
- data/ext/common/sunscraperproxy.cpp +11 -0
- data/ext/{standalone → common}/sunscraperproxy.h +5 -5
- data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
- data/ext/{embed → common}/sunscraperwebpage.h +0 -0
- data/ext/common/sunscraperworker.cpp +124 -0
- data/ext/common/sunscraperworker.h +50 -0
- data/ext/embed/embed.pro +14 -12
- data/ext/embed/sunscraperexternal.cpp +17 -16
- data/ext/embed/sunscraperinterface.cpp +206 -0
- data/ext/embed/sunscraperinterface.h +66 -0
- data/ext/embed/sunscraperlibrary.cpp +2 -12
- data/ext/embed/sunscraperlibrary.h +0 -1
- data/ext/embed/sunscraperthread.cpp +49 -0
- data/ext/embed/sunscraperthread.h +24 -0
- data/ext/extconf.rb +5 -3
- data/ext/standalone/standalone.pro +12 -6
- data/ext/standalone/sunscrapermain.cpp +13 -3
- data/ext/standalone/sunscraperrpc.cpp +76 -88
- data/ext/standalone/sunscraperrpc.h +19 -22
- data/ext/standalone/sunscraperrpcserver.cpp +26 -0
- data/ext/standalone/sunscraperrpcserver.h +24 -0
- data/ext/sunscraper-ext.pro +1 -1
- data/lib/sunscraper.rb +14 -14
- data/lib/sunscraper/library.rb +9 -9
- data/lib/sunscraper/standalone.rb +53 -107
- data/spec/sunscraper_spec.rb +86 -44
- data/sunscraper.gemspec +1 -1
- metadata +19 -17
- data/ext/embed/sunscraper.cpp +0 -92
- data/ext/embed/sunscraper.h +0 -47
- data/ext/embed/sunscraperproxy.cpp +0 -14
- data/ext/embed/sunscraperproxy.h +0 -24
- data/ext/embed/sunscraperworker.cpp +0 -163
- data/ext/embed/sunscraperworker.h +0 -58
- data/ext/standalone/sunscraperproxy.cpp +0 -14
- data/ext/standalone/sunscraperworker.cpp +0 -60
- data/ext/standalone/sunscraperworker.h +0 -34
@@ -7,7 +7,6 @@
|
|
7
7
|
|
8
8
|
class SunscraperWorker;
|
9
9
|
class QLocalSocket;
|
10
|
-
class QTimer;
|
11
10
|
|
12
11
|
class SunscraperRPC : public QObject
|
13
12
|
{
|
@@ -18,47 +17,45 @@ class SunscraperRPC : public QObject
|
|
18
17
|
StateData,
|
19
18
|
};
|
20
19
|
|
21
|
-
struct Header {
|
22
|
-
quint32 queryId;
|
23
|
-
quint32 requestType;
|
24
|
-
quint32 dataLength;
|
25
|
-
};
|
26
|
-
|
27
20
|
enum Request {
|
28
|
-
|
29
|
-
|
21
|
+
RPC_LOAD_URL = 1,
|
22
|
+
RPC_LOAD_HTML = 2,
|
30
23
|
RPC_WAIT = 3,
|
31
24
|
RPC_FETCH = 4,
|
32
|
-
RPC_DISCARD = 5,
|
33
25
|
};
|
34
26
|
|
35
27
|
public:
|
36
|
-
SunscraperRPC(
|
28
|
+
SunscraperRPC(QLocalSocket *socket);
|
37
29
|
~SunscraperRPC();
|
38
30
|
|
31
|
+
signals:
|
32
|
+
void disconnected();
|
33
|
+
|
39
34
|
private slots:
|
40
35
|
void onInputReadable();
|
41
36
|
void onInputDisconnected();
|
42
|
-
|
43
|
-
void
|
37
|
+
|
38
|
+
void onFinish(unsigned queryId);
|
39
|
+
void onTimeout(unsigned queryId);
|
40
|
+
void onFetchDone(unsigned queryId, QString data);
|
44
41
|
|
45
42
|
private:
|
43
|
+
static unsigned m_nextQueryId;
|
44
|
+
static SunscraperWorker *m_worker;
|
45
|
+
|
46
|
+
unsigned m_queryId;
|
46
47
|
QLocalSocket *m_socket;
|
47
48
|
|
48
|
-
State
|
49
|
-
|
49
|
+
State m_state;
|
50
|
+
unsigned m_pendingRequest, m_pendingDataLength;
|
50
51
|
QByteArray m_buffer;
|
51
52
|
|
52
|
-
|
53
|
-
|
54
|
-
QList<unsigned> m_waitQueue;
|
55
|
-
QMap<unsigned, QTimer*> m_timers;
|
56
|
-
QMap<unsigned, QString> m_results;
|
53
|
+
bool m_result;
|
57
54
|
|
58
55
|
SunscraperRPC();
|
59
56
|
|
60
|
-
void processRequest(
|
61
|
-
void sendReply(
|
57
|
+
void processRequest(unsigned requestType, QByteArray data);
|
58
|
+
void sendReply(QByteArray data);
|
62
59
|
};
|
63
60
|
|
64
61
|
#endif // SUNSCRAPERRPC_H
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#include <QLocalServer>
|
2
|
+
#include "sunscraperrpcserver.h"
|
3
|
+
#include "sunscraperrpc.h"
|
4
|
+
|
5
|
+
SunscraperRPCServer::SunscraperRPCServer(QObject *parent) :
|
6
|
+
QObject(parent)
|
7
|
+
{
|
8
|
+
m_localServer = new QLocalServer();
|
9
|
+
|
10
|
+
connect(m_localServer, SIGNAL(newConnection()), this, SLOT(onNewConnection()));
|
11
|
+
}
|
12
|
+
|
13
|
+
bool SunscraperRPCServer::listen(QString socketPath)
|
14
|
+
{
|
15
|
+
return m_localServer->listen(socketPath);
|
16
|
+
}
|
17
|
+
|
18
|
+
void SunscraperRPCServer::onNewConnection()
|
19
|
+
{
|
20
|
+
while(m_localServer->hasPendingConnections()) {
|
21
|
+
QLocalSocket *socket = m_localServer->nextPendingConnection();
|
22
|
+
|
23
|
+
SunscraperRPC *rpc = new SunscraperRPC(socket);
|
24
|
+
connect(rpc, SIGNAL(disconnected()), rpc, SLOT(deleteLater()));
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#ifndef SUNSCRAPERRPCSERVER_H
|
2
|
+
#define SUNSCRAPERRPCSERVER_H
|
3
|
+
|
4
|
+
#include <QObject>
|
5
|
+
|
6
|
+
class QLocalServer;
|
7
|
+
|
8
|
+
class SunscraperRPCServer : public QObject
|
9
|
+
{
|
10
|
+
Q_OBJECT
|
11
|
+
|
12
|
+
public:
|
13
|
+
SunscraperRPCServer(QObject *parent = 0);
|
14
|
+
|
15
|
+
bool listen(QString socketPath);
|
16
|
+
|
17
|
+
private slots:
|
18
|
+
void onNewConnection();
|
19
|
+
|
20
|
+
private:
|
21
|
+
QLocalServer *m_localServer;
|
22
|
+
};
|
23
|
+
|
24
|
+
#endif
|
data/ext/sunscraper-ext.pro
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
TEMPLATE = subdirs
|
2
|
-
SUBDIRS = embed standalone
|
2
|
+
SUBDIRS = common embed standalone
|
data/lib/sunscraper.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
if
|
2
|
-
raise RuntimeError, "Sunscraper does not work on Ruby
|
1
|
+
if RUBY_VERSION =~ /^1.8/
|
2
|
+
raise RuntimeError, "Sunscraper does not work on Ruby 1.8."
|
3
3
|
end
|
4
4
|
|
5
5
|
# Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
|
@@ -27,9 +27,9 @@ module Sunscraper
|
|
27
27
|
# If your application depends on base URL being available, use {scrape_url}.
|
28
28
|
#
|
29
29
|
# @param [Integer] timeout timeout in milliseconds
|
30
|
-
def scrape_html(html, timeout=5000)
|
30
|
+
def scrape_html(html, url="about:blank", timeout=5000)
|
31
31
|
scrape(timeout) do |worker, context|
|
32
|
-
worker.load_html context, html
|
32
|
+
worker.load_html context, html, url
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
@@ -47,19 +47,19 @@ module Sunscraper
|
|
47
47
|
def scrape(timeout)
|
48
48
|
worker = load_worker
|
49
49
|
|
50
|
-
|
51
|
-
|
52
|
-
worker.wait(context, timeout)
|
50
|
+
begin
|
51
|
+
context = worker.create
|
53
52
|
|
54
|
-
|
53
|
+
yield worker, context
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
55
|
+
if worker.wait(context, timeout)
|
56
|
+
worker.fetch(context)
|
57
|
+
else
|
58
|
+
raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
|
59
|
+
end
|
60
|
+
ensure
|
61
|
+
worker.finalize(context) if context
|
60
62
|
end
|
61
|
-
ensure
|
62
|
-
worker.discard(context) if context
|
63
63
|
end
|
64
64
|
|
65
65
|
def load_worker
|
data/lib/sunscraper/library.rb
CHANGED
@@ -19,23 +19,23 @@ module Sunscraper
|
|
19
19
|
ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
|
20
20
|
'ext', 'embed', "libsunscraper.#{extension}")
|
21
21
|
|
22
|
-
attach_function 'create', :sunscraper_create, [],
|
23
|
-
attach_function 'load_html', :sunscraper_load_html, [:
|
24
|
-
attach_function 'load_url', :sunscraper_load_url, [:
|
25
|
-
attach_function 'fetch', :sunscraper_fetch, [:
|
26
|
-
attach_function '
|
22
|
+
attach_function 'create', :sunscraper_create, [], :uint
|
23
|
+
attach_function 'load_html', :sunscraper_load_html, [:uint, :string, :string], :void
|
24
|
+
attach_function 'load_url', :sunscraper_load_url, [:uint, :string], :void
|
25
|
+
attach_function 'fetch', :sunscraper_fetch, [:uint], :string
|
26
|
+
attach_function 'finalize', :sunscraper_finalize, [:uint], :void
|
27
27
|
|
28
28
|
if RUBY_ENGINE == 'ruby'
|
29
29
|
# MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
|
30
|
-
attach_function 'wait', :sunscraper_wait, [:
|
30
|
+
attach_function 'wait', :sunscraper_wait, [:uint, :uint], :bool, :blocking => true
|
31
31
|
else
|
32
32
|
# Rubinius does not have GVL neither it has options in attach_function.
|
33
33
|
# Same for JRuby.
|
34
|
-
attach_function 'wait', :sunscraper_wait, [:
|
34
|
+
attach_function 'wait', :sunscraper_wait, [:uint, :uint], :bool
|
35
35
|
end
|
36
36
|
|
37
|
-
attach_function '
|
37
|
+
attach_function 'quit', :sunscraper_quit, [], :void
|
38
38
|
|
39
|
-
at_exit {
|
39
|
+
at_exit { quit }
|
40
40
|
end
|
41
41
|
end
|
@@ -3,113 +3,65 @@ require 'socket'
|
|
3
3
|
# @private
|
4
4
|
module Sunscraper
|
5
5
|
module Standalone
|
6
|
-
@
|
6
|
+
@rpc_mutex = Mutex.new
|
7
|
+
@rpc_socket = nil
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
@rpc_results = {}
|
11
|
-
@rpc_thread = nil
|
12
|
-
|
13
|
-
RPC_LOAD_HTML = 1
|
14
|
-
RPC_LOAD_URL = 2
|
9
|
+
RPC_LOAD_URL = 1
|
10
|
+
RPC_LOAD_HTML = 2
|
15
11
|
RPC_WAIT = 3
|
16
12
|
RPC_FETCH = 4
|
17
|
-
RPC_DISCARD = 5
|
18
13
|
|
19
14
|
class << self
|
20
|
-
attr_reader :rpc_mutex, :rpc_waiters, :rpc_results
|
21
|
-
|
22
15
|
def create
|
23
|
-
|
24
|
-
@last_query_id += 1
|
25
|
-
@last_query_id
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
def load_html(query_id, html)
|
30
|
-
perform_rpc query_id,
|
31
|
-
request: RPC_LOAD_HTML,
|
32
|
-
data: html
|
16
|
+
connect_to_worker
|
33
17
|
end
|
34
18
|
|
35
|
-
def load_url(
|
36
|
-
perform_rpc
|
19
|
+
def load_url(socket, url)
|
20
|
+
perform_rpc socket,
|
37
21
|
request: RPC_LOAD_URL,
|
38
22
|
data: url
|
39
23
|
end
|
40
24
|
|
41
|
-
def
|
42
|
-
|
25
|
+
def load_html(socket, html, baseUrl)
|
26
|
+
html, baseUrl = [html, baseUrl].map(&:to_s)
|
27
|
+
perform_rpc socket,
|
28
|
+
request: RPC_LOAD_HTML,
|
29
|
+
data: [html.length, html, baseUrl.length, baseUrl].pack("Na*Na*")
|
30
|
+
end
|
31
|
+
|
32
|
+
def wait(socket, timeout)
|
33
|
+
result = perform_rpc socket,
|
43
34
|
request: RPC_WAIT,
|
44
35
|
data: [timeout].pack("N"),
|
45
36
|
want_result: true
|
37
|
+
code, = result.unpack("N")
|
38
|
+
|
39
|
+
code == 1 # true
|
46
40
|
end
|
47
41
|
|
48
|
-
def fetch(
|
49
|
-
perform_rpc
|
42
|
+
def fetch(socket)
|
43
|
+
perform_rpc socket,
|
50
44
|
request: RPC_FETCH,
|
51
45
|
want_result: true
|
52
46
|
end
|
53
47
|
|
54
|
-
def
|
55
|
-
|
56
|
-
request: RPC_DISCARD
|
48
|
+
def finalize(socket)
|
49
|
+
socket.close
|
57
50
|
end
|
58
51
|
|
59
52
|
private
|
60
53
|
|
61
|
-
def perform_rpc(
|
54
|
+
def perform_rpc(socket, options={})
|
62
55
|
data = options[:data] || ""
|
63
|
-
|
64
|
-
|
65
|
-
@rpc_mutex.synchronize do
|
66
|
-
if @rpc_thread.nil?
|
67
|
-
@rpc_thread = Standalone::Thread.new(::Thread.current)
|
68
|
-
|
69
|
-
# Some fucko decided not to put any semaphores in Ruby,
|
70
|
-
# _and_ restrict Mutexes to be unlocked only from the thread
|
71
|
-
# which has locked them.
|
72
|
-
#
|
73
|
-
# Please, kill yourself if you're reading this.
|
74
|
-
::Thread.stop
|
75
|
-
end
|
76
|
-
|
77
|
-
@rpc_thread.perform(query_id, options[:request], data)
|
78
|
-
|
79
|
-
if block
|
80
|
-
@rpc_waiters[query_id] = Thread.current
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
if block
|
85
|
-
Thread.stop
|
86
|
-
@rpc_results[query_id]
|
87
|
-
end
|
88
|
-
ensure
|
89
|
-
if block
|
90
|
-
@rpc_waiters.delete query_id
|
91
|
-
@rpc_results.delete query_id
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
class Thread < ::Thread
|
97
|
-
def initialize(creator)
|
98
|
-
@creator = creator
|
56
|
+
socket.write([options[:request], data.length, data].pack("NNa*"))
|
99
57
|
|
100
|
-
|
101
|
-
|
102
|
-
|
58
|
+
if options[:want_result]
|
59
|
+
data_length, = socket.read(4).unpack("N")
|
60
|
+
socket.read(data_length)
|
103
61
|
end
|
104
62
|
end
|
105
63
|
|
106
|
-
def
|
107
|
-
@socket.write([query_id, request, data.length, data].pack("NNNa*"))
|
108
|
-
end
|
109
|
-
|
110
|
-
private
|
111
|
-
|
112
|
-
def work
|
64
|
+
def spawn_worker
|
113
65
|
if ::Sunscraper.os_x?
|
114
66
|
# Fuck you, OS X.
|
115
67
|
suffix = ".app/Contents/MacOS/sunscraper"
|
@@ -121,47 +73,41 @@ module Sunscraper
|
|
121
73
|
'ext', 'standalone', "sunscraper#{suffix}")
|
122
74
|
|
123
75
|
server_path = "/tmp/sunscraper.#{Process.pid}.sock"
|
124
|
-
|
76
|
+
File.unlink server_path if File.exists? server_path
|
125
77
|
|
126
78
|
if Kernel.respond_to? :spawn
|
127
|
-
|
79
|
+
@rpc_pid = Kernel.spawn "#{executable} #{server_path}"
|
128
80
|
else
|
129
81
|
# rbx does not have Kernel.spawn (yet). Sigh...
|
130
|
-
|
82
|
+
@rpc_pid = fork { exec executable, server_path }
|
131
83
|
end
|
132
84
|
|
133
|
-
|
134
|
-
|
135
|
-
|
85
|
+
# Sigh again. Probably no other way.
|
86
|
+
loop do
|
87
|
+
if File.exists? server_path
|
88
|
+
@rpc_socket = server_path
|
89
|
+
break
|
90
|
+
elsif Process.wait(@rpc_pid, Process::WNOHANG)
|
91
|
+
raise RuntimeError, "Cannot start Sunscraper process"
|
92
|
+
end
|
136
93
|
|
137
|
-
|
138
|
-
|
94
|
+
sleep 0.1
|
95
|
+
end
|
139
96
|
|
140
|
-
|
141
|
-
@creator.wakeup
|
97
|
+
Process.detach @rpc_pid
|
142
98
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
data = @socket.read(data_length) if data_length > 0
|
147
|
-
|
148
|
-
@parent.rpc_mutex.synchronize do
|
149
|
-
if !@parent.rpc_waiters.include?(query_id)
|
150
|
-
$stderr.puts "Sunscraper/standalone: no waiter for #{query_id}"
|
151
|
-
else
|
152
|
-
@parent.rpc_results[query_id] = data
|
153
|
-
@parent.rpc_waiters[query_id].wakeup
|
154
|
-
end
|
155
|
-
end
|
99
|
+
at_exit do
|
100
|
+
Process.kill "KILL", @rpc_pid
|
101
|
+
File.unlink @rpc_socket
|
156
102
|
end
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
103
|
+
end
|
104
|
+
|
105
|
+
def connect_to_worker
|
106
|
+
@rpc_mutex.synchronize do
|
107
|
+
spawn_worker if @rpc_socket.nil?
|
161
108
|
end
|
162
|
-
|
163
|
-
@
|
164
|
-
Process.kill pid
|
109
|
+
|
110
|
+
UNIXSocket.new(@rpc_socket)
|
165
111
|
end
|
166
112
|
end
|
167
113
|
end
|
data/spec/sunscraper_spec.rb
CHANGED
@@ -1,15 +1,12 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
|
3
2
|
require 'webrick'
|
4
3
|
|
5
|
-
|
4
|
+
HTML_TEMPLATE = <<HTML
|
6
5
|
<html>
|
7
6
|
<head>
|
8
7
|
<script type="text/javascript">
|
9
8
|
document.addEventListener("DOMContentLoaded", function() {
|
10
|
-
|
11
|
-
("!skrow tI").split("").reverse().join("");
|
12
|
-
Sunscraper.finish();
|
9
|
+
%code%
|
13
10
|
}, true);
|
14
11
|
</script>
|
15
12
|
</head>
|
@@ -19,73 +16,118 @@ HTML = <<HTML
|
|
19
16
|
</html>
|
20
17
|
HTML
|
21
18
|
|
22
|
-
|
19
|
+
HTML_FUGA = HTML_TEMPLATE.sub("%code%", <<CODE)
|
20
|
+
document.getElementById('fuga').textContent =
|
21
|
+
("!skrow tI").split("").reverse().join("");
|
22
|
+
Sunscraper.finish();
|
23
|
+
CODE
|
24
|
+
|
25
|
+
HTML_BASEURL = HTML_TEMPLATE.sub("%code%", <<CODE)
|
26
|
+
var xhr = new XMLHttpRequest();
|
27
|
+
xhr.onreadystatechange = function() {
|
28
|
+
if(xhr.readyState > 3) {
|
29
|
+
document.getElementById('fuga').textContent = xhr.responseText;
|
30
|
+
Sunscraper.finish();
|
31
|
+
}
|
32
|
+
};
|
33
|
+
xhr.open('GET', '/comicstrip', 1);
|
34
|
+
xhr.send();
|
35
|
+
CODE
|
36
|
+
|
37
|
+
HTML_USERAGENT = HTML_TEMPLATE.sub("%code%", <<CODE)
|
38
|
+
document.getElementById('fuga').textContent =
|
39
|
+
window.navigator.userAgent;
|
40
|
+
Sunscraper.finish();
|
41
|
+
CODE
|
42
|
+
|
43
|
+
HTML_LOCALSTORAGE = HTML_TEMPLATE.sub("%code%", <<CODE)
|
44
|
+
window.localStorage.setItem("key", ["O", "K"].join(""))
|
45
|
+
document.getElementById('fuga').textContent =
|
46
|
+
window.localStorage.getItem("key");
|
47
|
+
Sunscraper.finish();
|
48
|
+
CODE
|
23
49
|
|
24
|
-
def with_webserver
|
25
|
-
|
50
|
+
def with_webserver(html)
|
51
|
+
port = 45555
|
52
|
+
server = WEBrick::HTTPServer.new :Port => port, :Logger => WEBrick::Log.new('/dev/null'), :AccessLog => []
|
26
53
|
server.mount_proc '/' do |req, res|
|
27
|
-
res.body =
|
54
|
+
res.body = html
|
28
55
|
end
|
29
|
-
|
56
|
+
server.mount_proc '/comicstrip' do |req, res|
|
57
|
+
res.body = 'Go Get a Roomie!'
|
58
|
+
end
|
59
|
+
thread = Thread.new { server.start }
|
30
60
|
|
31
|
-
yield
|
61
|
+
yield "http://127.0.0.1:#{port}/"
|
32
62
|
ensure
|
33
|
-
server.shutdown
|
63
|
+
server.shutdown
|
64
|
+
thread.join
|
34
65
|
end
|
35
66
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
67
|
+
define_tests = lambda do |klass, worker|
|
68
|
+
describe klass do
|
69
|
+
before(:all) do
|
70
|
+
Sunscraper.worker = worker
|
71
|
+
end
|
41
72
|
|
42
|
-
|
43
|
-
|
44
|
-
describe "Sunscraper::Library" do
|
45
|
-
before do
|
46
|
-
Sunscraper.worker = :embed
|
73
|
+
after(:all) do
|
74
|
+
sleep(5) # let threads rest in peace
|
47
75
|
end
|
48
76
|
|
49
77
|
it "can scrape an HTML provided as a string" do
|
50
|
-
Sunscraper.scrape_html(
|
78
|
+
Sunscraper.scrape_html(HTML_FUGA).should include('It works!')
|
51
79
|
end
|
52
80
|
|
53
81
|
it "can scrape an URL" do
|
54
|
-
with_webserver do |
|
55
|
-
Sunscraper.scrape_url(
|
82
|
+
with_webserver(HTML_FUGA) do |url|
|
83
|
+
Sunscraper.scrape_url(url).should include('It works!')
|
56
84
|
end
|
57
85
|
end
|
58
86
|
|
59
87
|
it "should time out if callback is not called" do
|
60
|
-
lambda { Sunscraper.scrape_html("<!-- nothing. at least no callbacks -->",
|
88
|
+
lambda { Sunscraper.scrape_html("<!-- nothing. at least no callbacks -->", "about:blank", 500) }.
|
61
89
|
should raise_exception(Sunscraper::ScrapeTimeout)
|
62
90
|
end
|
63
|
-
end
|
64
|
-
end
|
65
91
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
# work even on jruby master (as of Mar 09, 2012).
|
71
|
-
describe "Sunscraper::Standalone" do
|
72
|
-
before do
|
73
|
-
Sunscraper.worker = :standalone
|
92
|
+
it "respects baseUrl parameter" do
|
93
|
+
with_webserver("<!-- nothing -->") do |url|
|
94
|
+
Sunscraper.scrape_html(HTML_BASEURL, url).should include('Go Get a Roomie')
|
95
|
+
end
|
74
96
|
end
|
75
97
|
|
76
|
-
it "
|
77
|
-
Sunscraper.scrape_html(
|
98
|
+
it "should identify itself as Sunscraper" do
|
99
|
+
Sunscraper.scrape_html(HTML_USERAGENT).should include("Sunscraper")
|
78
100
|
end
|
79
101
|
|
80
|
-
it "
|
81
|
-
with_webserver do |
|
82
|
-
Sunscraper.scrape_url(
|
102
|
+
it "should work with window.localStorage through webserver" do
|
103
|
+
with_webserver(HTML_LOCALSTORAGE) do |url|
|
104
|
+
Sunscraper.scrape_url(url).should include("OK")
|
83
105
|
end
|
84
106
|
end
|
85
107
|
|
86
|
-
it "should
|
87
|
-
|
88
|
-
|
108
|
+
it "should withstand a lot of concurrent threads" do
|
109
|
+
500.times.map {
|
110
|
+
Thread.new {
|
111
|
+
Sunscraper.scrape_html(HTML_FUGA)
|
112
|
+
}
|
113
|
+
}.each(&:join).
|
114
|
+
map(&:value).
|
115
|
+
each { |result|
|
116
|
+
result.should include('It works!')
|
117
|
+
}
|
89
118
|
end
|
90
119
|
end
|
120
|
+
end
|
121
|
+
|
122
|
+
unless Sunscraper.os_x?
|
123
|
+
# This part currently crashes on OS X (and will forever).
|
124
|
+
define_tests.("Sunscraper-Embed", :embed)
|
125
|
+
end
|
126
|
+
|
127
|
+
if !(RUBY_ENGINE =~ /rbx/ || RUBY_ENGINE =~ /jruby/) ||
|
128
|
+
ENV['EXPERIMENTAL'] == 'true'
|
129
|
+
# This part currently crashes Rubinius (as of Mar 09, 2012),
|
130
|
+
# and crashes jruby < 1.7.0, and uses Unix sockets which don't
|
131
|
+
# work even on jruby master (as of Mar 09, 2012).
|
132
|
+
define_tests.("Sunscraper-Standalone", :standalone)
|
91
133
|
end
|