sunscraper 1.0.0 → 1.1.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +28 -0
- data/README.md +42 -20
- data/ext/.gitignore +5 -1
- data/ext/embed/sunscraper.cpp +92 -0
- data/ext/{sunscraper.h → embed/sunscraper.h} +9 -7
- data/ext/{sunscraper.pro → embed/sunscraper.pro} +2 -0
- data/ext/embed/sunscraperexternal.cpp +39 -0
- data/ext/{sunscraperlibrary.cpp → embed/sunscraperlibrary.cpp} +4 -9
- data/ext/{sunscraperlibrary.h → embed/sunscraperlibrary.h} +1 -5
- data/ext/embed/sunscraperproxy.cpp +14 -0
- data/ext/{sunscraperproxy.h → embed/sunscraperproxy.h} +3 -3
- data/ext/embed/sunscraperthread.cpp +148 -0
- data/ext/embed/sunscraperthread.h +54 -0
- data/ext/extconf.rb +13 -3
- data/ext/standalone/sunscraper.pro +13 -0
- data/ext/standalone/sunscrapermain.cpp +13 -0
- data/ext/{sunscraperproxy.cpp → standalone/sunscraperproxy.cpp} +2 -2
- data/ext/standalone/sunscraperproxy.h +24 -0
- data/ext/standalone/sunscraperrpc.cpp +183 -0
- data/ext/standalone/sunscraperrpc.h +64 -0
- data/ext/{sunscraperthread.cpp → standalone/sunscraperworker.cpp} +9 -18
- data/ext/{sunscraperthread.h → standalone/sunscraperworker.h} +8 -8
- data/lib/sunscraper/library.rb +33 -29
- data/lib/sunscraper/standalone.rb +168 -0
- data/lib/sunscraper.rb +48 -11
- data/spec/sunscraper_spec.rb +59 -13
- data/sunscraper.gemspec +2 -2
- metadata +58 -75
- data/ext/Makefile +0 -270
- data/ext/sunscraper.cpp +0 -86
- data/ext/sunscraperexternal.cpp +0 -33
@@ -0,0 +1,183 @@
|
|
1
|
+
#include <QLocalSocket>
|
2
|
+
#include <QTimer>
|
3
|
+
#include <QDataStream>
|
4
|
+
#include <QApplication>
|
5
|
+
#include <QtDebug>
|
6
|
+
#include <arpa/inet.h>
|
7
|
+
#include "sunscraperrpc.h"
|
8
|
+
#include "sunscraperworker.h"
|
9
|
+
|
10
|
+
SunscraperRPC::SunscraperRPC(QString socketPath) :
|
11
|
+
m_state(StateHeader)
|
12
|
+
{
|
13
|
+
m_socket = new QLocalSocket(this);
|
14
|
+
m_socket->connectToServer(socketPath);
|
15
|
+
connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
|
16
|
+
connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
|
17
|
+
|
18
|
+
m_worker = new SunscraperWorker(this);
|
19
|
+
connect(m_worker, SIGNAL(finished(uint,QString)), this, SLOT(onPageRendered(uint,QString)));
|
20
|
+
}
|
21
|
+
|
22
|
+
SunscraperRPC::~SunscraperRPC()
|
23
|
+
{
|
24
|
+
delete m_worker;
|
25
|
+
}
|
26
|
+
|
27
|
+
void SunscraperRPC::onInputReadable()
|
28
|
+
{
|
29
|
+
m_buffer += m_socket->readAll();
|
30
|
+
|
31
|
+
bool moreData = true;
|
32
|
+
while(moreData) {
|
33
|
+
switch(m_state) {
|
34
|
+
case StateHeader:
|
35
|
+
if((unsigned) m_buffer.length() >= sizeof(Header)) {
|
36
|
+
memcpy((void*) &m_pendingHeader, m_buffer.constData(), sizeof(Header));
|
37
|
+
m_buffer.remove(0, sizeof(Header));
|
38
|
+
m_state = StateData;
|
39
|
+
} else {
|
40
|
+
moreData = false;
|
41
|
+
}
|
42
|
+
|
43
|
+
break;
|
44
|
+
|
45
|
+
case StateData:
|
46
|
+
unsigned length = ntohl(m_pendingHeader.dataLength);
|
47
|
+
|
48
|
+
if((unsigned) m_buffer.length() >= length) {
|
49
|
+
QByteArray data = m_buffer.left(length);
|
50
|
+
m_buffer.remove(0, length);
|
51
|
+
processRequest(m_pendingHeader, data);
|
52
|
+
m_state = StateHeader;
|
53
|
+
} else {
|
54
|
+
moreData = false;
|
55
|
+
}
|
56
|
+
|
57
|
+
break;
|
58
|
+
}
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
void SunscraperRPC::onInputDisconnected()
|
63
|
+
{
|
64
|
+
/* Magic value. */
|
65
|
+
QApplication::exit(42);
|
66
|
+
}
|
67
|
+
|
68
|
+
void SunscraperRPC::processRequest(Header header, QByteArray data)
|
69
|
+
{
|
70
|
+
unsigned queryId, requestType;
|
71
|
+
|
72
|
+
queryId = ntohl(header.queryId);
|
73
|
+
requestType = ntohl(header.requestType);
|
74
|
+
|
75
|
+
switch(requestType) {
|
76
|
+
case RPC_LOAD_HTML: {
|
77
|
+
m_worker->loadHtml(queryId, data);
|
78
|
+
|
79
|
+
break;
|
80
|
+
}
|
81
|
+
|
82
|
+
case RPC_LOAD_URL: {
|
83
|
+
m_worker->loadUrl(queryId, data);
|
84
|
+
|
85
|
+
break;
|
86
|
+
}
|
87
|
+
|
88
|
+
case RPC_WAIT: {
|
89
|
+
if(m_results.contains(queryId)) {
|
90
|
+
Header reply;
|
91
|
+
reply.queryId = htonl(queryId);
|
92
|
+
reply.requestType = htonl(RPC_WAIT);
|
93
|
+
|
94
|
+
sendReply(reply, QByteArray());
|
95
|
+
} else {
|
96
|
+
Q_ASSERT(!m_waitQueue.contains(queryId));
|
97
|
+
Q_ASSERT(!m_timers.contains(queryId));
|
98
|
+
|
99
|
+
m_waitQueue.append(queryId);
|
100
|
+
|
101
|
+
unsigned timeout;
|
102
|
+
|
103
|
+
QDataStream stream(data);
|
104
|
+
stream >> timeout;
|
105
|
+
|
106
|
+
QTimer *timer = new QTimer(this);
|
107
|
+
timer->setInterval(timeout);
|
108
|
+
timer->setSingleShot(true);
|
109
|
+
timer->start();
|
110
|
+
connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
|
111
|
+
|
112
|
+
m_timers[queryId] = timer;
|
113
|
+
}
|
114
|
+
|
115
|
+
break;
|
116
|
+
}
|
117
|
+
|
118
|
+
case RPC_FETCH: {
|
119
|
+
Header reply;
|
120
|
+
reply.queryId = htonl(queryId);
|
121
|
+
reply.requestType = htonl(RPC_FETCH);
|
122
|
+
|
123
|
+
if(m_results.contains(queryId)) {
|
124
|
+
sendReply(reply, m_results[queryId].toLocal8Bit());
|
125
|
+
} else {
|
126
|
+
sendReply(reply, "!SUNSCRAPER_TIMEOUT");
|
127
|
+
}
|
128
|
+
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
|
132
|
+
case RPC_DISCARD: {
|
133
|
+
m_results.remove(queryId);
|
134
|
+
m_waitQueue.removeAll(queryId);
|
135
|
+
|
136
|
+
if(m_timers.contains(queryId)) {
|
137
|
+
QTimer *timer = m_timers[queryId];
|
138
|
+
delete timer;
|
139
|
+
|
140
|
+
m_timers.remove(queryId);
|
141
|
+
}
|
142
|
+
|
143
|
+
m_worker->finalize(queryId);
|
144
|
+
|
145
|
+
break;
|
146
|
+
}
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
void SunscraperRPC::onPageRendered(unsigned queryId, QString data)
|
151
|
+
{
|
152
|
+
m_results[queryId] = data;
|
153
|
+
|
154
|
+
if(m_waitQueue.contains(queryId)) {
|
155
|
+
Header reply;
|
156
|
+
reply.queryId = htonl(queryId);
|
157
|
+
reply.requestType = htonl(RPC_WAIT);
|
158
|
+
|
159
|
+
sendReply(reply, QByteArray());
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
void SunscraperRPC::onTimeout()
|
164
|
+
{
|
165
|
+
QTimer *timer = static_cast<QTimer*>(QObject::sender());
|
166
|
+
unsigned queryId = m_timers.key(timer);
|
167
|
+
|
168
|
+
Header reply;
|
169
|
+
reply.queryId = htonl(queryId);
|
170
|
+
reply.requestType = htonl(RPC_WAIT);
|
171
|
+
|
172
|
+
sendReply(reply, QByteArray());
|
173
|
+
}
|
174
|
+
|
175
|
+
void SunscraperRPC::sendReply(Header header, QByteArray data)
|
176
|
+
{
|
177
|
+
header.dataLength = htonl(data.length());
|
178
|
+
|
179
|
+
QByteArray serialized((const char*) &header, sizeof(Header));
|
180
|
+
serialized.append(data);
|
181
|
+
|
182
|
+
m_socket->write(serialized);
|
183
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#ifndef SUNSCRAPERRPC_H
|
2
|
+
#define SUNSCRAPERRPC_H
|
3
|
+
|
4
|
+
#include <QObject>
|
5
|
+
#include <QVector>
|
6
|
+
#include <QMap>
|
7
|
+
|
8
|
+
class SunscraperWorker;
|
9
|
+
class QLocalSocket;
|
10
|
+
class QTimer;
|
11
|
+
|
12
|
+
class SunscraperRPC : public QObject
|
13
|
+
{
|
14
|
+
Q_OBJECT
|
15
|
+
|
16
|
+
enum State {
|
17
|
+
StateHeader = 0,
|
18
|
+
StateData,
|
19
|
+
};
|
20
|
+
|
21
|
+
struct Header {
|
22
|
+
quint32 queryId;
|
23
|
+
quint32 requestType;
|
24
|
+
quint32 dataLength;
|
25
|
+
};
|
26
|
+
|
27
|
+
enum Request {
|
28
|
+
RPC_LOAD_HTML = 1,
|
29
|
+
RPC_LOAD_URL = 2,
|
30
|
+
RPC_WAIT = 3,
|
31
|
+
RPC_FETCH = 4,
|
32
|
+
RPC_DISCARD = 5,
|
33
|
+
};
|
34
|
+
|
35
|
+
public:
|
36
|
+
SunscraperRPC(QString socketPath);
|
37
|
+
~SunscraperRPC();
|
38
|
+
|
39
|
+
private slots:
|
40
|
+
void onInputReadable();
|
41
|
+
void onInputDisconnected();
|
42
|
+
void onPageRendered(unsigned queryId, QString data);
|
43
|
+
void onTimeout();
|
44
|
+
|
45
|
+
private:
|
46
|
+
QLocalSocket *m_socket;
|
47
|
+
|
48
|
+
State m_state;
|
49
|
+
Header m_pendingHeader;
|
50
|
+
QByteArray m_buffer;
|
51
|
+
|
52
|
+
SunscraperWorker *m_worker;
|
53
|
+
|
54
|
+
QList<unsigned> m_waitQueue;
|
55
|
+
QMap<unsigned, QTimer*> m_timers;
|
56
|
+
QMap<unsigned, QString> m_results;
|
57
|
+
|
58
|
+
SunscraperRPC();
|
59
|
+
|
60
|
+
void processRequest(Header header, QByteArray data);
|
61
|
+
void sendReply(Header header, QByteArray data);
|
62
|
+
};
|
63
|
+
|
64
|
+
#endif // SUNSCRAPERRPC_H
|
@@ -1,37 +1,28 @@
|
|
1
1
|
#include <QApplication>
|
2
2
|
#include <QWebPage>
|
3
3
|
#include <QWebFrame>
|
4
|
-
#include "
|
4
|
+
#include "sunscraperworker.h"
|
5
5
|
#include "sunscraperproxy.h"
|
6
|
+
#include <QtDebug>
|
6
7
|
|
7
|
-
|
8
|
+
SunscraperWorker::SunscraperWorker(QObject *parent) :
|
9
|
+
QObject(parent)
|
8
10
|
{
|
9
11
|
}
|
10
12
|
|
11
|
-
void
|
12
|
-
{
|
13
|
-
static int argc;
|
14
|
-
static char **argv = {NULL};
|
15
|
-
|
16
|
-
QApplication app(argc, argv);
|
17
|
-
app.exec();
|
18
|
-
|
19
|
-
qFatal("Sunscraper apartment thread event loop should never end");
|
20
|
-
}
|
21
|
-
|
22
|
-
void SunscraperThread::loadHtml(unsigned queryId, QString html)
|
13
|
+
void SunscraperWorker::loadHtml(unsigned queryId, QString html)
|
23
14
|
{
|
24
15
|
QWebPage *webPage = initializeWebPage(queryId);
|
25
16
|
webPage->mainFrame()->setHtml(html);
|
26
17
|
}
|
27
18
|
|
28
|
-
void
|
19
|
+
void SunscraperWorker::loadUrl(unsigned queryId, QString url)
|
29
20
|
{
|
30
21
|
QWebPage *webPage = initializeWebPage(queryId);
|
31
22
|
webPage->mainFrame()->load(url);
|
32
23
|
}
|
33
24
|
|
34
|
-
void
|
25
|
+
void SunscraperWorker::finalize(unsigned queryId)
|
35
26
|
{
|
36
27
|
Q_ASSERT(_webPages[queryId] != NULL);
|
37
28
|
|
@@ -39,7 +30,7 @@ void SunscraperThread::finalize(unsigned queryId)
|
|
39
30
|
_webPages.remove(queryId);
|
40
31
|
}
|
41
32
|
|
42
|
-
QWebPage *
|
33
|
+
QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
|
43
34
|
{
|
44
35
|
Q_ASSERT(_webPages[queryId] == NULL);
|
45
36
|
|
@@ -52,7 +43,7 @@ QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
|
|
52
43
|
return webPage;
|
53
44
|
}
|
54
45
|
|
55
|
-
void
|
46
|
+
void SunscraperWorker::attachAPI()
|
56
47
|
{
|
57
48
|
QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
|
58
49
|
QWebPage *page = origin->page();
|
@@ -1,18 +1,18 @@
|
|
1
|
-
#ifndef
|
2
|
-
#define
|
1
|
+
#ifndef SUNSCRAPERWORKER_H
|
2
|
+
#define SUNSCRAPERWORKER_H
|
3
3
|
|
4
|
-
#include <
|
4
|
+
#include <QObject>
|
5
|
+
#include <QMutex>
|
5
6
|
#include <QMap>
|
6
7
|
|
7
8
|
class QWebPage;
|
8
9
|
|
9
|
-
class
|
10
|
+
class SunscraperWorker : public QObject
|
10
11
|
{
|
11
12
|
Q_OBJECT
|
12
|
-
public:
|
13
|
-
SunscraperThread();
|
14
13
|
|
15
|
-
|
14
|
+
public:
|
15
|
+
SunscraperWorker(QObject *parent = 0);
|
16
16
|
|
17
17
|
signals:
|
18
18
|
void finished(unsigned queryId, QString result);
|
@@ -31,4 +31,4 @@ private:
|
|
31
31
|
QWebPage *initializeWebPage(unsigned queryId);
|
32
32
|
};
|
33
33
|
|
34
|
-
#endif //
|
34
|
+
#endif // SUNSCRAPERWORKER_H
|
data/lib/sunscraper/library.rb
CHANGED
@@ -1,37 +1,41 @@
|
|
1
|
-
if
|
2
|
-
raise RuntimeError, "Sunscraper does not work on
|
1
|
+
if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
|
2
|
+
raise RuntimeError, "Sunscraper/embed does not work on OS X. Use Sunscraper/standalone."
|
3
3
|
end
|
4
4
|
|
5
5
|
require 'ffi'
|
6
|
+
require 'rbconfig'
|
6
7
|
|
7
8
|
# @private
|
8
|
-
module Sunscraper
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
9
|
+
module Sunscraper
|
10
|
+
module Library
|
11
|
+
extend FFI::Library
|
12
|
+
|
13
|
+
if Gem.win_platform?
|
14
|
+
extension = 'dll'
|
15
|
+
else
|
16
|
+
extension = 'so'
|
17
|
+
end
|
18
|
+
|
19
|
+
ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
|
20
|
+
'ext', 'embed', "libsunscraper.#{extension}")
|
21
|
+
|
22
|
+
attach_function 'create', :sunscraper_create, [], :pointer
|
23
|
+
attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
|
24
|
+
attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
|
25
|
+
attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
|
26
|
+
attach_function 'discard', :sunscraper_discard, [:pointer], :void
|
27
|
+
|
28
|
+
if RUBY_ENGINE == 'ruby'
|
29
|
+
# MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
|
30
|
+
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
|
31
|
+
else
|
32
|
+
# Rubinius does not have GVL neither it has options in attach_function.
|
33
|
+
# Same for JRuby.
|
34
|
+
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
|
35
|
+
end
|
36
|
+
|
37
|
+
attach_function 'finalize', :sunscraper_finalize, [], :void
|
19
38
|
|
20
|
-
|
21
|
-
'ext', "libsunscraper.#{extension}")
|
22
|
-
|
23
|
-
attach_function 'create', :sunscraper_create, [], :pointer
|
24
|
-
attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
|
25
|
-
attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
|
26
|
-
attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
|
27
|
-
attach_function 'discard', :sunscraper_discard, [:pointer], :void
|
28
|
-
|
29
|
-
if RUBY_ENGINE == 'ruby'
|
30
|
-
# MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
|
31
|
-
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
|
32
|
-
else
|
33
|
-
# Rubinius does not have GVL neither it has options in attach_function.
|
34
|
-
# Same for JRuby.
|
35
|
-
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
|
39
|
+
at_exit { finalize }
|
36
40
|
end
|
37
41
|
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'socket'
|
2
|
+
|
3
|
+
# @private
|
4
|
+
module Sunscraper
|
5
|
+
module Standalone
|
6
|
+
@last_query_id = 0
|
7
|
+
|
8
|
+
@rpc_mutex = Mutex.new
|
9
|
+
@rpc_waiters = {}
|
10
|
+
@rpc_results = {}
|
11
|
+
@rpc_thread = nil
|
12
|
+
|
13
|
+
RPC_LOAD_HTML = 1
|
14
|
+
RPC_LOAD_URL = 2
|
15
|
+
RPC_WAIT = 3
|
16
|
+
RPC_FETCH = 4
|
17
|
+
RPC_DISCARD = 5
|
18
|
+
|
19
|
+
class << self
|
20
|
+
attr_reader :rpc_mutex, :rpc_waiters, :rpc_results
|
21
|
+
|
22
|
+
def create
|
23
|
+
@rpc_mutex.synchronize do
|
24
|
+
@last_query_id += 1
|
25
|
+
@last_query_id
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def load_html(query_id, html)
|
30
|
+
perform_rpc query_id,
|
31
|
+
request: RPC_LOAD_HTML,
|
32
|
+
data: html
|
33
|
+
end
|
34
|
+
|
35
|
+
def load_url(query_id, url)
|
36
|
+
perform_rpc query_id,
|
37
|
+
request: RPC_LOAD_URL,
|
38
|
+
data: url
|
39
|
+
end
|
40
|
+
|
41
|
+
def wait(query_id, timeout)
|
42
|
+
perform_rpc query_id,
|
43
|
+
request: RPC_WAIT,
|
44
|
+
data: [timeout].pack("N"),
|
45
|
+
want_result: true
|
46
|
+
end
|
47
|
+
|
48
|
+
def fetch(query_id)
|
49
|
+
perform_rpc query_id,
|
50
|
+
request: RPC_FETCH,
|
51
|
+
want_result: true
|
52
|
+
end
|
53
|
+
|
54
|
+
def discard(query_id)
|
55
|
+
perform_rpc query_id,
|
56
|
+
request: RPC_DISCARD
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def perform_rpc(query_id, options={})
|
62
|
+
data = options[:data] || ""
|
63
|
+
block = options[:want_result]
|
64
|
+
|
65
|
+
@rpc_mutex.synchronize do
|
66
|
+
if @rpc_thread.nil?
|
67
|
+
@rpc_thread = Standalone::Thread.new(::Thread.current)
|
68
|
+
|
69
|
+
# Some fucko decided not to put any semaphores in Ruby,
|
70
|
+
# _and_ restrict Mutexes to be unlocked only from the thread
|
71
|
+
# which has locked them.
|
72
|
+
#
|
73
|
+
# Please, kill yourself if you're reading this.
|
74
|
+
::Thread.stop
|
75
|
+
end
|
76
|
+
|
77
|
+
@rpc_thread.perform(query_id, options[:request], data)
|
78
|
+
|
79
|
+
if block
|
80
|
+
@rpc_waiters[query_id] = Thread.current
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if block
|
85
|
+
Thread.stop
|
86
|
+
@rpc_results[query_id]
|
87
|
+
end
|
88
|
+
ensure
|
89
|
+
if block
|
90
|
+
@rpc_waiters.delete query_id
|
91
|
+
@rpc_results.delete query_id
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class Thread < ::Thread
|
97
|
+
def initialize(creator)
|
98
|
+
@creator = creator
|
99
|
+
|
100
|
+
super do
|
101
|
+
@parent = Sunscraper::Standalone
|
102
|
+
work
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def perform(query_id, request, data)
|
107
|
+
@socket.write([query_id, request, data.length, data].pack("NNNa*"))
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def work
|
113
|
+
if ::Sunscraper.os_x?
|
114
|
+
# Fuck you, OS X.
|
115
|
+
suffix = ".app/Contents/MacOS/sunscraper"
|
116
|
+
else
|
117
|
+
suffix = RbConfig::CONFIG["EXEEXT"]
|
118
|
+
end
|
119
|
+
|
120
|
+
executable = File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
|
121
|
+
'ext', 'standalone', "sunscraper#{suffix}")
|
122
|
+
|
123
|
+
server_path = "/tmp/sunscraper.#{Process.pid}.sock"
|
124
|
+
server = UNIXServer.new(server_path)
|
125
|
+
|
126
|
+
if Kernel.respond_to? :spawn
|
127
|
+
pid = Kernel.spawn "#{executable} #{server_path}"
|
128
|
+
else
|
129
|
+
# rbx does not have Kernel.spawn (yet). Sigh...
|
130
|
+
pid = fork { exec executable, server_path }
|
131
|
+
end
|
132
|
+
|
133
|
+
Process.detach pid
|
134
|
+
|
135
|
+
@socket = server.accept
|
136
|
+
|
137
|
+
server.close
|
138
|
+
FileUtils.rm server_path
|
139
|
+
|
140
|
+
# See above.
|
141
|
+
@creator.wakeup
|
142
|
+
|
143
|
+
loop do
|
144
|
+
header = @socket.read(4 * 3)
|
145
|
+
query_id, request, data_length = header.unpack("NNN")
|
146
|
+
data = @socket.read(data_length) if data_length > 0
|
147
|
+
|
148
|
+
@parent.rpc_mutex.synchronize do
|
149
|
+
if !@parent.rpc_waiters.include?(query_id)
|
150
|
+
$stderr.puts "Sunscraper/standalone: no waiter for #{query_id}"
|
151
|
+
else
|
152
|
+
@parent.rpc_results[query_id] = data
|
153
|
+
@parent.rpc_waiters[query_id].wakeup
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
rescue Exception => e
|
158
|
+
$stderr.puts "Sunscraper error: #{e.class}: #{e.message}"
|
159
|
+
e.backtrace.each do |line|
|
160
|
+
$stderr.puts " #{line}"
|
161
|
+
end
|
162
|
+
ensure
|
163
|
+
@socket.close
|
164
|
+
Process.kill pid
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
data/lib/sunscraper.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
|
2
|
+
raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
|
3
|
+
end
|
2
4
|
|
3
5
|
# Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
|
4
6
|
# method to be called. It blocks the calling thread, but is threadsafe, does
|
@@ -8,13 +10,26 @@ module Sunscraper
|
|
8
10
|
class ScrapeTimeout < StandardError; end
|
9
11
|
|
10
12
|
class << self
|
13
|
+
def os_x?
|
14
|
+
RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :worker
|
18
|
+
def worker=(worker_type)
|
19
|
+
if [:embed, :standalone].include?(worker_type)
|
20
|
+
@worker = worker_type
|
21
|
+
else
|
22
|
+
raise RuntimeError, "Invalid Sunscraper worker type: #{worker_type.inspect}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
11
26
|
# Scrape an inline HTML. The content is loaded without a particular base URL.
|
12
27
|
# If your application depends on base URL being available, use {scrape_url}.
|
13
28
|
#
|
14
29
|
# @param [Integer] timeout timeout in milliseconds
|
15
30
|
def scrape_html(html, timeout=5000)
|
16
|
-
scrape(timeout) do |context|
|
17
|
-
|
31
|
+
scrape(timeout) do |worker, context|
|
32
|
+
worker.load_html context, html
|
18
33
|
end
|
19
34
|
end
|
20
35
|
|
@@ -22,21 +37,21 @@ module Sunscraper
|
|
22
37
|
#
|
23
38
|
# @param [Integer] timeout timeout in milliseconds
|
24
39
|
def scrape_url(url, timeout=5000)
|
25
|
-
scrape(timeout) do |context|
|
26
|
-
|
40
|
+
scrape(timeout) do |worker, context|
|
41
|
+
worker.load_url context, url
|
27
42
|
end
|
28
43
|
end
|
29
44
|
|
30
45
|
private
|
31
46
|
|
32
47
|
def scrape(timeout)
|
33
|
-
|
34
|
-
|
35
|
-
yield context
|
48
|
+
worker = load_worker
|
36
49
|
|
37
|
-
|
50
|
+
context = worker.create
|
51
|
+
yield worker, context
|
52
|
+
worker.wait(context, timeout)
|
38
53
|
|
39
|
-
data =
|
54
|
+
data = worker.fetch(context)
|
40
55
|
|
41
56
|
if data == "!SUNSCRAPER_TIMEOUT"
|
42
57
|
raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
|
@@ -44,7 +59,29 @@ module Sunscraper
|
|
44
59
|
data
|
45
60
|
end
|
46
61
|
ensure
|
47
|
-
|
62
|
+
worker.discard(context) if context
|
63
|
+
end
|
64
|
+
|
65
|
+
def load_worker
|
66
|
+
case @worker
|
67
|
+
when :standalone
|
68
|
+
require 'sunscraper/standalone'
|
69
|
+
|
70
|
+
Sunscraper::Standalone
|
71
|
+
|
72
|
+
when :embed
|
73
|
+
require 'sunscraper/library'
|
74
|
+
|
75
|
+
Sunscraper::Library
|
76
|
+
end
|
48
77
|
end
|
49
78
|
end
|
50
79
|
end
|
80
|
+
|
81
|
+
if Sunscraper.os_x?
|
82
|
+
# OS X is braindead
|
83
|
+
Sunscraper.worker = :standalone
|
84
|
+
else
|
85
|
+
# ... even Win32 is better.
|
86
|
+
Sunscraper.worker = :embed
|
87
|
+
end
|