sunscraper 1.0.0 → 1.1.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +28 -0
- data/README.md +42 -20
- data/ext/.gitignore +5 -1
- data/ext/embed/sunscraper.cpp +92 -0
- data/ext/{sunscraper.h → embed/sunscraper.h} +9 -7
- data/ext/{sunscraper.pro → embed/sunscraper.pro} +2 -0
- data/ext/embed/sunscraperexternal.cpp +39 -0
- data/ext/{sunscraperlibrary.cpp → embed/sunscraperlibrary.cpp} +4 -9
- data/ext/{sunscraperlibrary.h → embed/sunscraperlibrary.h} +1 -5
- data/ext/embed/sunscraperproxy.cpp +14 -0
- data/ext/{sunscraperproxy.h → embed/sunscraperproxy.h} +3 -3
- data/ext/embed/sunscraperthread.cpp +148 -0
- data/ext/embed/sunscraperthread.h +54 -0
- data/ext/extconf.rb +13 -3
- data/ext/standalone/sunscraper.pro +13 -0
- data/ext/standalone/sunscrapermain.cpp +13 -0
- data/ext/{sunscraperproxy.cpp → standalone/sunscraperproxy.cpp} +2 -2
- data/ext/standalone/sunscraperproxy.h +24 -0
- data/ext/standalone/sunscraperrpc.cpp +183 -0
- data/ext/standalone/sunscraperrpc.h +64 -0
- data/ext/{sunscraperthread.cpp → standalone/sunscraperworker.cpp} +9 -18
- data/ext/{sunscraperthread.h → standalone/sunscraperworker.h} +8 -8
- data/lib/sunscraper/library.rb +33 -29
- data/lib/sunscraper/standalone.rb +168 -0
- data/lib/sunscraper.rb +48 -11
- data/spec/sunscraper_spec.rb +59 -13
- data/sunscraper.gemspec +2 -2
- metadata +58 -75
- data/ext/Makefile +0 -270
- data/ext/sunscraper.cpp +0 -86
- data/ext/sunscraperexternal.cpp +0 -33
@@ -0,0 +1,183 @@
|
|
1
|
+
#include <QLocalSocket>
|
2
|
+
#include <QTimer>
|
3
|
+
#include <QDataStream>
|
4
|
+
#include <QApplication>
|
5
|
+
#include <QtDebug>
|
6
|
+
#include <arpa/inet.h>
|
7
|
+
#include "sunscraperrpc.h"
|
8
|
+
#include "sunscraperworker.h"
|
9
|
+
|
10
|
+
SunscraperRPC::SunscraperRPC(QString socketPath) :
|
11
|
+
m_state(StateHeader)
|
12
|
+
{
|
13
|
+
m_socket = new QLocalSocket(this);
|
14
|
+
m_socket->connectToServer(socketPath);
|
15
|
+
connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
|
16
|
+
connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
|
17
|
+
|
18
|
+
m_worker = new SunscraperWorker(this);
|
19
|
+
connect(m_worker, SIGNAL(finished(uint,QString)), this, SLOT(onPageRendered(uint,QString)));
|
20
|
+
}
|
21
|
+
|
22
|
+
SunscraperRPC::~SunscraperRPC()
|
23
|
+
{
|
24
|
+
delete m_worker;
|
25
|
+
}
|
26
|
+
|
27
|
+
void SunscraperRPC::onInputReadable()
|
28
|
+
{
|
29
|
+
m_buffer += m_socket->readAll();
|
30
|
+
|
31
|
+
bool moreData = true;
|
32
|
+
while(moreData) {
|
33
|
+
switch(m_state) {
|
34
|
+
case StateHeader:
|
35
|
+
if((unsigned) m_buffer.length() >= sizeof(Header)) {
|
36
|
+
memcpy((void*) &m_pendingHeader, m_buffer.constData(), sizeof(Header));
|
37
|
+
m_buffer.remove(0, sizeof(Header));
|
38
|
+
m_state = StateData;
|
39
|
+
} else {
|
40
|
+
moreData = false;
|
41
|
+
}
|
42
|
+
|
43
|
+
break;
|
44
|
+
|
45
|
+
case StateData:
|
46
|
+
unsigned length = ntohl(m_pendingHeader.dataLength);
|
47
|
+
|
48
|
+
if((unsigned) m_buffer.length() >= length) {
|
49
|
+
QByteArray data = m_buffer.left(length);
|
50
|
+
m_buffer.remove(0, length);
|
51
|
+
processRequest(m_pendingHeader, data);
|
52
|
+
m_state = StateHeader;
|
53
|
+
} else {
|
54
|
+
moreData = false;
|
55
|
+
}
|
56
|
+
|
57
|
+
break;
|
58
|
+
}
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
void SunscraperRPC::onInputDisconnected()
|
63
|
+
{
|
64
|
+
/* Magic value. */
|
65
|
+
QApplication::exit(42);
|
66
|
+
}
|
67
|
+
|
68
|
+
void SunscraperRPC::processRequest(Header header, QByteArray data)
|
69
|
+
{
|
70
|
+
unsigned queryId, requestType;
|
71
|
+
|
72
|
+
queryId = ntohl(header.queryId);
|
73
|
+
requestType = ntohl(header.requestType);
|
74
|
+
|
75
|
+
switch(requestType) {
|
76
|
+
case RPC_LOAD_HTML: {
|
77
|
+
m_worker->loadHtml(queryId, data);
|
78
|
+
|
79
|
+
break;
|
80
|
+
}
|
81
|
+
|
82
|
+
case RPC_LOAD_URL: {
|
83
|
+
m_worker->loadUrl(queryId, data);
|
84
|
+
|
85
|
+
break;
|
86
|
+
}
|
87
|
+
|
88
|
+
case RPC_WAIT: {
|
89
|
+
if(m_results.contains(queryId)) {
|
90
|
+
Header reply;
|
91
|
+
reply.queryId = htonl(queryId);
|
92
|
+
reply.requestType = htonl(RPC_WAIT);
|
93
|
+
|
94
|
+
sendReply(reply, QByteArray());
|
95
|
+
} else {
|
96
|
+
Q_ASSERT(!m_waitQueue.contains(queryId));
|
97
|
+
Q_ASSERT(!m_timers.contains(queryId));
|
98
|
+
|
99
|
+
m_waitQueue.append(queryId);
|
100
|
+
|
101
|
+
unsigned timeout;
|
102
|
+
|
103
|
+
QDataStream stream(data);
|
104
|
+
stream >> timeout;
|
105
|
+
|
106
|
+
QTimer *timer = new QTimer(this);
|
107
|
+
timer->setInterval(timeout);
|
108
|
+
timer->setSingleShot(true);
|
109
|
+
timer->start();
|
110
|
+
connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
|
111
|
+
|
112
|
+
m_timers[queryId] = timer;
|
113
|
+
}
|
114
|
+
|
115
|
+
break;
|
116
|
+
}
|
117
|
+
|
118
|
+
case RPC_FETCH: {
|
119
|
+
Header reply;
|
120
|
+
reply.queryId = htonl(queryId);
|
121
|
+
reply.requestType = htonl(RPC_FETCH);
|
122
|
+
|
123
|
+
if(m_results.contains(queryId)) {
|
124
|
+
sendReply(reply, m_results[queryId].toLocal8Bit());
|
125
|
+
} else {
|
126
|
+
sendReply(reply, "!SUNSCRAPER_TIMEOUT");
|
127
|
+
}
|
128
|
+
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
|
132
|
+
case RPC_DISCARD: {
|
133
|
+
m_results.remove(queryId);
|
134
|
+
m_waitQueue.removeAll(queryId);
|
135
|
+
|
136
|
+
if(m_timers.contains(queryId)) {
|
137
|
+
QTimer *timer = m_timers[queryId];
|
138
|
+
delete timer;
|
139
|
+
|
140
|
+
m_timers.remove(queryId);
|
141
|
+
}
|
142
|
+
|
143
|
+
m_worker->finalize(queryId);
|
144
|
+
|
145
|
+
break;
|
146
|
+
}
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
void SunscraperRPC::onPageRendered(unsigned queryId, QString data)
|
151
|
+
{
|
152
|
+
m_results[queryId] = data;
|
153
|
+
|
154
|
+
if(m_waitQueue.contains(queryId)) {
|
155
|
+
Header reply;
|
156
|
+
reply.queryId = htonl(queryId);
|
157
|
+
reply.requestType = htonl(RPC_WAIT);
|
158
|
+
|
159
|
+
sendReply(reply, QByteArray());
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
void SunscraperRPC::onTimeout()
|
164
|
+
{
|
165
|
+
QTimer *timer = static_cast<QTimer*>(QObject::sender());
|
166
|
+
unsigned queryId = m_timers.key(timer);
|
167
|
+
|
168
|
+
Header reply;
|
169
|
+
reply.queryId = htonl(queryId);
|
170
|
+
reply.requestType = htonl(RPC_WAIT);
|
171
|
+
|
172
|
+
sendReply(reply, QByteArray());
|
173
|
+
}
|
174
|
+
|
175
|
+
void SunscraperRPC::sendReply(Header header, QByteArray data)
|
176
|
+
{
|
177
|
+
header.dataLength = htonl(data.length());
|
178
|
+
|
179
|
+
QByteArray serialized((const char*) &header, sizeof(Header));
|
180
|
+
serialized.append(data);
|
181
|
+
|
182
|
+
m_socket->write(serialized);
|
183
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#ifndef SUNSCRAPERRPC_H
|
2
|
+
#define SUNSCRAPERRPC_H
|
3
|
+
|
4
|
+
#include <QObject>
|
5
|
+
#include <QVector>
|
6
|
+
#include <QMap>
|
7
|
+
|
8
|
+
class SunscraperWorker;
|
9
|
+
class QLocalSocket;
|
10
|
+
class QTimer;
|
11
|
+
|
12
|
+
class SunscraperRPC : public QObject
|
13
|
+
{
|
14
|
+
Q_OBJECT
|
15
|
+
|
16
|
+
enum State {
|
17
|
+
StateHeader = 0,
|
18
|
+
StateData,
|
19
|
+
};
|
20
|
+
|
21
|
+
struct Header {
|
22
|
+
quint32 queryId;
|
23
|
+
quint32 requestType;
|
24
|
+
quint32 dataLength;
|
25
|
+
};
|
26
|
+
|
27
|
+
enum Request {
|
28
|
+
RPC_LOAD_HTML = 1,
|
29
|
+
RPC_LOAD_URL = 2,
|
30
|
+
RPC_WAIT = 3,
|
31
|
+
RPC_FETCH = 4,
|
32
|
+
RPC_DISCARD = 5,
|
33
|
+
};
|
34
|
+
|
35
|
+
public:
|
36
|
+
SunscraperRPC(QString socketPath);
|
37
|
+
~SunscraperRPC();
|
38
|
+
|
39
|
+
private slots:
|
40
|
+
void onInputReadable();
|
41
|
+
void onInputDisconnected();
|
42
|
+
void onPageRendered(unsigned queryId, QString data);
|
43
|
+
void onTimeout();
|
44
|
+
|
45
|
+
private:
|
46
|
+
QLocalSocket *m_socket;
|
47
|
+
|
48
|
+
State m_state;
|
49
|
+
Header m_pendingHeader;
|
50
|
+
QByteArray m_buffer;
|
51
|
+
|
52
|
+
SunscraperWorker *m_worker;
|
53
|
+
|
54
|
+
QList<unsigned> m_waitQueue;
|
55
|
+
QMap<unsigned, QTimer*> m_timers;
|
56
|
+
QMap<unsigned, QString> m_results;
|
57
|
+
|
58
|
+
SunscraperRPC();
|
59
|
+
|
60
|
+
void processRequest(Header header, QByteArray data);
|
61
|
+
void sendReply(Header header, QByteArray data);
|
62
|
+
};
|
63
|
+
|
64
|
+
#endif // SUNSCRAPERRPC_H
|
@@ -1,37 +1,28 @@
|
|
1
1
|
#include <QApplication>
|
2
2
|
#include <QWebPage>
|
3
3
|
#include <QWebFrame>
|
4
|
-
#include "
|
4
|
+
#include "sunscraperworker.h"
|
5
5
|
#include "sunscraperproxy.h"
|
6
|
+
#include <QtDebug>
|
6
7
|
|
7
|
-
|
8
|
+
SunscraperWorker::SunscraperWorker(QObject *parent) :
|
9
|
+
QObject(parent)
|
8
10
|
{
|
9
11
|
}
|
10
12
|
|
11
|
-
void
|
12
|
-
{
|
13
|
-
static int argc;
|
14
|
-
static char **argv = {NULL};
|
15
|
-
|
16
|
-
QApplication app(argc, argv);
|
17
|
-
app.exec();
|
18
|
-
|
19
|
-
qFatal("Sunscraper apartment thread event loop should never end");
|
20
|
-
}
|
21
|
-
|
22
|
-
void SunscraperThread::loadHtml(unsigned queryId, QString html)
|
13
|
+
void SunscraperWorker::loadHtml(unsigned queryId, QString html)
|
23
14
|
{
|
24
15
|
QWebPage *webPage = initializeWebPage(queryId);
|
25
16
|
webPage->mainFrame()->setHtml(html);
|
26
17
|
}
|
27
18
|
|
28
|
-
void
|
19
|
+
void SunscraperWorker::loadUrl(unsigned queryId, QString url)
|
29
20
|
{
|
30
21
|
QWebPage *webPage = initializeWebPage(queryId);
|
31
22
|
webPage->mainFrame()->load(url);
|
32
23
|
}
|
33
24
|
|
34
|
-
void
|
25
|
+
void SunscraperWorker::finalize(unsigned queryId)
|
35
26
|
{
|
36
27
|
Q_ASSERT(_webPages[queryId] != NULL);
|
37
28
|
|
@@ -39,7 +30,7 @@ void SunscraperThread::finalize(unsigned queryId)
|
|
39
30
|
_webPages.remove(queryId);
|
40
31
|
}
|
41
32
|
|
42
|
-
QWebPage *
|
33
|
+
QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
|
43
34
|
{
|
44
35
|
Q_ASSERT(_webPages[queryId] == NULL);
|
45
36
|
|
@@ -52,7 +43,7 @@ QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
|
|
52
43
|
return webPage;
|
53
44
|
}
|
54
45
|
|
55
|
-
void
|
46
|
+
void SunscraperWorker::attachAPI()
|
56
47
|
{
|
57
48
|
QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
|
58
49
|
QWebPage *page = origin->page();
|
@@ -1,18 +1,18 @@
|
|
1
|
-
#ifndef
|
2
|
-
#define
|
1
|
+
#ifndef SUNSCRAPERWORKER_H
|
2
|
+
#define SUNSCRAPERWORKER_H
|
3
3
|
|
4
|
-
#include <
|
4
|
+
#include <QObject>
|
5
|
+
#include <QMutex>
|
5
6
|
#include <QMap>
|
6
7
|
|
7
8
|
class QWebPage;
|
8
9
|
|
9
|
-
class
|
10
|
+
class SunscraperWorker : public QObject
|
10
11
|
{
|
11
12
|
Q_OBJECT
|
12
|
-
public:
|
13
|
-
SunscraperThread();
|
14
13
|
|
15
|
-
|
14
|
+
public:
|
15
|
+
SunscraperWorker(QObject *parent = 0);
|
16
16
|
|
17
17
|
signals:
|
18
18
|
void finished(unsigned queryId, QString result);
|
@@ -31,4 +31,4 @@ private:
|
|
31
31
|
QWebPage *initializeWebPage(unsigned queryId);
|
32
32
|
};
|
33
33
|
|
34
|
-
#endif //
|
34
|
+
#endif // SUNSCRAPERWORKER_H
|
data/lib/sunscraper/library.rb
CHANGED
@@ -1,37 +1,41 @@
|
|
1
|
-
if
|
2
|
-
raise RuntimeError, "Sunscraper does not work on
|
1
|
+
if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
|
2
|
+
raise RuntimeError, "Sunscraper/embed does not work on OS X. Use Sunscraper/standalone."
|
3
3
|
end
|
4
4
|
|
5
5
|
require 'ffi'
|
6
|
+
require 'rbconfig'
|
6
7
|
|
7
8
|
# @private
|
8
|
-
module Sunscraper
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
9
|
+
module Sunscraper
|
10
|
+
module Library
|
11
|
+
extend FFI::Library
|
12
|
+
|
13
|
+
if Gem.win_platform?
|
14
|
+
extension = 'dll'
|
15
|
+
else
|
16
|
+
extension = 'so'
|
17
|
+
end
|
18
|
+
|
19
|
+
ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
|
20
|
+
'ext', 'embed', "libsunscraper.#{extension}")
|
21
|
+
|
22
|
+
attach_function 'create', :sunscraper_create, [], :pointer
|
23
|
+
attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
|
24
|
+
attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
|
25
|
+
attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
|
26
|
+
attach_function 'discard', :sunscraper_discard, [:pointer], :void
|
27
|
+
|
28
|
+
if RUBY_ENGINE == 'ruby'
|
29
|
+
# MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
|
30
|
+
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
|
31
|
+
else
|
32
|
+
# Rubinius does not have GVL neither it has options in attach_function.
|
33
|
+
# Same for JRuby.
|
34
|
+
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
|
35
|
+
end
|
36
|
+
|
37
|
+
attach_function 'finalize', :sunscraper_finalize, [], :void
|
19
38
|
|
20
|
-
|
21
|
-
'ext', "libsunscraper.#{extension}")
|
22
|
-
|
23
|
-
attach_function 'create', :sunscraper_create, [], :pointer
|
24
|
-
attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
|
25
|
-
attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
|
26
|
-
attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
|
27
|
-
attach_function 'discard', :sunscraper_discard, [:pointer], :void
|
28
|
-
|
29
|
-
if RUBY_ENGINE == 'ruby'
|
30
|
-
# MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
|
31
|
-
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
|
32
|
-
else
|
33
|
-
# Rubinius does not have GVL neither it has options in attach_function.
|
34
|
-
# Same for JRuby.
|
35
|
-
attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
|
39
|
+
at_exit { finalize }
|
36
40
|
end
|
37
41
|
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'socket'
|
2
|
+
|
3
|
+
# @private
|
4
|
+
module Sunscraper
|
5
|
+
module Standalone
|
6
|
+
@last_query_id = 0
|
7
|
+
|
8
|
+
@rpc_mutex = Mutex.new
|
9
|
+
@rpc_waiters = {}
|
10
|
+
@rpc_results = {}
|
11
|
+
@rpc_thread = nil
|
12
|
+
|
13
|
+
RPC_LOAD_HTML = 1
|
14
|
+
RPC_LOAD_URL = 2
|
15
|
+
RPC_WAIT = 3
|
16
|
+
RPC_FETCH = 4
|
17
|
+
RPC_DISCARD = 5
|
18
|
+
|
19
|
+
class << self
|
20
|
+
attr_reader :rpc_mutex, :rpc_waiters, :rpc_results
|
21
|
+
|
22
|
+
def create
|
23
|
+
@rpc_mutex.synchronize do
|
24
|
+
@last_query_id += 1
|
25
|
+
@last_query_id
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def load_html(query_id, html)
|
30
|
+
perform_rpc query_id,
|
31
|
+
request: RPC_LOAD_HTML,
|
32
|
+
data: html
|
33
|
+
end
|
34
|
+
|
35
|
+
def load_url(query_id, url)
|
36
|
+
perform_rpc query_id,
|
37
|
+
request: RPC_LOAD_URL,
|
38
|
+
data: url
|
39
|
+
end
|
40
|
+
|
41
|
+
def wait(query_id, timeout)
|
42
|
+
perform_rpc query_id,
|
43
|
+
request: RPC_WAIT,
|
44
|
+
data: [timeout].pack("N"),
|
45
|
+
want_result: true
|
46
|
+
end
|
47
|
+
|
48
|
+
def fetch(query_id)
|
49
|
+
perform_rpc query_id,
|
50
|
+
request: RPC_FETCH,
|
51
|
+
want_result: true
|
52
|
+
end
|
53
|
+
|
54
|
+
def discard(query_id)
|
55
|
+
perform_rpc query_id,
|
56
|
+
request: RPC_DISCARD
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def perform_rpc(query_id, options={})
|
62
|
+
data = options[:data] || ""
|
63
|
+
block = options[:want_result]
|
64
|
+
|
65
|
+
@rpc_mutex.synchronize do
|
66
|
+
if @rpc_thread.nil?
|
67
|
+
@rpc_thread = Standalone::Thread.new(::Thread.current)
|
68
|
+
|
69
|
+
# Some fucko decided not to put any semaphores in Ruby,
|
70
|
+
# _and_ restrict Mutexes to be unlocked only from the thread
|
71
|
+
# which has locked them.
|
72
|
+
#
|
73
|
+
# Please, kill yourself if you're reading this.
|
74
|
+
::Thread.stop
|
75
|
+
end
|
76
|
+
|
77
|
+
@rpc_thread.perform(query_id, options[:request], data)
|
78
|
+
|
79
|
+
if block
|
80
|
+
@rpc_waiters[query_id] = Thread.current
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if block
|
85
|
+
Thread.stop
|
86
|
+
@rpc_results[query_id]
|
87
|
+
end
|
88
|
+
ensure
|
89
|
+
if block
|
90
|
+
@rpc_waiters.delete query_id
|
91
|
+
@rpc_results.delete query_id
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class Thread < ::Thread
|
97
|
+
def initialize(creator)
|
98
|
+
@creator = creator
|
99
|
+
|
100
|
+
super do
|
101
|
+
@parent = Sunscraper::Standalone
|
102
|
+
work
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def perform(query_id, request, data)
|
107
|
+
@socket.write([query_id, request, data.length, data].pack("NNNa*"))
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def work
|
113
|
+
if ::Sunscraper.os_x?
|
114
|
+
# Fuck you, OS X.
|
115
|
+
suffix = ".app/Contents/MacOS/sunscraper"
|
116
|
+
else
|
117
|
+
suffix = RbConfig::CONFIG["EXEEXT"]
|
118
|
+
end
|
119
|
+
|
120
|
+
executable = File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
|
121
|
+
'ext', 'standalone', "sunscraper#{suffix}")
|
122
|
+
|
123
|
+
server_path = "/tmp/sunscraper.#{Process.pid}.sock"
|
124
|
+
server = UNIXServer.new(server_path)
|
125
|
+
|
126
|
+
if Kernel.respond_to? :spawn
|
127
|
+
pid = Kernel.spawn "#{executable} #{server_path}"
|
128
|
+
else
|
129
|
+
# rbx does not have Kernel.spawn (yet). Sigh...
|
130
|
+
pid = fork { exec executable, server_path }
|
131
|
+
end
|
132
|
+
|
133
|
+
Process.detach pid
|
134
|
+
|
135
|
+
@socket = server.accept
|
136
|
+
|
137
|
+
server.close
|
138
|
+
FileUtils.rm server_path
|
139
|
+
|
140
|
+
# See above.
|
141
|
+
@creator.wakeup
|
142
|
+
|
143
|
+
loop do
|
144
|
+
header = @socket.read(4 * 3)
|
145
|
+
query_id, request, data_length = header.unpack("NNN")
|
146
|
+
data = @socket.read(data_length) if data_length > 0
|
147
|
+
|
148
|
+
@parent.rpc_mutex.synchronize do
|
149
|
+
if !@parent.rpc_waiters.include?(query_id)
|
150
|
+
$stderr.puts "Sunscraper/standalone: no waiter for #{query_id}"
|
151
|
+
else
|
152
|
+
@parent.rpc_results[query_id] = data
|
153
|
+
@parent.rpc_waiters[query_id].wakeup
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
rescue Exception => e
|
158
|
+
$stderr.puts "Sunscraper error: #{e.class}: #{e.message}"
|
159
|
+
e.backtrace.each do |line|
|
160
|
+
$stderr.puts " #{line}"
|
161
|
+
end
|
162
|
+
ensure
|
163
|
+
@socket.close
|
164
|
+
Process.kill pid
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
data/lib/sunscraper.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
|
2
|
+
raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
|
3
|
+
end
|
2
4
|
|
3
5
|
# Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
|
4
6
|
# method to be called. It blocks the calling thread, but is threadsafe, does
|
@@ -8,13 +10,26 @@ module Sunscraper
|
|
8
10
|
class ScrapeTimeout < StandardError; end
|
9
11
|
|
10
12
|
class << self
|
13
|
+
def os_x?
|
14
|
+
RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :worker
|
18
|
+
def worker=(worker_type)
|
19
|
+
if [:embed, :standalone].include?(worker_type)
|
20
|
+
@worker = worker_type
|
21
|
+
else
|
22
|
+
raise RuntimeError, "Invalid Sunscraper worker type: #{worker_type.inspect}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
11
26
|
# Scrape an inline HTML. The content is loaded without a particular base URL.
|
12
27
|
# If your application depends on base URL being available, use {scrape_url}.
|
13
28
|
#
|
14
29
|
# @param [Integer] timeout timeout in milliseconds
|
15
30
|
def scrape_html(html, timeout=5000)
|
16
|
-
scrape(timeout) do |context|
|
17
|
-
|
31
|
+
scrape(timeout) do |worker, context|
|
32
|
+
worker.load_html context, html
|
18
33
|
end
|
19
34
|
end
|
20
35
|
|
@@ -22,21 +37,21 @@ module Sunscraper
|
|
22
37
|
#
|
23
38
|
# @param [Integer] timeout timeout in milliseconds
|
24
39
|
def scrape_url(url, timeout=5000)
|
25
|
-
scrape(timeout) do |context|
|
26
|
-
|
40
|
+
scrape(timeout) do |worker, context|
|
41
|
+
worker.load_url context, url
|
27
42
|
end
|
28
43
|
end
|
29
44
|
|
30
45
|
private
|
31
46
|
|
32
47
|
def scrape(timeout)
|
33
|
-
|
34
|
-
|
35
|
-
yield context
|
48
|
+
worker = load_worker
|
36
49
|
|
37
|
-
|
50
|
+
context = worker.create
|
51
|
+
yield worker, context
|
52
|
+
worker.wait(context, timeout)
|
38
53
|
|
39
|
-
data =
|
54
|
+
data = worker.fetch(context)
|
40
55
|
|
41
56
|
if data == "!SUNSCRAPER_TIMEOUT"
|
42
57
|
raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
|
@@ -44,7 +59,29 @@ module Sunscraper
|
|
44
59
|
data
|
45
60
|
end
|
46
61
|
ensure
|
47
|
-
|
62
|
+
worker.discard(context) if context
|
63
|
+
end
|
64
|
+
|
65
|
+
def load_worker
|
66
|
+
case @worker
|
67
|
+
when :standalone
|
68
|
+
require 'sunscraper/standalone'
|
69
|
+
|
70
|
+
Sunscraper::Standalone
|
71
|
+
|
72
|
+
when :embed
|
73
|
+
require 'sunscraper/library'
|
74
|
+
|
75
|
+
Sunscraper::Library
|
76
|
+
end
|
48
77
|
end
|
49
78
|
end
|
50
79
|
end
|
80
|
+
|
81
|
+
if Sunscraper.os_x?
|
82
|
+
# OS X is braindead
|
83
|
+
Sunscraper.worker = :standalone
|
84
|
+
else
|
85
|
+
# ... even Win32 is better.
|
86
|
+
Sunscraper.worker = :embed
|
87
|
+
end
|