sunscraper 1.1.0.beta3 → 1.2.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/common/common.pro +13 -0
- data/ext/common/libsunscraper_common.a +0 -0
- data/ext/common/sunscraperproxy.cpp +11 -0
- data/ext/{standalone → common}/sunscraperproxy.h +5 -5
- data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
- data/ext/{embed → common}/sunscraperwebpage.h +0 -0
- data/ext/common/sunscraperworker.cpp +124 -0
- data/ext/common/sunscraperworker.h +50 -0
- data/ext/embed/embed.pro +14 -12
- data/ext/embed/sunscraperexternal.cpp +17 -16
- data/ext/embed/sunscraperinterface.cpp +206 -0
- data/ext/embed/sunscraperinterface.h +66 -0
- data/ext/embed/sunscraperlibrary.cpp +2 -12
- data/ext/embed/sunscraperlibrary.h +0 -1
- data/ext/embed/sunscraperthread.cpp +49 -0
- data/ext/embed/sunscraperthread.h +24 -0
- data/ext/extconf.rb +5 -3
- data/ext/standalone/standalone.pro +12 -6
- data/ext/standalone/sunscrapermain.cpp +13 -3
- data/ext/standalone/sunscraperrpc.cpp +76 -88
- data/ext/standalone/sunscraperrpc.h +19 -22
- data/ext/standalone/sunscraperrpcserver.cpp +26 -0
- data/ext/standalone/sunscraperrpcserver.h +24 -0
- data/ext/sunscraper-ext.pro +1 -1
- data/lib/sunscraper.rb +14 -14
- data/lib/sunscraper/library.rb +9 -9
- data/lib/sunscraper/standalone.rb +53 -107
- data/spec/sunscraper_spec.rb +86 -44
- data/sunscraper.gemspec +1 -1
- metadata +19 -17
- data/ext/embed/sunscraper.cpp +0 -92
- data/ext/embed/sunscraper.h +0 -47
- data/ext/embed/sunscraperproxy.cpp +0 -14
- data/ext/embed/sunscraperproxy.h +0 -24
- data/ext/embed/sunscraperworker.cpp +0 -163
- data/ext/embed/sunscraperworker.h +0 -58
- data/ext/standalone/sunscraperproxy.cpp +0 -14
- data/ext/standalone/sunscraperworker.cpp +0 -60
- data/ext/standalone/sunscraperworker.h +0 -34
@@ -0,0 +1,66 @@
|
|
1
|
+
#ifndef SUNSCRAPERINTERFACE_H
|
2
|
+
#define SUNSCRAPERINTERFACE_H
|
3
|
+
|
4
|
+
#include <QObject>
|
5
|
+
#include <QMutex>
|
6
|
+
#include <QSemaphore>
|
7
|
+
#include <QMap>
|
8
|
+
#include <QUrl>
|
9
|
+
#include <QByteArray>
|
10
|
+
|
11
|
+
class SunscraperWorker;
|
12
|
+
|
13
|
+
class SunscraperInterface : public QObject
|
14
|
+
{
|
15
|
+
Q_OBJECT
|
16
|
+
|
17
|
+
public:
|
18
|
+
static SunscraperInterface *instance();
|
19
|
+
|
20
|
+
unsigned createQuery();
|
21
|
+
|
22
|
+
void loadHtml(unsigned queryId, QString html, QUrl baseUrl);
|
23
|
+
void loadUrl(unsigned queryId, QUrl url);
|
24
|
+
|
25
|
+
bool wait(unsigned queryId, unsigned timeout);
|
26
|
+
|
27
|
+
QByteArray fetch(unsigned queryId);
|
28
|
+
|
29
|
+
void finalize(unsigned queryId);
|
30
|
+
|
31
|
+
private slots:
|
32
|
+
void onFinish(unsigned queryId);
|
33
|
+
void onTimeout(unsigned queryId);
|
34
|
+
void onFetchDone(unsigned queryId, QString html);
|
35
|
+
|
36
|
+
signals:
|
37
|
+
void requestLoadUrl(unsigned queryId, QUrl url);
|
38
|
+
void requestLoadHtml(unsigned queryId, QString html, QUrl url);
|
39
|
+
void requestTimeout(unsigned queryId, unsigned timeout);
|
40
|
+
void requestFetch(unsigned queryId);
|
41
|
+
void requestFinalize(unsigned queryId);
|
42
|
+
|
43
|
+
private:
|
44
|
+
static QMutex m_initializationMutex;
|
45
|
+
static SunscraperInterface *m_instance;
|
46
|
+
|
47
|
+
QMutex m_queryIdMutex;
|
48
|
+
unsigned m_nextQueryId;
|
49
|
+
|
50
|
+
QMutex m_semaphoresMutex;
|
51
|
+
QMap<unsigned, QSemaphore *> m_semaphores;
|
52
|
+
|
53
|
+
QMutex m_resultsMutex;
|
54
|
+
QMap<unsigned, bool> m_results;
|
55
|
+
QMap<unsigned, QByteArray> m_htmlCache;
|
56
|
+
|
57
|
+
SunscraperWorker *m_worker;
|
58
|
+
|
59
|
+
SunscraperInterface();
|
60
|
+
|
61
|
+
void initSemaphore(unsigned queryId);
|
62
|
+
void waitOnSemaphore(unsigned queryId);
|
63
|
+
void signalSemaphore(unsigned queryId);
|
64
|
+
};
|
65
|
+
|
66
|
+
#endif // SUNSCRAPERINTERFACE_H
|
@@ -1,20 +1,10 @@
|
|
1
1
|
#include "sunscraperlibrary.h"
|
2
|
-
#include "
|
2
|
+
#include "sunscraperthread.h"
|
3
3
|
#include <QtDebug>
|
4
4
|
|
5
5
|
SunscraperLibrary SunscraperLibrary::m_instance;
|
6
6
|
|
7
7
|
SunscraperLibrary::SunscraperLibrary()
|
8
8
|
{
|
9
|
-
|
10
|
-
}
|
11
|
-
|
12
|
-
SunscraperLibrary::~SunscraperLibrary()
|
13
|
-
{
|
14
|
-
/* Do nothing. This is on purpose. */
|
15
|
-
}
|
16
|
-
|
17
|
-
SunscraperLibrary *SunscraperLibrary::instance()
|
18
|
-
{
|
19
|
-
return &m_instance;
|
9
|
+
SunscraperThread::invoke();
|
20
10
|
}
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#include <QApplication>
|
2
|
+
#include <QtDebug>
|
3
|
+
#include "sunscraperthread.h"
|
4
|
+
#include "sunscraperworker.h"
|
5
|
+
|
6
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
7
|
+
pthread_t SunscraperThread::m_thread;
|
8
|
+
#endif
|
9
|
+
|
10
|
+
void SunscraperThread::invoke()
|
11
|
+
{
|
12
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
13
|
+
pthread_create(&m_thread, NULL, &SunscraperThread::thread_routine, NULL);
|
14
|
+
#endif
|
15
|
+
}
|
16
|
+
|
17
|
+
void *SunscraperThread::thread_routine(void *)
|
18
|
+
{
|
19
|
+
/* Better error messages. */
|
20
|
+
int argc = 1;
|
21
|
+
char *argv[] = { (char*) "Sunscraper", NULL};
|
22
|
+
|
23
|
+
/* Why (char*)? Because argv can (theoretically) be modified. *
|
24
|
+
* But Qt won't do that with argv[0]. I know, trust me. */
|
25
|
+
|
26
|
+
QApplication app(argc, argv);
|
27
|
+
app.setApplicationName("Sunscraper-Embed");
|
28
|
+
|
29
|
+
SunscraperWorker::unlock();
|
30
|
+
|
31
|
+
/*
|
32
|
+
* The magic value 42 means we want exit from the loop.
|
33
|
+
* E.g. alerts from within the page may exit the loop with value 0.
|
34
|
+
*/
|
35
|
+
while(app.exec() != 42);
|
36
|
+
|
37
|
+
/* Our host application exits. */
|
38
|
+
|
39
|
+
return NULL;
|
40
|
+
}
|
41
|
+
|
42
|
+
void SunscraperThread::commitSuicide()
|
43
|
+
{
|
44
|
+
QApplication::exit(42);
|
45
|
+
|
46
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
47
|
+
pthread_join(m_thread, NULL);
|
48
|
+
#endif
|
49
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#ifndef SUNSCRAPERTHREAD_H
|
2
|
+
#define SUNSCRAPERTHREAD_H
|
3
|
+
|
4
|
+
#include <QThread>
|
5
|
+
#include <QSemaphore>
|
6
|
+
|
7
|
+
class SunscraperThread : public QThread
|
8
|
+
{
|
9
|
+
Q_OBJECT
|
10
|
+
public:
|
11
|
+
static void invoke();
|
12
|
+
static void commitSuicide();
|
13
|
+
|
14
|
+
private:
|
15
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
16
|
+
static pthread_t m_thread;
|
17
|
+
#else
|
18
|
+
#error Your platform is unsupported. Implement SunscraperWorker::invoke() and send a pull request.
|
19
|
+
#endif
|
20
|
+
|
21
|
+
static void *thread_routine(void *arg);
|
22
|
+
};
|
23
|
+
|
24
|
+
#endif /* SUNSCRAPERTHREAD_H */
|
data/ext/extconf.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
# This Makefile will get replaced by qmake.
|
2
2
|
|
3
|
+
require 'rbconfig'
|
4
|
+
|
3
5
|
if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
|
4
6
|
# Cannot you OS X have a build system like all sane people?
|
5
7
|
# Win32 wins again.
|
6
|
-
qmake = %{qmake -spec macx-g++}
|
8
|
+
qmake = %{qmake CONFIG+=debug -spec macx-g++}
|
7
9
|
|
8
10
|
File.open("Makefile", "w") do |mf|
|
9
11
|
mf.puts <<-ENDM
|
@@ -16,9 +18,9 @@ install:
|
|
16
18
|
end
|
17
19
|
else
|
18
20
|
if Gem.win_platform?
|
19
|
-
qmake = %{qmake -spec win32-g++}
|
21
|
+
qmake = %{qmake CONFIG+=debug -spec win32-g++}
|
20
22
|
else
|
21
|
-
qmake = %{qmake}
|
23
|
+
qmake = %{qmake CONFIG+=debug}
|
22
24
|
end
|
23
25
|
|
24
26
|
File.open("Makefile", "w") do |mf|
|
@@ -3,11 +3,17 @@ QT += webkit network
|
|
3
3
|
TARGET = sunscraper
|
4
4
|
TEMPLATE = app
|
5
5
|
|
6
|
-
SOURCES +=
|
7
|
-
|
8
|
-
sunscraperrpc.cpp \
|
6
|
+
SOURCES += sunscraperrpc.cpp \
|
7
|
+
sunscraperrpcserver.cpp \
|
9
8
|
sunscrapermain.cpp
|
10
9
|
|
11
|
-
HEADERS +=
|
12
|
-
|
13
|
-
|
10
|
+
HEADERS += sunscraperrpc.h \
|
11
|
+
sunscraperrpcserver.h
|
12
|
+
|
13
|
+
INCLUDEPATH += ../common
|
14
|
+
|
15
|
+
unix:{
|
16
|
+
LIBS += -L../common -lsunscraper_common
|
17
|
+
POST_TARGETDEPS += ../common/libsunscraper_common.a
|
18
|
+
LDFLAGS += -pthread
|
19
|
+
}
|
@@ -1,13 +1,23 @@
|
|
1
1
|
#include <QApplication>
|
2
2
|
#include <QStringList>
|
3
3
|
#include "sunscraperworker.h"
|
4
|
-
#include "
|
4
|
+
#include "sunscraperrpcserver.h"
|
5
5
|
|
6
6
|
int main(int argc, char **argv)
|
7
7
|
{
|
8
8
|
QApplication app(argc, argv);
|
9
|
+
app.setApplicationName("Sunscraper-Standalone");
|
9
10
|
|
10
|
-
|
11
|
+
SunscraperWorker::unlock();
|
11
12
|
|
12
|
-
|
13
|
+
SunscraperRPCServer *rpcServer = new SunscraperRPCServer();
|
14
|
+
|
15
|
+
QString socketPath = app.arguments().at(1);
|
16
|
+
if(!rpcServer->listen(socketPath)) {
|
17
|
+
qFatal("Cannot listen on %s", socketPath.toLocal8Bit().constData());
|
18
|
+
}
|
19
|
+
|
20
|
+
app.exec();
|
21
|
+
|
22
|
+
qFatal("finished");
|
13
23
|
}
|
@@ -4,24 +4,32 @@
|
|
4
4
|
#include <QApplication>
|
5
5
|
#include <QtDebug>
|
6
6
|
#include <arpa/inet.h>
|
7
|
+
#include <sunscraperworker.h>
|
7
8
|
#include "sunscraperrpc.h"
|
8
|
-
#include "sunscraperworker.h"
|
9
9
|
|
10
|
-
SunscraperRPC::
|
11
|
-
|
10
|
+
SunscraperWorker *SunscraperRPC::m_worker;
|
11
|
+
unsigned SunscraperRPC::m_nextQueryId;
|
12
|
+
|
13
|
+
SunscraperRPC::SunscraperRPC(QLocalSocket *socket) :
|
14
|
+
m_socket(socket), m_state(StateHeader)
|
12
15
|
{
|
13
|
-
|
14
|
-
|
16
|
+
m_nextQueryId += 1;
|
17
|
+
m_queryId = m_nextQueryId;
|
18
|
+
|
15
19
|
connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
|
16
20
|
connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
|
17
21
|
|
18
|
-
m_worker
|
19
|
-
|
22
|
+
if(m_worker == NULL)
|
23
|
+
m_worker = new SunscraperWorker();
|
24
|
+
|
25
|
+
connect(m_worker, SIGNAL(finished(uint)), this, SLOT(onFinish(uint)));
|
26
|
+
connect(m_worker, SIGNAL(timedOut(uint)), this, SLOT(onTimeout(uint)));
|
27
|
+
connect(m_worker, SIGNAL(htmlFetched(uint,QString)), this, SLOT(onFetchDone(uint,QString)));
|
20
28
|
}
|
21
29
|
|
22
30
|
SunscraperRPC::~SunscraperRPC()
|
23
31
|
{
|
24
|
-
delete
|
32
|
+
delete m_socket;
|
25
33
|
}
|
26
34
|
|
27
35
|
void SunscraperRPC::onInputReadable()
|
@@ -32,9 +40,13 @@ void SunscraperRPC::onInputReadable()
|
|
32
40
|
while(moreData) {
|
33
41
|
switch(m_state) {
|
34
42
|
case StateHeader:
|
35
|
-
if((unsigned) m_buffer.length() >= sizeof(
|
36
|
-
|
37
|
-
|
43
|
+
if((unsigned) m_buffer.length() >= sizeof(quint32) * 2) {
|
44
|
+
QDataStream stream(m_buffer);
|
45
|
+
stream >> (quint32&) m_pendingRequest;
|
46
|
+
stream >> (quint32&) m_pendingDataLength;
|
47
|
+
|
48
|
+
m_buffer.remove(0, sizeof(quint32) * 2);
|
49
|
+
|
38
50
|
m_state = StateData;
|
39
51
|
} else {
|
40
52
|
moreData = false;
|
@@ -43,12 +55,12 @@ void SunscraperRPC::onInputReadable()
|
|
43
55
|
break;
|
44
56
|
|
45
57
|
case StateData:
|
46
|
-
unsigned length
|
58
|
+
if((unsigned) m_buffer.length() >= m_pendingDataLength) {
|
59
|
+
QByteArray data = m_buffer.left(m_pendingDataLength);
|
60
|
+
m_buffer.remove(0, m_pendingDataLength);
|
61
|
+
|
62
|
+
processRequest(m_pendingRequest, data);
|
47
63
|
|
48
|
-
if((unsigned) m_buffer.length() >= length) {
|
49
|
-
QByteArray data = m_buffer.left(length);
|
50
|
-
m_buffer.remove(0, length);
|
51
|
-
processRequest(m_pendingHeader, data);
|
52
64
|
m_state = StateHeader;
|
53
65
|
} else {
|
54
66
|
moreData = false;
|
@@ -61,123 +73,99 @@ void SunscraperRPC::onInputReadable()
|
|
61
73
|
|
62
74
|
void SunscraperRPC::onInputDisconnected()
|
63
75
|
{
|
64
|
-
|
65
|
-
|
76
|
+
m_worker->finalize(m_queryId);
|
77
|
+
|
78
|
+
emit disconnected();
|
66
79
|
}
|
67
80
|
|
68
|
-
void SunscraperRPC::processRequest(
|
81
|
+
void SunscraperRPC::processRequest(unsigned requestType, QByteArray data)
|
69
82
|
{
|
70
|
-
unsigned queryId, requestType;
|
71
|
-
|
72
|
-
queryId = ntohl(header.queryId);
|
73
|
-
requestType = ntohl(header.requestType);
|
74
|
-
|
75
83
|
switch(requestType) {
|
76
84
|
case RPC_LOAD_HTML: {
|
77
|
-
|
85
|
+
QDataStream stream(data);
|
86
|
+
|
87
|
+
QByteArray html;
|
88
|
+
stream >> html;
|
89
|
+
|
90
|
+
QByteArray baseUrl;
|
91
|
+
stream >> baseUrl;
|
92
|
+
|
93
|
+
m_worker->loadHtml(m_queryId, html, QUrl(baseUrl));
|
78
94
|
|
79
95
|
break;
|
80
96
|
}
|
81
97
|
|
82
98
|
case RPC_LOAD_URL: {
|
83
|
-
m_worker->loadUrl(
|
99
|
+
m_worker->loadUrl(m_queryId, QUrl(data));
|
84
100
|
|
85
101
|
break;
|
86
102
|
}
|
87
103
|
|
88
104
|
case RPC_WAIT: {
|
89
|
-
if(
|
90
|
-
|
91
|
-
reply.queryId = htonl(queryId);
|
92
|
-
reply.requestType = htonl(RPC_WAIT);
|
93
|
-
|
94
|
-
sendReply(reply, QByteArray());
|
95
|
-
} else {
|
96
|
-
Q_ASSERT(!m_waitQueue.contains(queryId));
|
97
|
-
Q_ASSERT(!m_timers.contains(queryId));
|
98
|
-
|
99
|
-
m_waitQueue.append(queryId);
|
105
|
+
if(!m_result) {
|
106
|
+
QDataStream stream(data);
|
100
107
|
|
101
108
|
unsigned timeout;
|
102
|
-
|
103
|
-
QDataStream stream(data);
|
104
109
|
stream >> timeout;
|
105
110
|
|
106
|
-
|
107
|
-
timer->setInterval(timeout);
|
108
|
-
timer->setSingleShot(true);
|
109
|
-
timer->start();
|
110
|
-
connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
|
111
|
-
|
112
|
-
m_timers[queryId] = timer;
|
111
|
+
m_worker->setTimeout(m_queryId, timeout);
|
113
112
|
}
|
114
113
|
|
115
114
|
break;
|
116
115
|
}
|
117
116
|
|
118
117
|
case RPC_FETCH: {
|
119
|
-
|
120
|
-
reply.queryId = htonl(queryId);
|
121
|
-
reply.requestType = htonl(RPC_FETCH);
|
122
|
-
|
123
|
-
if(m_results.contains(queryId)) {
|
124
|
-
sendReply(reply, m_results[queryId].toLocal8Bit());
|
125
|
-
} else {
|
126
|
-
sendReply(reply, "!SUNSCRAPER_TIMEOUT");
|
127
|
-
}
|
118
|
+
m_worker->fetchHtml(m_queryId);
|
128
119
|
|
129
120
|
break;
|
130
121
|
}
|
122
|
+
}
|
123
|
+
}
|
131
124
|
|
132
|
-
|
133
|
-
|
134
|
-
|
125
|
+
void SunscraperRPC::onFinish(unsigned eventQueryId)
|
126
|
+
{
|
127
|
+
if(eventQueryId != m_queryId)
|
128
|
+
return;
|
135
129
|
|
136
|
-
|
137
|
-
QTimer *timer = m_timers[queryId];
|
138
|
-
delete timer;
|
130
|
+
QByteArray data;
|
139
131
|
|
140
|
-
|
141
|
-
|
132
|
+
QDataStream stream(&data, QIODevice::WriteOnly);
|
133
|
+
stream << (int) true;
|
142
134
|
|
143
|
-
|
135
|
+
sendReply(data);
|
144
136
|
|
145
|
-
|
146
|
-
}
|
147
|
-
}
|
137
|
+
m_result = true;
|
148
138
|
}
|
149
139
|
|
150
|
-
void SunscraperRPC::
|
140
|
+
void SunscraperRPC::onTimeout(unsigned eventQueryId)
|
151
141
|
{
|
152
|
-
|
142
|
+
if(eventQueryId != m_queryId)
|
143
|
+
return;
|
153
144
|
|
154
|
-
|
155
|
-
Header reply;
|
156
|
-
reply.queryId = htonl(queryId);
|
157
|
-
reply.requestType = htonl(RPC_WAIT);
|
145
|
+
QByteArray data;
|
158
146
|
|
159
|
-
|
160
|
-
|
147
|
+
QDataStream stream(&data, QIODevice::WriteOnly);
|
148
|
+
stream << (int) false;
|
149
|
+
|
150
|
+
sendReply(data);
|
151
|
+
|
152
|
+
m_result = false;
|
161
153
|
}
|
162
154
|
|
163
|
-
void SunscraperRPC::
|
155
|
+
void SunscraperRPC::onFetchDone(unsigned eventQueryId, QString data)
|
164
156
|
{
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
Header reply;
|
169
|
-
reply.queryId = htonl(queryId);
|
170
|
-
reply.requestType = htonl(RPC_WAIT);
|
157
|
+
if(eventQueryId != m_queryId)
|
158
|
+
return;
|
171
159
|
|
172
|
-
sendReply(
|
160
|
+
sendReply(data.toLocal8Bit());
|
173
161
|
}
|
174
162
|
|
175
|
-
void SunscraperRPC::sendReply(
|
163
|
+
void SunscraperRPC::sendReply(QByteArray data)
|
176
164
|
{
|
177
|
-
|
165
|
+
QByteArray packet;
|
178
166
|
|
179
|
-
|
180
|
-
|
167
|
+
QDataStream stream(&packet, QIODevice::WriteOnly);
|
168
|
+
stream << data;
|
181
169
|
|
182
|
-
m_socket->write(
|
170
|
+
m_socket->write(packet);
|
183
171
|
}
|