sunscraper 1.1.0.beta3 → 1.2.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/common/common.pro +13 -0
- data/ext/common/libsunscraper_common.a +0 -0
- data/ext/common/sunscraperproxy.cpp +11 -0
- data/ext/{standalone → common}/sunscraperproxy.h +5 -5
- data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
- data/ext/{embed → common}/sunscraperwebpage.h +0 -0
- data/ext/common/sunscraperworker.cpp +124 -0
- data/ext/common/sunscraperworker.h +50 -0
- data/ext/embed/embed.pro +14 -12
- data/ext/embed/sunscraperexternal.cpp +17 -16
- data/ext/embed/sunscraperinterface.cpp +206 -0
- data/ext/embed/sunscraperinterface.h +66 -0
- data/ext/embed/sunscraperlibrary.cpp +2 -12
- data/ext/embed/sunscraperlibrary.h +0 -1
- data/ext/embed/sunscraperthread.cpp +49 -0
- data/ext/embed/sunscraperthread.h +24 -0
- data/ext/extconf.rb +5 -3
- data/ext/standalone/standalone.pro +12 -6
- data/ext/standalone/sunscrapermain.cpp +13 -3
- data/ext/standalone/sunscraperrpc.cpp +76 -88
- data/ext/standalone/sunscraperrpc.h +19 -22
- data/ext/standalone/sunscraperrpcserver.cpp +26 -0
- data/ext/standalone/sunscraperrpcserver.h +24 -0
- data/ext/sunscraper-ext.pro +1 -1
- data/lib/sunscraper.rb +14 -14
- data/lib/sunscraper/library.rb +9 -9
- data/lib/sunscraper/standalone.rb +53 -107
- data/spec/sunscraper_spec.rb +86 -44
- data/sunscraper.gemspec +1 -1
- metadata +19 -17
- data/ext/embed/sunscraper.cpp +0 -92
- data/ext/embed/sunscraper.h +0 -47
- data/ext/embed/sunscraperproxy.cpp +0 -14
- data/ext/embed/sunscraperproxy.h +0 -24
- data/ext/embed/sunscraperworker.cpp +0 -163
- data/ext/embed/sunscraperworker.h +0 -58
- data/ext/standalone/sunscraperproxy.cpp +0 -14
- data/ext/standalone/sunscraperworker.cpp +0 -60
- data/ext/standalone/sunscraperworker.h +0 -34
@@ -0,0 +1,66 @@
|
|
1
|
+
#ifndef SUNSCRAPERINTERFACE_H
|
2
|
+
#define SUNSCRAPERINTERFACE_H
|
3
|
+
|
4
|
+
#include <QObject>
|
5
|
+
#include <QMutex>
|
6
|
+
#include <QSemaphore>
|
7
|
+
#include <QMap>
|
8
|
+
#include <QUrl>
|
9
|
+
#include <QByteArray>
|
10
|
+
|
11
|
+
class SunscraperWorker;
|
12
|
+
|
13
|
+
class SunscraperInterface : public QObject
|
14
|
+
{
|
15
|
+
Q_OBJECT
|
16
|
+
|
17
|
+
public:
|
18
|
+
static SunscraperInterface *instance();
|
19
|
+
|
20
|
+
unsigned createQuery();
|
21
|
+
|
22
|
+
void loadHtml(unsigned queryId, QString html, QUrl baseUrl);
|
23
|
+
void loadUrl(unsigned queryId, QUrl url);
|
24
|
+
|
25
|
+
bool wait(unsigned queryId, unsigned timeout);
|
26
|
+
|
27
|
+
QByteArray fetch(unsigned queryId);
|
28
|
+
|
29
|
+
void finalize(unsigned queryId);
|
30
|
+
|
31
|
+
private slots:
|
32
|
+
void onFinish(unsigned queryId);
|
33
|
+
void onTimeout(unsigned queryId);
|
34
|
+
void onFetchDone(unsigned queryId, QString html);
|
35
|
+
|
36
|
+
signals:
|
37
|
+
void requestLoadUrl(unsigned queryId, QUrl url);
|
38
|
+
void requestLoadHtml(unsigned queryId, QString html, QUrl url);
|
39
|
+
void requestTimeout(unsigned queryId, unsigned timeout);
|
40
|
+
void requestFetch(unsigned queryId);
|
41
|
+
void requestFinalize(unsigned queryId);
|
42
|
+
|
43
|
+
private:
|
44
|
+
static QMutex m_initializationMutex;
|
45
|
+
static SunscraperInterface *m_instance;
|
46
|
+
|
47
|
+
QMutex m_queryIdMutex;
|
48
|
+
unsigned m_nextQueryId;
|
49
|
+
|
50
|
+
QMutex m_semaphoresMutex;
|
51
|
+
QMap<unsigned, QSemaphore *> m_semaphores;
|
52
|
+
|
53
|
+
QMutex m_resultsMutex;
|
54
|
+
QMap<unsigned, bool> m_results;
|
55
|
+
QMap<unsigned, QByteArray> m_htmlCache;
|
56
|
+
|
57
|
+
SunscraperWorker *m_worker;
|
58
|
+
|
59
|
+
SunscraperInterface();
|
60
|
+
|
61
|
+
void initSemaphore(unsigned queryId);
|
62
|
+
void waitOnSemaphore(unsigned queryId);
|
63
|
+
void signalSemaphore(unsigned queryId);
|
64
|
+
};
|
65
|
+
|
66
|
+
#endif // SUNSCRAPERINTERFACE_H
|
@@ -1,20 +1,10 @@
|
|
1
1
|
#include "sunscraperlibrary.h"
|
2
|
-
#include "
|
2
|
+
#include "sunscraperthread.h"
|
3
3
|
#include <QtDebug>
|
4
4
|
|
5
5
|
SunscraperLibrary SunscraperLibrary::m_instance;
|
6
6
|
|
7
7
|
SunscraperLibrary::SunscraperLibrary()
|
8
8
|
{
|
9
|
-
|
10
|
-
}
|
11
|
-
|
12
|
-
SunscraperLibrary::~SunscraperLibrary()
|
13
|
-
{
|
14
|
-
/* Do nothing. This is on purpose. */
|
15
|
-
}
|
16
|
-
|
17
|
-
SunscraperLibrary *SunscraperLibrary::instance()
|
18
|
-
{
|
19
|
-
return &m_instance;
|
9
|
+
SunscraperThread::invoke();
|
20
10
|
}
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#include <QApplication>
|
2
|
+
#include <QtDebug>
|
3
|
+
#include "sunscraperthread.h"
|
4
|
+
#include "sunscraperworker.h"
|
5
|
+
|
6
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
7
|
+
pthread_t SunscraperThread::m_thread;
|
8
|
+
#endif
|
9
|
+
|
10
|
+
void SunscraperThread::invoke()
|
11
|
+
{
|
12
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
13
|
+
pthread_create(&m_thread, NULL, &SunscraperThread::thread_routine, NULL);
|
14
|
+
#endif
|
15
|
+
}
|
16
|
+
|
17
|
+
void *SunscraperThread::thread_routine(void *)
|
18
|
+
{
|
19
|
+
/* Better error messages. */
|
20
|
+
int argc = 1;
|
21
|
+
char *argv[] = { (char*) "Sunscraper", NULL};
|
22
|
+
|
23
|
+
/* Why (char*)? Because argv can (theoretically) be modified. *
|
24
|
+
* But Qt won't do that with argv[0]. I know, trust me. */
|
25
|
+
|
26
|
+
QApplication app(argc, argv);
|
27
|
+
app.setApplicationName("Sunscraper-Embed");
|
28
|
+
|
29
|
+
SunscraperWorker::unlock();
|
30
|
+
|
31
|
+
/*
|
32
|
+
* The magic value 42 means we want exit from the loop.
|
33
|
+
* E.g. alerts from within the page may exit the loop with value 0.
|
34
|
+
*/
|
35
|
+
while(app.exec() != 42);
|
36
|
+
|
37
|
+
/* Our host application exits. */
|
38
|
+
|
39
|
+
return NULL;
|
40
|
+
}
|
41
|
+
|
42
|
+
void SunscraperThread::commitSuicide()
|
43
|
+
{
|
44
|
+
QApplication::exit(42);
|
45
|
+
|
46
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
47
|
+
pthread_join(m_thread, NULL);
|
48
|
+
#endif
|
49
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#ifndef SUNSCRAPERTHREAD_H
|
2
|
+
#define SUNSCRAPERTHREAD_H
|
3
|
+
|
4
|
+
#include <QThread>
|
5
|
+
#include <QSemaphore>
|
6
|
+
|
7
|
+
class SunscraperThread : public QThread
|
8
|
+
{
|
9
|
+
Q_OBJECT
|
10
|
+
public:
|
11
|
+
static void invoke();
|
12
|
+
static void commitSuicide();
|
13
|
+
|
14
|
+
private:
|
15
|
+
#if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
|
16
|
+
static pthread_t m_thread;
|
17
|
+
#else
|
18
|
+
#error Your platform is unsupported. Implement SunscraperWorker::invoke() and send a pull request.
|
19
|
+
#endif
|
20
|
+
|
21
|
+
static void *thread_routine(void *arg);
|
22
|
+
};
|
23
|
+
|
24
|
+
#endif /* SUNSCRAPERTHREAD_H */
|
data/ext/extconf.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
# This Makefile will get replaced by qmake.
|
2
2
|
|
3
|
+
require 'rbconfig'
|
4
|
+
|
3
5
|
if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
|
4
6
|
# Cannot you OS X have a build system like all sane people?
|
5
7
|
# Win32 wins again.
|
6
|
-
qmake = %{qmake -spec macx-g++}
|
8
|
+
qmake = %{qmake CONFIG+=debug -spec macx-g++}
|
7
9
|
|
8
10
|
File.open("Makefile", "w") do |mf|
|
9
11
|
mf.puts <<-ENDM
|
@@ -16,9 +18,9 @@ install:
|
|
16
18
|
end
|
17
19
|
else
|
18
20
|
if Gem.win_platform?
|
19
|
-
qmake = %{qmake -spec win32-g++}
|
21
|
+
qmake = %{qmake CONFIG+=debug -spec win32-g++}
|
20
22
|
else
|
21
|
-
qmake = %{qmake}
|
23
|
+
qmake = %{qmake CONFIG+=debug}
|
22
24
|
end
|
23
25
|
|
24
26
|
File.open("Makefile", "w") do |mf|
|
@@ -3,11 +3,17 @@ QT += webkit network
|
|
3
3
|
TARGET = sunscraper
|
4
4
|
TEMPLATE = app
|
5
5
|
|
6
|
-
SOURCES +=
|
7
|
-
|
8
|
-
sunscraperrpc.cpp \
|
6
|
+
SOURCES += sunscraperrpc.cpp \
|
7
|
+
sunscraperrpcserver.cpp \
|
9
8
|
sunscrapermain.cpp
|
10
9
|
|
11
|
-
HEADERS +=
|
12
|
-
|
13
|
-
|
10
|
+
HEADERS += sunscraperrpc.h \
|
11
|
+
sunscraperrpcserver.h
|
12
|
+
|
13
|
+
INCLUDEPATH += ../common
|
14
|
+
|
15
|
+
unix:{
|
16
|
+
LIBS += -L../common -lsunscraper_common
|
17
|
+
POST_TARGETDEPS += ../common/libsunscraper_common.a
|
18
|
+
LDFLAGS += -pthread
|
19
|
+
}
|
@@ -1,13 +1,23 @@
|
|
1
1
|
#include <QApplication>
|
2
2
|
#include <QStringList>
|
3
3
|
#include "sunscraperworker.h"
|
4
|
-
#include "
|
4
|
+
#include "sunscraperrpcserver.h"
|
5
5
|
|
6
6
|
int main(int argc, char **argv)
|
7
7
|
{
|
8
8
|
QApplication app(argc, argv);
|
9
|
+
app.setApplicationName("Sunscraper-Standalone");
|
9
10
|
|
10
|
-
|
11
|
+
SunscraperWorker::unlock();
|
11
12
|
|
12
|
-
|
13
|
+
SunscraperRPCServer *rpcServer = new SunscraperRPCServer();
|
14
|
+
|
15
|
+
QString socketPath = app.arguments().at(1);
|
16
|
+
if(!rpcServer->listen(socketPath)) {
|
17
|
+
qFatal("Cannot listen on %s", socketPath.toLocal8Bit().constData());
|
18
|
+
}
|
19
|
+
|
20
|
+
app.exec();
|
21
|
+
|
22
|
+
qFatal("finished");
|
13
23
|
}
|
@@ -4,24 +4,32 @@
|
|
4
4
|
#include <QApplication>
|
5
5
|
#include <QtDebug>
|
6
6
|
#include <arpa/inet.h>
|
7
|
+
#include <sunscraperworker.h>
|
7
8
|
#include "sunscraperrpc.h"
|
8
|
-
#include "sunscraperworker.h"
|
9
9
|
|
10
|
-
SunscraperRPC::
|
11
|
-
|
10
|
+
SunscraperWorker *SunscraperRPC::m_worker;
|
11
|
+
unsigned SunscraperRPC::m_nextQueryId;
|
12
|
+
|
13
|
+
SunscraperRPC::SunscraperRPC(QLocalSocket *socket) :
|
14
|
+
m_socket(socket), m_state(StateHeader)
|
12
15
|
{
|
13
|
-
|
14
|
-
|
16
|
+
m_nextQueryId += 1;
|
17
|
+
m_queryId = m_nextQueryId;
|
18
|
+
|
15
19
|
connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
|
16
20
|
connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
|
17
21
|
|
18
|
-
m_worker
|
19
|
-
|
22
|
+
if(m_worker == NULL)
|
23
|
+
m_worker = new SunscraperWorker();
|
24
|
+
|
25
|
+
connect(m_worker, SIGNAL(finished(uint)), this, SLOT(onFinish(uint)));
|
26
|
+
connect(m_worker, SIGNAL(timedOut(uint)), this, SLOT(onTimeout(uint)));
|
27
|
+
connect(m_worker, SIGNAL(htmlFetched(uint,QString)), this, SLOT(onFetchDone(uint,QString)));
|
20
28
|
}
|
21
29
|
|
22
30
|
SunscraperRPC::~SunscraperRPC()
|
23
31
|
{
|
24
|
-
delete
|
32
|
+
delete m_socket;
|
25
33
|
}
|
26
34
|
|
27
35
|
void SunscraperRPC::onInputReadable()
|
@@ -32,9 +40,13 @@ void SunscraperRPC::onInputReadable()
|
|
32
40
|
while(moreData) {
|
33
41
|
switch(m_state) {
|
34
42
|
case StateHeader:
|
35
|
-
if((unsigned) m_buffer.length() >= sizeof(
|
36
|
-
|
37
|
-
|
43
|
+
if((unsigned) m_buffer.length() >= sizeof(quint32) * 2) {
|
44
|
+
QDataStream stream(m_buffer);
|
45
|
+
stream >> (quint32&) m_pendingRequest;
|
46
|
+
stream >> (quint32&) m_pendingDataLength;
|
47
|
+
|
48
|
+
m_buffer.remove(0, sizeof(quint32) * 2);
|
49
|
+
|
38
50
|
m_state = StateData;
|
39
51
|
} else {
|
40
52
|
moreData = false;
|
@@ -43,12 +55,12 @@ void SunscraperRPC::onInputReadable()
|
|
43
55
|
break;
|
44
56
|
|
45
57
|
case StateData:
|
46
|
-
unsigned length
|
58
|
+
if((unsigned) m_buffer.length() >= m_pendingDataLength) {
|
59
|
+
QByteArray data = m_buffer.left(m_pendingDataLength);
|
60
|
+
m_buffer.remove(0, m_pendingDataLength);
|
61
|
+
|
62
|
+
processRequest(m_pendingRequest, data);
|
47
63
|
|
48
|
-
if((unsigned) m_buffer.length() >= length) {
|
49
|
-
QByteArray data = m_buffer.left(length);
|
50
|
-
m_buffer.remove(0, length);
|
51
|
-
processRequest(m_pendingHeader, data);
|
52
64
|
m_state = StateHeader;
|
53
65
|
} else {
|
54
66
|
moreData = false;
|
@@ -61,123 +73,99 @@ void SunscraperRPC::onInputReadable()
|
|
61
73
|
|
62
74
|
void SunscraperRPC::onInputDisconnected()
|
63
75
|
{
|
64
|
-
|
65
|
-
|
76
|
+
m_worker->finalize(m_queryId);
|
77
|
+
|
78
|
+
emit disconnected();
|
66
79
|
}
|
67
80
|
|
68
|
-
void SunscraperRPC::processRequest(
|
81
|
+
void SunscraperRPC::processRequest(unsigned requestType, QByteArray data)
|
69
82
|
{
|
70
|
-
unsigned queryId, requestType;
|
71
|
-
|
72
|
-
queryId = ntohl(header.queryId);
|
73
|
-
requestType = ntohl(header.requestType);
|
74
|
-
|
75
83
|
switch(requestType) {
|
76
84
|
case RPC_LOAD_HTML: {
|
77
|
-
|
85
|
+
QDataStream stream(data);
|
86
|
+
|
87
|
+
QByteArray html;
|
88
|
+
stream >> html;
|
89
|
+
|
90
|
+
QByteArray baseUrl;
|
91
|
+
stream >> baseUrl;
|
92
|
+
|
93
|
+
m_worker->loadHtml(m_queryId, html, QUrl(baseUrl));
|
78
94
|
|
79
95
|
break;
|
80
96
|
}
|
81
97
|
|
82
98
|
case RPC_LOAD_URL: {
|
83
|
-
m_worker->loadUrl(
|
99
|
+
m_worker->loadUrl(m_queryId, QUrl(data));
|
84
100
|
|
85
101
|
break;
|
86
102
|
}
|
87
103
|
|
88
104
|
case RPC_WAIT: {
|
89
|
-
if(
|
90
|
-
|
91
|
-
reply.queryId = htonl(queryId);
|
92
|
-
reply.requestType = htonl(RPC_WAIT);
|
93
|
-
|
94
|
-
sendReply(reply, QByteArray());
|
95
|
-
} else {
|
96
|
-
Q_ASSERT(!m_waitQueue.contains(queryId));
|
97
|
-
Q_ASSERT(!m_timers.contains(queryId));
|
98
|
-
|
99
|
-
m_waitQueue.append(queryId);
|
105
|
+
if(!m_result) {
|
106
|
+
QDataStream stream(data);
|
100
107
|
|
101
108
|
unsigned timeout;
|
102
|
-
|
103
|
-
QDataStream stream(data);
|
104
109
|
stream >> timeout;
|
105
110
|
|
106
|
-
|
107
|
-
timer->setInterval(timeout);
|
108
|
-
timer->setSingleShot(true);
|
109
|
-
timer->start();
|
110
|
-
connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
|
111
|
-
|
112
|
-
m_timers[queryId] = timer;
|
111
|
+
m_worker->setTimeout(m_queryId, timeout);
|
113
112
|
}
|
114
113
|
|
115
114
|
break;
|
116
115
|
}
|
117
116
|
|
118
117
|
case RPC_FETCH: {
|
119
|
-
|
120
|
-
reply.queryId = htonl(queryId);
|
121
|
-
reply.requestType = htonl(RPC_FETCH);
|
122
|
-
|
123
|
-
if(m_results.contains(queryId)) {
|
124
|
-
sendReply(reply, m_results[queryId].toLocal8Bit());
|
125
|
-
} else {
|
126
|
-
sendReply(reply, "!SUNSCRAPER_TIMEOUT");
|
127
|
-
}
|
118
|
+
m_worker->fetchHtml(m_queryId);
|
128
119
|
|
129
120
|
break;
|
130
121
|
}
|
122
|
+
}
|
123
|
+
}
|
131
124
|
|
132
|
-
|
133
|
-
|
134
|
-
|
125
|
+
void SunscraperRPC::onFinish(unsigned eventQueryId)
|
126
|
+
{
|
127
|
+
if(eventQueryId != m_queryId)
|
128
|
+
return;
|
135
129
|
|
136
|
-
|
137
|
-
QTimer *timer = m_timers[queryId];
|
138
|
-
delete timer;
|
130
|
+
QByteArray data;
|
139
131
|
|
140
|
-
|
141
|
-
|
132
|
+
QDataStream stream(&data, QIODevice::WriteOnly);
|
133
|
+
stream << (int) true;
|
142
134
|
|
143
|
-
|
135
|
+
sendReply(data);
|
144
136
|
|
145
|
-
|
146
|
-
}
|
147
|
-
}
|
137
|
+
m_result = true;
|
148
138
|
}
|
149
139
|
|
150
|
-
void SunscraperRPC::
|
140
|
+
void SunscraperRPC::onTimeout(unsigned eventQueryId)
|
151
141
|
{
|
152
|
-
|
142
|
+
if(eventQueryId != m_queryId)
|
143
|
+
return;
|
153
144
|
|
154
|
-
|
155
|
-
Header reply;
|
156
|
-
reply.queryId = htonl(queryId);
|
157
|
-
reply.requestType = htonl(RPC_WAIT);
|
145
|
+
QByteArray data;
|
158
146
|
|
159
|
-
|
160
|
-
|
147
|
+
QDataStream stream(&data, QIODevice::WriteOnly);
|
148
|
+
stream << (int) false;
|
149
|
+
|
150
|
+
sendReply(data);
|
151
|
+
|
152
|
+
m_result = false;
|
161
153
|
}
|
162
154
|
|
163
|
-
void SunscraperRPC::
|
155
|
+
void SunscraperRPC::onFetchDone(unsigned eventQueryId, QString data)
|
164
156
|
{
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
Header reply;
|
169
|
-
reply.queryId = htonl(queryId);
|
170
|
-
reply.requestType = htonl(RPC_WAIT);
|
157
|
+
if(eventQueryId != m_queryId)
|
158
|
+
return;
|
171
159
|
|
172
|
-
sendReply(
|
160
|
+
sendReply(data.toLocal8Bit());
|
173
161
|
}
|
174
162
|
|
175
|
-
void SunscraperRPC::sendReply(
|
163
|
+
void SunscraperRPC::sendReply(QByteArray data)
|
176
164
|
{
|
177
|
-
|
165
|
+
QByteArray packet;
|
178
166
|
|
179
|
-
|
180
|
-
|
167
|
+
QDataStream stream(&packet, QIODevice::WriteOnly);
|
168
|
+
stream << data;
|
181
169
|
|
182
|
-
m_socket->write(
|
170
|
+
m_socket->write(packet);
|
183
171
|
}
|