sunscraper 1.1.0.beta3 → 1.2.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/ext/common/common.pro +13 -0
  2. data/ext/common/libsunscraper_common.a +0 -0
  3. data/ext/common/sunscraperproxy.cpp +11 -0
  4. data/ext/{standalone → common}/sunscraperproxy.h +5 -5
  5. data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
  6. data/ext/{embed → common}/sunscraperwebpage.h +0 -0
  7. data/ext/common/sunscraperworker.cpp +124 -0
  8. data/ext/common/sunscraperworker.h +50 -0
  9. data/ext/embed/embed.pro +14 -12
  10. data/ext/embed/sunscraperexternal.cpp +17 -16
  11. data/ext/embed/sunscraperinterface.cpp +206 -0
  12. data/ext/embed/sunscraperinterface.h +66 -0
  13. data/ext/embed/sunscraperlibrary.cpp +2 -12
  14. data/ext/embed/sunscraperlibrary.h +0 -1
  15. data/ext/embed/sunscraperthread.cpp +49 -0
  16. data/ext/embed/sunscraperthread.h +24 -0
  17. data/ext/extconf.rb +5 -3
  18. data/ext/standalone/standalone.pro +12 -6
  19. data/ext/standalone/sunscrapermain.cpp +13 -3
  20. data/ext/standalone/sunscraperrpc.cpp +76 -88
  21. data/ext/standalone/sunscraperrpc.h +19 -22
  22. data/ext/standalone/sunscraperrpcserver.cpp +26 -0
  23. data/ext/standalone/sunscraperrpcserver.h +24 -0
  24. data/ext/sunscraper-ext.pro +1 -1
  25. data/lib/sunscraper.rb +14 -14
  26. data/lib/sunscraper/library.rb +9 -9
  27. data/lib/sunscraper/standalone.rb +53 -107
  28. data/spec/sunscraper_spec.rb +86 -44
  29. data/sunscraper.gemspec +1 -1
  30. metadata +19 -17
  31. data/ext/embed/sunscraper.cpp +0 -92
  32. data/ext/embed/sunscraper.h +0 -47
  33. data/ext/embed/sunscraperproxy.cpp +0 -14
  34. data/ext/embed/sunscraperproxy.h +0 -24
  35. data/ext/embed/sunscraperworker.cpp +0 -163
  36. data/ext/embed/sunscraperworker.h +0 -58
  37. data/ext/standalone/sunscraperproxy.cpp +0 -14
  38. data/ext/standalone/sunscraperworker.cpp +0 -60
  39. data/ext/standalone/sunscraperworker.h +0 -34
@@ -0,0 +1,66 @@
1
+ #ifndef SUNSCRAPERINTERFACE_H
2
+ #define SUNSCRAPERINTERFACE_H
3
+
4
+ #include <QObject>
5
+ #include <QMutex>
6
+ #include <QSemaphore>
7
+ #include <QMap>
8
+ #include <QUrl>
9
+ #include <QByteArray>
10
+
11
+ class SunscraperWorker;
12
+
13
+ class SunscraperInterface : public QObject
14
+ {
15
+ Q_OBJECT
16
+
17
+ public:
18
+ static SunscraperInterface *instance();
19
+
20
+ unsigned createQuery();
21
+
22
+ void loadHtml(unsigned queryId, QString html, QUrl baseUrl);
23
+ void loadUrl(unsigned queryId, QUrl url);
24
+
25
+ bool wait(unsigned queryId, unsigned timeout);
26
+
27
+ QByteArray fetch(unsigned queryId);
28
+
29
+ void finalize(unsigned queryId);
30
+
31
+ private slots:
32
+ void onFinish(unsigned queryId);
33
+ void onTimeout(unsigned queryId);
34
+ void onFetchDone(unsigned queryId, QString html);
35
+
36
+ signals:
37
+ void requestLoadUrl(unsigned queryId, QUrl url);
38
+ void requestLoadHtml(unsigned queryId, QString html, QUrl url);
39
+ void requestTimeout(unsigned queryId, unsigned timeout);
40
+ void requestFetch(unsigned queryId);
41
+ void requestFinalize(unsigned queryId);
42
+
43
+ private:
44
+ static QMutex m_initializationMutex;
45
+ static SunscraperInterface *m_instance;
46
+
47
+ QMutex m_queryIdMutex;
48
+ unsigned m_nextQueryId;
49
+
50
+ QMutex m_semaphoresMutex;
51
+ QMap<unsigned, QSemaphore *> m_semaphores;
52
+
53
+ QMutex m_resultsMutex;
54
+ QMap<unsigned, bool> m_results;
55
+ QMap<unsigned, QByteArray> m_htmlCache;
56
+
57
+ SunscraperWorker *m_worker;
58
+
59
+ SunscraperInterface();
60
+
61
+ void initSemaphore(unsigned queryId);
62
+ void waitOnSemaphore(unsigned queryId);
63
+ void signalSemaphore(unsigned queryId);
64
+ };
65
+
66
+ #endif // SUNSCRAPERINTERFACE_H
@@ -1,20 +1,10 @@
1
1
  #include "sunscraperlibrary.h"
2
- #include "sunscraperworker.h"
2
+ #include "sunscraperthread.h"
3
3
  #include <QtDebug>
4
4
 
5
5
  SunscraperLibrary SunscraperLibrary::m_instance;
6
6
 
7
7
  SunscraperLibrary::SunscraperLibrary()
8
8
  {
9
- SunscraperWorker::invoke();
10
- }
11
-
12
- SunscraperLibrary::~SunscraperLibrary()
13
- {
14
- /* Do nothing. This is on purpose. */
15
- }
16
-
17
- SunscraperLibrary *SunscraperLibrary::instance()
18
- {
19
- return &m_instance;
9
+ SunscraperThread::invoke();
20
10
  }
@@ -10,7 +10,6 @@ public:
10
10
  private:
11
11
  SunscraperLibrary();
12
12
  SunscraperLibrary(SunscraperLibrary &);
13
- ~SunscraperLibrary();
14
13
 
15
14
  static SunscraperLibrary m_instance;
16
15
  };
@@ -0,0 +1,49 @@
1
+ #include <QApplication>
2
+ #include <QtDebug>
3
+ #include "sunscraperthread.h"
4
+ #include "sunscraperworker.h"
5
+
6
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
7
+ pthread_t SunscraperThread::m_thread;
8
+ #endif
9
+
10
+ void SunscraperThread::invoke()
11
+ {
12
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
13
+ pthread_create(&m_thread, NULL, &SunscraperThread::thread_routine, NULL);
14
+ #endif
15
+ }
16
+
17
+ void *SunscraperThread::thread_routine(void *)
18
+ {
19
+ /* Better error messages. */
20
+ int argc = 1;
21
+ char *argv[] = { (char*) "Sunscraper", NULL};
22
+
23
+ /* Why (char*)? Because argv can (theoretically) be modified. *
24
+ * But Qt won't do that with argv[0]. I know, trust me. */
25
+
26
+ QApplication app(argc, argv);
27
+ app.setApplicationName("Sunscraper-Embed");
28
+
29
+ SunscraperWorker::unlock();
30
+
31
+ /*
32
+ * The magic value 42 means we want exit from the loop.
33
+ * E.g. alerts from within the page may exit the loop with value 0.
34
+ */
35
+ while(app.exec() != 42);
36
+
37
+ /* Our host application exits. */
38
+
39
+ return NULL;
40
+ }
41
+
42
+ void SunscraperThread::commitSuicide()
43
+ {
44
+ QApplication::exit(42);
45
+
46
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
47
+ pthread_join(m_thread, NULL);
48
+ #endif
49
+ }
@@ -0,0 +1,24 @@
1
+ #ifndef SUNSCRAPERTHREAD_H
2
+ #define SUNSCRAPERTHREAD_H
3
+
4
+ #include <QThread>
5
+ #include <QSemaphore>
6
+
7
+ class SunscraperThread : public QThread
8
+ {
9
+ Q_OBJECT
10
+ public:
11
+ static void invoke();
12
+ static void commitSuicide();
13
+
14
+ private:
15
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
16
+ static pthread_t m_thread;
17
+ #else
18
+ #error Your platform is unsupported. Implement SunscraperWorker::invoke() and send a pull request.
19
+ #endif
20
+
21
+ static void *thread_routine(void *arg);
22
+ };
23
+
24
+ #endif /* SUNSCRAPERTHREAD_H */
data/ext/extconf.rb CHANGED
@@ -1,9 +1,11 @@
1
1
  # This Makefile will get replaced by qmake.
2
2
 
3
+ require 'rbconfig'
4
+
3
5
  if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
4
6
  # Cannot you OS X have a build system like all sane people?
5
7
  # Win32 wins again.
6
- qmake = %{qmake -spec macx-g++}
8
+ qmake = %{qmake CONFIG+=debug -spec macx-g++}
7
9
 
8
10
  File.open("Makefile", "w") do |mf|
9
11
  mf.puts <<-ENDM
@@ -16,9 +18,9 @@ install:
16
18
  end
17
19
  else
18
20
  if Gem.win_platform?
19
- qmake = %{qmake -spec win32-g++}
21
+ qmake = %{qmake CONFIG+=debug -spec win32-g++}
20
22
  else
21
- qmake = %{qmake}
23
+ qmake = %{qmake CONFIG+=debug}
22
24
  end
23
25
 
24
26
  File.open("Makefile", "w") do |mf|
@@ -3,11 +3,17 @@ QT += webkit network
3
3
  TARGET = sunscraper
4
4
  TEMPLATE = app
5
5
 
6
- SOURCES += sunscraperproxy.cpp \
7
- sunscraperworker.cpp \
8
- sunscraperrpc.cpp \
6
+ SOURCES += sunscraperrpc.cpp \
7
+ sunscraperrpcserver.cpp \
9
8
  sunscrapermain.cpp
10
9
 
11
- HEADERS += sunscraperproxy.h \
12
- sunscraperrpc.h \
13
- sunscraperworker.h
10
+ HEADERS += sunscraperrpc.h \
11
+ sunscraperrpcserver.h
12
+
13
+ INCLUDEPATH += ../common
14
+
15
+ unix:{
16
+ LIBS += -L../common -lsunscraper_common
17
+ POST_TARGETDEPS += ../common/libsunscraper_common.a
18
+ LDFLAGS += -pthread
19
+ }
@@ -1,13 +1,23 @@
1
1
  #include <QApplication>
2
2
  #include <QStringList>
3
3
  #include "sunscraperworker.h"
4
- #include "sunscraperrpc.h"
4
+ #include "sunscraperrpcserver.h"
5
5
 
6
6
  int main(int argc, char **argv)
7
7
  {
8
8
  QApplication app(argc, argv);
9
+ app.setApplicationName("Sunscraper-Standalone");
9
10
 
10
- SunscraperRPC rpc(app.arguments().at(1));
11
+ SunscraperWorker::unlock();
11
12
 
12
- return app.exec();
13
+ SunscraperRPCServer *rpcServer = new SunscraperRPCServer();
14
+
15
+ QString socketPath = app.arguments().at(1);
16
+ if(!rpcServer->listen(socketPath)) {
17
+ qFatal("Cannot listen on %s", socketPath.toLocal8Bit().constData());
18
+ }
19
+
20
+ app.exec();
21
+
22
+ qFatal("finished");
13
23
  }
@@ -4,24 +4,32 @@
4
4
  #include <QApplication>
5
5
  #include <QtDebug>
6
6
  #include <arpa/inet.h>
7
+ #include <sunscraperworker.h>
7
8
  #include "sunscraperrpc.h"
8
- #include "sunscraperworker.h"
9
9
 
10
- SunscraperRPC::SunscraperRPC(QString socketPath) :
11
- m_state(StateHeader)
10
+ SunscraperWorker *SunscraperRPC::m_worker;
11
+ unsigned SunscraperRPC::m_nextQueryId;
12
+
13
+ SunscraperRPC::SunscraperRPC(QLocalSocket *socket) :
14
+ m_socket(socket), m_state(StateHeader)
12
15
  {
13
- m_socket = new QLocalSocket(this);
14
- m_socket->connectToServer(socketPath);
16
+ m_nextQueryId += 1;
17
+ m_queryId = m_nextQueryId;
18
+
15
19
  connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
16
20
  connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
17
21
 
18
- m_worker = new SunscraperWorker(this);
19
- connect(m_worker, SIGNAL(finished(uint,QString)), this, SLOT(onPageRendered(uint,QString)));
22
+ if(m_worker == NULL)
23
+ m_worker = new SunscraperWorker();
24
+
25
+ connect(m_worker, SIGNAL(finished(uint)), this, SLOT(onFinish(uint)));
26
+ connect(m_worker, SIGNAL(timedOut(uint)), this, SLOT(onTimeout(uint)));
27
+ connect(m_worker, SIGNAL(htmlFetched(uint,QString)), this, SLOT(onFetchDone(uint,QString)));
20
28
  }
21
29
 
22
30
  SunscraperRPC::~SunscraperRPC()
23
31
  {
24
- delete m_worker;
32
+ delete m_socket;
25
33
  }
26
34
 
27
35
  void SunscraperRPC::onInputReadable()
@@ -32,9 +40,13 @@ void SunscraperRPC::onInputReadable()
32
40
  while(moreData) {
33
41
  switch(m_state) {
34
42
  case StateHeader:
35
- if((unsigned) m_buffer.length() >= sizeof(Header)) {
36
- memcpy((void*) &m_pendingHeader, m_buffer.constData(), sizeof(Header));
37
- m_buffer.remove(0, sizeof(Header));
43
+ if((unsigned) m_buffer.length() >= sizeof(quint32) * 2) {
44
+ QDataStream stream(m_buffer);
45
+ stream >> (quint32&) m_pendingRequest;
46
+ stream >> (quint32&) m_pendingDataLength;
47
+
48
+ m_buffer.remove(0, sizeof(quint32) * 2);
49
+
38
50
  m_state = StateData;
39
51
  } else {
40
52
  moreData = false;
@@ -43,12 +55,12 @@ void SunscraperRPC::onInputReadable()
43
55
  break;
44
56
 
45
57
  case StateData:
46
- unsigned length = ntohl(m_pendingHeader.dataLength);
58
+ if((unsigned) m_buffer.length() >= m_pendingDataLength) {
59
+ QByteArray data = m_buffer.left(m_pendingDataLength);
60
+ m_buffer.remove(0, m_pendingDataLength);
61
+
62
+ processRequest(m_pendingRequest, data);
47
63
 
48
- if((unsigned) m_buffer.length() >= length) {
49
- QByteArray data = m_buffer.left(length);
50
- m_buffer.remove(0, length);
51
- processRequest(m_pendingHeader, data);
52
64
  m_state = StateHeader;
53
65
  } else {
54
66
  moreData = false;
@@ -61,123 +73,99 @@ void SunscraperRPC::onInputReadable()
61
73
 
62
74
  void SunscraperRPC::onInputDisconnected()
63
75
  {
64
- /* Magic value. */
65
- QApplication::exit(42);
76
+ m_worker->finalize(m_queryId);
77
+
78
+ emit disconnected();
66
79
  }
67
80
 
68
- void SunscraperRPC::processRequest(Header header, QByteArray data)
81
+ void SunscraperRPC::processRequest(unsigned requestType, QByteArray data)
69
82
  {
70
- unsigned queryId, requestType;
71
-
72
- queryId = ntohl(header.queryId);
73
- requestType = ntohl(header.requestType);
74
-
75
83
  switch(requestType) {
76
84
  case RPC_LOAD_HTML: {
77
- m_worker->loadHtml(queryId, data);
85
+ QDataStream stream(data);
86
+
87
+ QByteArray html;
88
+ stream >> html;
89
+
90
+ QByteArray baseUrl;
91
+ stream >> baseUrl;
92
+
93
+ m_worker->loadHtml(m_queryId, html, QUrl(baseUrl));
78
94
 
79
95
  break;
80
96
  }
81
97
 
82
98
  case RPC_LOAD_URL: {
83
- m_worker->loadUrl(queryId, data);
99
+ m_worker->loadUrl(m_queryId, QUrl(data));
84
100
 
85
101
  break;
86
102
  }
87
103
 
88
104
  case RPC_WAIT: {
89
- if(m_results.contains(queryId)) {
90
- Header reply;
91
- reply.queryId = htonl(queryId);
92
- reply.requestType = htonl(RPC_WAIT);
93
-
94
- sendReply(reply, QByteArray());
95
- } else {
96
- Q_ASSERT(!m_waitQueue.contains(queryId));
97
- Q_ASSERT(!m_timers.contains(queryId));
98
-
99
- m_waitQueue.append(queryId);
105
+ if(!m_result) {
106
+ QDataStream stream(data);
100
107
 
101
108
  unsigned timeout;
102
-
103
- QDataStream stream(data);
104
109
  stream >> timeout;
105
110
 
106
- QTimer *timer = new QTimer(this);
107
- timer->setInterval(timeout);
108
- timer->setSingleShot(true);
109
- timer->start();
110
- connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
111
-
112
- m_timers[queryId] = timer;
111
+ m_worker->setTimeout(m_queryId, timeout);
113
112
  }
114
113
 
115
114
  break;
116
115
  }
117
116
 
118
117
  case RPC_FETCH: {
119
- Header reply;
120
- reply.queryId = htonl(queryId);
121
- reply.requestType = htonl(RPC_FETCH);
122
-
123
- if(m_results.contains(queryId)) {
124
- sendReply(reply, m_results[queryId].toLocal8Bit());
125
- } else {
126
- sendReply(reply, "!SUNSCRAPER_TIMEOUT");
127
- }
118
+ m_worker->fetchHtml(m_queryId);
128
119
 
129
120
  break;
130
121
  }
122
+ }
123
+ }
131
124
 
132
- case RPC_DISCARD: {
133
- m_results.remove(queryId);
134
- m_waitQueue.removeAll(queryId);
125
+ void SunscraperRPC::onFinish(unsigned eventQueryId)
126
+ {
127
+ if(eventQueryId != m_queryId)
128
+ return;
135
129
 
136
- if(m_timers.contains(queryId)) {
137
- QTimer *timer = m_timers[queryId];
138
- delete timer;
130
+ QByteArray data;
139
131
 
140
- m_timers.remove(queryId);
141
- }
132
+ QDataStream stream(&data, QIODevice::WriteOnly);
133
+ stream << (int) true;
142
134
 
143
- m_worker->finalize(queryId);
135
+ sendReply(data);
144
136
 
145
- break;
146
- }
147
- }
137
+ m_result = true;
148
138
  }
149
139
 
150
- void SunscraperRPC::onPageRendered(unsigned queryId, QString data)
140
+ void SunscraperRPC::onTimeout(unsigned eventQueryId)
151
141
  {
152
- m_results[queryId] = data;
142
+ if(eventQueryId != m_queryId)
143
+ return;
153
144
 
154
- if(m_waitQueue.contains(queryId)) {
155
- Header reply;
156
- reply.queryId = htonl(queryId);
157
- reply.requestType = htonl(RPC_WAIT);
145
+ QByteArray data;
158
146
 
159
- sendReply(reply, QByteArray());
160
- }
147
+ QDataStream stream(&data, QIODevice::WriteOnly);
148
+ stream << (int) false;
149
+
150
+ sendReply(data);
151
+
152
+ m_result = false;
161
153
  }
162
154
 
163
- void SunscraperRPC::onTimeout()
155
+ void SunscraperRPC::onFetchDone(unsigned eventQueryId, QString data)
164
156
  {
165
- QTimer *timer = static_cast<QTimer*>(QObject::sender());
166
- unsigned queryId = m_timers.key(timer);
167
-
168
- Header reply;
169
- reply.queryId = htonl(queryId);
170
- reply.requestType = htonl(RPC_WAIT);
157
+ if(eventQueryId != m_queryId)
158
+ return;
171
159
 
172
- sendReply(reply, QByteArray());
160
+ sendReply(data.toLocal8Bit());
173
161
  }
174
162
 
175
- void SunscraperRPC::sendReply(Header header, QByteArray data)
163
+ void SunscraperRPC::sendReply(QByteArray data)
176
164
  {
177
- header.dataLength = htonl(data.length());
165
+ QByteArray packet;
178
166
 
179
- QByteArray serialized((const char*) &header, sizeof(Header));
180
- serialized.append(data);
167
+ QDataStream stream(&packet, QIODevice::WriteOnly);
168
+ stream << data;
181
169
 
182
- m_socket->write(serialized);
170
+ m_socket->write(packet);
183
171
  }