sunscraper 1.1.0.beta3 → 1.2.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/ext/common/common.pro +13 -0
  2. data/ext/common/libsunscraper_common.a +0 -0
  3. data/ext/common/sunscraperproxy.cpp +11 -0
  4. data/ext/{standalone → common}/sunscraperproxy.h +5 -5
  5. data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
  6. data/ext/{embed → common}/sunscraperwebpage.h +0 -0
  7. data/ext/common/sunscraperworker.cpp +124 -0
  8. data/ext/common/sunscraperworker.h +50 -0
  9. data/ext/embed/embed.pro +14 -12
  10. data/ext/embed/sunscraperexternal.cpp +17 -16
  11. data/ext/embed/sunscraperinterface.cpp +206 -0
  12. data/ext/embed/sunscraperinterface.h +66 -0
  13. data/ext/embed/sunscraperlibrary.cpp +2 -12
  14. data/ext/embed/sunscraperlibrary.h +0 -1
  15. data/ext/embed/sunscraperthread.cpp +49 -0
  16. data/ext/embed/sunscraperthread.h +24 -0
  17. data/ext/extconf.rb +5 -3
  18. data/ext/standalone/standalone.pro +12 -6
  19. data/ext/standalone/sunscrapermain.cpp +13 -3
  20. data/ext/standalone/sunscraperrpc.cpp +76 -88
  21. data/ext/standalone/sunscraperrpc.h +19 -22
  22. data/ext/standalone/sunscraperrpcserver.cpp +26 -0
  23. data/ext/standalone/sunscraperrpcserver.h +24 -0
  24. data/ext/sunscraper-ext.pro +1 -1
  25. data/lib/sunscraper.rb +14 -14
  26. data/lib/sunscraper/library.rb +9 -9
  27. data/lib/sunscraper/standalone.rb +53 -107
  28. data/spec/sunscraper_spec.rb +86 -44
  29. data/sunscraper.gemspec +1 -1
  30. metadata +19 -17
  31. data/ext/embed/sunscraper.cpp +0 -92
  32. data/ext/embed/sunscraper.h +0 -47
  33. data/ext/embed/sunscraperproxy.cpp +0 -14
  34. data/ext/embed/sunscraperproxy.h +0 -24
  35. data/ext/embed/sunscraperworker.cpp +0 -163
  36. data/ext/embed/sunscraperworker.h +0 -58
  37. data/ext/standalone/sunscraperproxy.cpp +0 -14
  38. data/ext/standalone/sunscraperworker.cpp +0 -60
  39. data/ext/standalone/sunscraperworker.h +0 -34
@@ -0,0 +1,66 @@
1
+ #ifndef SUNSCRAPERINTERFACE_H
2
+ #define SUNSCRAPERINTERFACE_H
3
+
4
+ #include <QObject>
5
+ #include <QMutex>
6
+ #include <QSemaphore>
7
+ #include <QMap>
8
+ #include <QUrl>
9
+ #include <QByteArray>
10
+
11
+ class SunscraperWorker;
12
+
13
+ class SunscraperInterface : public QObject
14
+ {
15
+ Q_OBJECT
16
+
17
+ public:
18
+ static SunscraperInterface *instance();
19
+
20
+ unsigned createQuery();
21
+
22
+ void loadHtml(unsigned queryId, QString html, QUrl baseUrl);
23
+ void loadUrl(unsigned queryId, QUrl url);
24
+
25
+ bool wait(unsigned queryId, unsigned timeout);
26
+
27
+ QByteArray fetch(unsigned queryId);
28
+
29
+ void finalize(unsigned queryId);
30
+
31
+ private slots:
32
+ void onFinish(unsigned queryId);
33
+ void onTimeout(unsigned queryId);
34
+ void onFetchDone(unsigned queryId, QString html);
35
+
36
+ signals:
37
+ void requestLoadUrl(unsigned queryId, QUrl url);
38
+ void requestLoadHtml(unsigned queryId, QString html, QUrl url);
39
+ void requestTimeout(unsigned queryId, unsigned timeout);
40
+ void requestFetch(unsigned queryId);
41
+ void requestFinalize(unsigned queryId);
42
+
43
+ private:
44
+ static QMutex m_initializationMutex;
45
+ static SunscraperInterface *m_instance;
46
+
47
+ QMutex m_queryIdMutex;
48
+ unsigned m_nextQueryId;
49
+
50
+ QMutex m_semaphoresMutex;
51
+ QMap<unsigned, QSemaphore *> m_semaphores;
52
+
53
+ QMutex m_resultsMutex;
54
+ QMap<unsigned, bool> m_results;
55
+ QMap<unsigned, QByteArray> m_htmlCache;
56
+
57
+ SunscraperWorker *m_worker;
58
+
59
+ SunscraperInterface();
60
+
61
+ void initSemaphore(unsigned queryId);
62
+ void waitOnSemaphore(unsigned queryId);
63
+ void signalSemaphore(unsigned queryId);
64
+ };
65
+
66
+ #endif // SUNSCRAPERINTERFACE_H
@@ -1,20 +1,10 @@
1
1
  #include "sunscraperlibrary.h"
2
- #include "sunscraperworker.h"
2
+ #include "sunscraperthread.h"
3
3
  #include <QtDebug>
4
4
 
5
5
  SunscraperLibrary SunscraperLibrary::m_instance;
6
6
 
7
7
  SunscraperLibrary::SunscraperLibrary()
8
8
  {
9
- SunscraperWorker::invoke();
10
- }
11
-
12
- SunscraperLibrary::~SunscraperLibrary()
13
- {
14
- /* Do nothing. This is on purpose. */
15
- }
16
-
17
- SunscraperLibrary *SunscraperLibrary::instance()
18
- {
19
- return &m_instance;
9
+ SunscraperThread::invoke();
20
10
  }
@@ -10,7 +10,6 @@ public:
10
10
  private:
11
11
  SunscraperLibrary();
12
12
  SunscraperLibrary(SunscraperLibrary &);
13
- ~SunscraperLibrary();
14
13
 
15
14
  static SunscraperLibrary m_instance;
16
15
  };
@@ -0,0 +1,49 @@
1
+ #include <QApplication>
2
+ #include <QtDebug>
3
+ #include "sunscraperthread.h"
4
+ #include "sunscraperworker.h"
5
+
6
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
7
+ pthread_t SunscraperThread::m_thread;
8
+ #endif
9
+
10
+ void SunscraperThread::invoke()
11
+ {
12
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
13
+ pthread_create(&m_thread, NULL, &SunscraperThread::thread_routine, NULL);
14
+ #endif
15
+ }
16
+
17
+ void *SunscraperThread::thread_routine(void *)
18
+ {
19
+ /* Better error messages. */
20
+ int argc = 1;
21
+ char *argv[] = { (char*) "Sunscraper", NULL};
22
+
23
+ /* Why (char*)? Because argv can (theoretically) be modified. *
24
+ * But Qt won't do that with argv[0]. I know, trust me. */
25
+
26
+ QApplication app(argc, argv);
27
+ app.setApplicationName("Sunscraper-Embed");
28
+
29
+ SunscraperWorker::unlock();
30
+
31
+ /*
32
+ * The magic value 42 means we want exit from the loop.
33
+ * E.g. alerts from within the page may exit the loop with value 0.
34
+ */
35
+ while(app.exec() != 42);
36
+
37
+ /* Our host application exits. */
38
+
39
+ return NULL;
40
+ }
41
+
42
+ void SunscraperThread::commitSuicide()
43
+ {
44
+ QApplication::exit(42);
45
+
46
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
47
+ pthread_join(m_thread, NULL);
48
+ #endif
49
+ }
@@ -0,0 +1,24 @@
1
+ #ifndef SUNSCRAPERTHREAD_H
2
+ #define SUNSCRAPERTHREAD_H
3
+
4
+ #include <QThread>
5
+ #include <QSemaphore>
6
+
7
+ class SunscraperThread : public QThread
8
+ {
9
+ Q_OBJECT
10
+ public:
11
+ static void invoke();
12
+ static void commitSuicide();
13
+
14
+ private:
15
+ #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
16
+ static pthread_t m_thread;
17
+ #else
18
+ #error Your platform is unsupported. Implement SunscraperWorker::invoke() and send a pull request.
19
+ #endif
20
+
21
+ static void *thread_routine(void *arg);
22
+ };
23
+
24
+ #endif /* SUNSCRAPERTHREAD_H */
data/ext/extconf.rb CHANGED
@@ -1,9 +1,11 @@
1
1
  # This Makefile will get replaced by qmake.
2
2
 
3
+ require 'rbconfig'
4
+
3
5
  if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
4
6
  # Cannot you OS X have a build system like all sane people?
5
7
  # Win32 wins again.
6
- qmake = %{qmake -spec macx-g++}
8
+ qmake = %{qmake CONFIG+=debug -spec macx-g++}
7
9
 
8
10
  File.open("Makefile", "w") do |mf|
9
11
  mf.puts <<-ENDM
@@ -16,9 +18,9 @@ install:
16
18
  end
17
19
  else
18
20
  if Gem.win_platform?
19
- qmake = %{qmake -spec win32-g++}
21
+ qmake = %{qmake CONFIG+=debug -spec win32-g++}
20
22
  else
21
- qmake = %{qmake}
23
+ qmake = %{qmake CONFIG+=debug}
22
24
  end
23
25
 
24
26
  File.open("Makefile", "w") do |mf|
@@ -3,11 +3,17 @@ QT += webkit network
3
3
  TARGET = sunscraper
4
4
  TEMPLATE = app
5
5
 
6
- SOURCES += sunscraperproxy.cpp \
7
- sunscraperworker.cpp \
8
- sunscraperrpc.cpp \
6
+ SOURCES += sunscraperrpc.cpp \
7
+ sunscraperrpcserver.cpp \
9
8
  sunscrapermain.cpp
10
9
 
11
- HEADERS += sunscraperproxy.h \
12
- sunscraperrpc.h \
13
- sunscraperworker.h
10
+ HEADERS += sunscraperrpc.h \
11
+ sunscraperrpcserver.h
12
+
13
+ INCLUDEPATH += ../common
14
+
15
+ unix:{
16
+ LIBS += -L../common -lsunscraper_common
17
+ POST_TARGETDEPS += ../common/libsunscraper_common.a
18
+ LDFLAGS += -pthread
19
+ }
@@ -1,13 +1,23 @@
1
1
  #include <QApplication>
2
2
  #include <QStringList>
3
3
  #include "sunscraperworker.h"
4
- #include "sunscraperrpc.h"
4
+ #include "sunscraperrpcserver.h"
5
5
 
6
6
  int main(int argc, char **argv)
7
7
  {
8
8
  QApplication app(argc, argv);
9
+ app.setApplicationName("Sunscraper-Standalone");
9
10
 
10
- SunscraperRPC rpc(app.arguments().at(1));
11
+ SunscraperWorker::unlock();
11
12
 
12
- return app.exec();
13
+ SunscraperRPCServer *rpcServer = new SunscraperRPCServer();
14
+
15
+ QString socketPath = app.arguments().at(1);
16
+ if(!rpcServer->listen(socketPath)) {
17
+ qFatal("Cannot listen on %s", socketPath.toLocal8Bit().constData());
18
+ }
19
+
20
+ app.exec();
21
+
22
+ qFatal("finished");
13
23
  }
@@ -4,24 +4,32 @@
4
4
  #include <QApplication>
5
5
  #include <QtDebug>
6
6
  #include <arpa/inet.h>
7
+ #include <sunscraperworker.h>
7
8
  #include "sunscraperrpc.h"
8
- #include "sunscraperworker.h"
9
9
 
10
- SunscraperRPC::SunscraperRPC(QString socketPath) :
11
- m_state(StateHeader)
10
+ SunscraperWorker *SunscraperRPC::m_worker;
11
+ unsigned SunscraperRPC::m_nextQueryId;
12
+
13
+ SunscraperRPC::SunscraperRPC(QLocalSocket *socket) :
14
+ m_socket(socket), m_state(StateHeader)
12
15
  {
13
- m_socket = new QLocalSocket(this);
14
- m_socket->connectToServer(socketPath);
16
+ m_nextQueryId += 1;
17
+ m_queryId = m_nextQueryId;
18
+
15
19
  connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
16
20
  connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
17
21
 
18
- m_worker = new SunscraperWorker(this);
19
- connect(m_worker, SIGNAL(finished(uint,QString)), this, SLOT(onPageRendered(uint,QString)));
22
+ if(m_worker == NULL)
23
+ m_worker = new SunscraperWorker();
24
+
25
+ connect(m_worker, SIGNAL(finished(uint)), this, SLOT(onFinish(uint)));
26
+ connect(m_worker, SIGNAL(timedOut(uint)), this, SLOT(onTimeout(uint)));
27
+ connect(m_worker, SIGNAL(htmlFetched(uint,QString)), this, SLOT(onFetchDone(uint,QString)));
20
28
  }
21
29
 
22
30
  SunscraperRPC::~SunscraperRPC()
23
31
  {
24
- delete m_worker;
32
+ delete m_socket;
25
33
  }
26
34
 
27
35
  void SunscraperRPC::onInputReadable()
@@ -32,9 +40,13 @@ void SunscraperRPC::onInputReadable()
32
40
  while(moreData) {
33
41
  switch(m_state) {
34
42
  case StateHeader:
35
- if((unsigned) m_buffer.length() >= sizeof(Header)) {
36
- memcpy((void*) &m_pendingHeader, m_buffer.constData(), sizeof(Header));
37
- m_buffer.remove(0, sizeof(Header));
43
+ if((unsigned) m_buffer.length() >= sizeof(quint32) * 2) {
44
+ QDataStream stream(m_buffer);
45
+ stream >> (quint32&) m_pendingRequest;
46
+ stream >> (quint32&) m_pendingDataLength;
47
+
48
+ m_buffer.remove(0, sizeof(quint32) * 2);
49
+
38
50
  m_state = StateData;
39
51
  } else {
40
52
  moreData = false;
@@ -43,12 +55,12 @@ void SunscraperRPC::onInputReadable()
43
55
  break;
44
56
 
45
57
  case StateData:
46
- unsigned length = ntohl(m_pendingHeader.dataLength);
58
+ if((unsigned) m_buffer.length() >= m_pendingDataLength) {
59
+ QByteArray data = m_buffer.left(m_pendingDataLength);
60
+ m_buffer.remove(0, m_pendingDataLength);
61
+
62
+ processRequest(m_pendingRequest, data);
47
63
 
48
- if((unsigned) m_buffer.length() >= length) {
49
- QByteArray data = m_buffer.left(length);
50
- m_buffer.remove(0, length);
51
- processRequest(m_pendingHeader, data);
52
64
  m_state = StateHeader;
53
65
  } else {
54
66
  moreData = false;
@@ -61,123 +73,99 @@ void SunscraperRPC::onInputReadable()
61
73
 
62
74
  void SunscraperRPC::onInputDisconnected()
63
75
  {
64
- /* Magic value. */
65
- QApplication::exit(42);
76
+ m_worker->finalize(m_queryId);
77
+
78
+ emit disconnected();
66
79
  }
67
80
 
68
- void SunscraperRPC::processRequest(Header header, QByteArray data)
81
+ void SunscraperRPC::processRequest(unsigned requestType, QByteArray data)
69
82
  {
70
- unsigned queryId, requestType;
71
-
72
- queryId = ntohl(header.queryId);
73
- requestType = ntohl(header.requestType);
74
-
75
83
  switch(requestType) {
76
84
  case RPC_LOAD_HTML: {
77
- m_worker->loadHtml(queryId, data);
85
+ QDataStream stream(data);
86
+
87
+ QByteArray html;
88
+ stream >> html;
89
+
90
+ QByteArray baseUrl;
91
+ stream >> baseUrl;
92
+
93
+ m_worker->loadHtml(m_queryId, html, QUrl(baseUrl));
78
94
 
79
95
  break;
80
96
  }
81
97
 
82
98
  case RPC_LOAD_URL: {
83
- m_worker->loadUrl(queryId, data);
99
+ m_worker->loadUrl(m_queryId, QUrl(data));
84
100
 
85
101
  break;
86
102
  }
87
103
 
88
104
  case RPC_WAIT: {
89
- if(m_results.contains(queryId)) {
90
- Header reply;
91
- reply.queryId = htonl(queryId);
92
- reply.requestType = htonl(RPC_WAIT);
93
-
94
- sendReply(reply, QByteArray());
95
- } else {
96
- Q_ASSERT(!m_waitQueue.contains(queryId));
97
- Q_ASSERT(!m_timers.contains(queryId));
98
-
99
- m_waitQueue.append(queryId);
105
+ if(!m_result) {
106
+ QDataStream stream(data);
100
107
 
101
108
  unsigned timeout;
102
-
103
- QDataStream stream(data);
104
109
  stream >> timeout;
105
110
 
106
- QTimer *timer = new QTimer(this);
107
- timer->setInterval(timeout);
108
- timer->setSingleShot(true);
109
- timer->start();
110
- connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
111
-
112
- m_timers[queryId] = timer;
111
+ m_worker->setTimeout(m_queryId, timeout);
113
112
  }
114
113
 
115
114
  break;
116
115
  }
117
116
 
118
117
  case RPC_FETCH: {
119
- Header reply;
120
- reply.queryId = htonl(queryId);
121
- reply.requestType = htonl(RPC_FETCH);
122
-
123
- if(m_results.contains(queryId)) {
124
- sendReply(reply, m_results[queryId].toLocal8Bit());
125
- } else {
126
- sendReply(reply, "!SUNSCRAPER_TIMEOUT");
127
- }
118
+ m_worker->fetchHtml(m_queryId);
128
119
 
129
120
  break;
130
121
  }
122
+ }
123
+ }
131
124
 
132
- case RPC_DISCARD: {
133
- m_results.remove(queryId);
134
- m_waitQueue.removeAll(queryId);
125
+ void SunscraperRPC::onFinish(unsigned eventQueryId)
126
+ {
127
+ if(eventQueryId != m_queryId)
128
+ return;
135
129
 
136
- if(m_timers.contains(queryId)) {
137
- QTimer *timer = m_timers[queryId];
138
- delete timer;
130
+ QByteArray data;
139
131
 
140
- m_timers.remove(queryId);
141
- }
132
+ QDataStream stream(&data, QIODevice::WriteOnly);
133
+ stream << (int) true;
142
134
 
143
- m_worker->finalize(queryId);
135
+ sendReply(data);
144
136
 
145
- break;
146
- }
147
- }
137
+ m_result = true;
148
138
  }
149
139
 
150
- void SunscraperRPC::onPageRendered(unsigned queryId, QString data)
140
+ void SunscraperRPC::onTimeout(unsigned eventQueryId)
151
141
  {
152
- m_results[queryId] = data;
142
+ if(eventQueryId != m_queryId)
143
+ return;
153
144
 
154
- if(m_waitQueue.contains(queryId)) {
155
- Header reply;
156
- reply.queryId = htonl(queryId);
157
- reply.requestType = htonl(RPC_WAIT);
145
+ QByteArray data;
158
146
 
159
- sendReply(reply, QByteArray());
160
- }
147
+ QDataStream stream(&data, QIODevice::WriteOnly);
148
+ stream << (int) false;
149
+
150
+ sendReply(data);
151
+
152
+ m_result = false;
161
153
  }
162
154
 
163
- void SunscraperRPC::onTimeout()
155
+ void SunscraperRPC::onFetchDone(unsigned eventQueryId, QString data)
164
156
  {
165
- QTimer *timer = static_cast<QTimer*>(QObject::sender());
166
- unsigned queryId = m_timers.key(timer);
167
-
168
- Header reply;
169
- reply.queryId = htonl(queryId);
170
- reply.requestType = htonl(RPC_WAIT);
157
+ if(eventQueryId != m_queryId)
158
+ return;
171
159
 
172
- sendReply(reply, QByteArray());
160
+ sendReply(data.toLocal8Bit());
173
161
  }
174
162
 
175
- void SunscraperRPC::sendReply(Header header, QByteArray data)
163
+ void SunscraperRPC::sendReply(QByteArray data)
176
164
  {
177
- header.dataLength = htonl(data.length());
165
+ QByteArray packet;
178
166
 
179
- QByteArray serialized((const char*) &header, sizeof(Header));
180
- serialized.append(data);
167
+ QDataStream stream(&packet, QIODevice::WriteOnly);
168
+ stream << data;
181
169
 
182
- m_socket->write(serialized);
170
+ m_socket->write(packet);
183
171
  }