sunscraper 1.1.0.beta3 → 1.2.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/ext/common/common.pro +13 -0
  2. data/ext/common/libsunscraper_common.a +0 -0
  3. data/ext/common/sunscraperproxy.cpp +11 -0
  4. data/ext/{standalone → common}/sunscraperproxy.h +5 -5
  5. data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
  6. data/ext/{embed → common}/sunscraperwebpage.h +0 -0
  7. data/ext/common/sunscraperworker.cpp +124 -0
  8. data/ext/common/sunscraperworker.h +50 -0
  9. data/ext/embed/embed.pro +14 -12
  10. data/ext/embed/sunscraperexternal.cpp +17 -16
  11. data/ext/embed/sunscraperinterface.cpp +206 -0
  12. data/ext/embed/sunscraperinterface.h +66 -0
  13. data/ext/embed/sunscraperlibrary.cpp +2 -12
  14. data/ext/embed/sunscraperlibrary.h +0 -1
  15. data/ext/embed/sunscraperthread.cpp +49 -0
  16. data/ext/embed/sunscraperthread.h +24 -0
  17. data/ext/extconf.rb +5 -3
  18. data/ext/standalone/standalone.pro +12 -6
  19. data/ext/standalone/sunscrapermain.cpp +13 -3
  20. data/ext/standalone/sunscraperrpc.cpp +76 -88
  21. data/ext/standalone/sunscraperrpc.h +19 -22
  22. data/ext/standalone/sunscraperrpcserver.cpp +26 -0
  23. data/ext/standalone/sunscraperrpcserver.h +24 -0
  24. data/ext/sunscraper-ext.pro +1 -1
  25. data/lib/sunscraper.rb +14 -14
  26. data/lib/sunscraper/library.rb +9 -9
  27. data/lib/sunscraper/standalone.rb +53 -107
  28. data/spec/sunscraper_spec.rb +86 -44
  29. data/sunscraper.gemspec +1 -1
  30. metadata +19 -17
  31. data/ext/embed/sunscraper.cpp +0 -92
  32. data/ext/embed/sunscraper.h +0 -47
  33. data/ext/embed/sunscraperproxy.cpp +0 -14
  34. data/ext/embed/sunscraperproxy.h +0 -24
  35. data/ext/embed/sunscraperworker.cpp +0 -163
  36. data/ext/embed/sunscraperworker.h +0 -58
  37. data/ext/standalone/sunscraperproxy.cpp +0 -14
  38. data/ext/standalone/sunscraperworker.cpp +0 -60
  39. data/ext/standalone/sunscraperworker.h +0 -34
@@ -0,0 +1,13 @@
1
+ TEMPLATE = lib
2
+ TARGET = sunscraper_common
3
+
4
+ QT += webkit network
5
+ CONFIG += staticlib
6
+
7
+ SOURCES += sunscraperproxy.cpp \
8
+ sunscraperwebpage.cpp \
9
+ sunscraperworker.cpp
10
+
11
+ HEADERS += sunscraperproxy.h \
12
+ sunscraperwebpage.h \
13
+ sunscraperworker.h
Binary file
@@ -0,0 +1,11 @@
1
+ #include "sunscraperproxy.h"
2
+
3
+ SunscraperProxy::SunscraperProxy(QObject *parent, unsigned queryId) :
4
+ QObject(parent), m_queryId(queryId)
5
+ {
6
+ }
7
+
8
+ void SunscraperProxy::finish()
9
+ {
10
+ emit finished(m_queryId);
11
+ }
@@ -3,22 +3,22 @@
3
3
 
4
4
  #include <QObject>
5
5
 
6
- class QWebPage;
7
-
8
6
  class SunscraperProxy : public QObject
9
7
  {
10
8
  Q_OBJECT
9
+
11
10
  public:
12
- SunscraperProxy(QWebPage *parent, unsigned queryId);
11
+ SunscraperProxy(QObject *parent, unsigned queryId);
13
12
 
14
13
  Q_INVOKABLE void finish();
15
14
 
16
15
  signals:
17
- void finished(unsigned queryId, QString html);
16
+ void finished(unsigned queryId);
18
17
 
19
18
  private:
20
- QWebPage *m_webPage;
21
19
  unsigned m_queryId;
20
+
21
+ SunscraperProxy();
22
22
  };
23
23
 
24
24
  #endif // SUNSCRAPERPROXY_H
File without changes
File without changes
@@ -0,0 +1,124 @@
1
+ #include <QWebPage>
2
+ #include <QWebFrame>
3
+ #include <QTimer>
4
+ #include <QtDebug>
5
+ #include "sunscraperworker.h"
6
+ #include "sunscraperwebpage.h"
7
+ #include "sunscraperproxy.h"
8
+
9
+ QSemaphore SunscraperWorker::m_initializationLock(0);
10
+
11
+ SunscraperWorker::SunscraperWorker(QObject *parent) :
12
+ QObject(parent)
13
+ {
14
+ m_initializationLock.acquire(1);
15
+ m_initializationLock.release(1);
16
+ }
17
+
18
+ void SunscraperWorker::unlock()
19
+ {
20
+ m_initializationLock.release(1);
21
+ }
22
+
23
+ void SunscraperWorker::loadHtml(unsigned queryId, QString html, QUrl baseUrl)
24
+ {
25
+ QWebPage *webPage = initializeWebPage(queryId);
26
+ webPage->mainFrame()->setHtml(html, baseUrl);
27
+ }
28
+
29
+ void SunscraperWorker::loadUrl(unsigned queryId, QUrl url)
30
+ {
31
+ QWebPage *webPage = initializeWebPage(queryId);
32
+ webPage->mainFrame()->load(url);
33
+ }
34
+
35
+ void SunscraperWorker::setTimeout(unsigned queryId, unsigned timeout)
36
+ {
37
+ Q_ASSERT(m_timers[queryId] == NULL);
38
+
39
+ QTimer *timer = new QTimer();
40
+ timer->setInterval(timeout);
41
+ timer->setSingleShot(true);
42
+
43
+ connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
44
+
45
+ timer->start();
46
+ m_timers[queryId] = timer;
47
+ }
48
+
49
+ void SunscraperWorker::finalize(unsigned queryId)
50
+ {
51
+ if(m_webPages.contains(queryId)) {
52
+ m_webPages[queryId]->deleteLater();
53
+ m_webPages.remove(queryId);
54
+ }
55
+
56
+ if(m_timers.contains(queryId)) {
57
+ m_timers[queryId]->deleteLater();
58
+ m_timers.remove(queryId);
59
+ }
60
+ }
61
+
62
+ QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
63
+ {
64
+ Q_ASSERT(m_webPages[queryId] == NULL);
65
+
66
+ SunscraperWebPage *webPage = new SunscraperWebPage();
67
+ webPage->settings()->setAttribute(QWebSettings::LocalStorageEnabled, true);
68
+
69
+ connect(webPage, SIGNAL(frameCreated(QWebFrame*)), this, SLOT(onFrameCreated(QWebFrame*)));
70
+ connect(webPage, SIGNAL(consoleMessage(QString)), this, SLOT(onMessage(QString)));
71
+
72
+ m_webPages[queryId] = webPage;
73
+
74
+ return webPage;
75
+ }
76
+
77
+ void SunscraperWorker::onFrameCreated(QWebFrame *frame)
78
+ {
79
+ connect(frame, SIGNAL(javaScriptWindowObjectCleared()),
80
+ this, SLOT(onJavascriptObjectCleared()));
81
+ }
82
+
83
+ void SunscraperWorker::onJavascriptObjectCleared()
84
+ {
85
+ QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
86
+ QWebPage *page = origin->page();
87
+
88
+ unsigned queryId = m_webPages.key(page, 0);
89
+ Q_ASSERT(queryId != 0);
90
+
91
+ SunscraperProxy *proxy = new SunscraperProxy(page, queryId);
92
+ connect(proxy, SIGNAL(finished(uint)), this, SLOT(onFinish(uint)));
93
+
94
+ origin->addToJavaScriptWindowObject("Sunscraper", proxy, QScriptEngine::QtOwnership);
95
+ }
96
+
97
+ void SunscraperWorker::onFinish(unsigned queryId)
98
+ {
99
+ Q_ASSERT(m_webPages[queryId] != NULL);
100
+
101
+ emit finished(queryId);
102
+ }
103
+
104
+ void SunscraperWorker::onTimeout()
105
+ {
106
+ QTimer *origin = static_cast<QTimer *>(QObject::sender());
107
+
108
+ unsigned queryId = m_timers.key(origin, 0);
109
+ Q_ASSERT(queryId != 0);
110
+
111
+ m_timers[queryId]->deleteLater();
112
+ m_timers.remove(queryId);
113
+
114
+ emit timedOut(queryId);
115
+ }
116
+
117
+ void SunscraperWorker::onMessage(QString message)
118
+ {
119
+ qDebug() << "Sunscraper Console:" << message;
120
+ }
121
+
122
+ void SunscraperWorker::fetchHtml(unsigned queryId) {
123
+ emit htmlFetched(queryId, m_webPages[queryId]->mainFrame()->toHtml());
124
+ }
@@ -0,0 +1,50 @@
1
+ #ifndef SUNSCRAPERWORKER_H
2
+ #define SUNSCRAPERWORKER_H
3
+
4
+ #include <QObject>
5
+ #include <QSemaphore>
6
+ #include <QMap>
7
+ #include <QUrl>
8
+
9
+ class QWebPage;
10
+ class QWebFrame;
11
+ class QTimer;
12
+
13
+ class SunscraperWorker : public QObject
14
+ {
15
+ Q_OBJECT
16
+
17
+ public:
18
+ SunscraperWorker(QObject * parent = 0);
19
+
20
+ static void unlock();
21
+
22
+ signals:
23
+ void finished(unsigned queryId);
24
+ void timedOut(unsigned queryId);
25
+ void htmlFetched(unsigned queryId, QString data);
26
+
27
+ public slots:
28
+ void loadHtml(unsigned queryId, QString html, QUrl baseUrl);
29
+ void loadUrl(unsigned queryId, QUrl url);
30
+ void setTimeout(unsigned queryId, unsigned timeout);
31
+ void fetchHtml(unsigned queryId);
32
+ void finalize(unsigned queryId);
33
+
34
+ private slots:
35
+ void onFrameCreated(QWebFrame *frame);
36
+ void onJavascriptObjectCleared();
37
+ void onFinish(unsigned queryId);
38
+ void onTimeout();
39
+ void onMessage(QString message);
40
+
41
+ private:
42
+ static QSemaphore m_initializationLock;
43
+
44
+ QMap<unsigned, QWebPage *> m_webPages;
45
+ QMap<unsigned, QTimer *> m_timers;
46
+
47
+ QWebPage *initializeWebPage(unsigned queryId);
48
+ };
49
+
50
+ #endif // SUNSCRAPERWORKER_H
data/ext/embed/embed.pro CHANGED
@@ -1,19 +1,21 @@
1
1
  QT += webkit
2
2
 
3
- TARGET = sunscraper
3
+ TARGET = sunscraper
4
4
  TEMPLATE = lib
5
5
 
6
- SOURCES += sunscraperlibrary.cpp \
7
- sunscraperworker.cpp \
8
- sunscraperwebpage.cpp \
9
- sunscraperexternal.cpp \
10
- sunscraper.cpp \
11
- sunscraperproxy.cpp
6
+ SOURCES += sunscraperexternal.cpp \
7
+ sunscraperlibrary.cpp \
8
+ sunscraperinterface.cpp \
9
+ sunscraperthread.cpp
12
10
 
13
11
  HEADERS += sunscraperlibrary.h \
14
- sunscraperworker.h \
15
- sunscraperwebpage.h \
16
- sunscraper.h \
17
- sunscraperproxy.h
12
+ sunscraperinterface.h \
13
+ sunscraperthread.h
18
14
 
19
- linux:LDFLAGS += -pthread
15
+ INCLUDEPATH += ../common
16
+
17
+ unix:{
18
+ LIBS += -L../common -lsunscraper_common
19
+ POST_TARGETDEPS += ../common/libsunscraper_common.a
20
+ LDFLAGS += -pthread
21
+ }
@@ -1,39 +1,40 @@
1
- #include "sunscraper.h"
2
- #include "sunscraperworker.h"
1
+ #include "sunscraperinterface.h"
2
+ #include "sunscraperthread.h"
3
3
 
4
4
  extern "C" {
5
- Sunscraper *sunscraper_create()
5
+ unsigned sunscraper_create()
6
6
  {
7
- return new Sunscraper();
7
+ return SunscraperInterface::instance()->createQuery();
8
8
  }
9
9
 
10
- void sunscraper_load_html(Sunscraper *sunscraper, const char *html)
10
+ void sunscraper_load_html(unsigned queryId, const char *html, const char *url)
11
11
  {
12
- sunscraper->loadHtml(html);
12
+ SunscraperInterface::instance()->loadHtml(queryId, html, QUrl(url));
13
13
  }
14
14
 
15
- void sunscraper_load_url(Sunscraper *sunscraper, const char *url)
15
+ void sunscraper_load_url(unsigned queryId, const char *url)
16
16
  {
17
- sunscraper->loadUrl(url);
17
+ SunscraperInterface::instance()->loadUrl(queryId, QUrl(url));
18
18
  }
19
19
 
20
- void sunscraper_wait(Sunscraper *sunscraper, unsigned timeout)
20
+ int sunscraper_wait(unsigned queryId, unsigned timeout)
21
21
  {
22
- sunscraper->wait(timeout);
22
+ return SunscraperInterface::instance()->wait(queryId, timeout);
23
23
  }
24
24
 
25
- const char *sunscraper_fetch(Sunscraper *sunscraper)
25
+ const char *sunscraper_fetch(unsigned queryId)
26
26
  {
27
- return sunscraper->fetchAsCString();
27
+ /* VERIFY that the string won't be deleted prematurely */
28
+ return SunscraperInterface::instance()->fetch(queryId).constData();
28
29
  }
29
30
 
30
- void sunscraper_discard(Sunscraper *sunscraper)
31
+ void sunscraper_finalize(unsigned queryId)
31
32
  {
32
- delete sunscraper;
33
+ SunscraperInterface::instance()->finalize(queryId);
33
34
  }
34
35
 
35
- void sunscraper_finalize()
36
+ void sunscraper_quit()
36
37
  {
37
- SunscraperWorker::commitSuicide();
38
+ SunscraperThread::commitSuicide();
38
39
  }
39
40
  }
@@ -0,0 +1,206 @@
1
+ #include <QApplication>
2
+ #include <QThread>
3
+ #include <QWebPage>
4
+ #include <QWebFrame>
5
+ #include <QMutexLocker>
6
+ #include <QtDebug>
7
+ #include "sunscraperinterface.h"
8
+ #include "sunscraperlibrary.h"
9
+ #include "sunscraperworker.h"
10
+
11
+ // #define DEBUG_SUNSCRAPERINTERFACE
12
+
13
+ QMutex SunscraperInterface::m_initializationMutex;
14
+ SunscraperInterface *SunscraperInterface::m_instance;
15
+
16
+ SunscraperInterface::SunscraperInterface() :
17
+ m_nextQueryId(0)
18
+ {
19
+ m_worker = new SunscraperWorker();
20
+ m_worker->moveToThread(QApplication::instance()->thread());
21
+
22
+ connect(this, SIGNAL(requestLoadHtml(uint,QString,QUrl)),
23
+ m_worker, SLOT(loadHtml(uint,QString,QUrl)), Qt::QueuedConnection);
24
+ connect(this, SIGNAL(requestLoadUrl(uint,QUrl)),
25
+ m_worker, SLOT(loadUrl(uint,QUrl)), Qt::QueuedConnection);
26
+ connect(this, SIGNAL(requestTimeout(uint,uint)),
27
+ m_worker, SLOT(setTimeout(uint,uint)), Qt::QueuedConnection);
28
+ connect(this, SIGNAL(requestFetch(uint)),
29
+ m_worker, SLOT(fetchHtml(uint)), Qt::QueuedConnection);
30
+ connect(this, SIGNAL(requestFinalize(uint)),
31
+ m_worker, SLOT(finalize(uint)), Qt::QueuedConnection);
32
+
33
+ connect(m_worker, SIGNAL(finished(uint)),
34
+ this, SLOT(onFinish(uint)), Qt::DirectConnection);
35
+ connect(m_worker, SIGNAL(timedOut(uint)),
36
+ this, SLOT(onTimeout(uint)), Qt::DirectConnection);
37
+ connect(m_worker, SIGNAL(htmlFetched(uint,QString)),
38
+ this, SLOT(onFetchDone(uint,QString)), Qt::DirectConnection);
39
+ }
40
+
41
+ SunscraperInterface *SunscraperInterface::instance()
42
+ {
43
+ QMutexLocker locker(&m_initializationMutex);
44
+
45
+ if(m_instance == NULL)
46
+ m_instance = new SunscraperInterface();
47
+
48
+ return m_instance;
49
+ }
50
+
51
+ void SunscraperInterface::initSemaphore(unsigned queryId)
52
+ {
53
+ QMutexLocker locker(&m_semaphoresMutex);
54
+
55
+ Q_ASSERT(m_semaphores[queryId] == NULL);
56
+
57
+ QSemaphore *semaphore = new QSemaphore(0);
58
+ m_semaphores[queryId] = semaphore;
59
+ }
60
+
61
+ void SunscraperInterface::waitOnSemaphore(unsigned queryId)
62
+ {
63
+ m_semaphoresMutex.lock();
64
+
65
+ Q_ASSERT(m_semaphores[queryId] != NULL);
66
+
67
+ QSemaphore *semaphore = m_semaphores[queryId];
68
+
69
+ m_semaphoresMutex.unlock();
70
+
71
+ semaphore->acquire(1);
72
+
73
+ m_semaphoresMutex.lock();
74
+
75
+ delete semaphore;
76
+ m_semaphores.remove(queryId);
77
+
78
+ m_semaphoresMutex.unlock();
79
+ }
80
+
81
+ void SunscraperInterface::signalSemaphore(unsigned queryId)
82
+ {
83
+ QMutexLocker locker(&m_semaphoresMutex);
84
+
85
+ Q_ASSERT(m_semaphores[queryId] != NULL);
86
+
87
+ m_semaphores[queryId]->release(1);
88
+ }
89
+
90
+ unsigned SunscraperInterface::createQuery()
91
+ {
92
+ QMutexLocker locker(&m_queryIdMutex);
93
+
94
+ m_nextQueryId += 1;
95
+
96
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
97
+ qDebug() << "createQuery" << m_nextQueryId;
98
+ #endif
99
+
100
+ return m_nextQueryId;
101
+ }
102
+
103
+ void SunscraperInterface::loadHtml(unsigned queryId, QString html, QUrl baseUrl)
104
+ {
105
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
106
+ qDebug() << "loadHtml" << queryId << html << baseUrl;
107
+ #endif
108
+
109
+ emit requestLoadHtml(queryId, html, baseUrl);
110
+ }
111
+
112
+ void SunscraperInterface::loadUrl(unsigned queryId, QUrl url)
113
+ {
114
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
115
+ qDebug() << "loadUrl" << queryId << url;
116
+ #endif
117
+
118
+ emit requestLoadUrl(queryId, url);
119
+ }
120
+
121
+ bool SunscraperInterface::wait(unsigned queryId, unsigned timeout)
122
+ {
123
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
124
+ qDebug() << "wait" << queryId << timeout;
125
+ #endif
126
+
127
+ initSemaphore(queryId);
128
+ emit requestTimeout(queryId, timeout);
129
+ waitOnSemaphore(queryId);
130
+
131
+ /* There was either a finish or timeout */
132
+
133
+ {
134
+ QMutexLocker locker(&m_resultsMutex);
135
+
136
+ bool success = m_results[queryId];
137
+ m_results.remove(queryId);
138
+
139
+ return success;
140
+ }
141
+ }
142
+
143
+ void SunscraperInterface::onFinish(unsigned queryId)
144
+ {
145
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
146
+ qDebug() << "onFinish" << queryId;
147
+ #endif
148
+
149
+ QMutexLocker locker(&m_resultsMutex);
150
+ m_results[queryId] = true;
151
+
152
+ signalSemaphore(queryId);
153
+ }
154
+
155
+ void SunscraperInterface::onTimeout(unsigned queryId)
156
+ {
157
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
158
+ qDebug() << "onTimeout" << queryId;
159
+ #endif
160
+
161
+ QMutexLocker locker(&m_resultsMutex);
162
+ m_results[queryId] = false;
163
+
164
+ signalSemaphore(queryId);
165
+ }
166
+
167
+ void SunscraperInterface::onFetchDone(unsigned queryId, QString html)
168
+ {
169
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
170
+ qDebug() << "onFetchDone" << queryId;
171
+ #endif
172
+
173
+ QMutexLocker locker(&m_resultsMutex);
174
+ m_htmlCache[queryId] = html.toLocal8Bit();
175
+
176
+ signalSemaphore(queryId);
177
+ }
178
+
179
+ QByteArray SunscraperInterface::fetch(unsigned queryId)
180
+ {
181
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
182
+ qDebug() << "fetch" << queryId;
183
+ #endif
184
+
185
+ initSemaphore(queryId);
186
+ emit requestFetch(queryId);
187
+ waitOnSemaphore(queryId);
188
+
189
+ {
190
+ QMutexLocker locker(&m_resultsMutex);
191
+ return m_htmlCache[queryId];
192
+ }
193
+ }
194
+
195
+ void SunscraperInterface::finalize(unsigned queryId)
196
+ {
197
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
198
+ qDebug() << "finalize" << queryId;
199
+ #endif
200
+
201
+ emit requestFinalize(queryId);
202
+
203
+ QMutexLocker locker(&m_resultsMutex);
204
+ m_results.remove(queryId);
205
+ m_htmlCache.remove(queryId);
206
+ }