sunscraper 1.1.0.beta3 → 1.2.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/ext/common/common.pro +13 -0
  2. data/ext/common/libsunscraper_common.a +0 -0
  3. data/ext/common/sunscraperproxy.cpp +11 -0
  4. data/ext/{standalone → common}/sunscraperproxy.h +5 -5
  5. data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
  6. data/ext/{embed → common}/sunscraperwebpage.h +0 -0
  7. data/ext/common/sunscraperworker.cpp +124 -0
  8. data/ext/common/sunscraperworker.h +50 -0
  9. data/ext/embed/embed.pro +14 -12
  10. data/ext/embed/sunscraperexternal.cpp +17 -16
  11. data/ext/embed/sunscraperinterface.cpp +206 -0
  12. data/ext/embed/sunscraperinterface.h +66 -0
  13. data/ext/embed/sunscraperlibrary.cpp +2 -12
  14. data/ext/embed/sunscraperlibrary.h +0 -1
  15. data/ext/embed/sunscraperthread.cpp +49 -0
  16. data/ext/embed/sunscraperthread.h +24 -0
  17. data/ext/extconf.rb +5 -3
  18. data/ext/standalone/standalone.pro +12 -6
  19. data/ext/standalone/sunscrapermain.cpp +13 -3
  20. data/ext/standalone/sunscraperrpc.cpp +76 -88
  21. data/ext/standalone/sunscraperrpc.h +19 -22
  22. data/ext/standalone/sunscraperrpcserver.cpp +26 -0
  23. data/ext/standalone/sunscraperrpcserver.h +24 -0
  24. data/ext/sunscraper-ext.pro +1 -1
  25. data/lib/sunscraper.rb +14 -14
  26. data/lib/sunscraper/library.rb +9 -9
  27. data/lib/sunscraper/standalone.rb +53 -107
  28. data/spec/sunscraper_spec.rb +86 -44
  29. data/sunscraper.gemspec +1 -1
  30. metadata +19 -17
  31. data/ext/embed/sunscraper.cpp +0 -92
  32. data/ext/embed/sunscraper.h +0 -47
  33. data/ext/embed/sunscraperproxy.cpp +0 -14
  34. data/ext/embed/sunscraperproxy.h +0 -24
  35. data/ext/embed/sunscraperworker.cpp +0 -163
  36. data/ext/embed/sunscraperworker.h +0 -58
  37. data/ext/standalone/sunscraperproxy.cpp +0 -14
  38. data/ext/standalone/sunscraperworker.cpp +0 -60
  39. data/ext/standalone/sunscraperworker.h +0 -34
@@ -0,0 +1,13 @@
1
+ TEMPLATE = lib
2
+ TARGET = sunscraper_common
3
+
4
+ QT += webkit network
5
+ CONFIG += staticlib
6
+
7
+ SOURCES += sunscraperproxy.cpp \
8
+ sunscraperwebpage.cpp \
9
+ sunscraperworker.cpp
10
+
11
+ HEADERS += sunscraperproxy.h \
12
+ sunscraperwebpage.h \
13
+ sunscraperworker.h
Binary file
@@ -0,0 +1,11 @@
1
+ #include "sunscraperproxy.h"
2
+
3
+ SunscraperProxy::SunscraperProxy(QObject *parent, unsigned queryId) :
4
+ QObject(parent), m_queryId(queryId)
5
+ {
6
+ }
7
+
8
+ void SunscraperProxy::finish()
9
+ {
10
+ emit finished(m_queryId);
11
+ }
@@ -3,22 +3,22 @@
3
3
 
4
4
  #include <QObject>
5
5
 
6
- class QWebPage;
7
-
8
6
  class SunscraperProxy : public QObject
9
7
  {
10
8
  Q_OBJECT
9
+
11
10
  public:
12
- SunscraperProxy(QWebPage *parent, unsigned queryId);
11
+ SunscraperProxy(QObject *parent, unsigned queryId);
13
12
 
14
13
  Q_INVOKABLE void finish();
15
14
 
16
15
  signals:
17
- void finished(unsigned queryId, QString html);
16
+ void finished(unsigned queryId);
18
17
 
19
18
  private:
20
- QWebPage *m_webPage;
21
19
  unsigned m_queryId;
20
+
21
+ SunscraperProxy();
22
22
  };
23
23
 
24
24
  #endif // SUNSCRAPERPROXY_H
File without changes
File without changes
@@ -0,0 +1,124 @@
1
+ #include <QWebPage>
2
+ #include <QWebFrame>
3
+ #include <QTimer>
4
+ #include <QtDebug>
5
+ #include "sunscraperworker.h"
6
+ #include "sunscraperwebpage.h"
7
+ #include "sunscraperproxy.h"
8
+
9
+ QSemaphore SunscraperWorker::m_initializationLock(0);
10
+
11
+ SunscraperWorker::SunscraperWorker(QObject *parent) :
12
+ QObject(parent)
13
+ {
14
+ m_initializationLock.acquire(1);
15
+ m_initializationLock.release(1);
16
+ }
17
+
18
+ void SunscraperWorker::unlock()
19
+ {
20
+ m_initializationLock.release(1);
21
+ }
22
+
23
+ void SunscraperWorker::loadHtml(unsigned queryId, QString html, QUrl baseUrl)
24
+ {
25
+ QWebPage *webPage = initializeWebPage(queryId);
26
+ webPage->mainFrame()->setHtml(html, baseUrl);
27
+ }
28
+
29
+ void SunscraperWorker::loadUrl(unsigned queryId, QUrl url)
30
+ {
31
+ QWebPage *webPage = initializeWebPage(queryId);
32
+ webPage->mainFrame()->load(url);
33
+ }
34
+
35
+ void SunscraperWorker::setTimeout(unsigned queryId, unsigned timeout)
36
+ {
37
+ Q_ASSERT(m_timers[queryId] == NULL);
38
+
39
+ QTimer *timer = new QTimer();
40
+ timer->setInterval(timeout);
41
+ timer->setSingleShot(true);
42
+
43
+ connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
44
+
45
+ timer->start();
46
+ m_timers[queryId] = timer;
47
+ }
48
+
49
+ void SunscraperWorker::finalize(unsigned queryId)
50
+ {
51
+ if(m_webPages.contains(queryId)) {
52
+ m_webPages[queryId]->deleteLater();
53
+ m_webPages.remove(queryId);
54
+ }
55
+
56
+ if(m_timers.contains(queryId)) {
57
+ m_timers[queryId]->deleteLater();
58
+ m_timers.remove(queryId);
59
+ }
60
+ }
61
+
62
+ QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
63
+ {
64
+ Q_ASSERT(m_webPages[queryId] == NULL);
65
+
66
+ SunscraperWebPage *webPage = new SunscraperWebPage();
67
+ webPage->settings()->setAttribute(QWebSettings::LocalStorageEnabled, true);
68
+
69
+ connect(webPage, SIGNAL(frameCreated(QWebFrame*)), this, SLOT(onFrameCreated(QWebFrame*)));
70
+ connect(webPage, SIGNAL(consoleMessage(QString)), this, SLOT(onMessage(QString)));
71
+
72
+ m_webPages[queryId] = webPage;
73
+
74
+ return webPage;
75
+ }
76
+
77
+ void SunscraperWorker::onFrameCreated(QWebFrame *frame)
78
+ {
79
+ connect(frame, SIGNAL(javaScriptWindowObjectCleared()),
80
+ this, SLOT(onJavascriptObjectCleared()));
81
+ }
82
+
83
+ void SunscraperWorker::onJavascriptObjectCleared()
84
+ {
85
+ QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
86
+ QWebPage *page = origin->page();
87
+
88
+ unsigned queryId = m_webPages.key(page, 0);
89
+ Q_ASSERT(queryId != 0);
90
+
91
+ SunscraperProxy *proxy = new SunscraperProxy(page, queryId);
92
+ connect(proxy, SIGNAL(finished(uint)), this, SLOT(onFinish(uint)));
93
+
94
+ origin->addToJavaScriptWindowObject("Sunscraper", proxy, QScriptEngine::QtOwnership);
95
+ }
96
+
97
+ void SunscraperWorker::onFinish(unsigned queryId)
98
+ {
99
+ Q_ASSERT(m_webPages[queryId] != NULL);
100
+
101
+ emit finished(queryId);
102
+ }
103
+
104
+ void SunscraperWorker::onTimeout()
105
+ {
106
+ QTimer *origin = static_cast<QTimer *>(QObject::sender());
107
+
108
+ unsigned queryId = m_timers.key(origin, 0);
109
+ Q_ASSERT(queryId != 0);
110
+
111
+ m_timers[queryId]->deleteLater();
112
+ m_timers.remove(queryId);
113
+
114
+ emit timedOut(queryId);
115
+ }
116
+
117
+ void SunscraperWorker::onMessage(QString message)
118
+ {
119
+ qDebug() << "Sunscraper Console:" << message;
120
+ }
121
+
122
+ void SunscraperWorker::fetchHtml(unsigned queryId) {
123
+ emit htmlFetched(queryId, m_webPages[queryId]->mainFrame()->toHtml());
124
+ }
@@ -0,0 +1,50 @@
1
+ #ifndef SUNSCRAPERWORKER_H
2
+ #define SUNSCRAPERWORKER_H
3
+
4
+ #include <QObject>
5
+ #include <QSemaphore>
6
+ #include <QMap>
7
+ #include <QUrl>
8
+
9
+ class QWebPage;
10
+ class QWebFrame;
11
+ class QTimer;
12
+
13
+ class SunscraperWorker : public QObject
14
+ {
15
+ Q_OBJECT
16
+
17
+ public:
18
+ SunscraperWorker(QObject * parent = 0);
19
+
20
+ static void unlock();
21
+
22
+ signals:
23
+ void finished(unsigned queryId);
24
+ void timedOut(unsigned queryId);
25
+ void htmlFetched(unsigned queryId, QString data);
26
+
27
+ public slots:
28
+ void loadHtml(unsigned queryId, QString html, QUrl baseUrl);
29
+ void loadUrl(unsigned queryId, QUrl url);
30
+ void setTimeout(unsigned queryId, unsigned timeout);
31
+ void fetchHtml(unsigned queryId);
32
+ void finalize(unsigned queryId);
33
+
34
+ private slots:
35
+ void onFrameCreated(QWebFrame *frame);
36
+ void onJavascriptObjectCleared();
37
+ void onFinish(unsigned queryId);
38
+ void onTimeout();
39
+ void onMessage(QString message);
40
+
41
+ private:
42
+ static QSemaphore m_initializationLock;
43
+
44
+ QMap<unsigned, QWebPage *> m_webPages;
45
+ QMap<unsigned, QTimer *> m_timers;
46
+
47
+ QWebPage *initializeWebPage(unsigned queryId);
48
+ };
49
+
50
+ #endif // SUNSCRAPERWORKER_H
data/ext/embed/embed.pro CHANGED
@@ -1,19 +1,21 @@
1
1
  QT += webkit
2
2
 
3
- TARGET = sunscraper
3
+ TARGET = sunscraper
4
4
  TEMPLATE = lib
5
5
 
6
- SOURCES += sunscraperlibrary.cpp \
7
- sunscraperworker.cpp \
8
- sunscraperwebpage.cpp \
9
- sunscraperexternal.cpp \
10
- sunscraper.cpp \
11
- sunscraperproxy.cpp
6
+ SOURCES += sunscraperexternal.cpp \
7
+ sunscraperlibrary.cpp \
8
+ sunscraperinterface.cpp \
9
+ sunscraperthread.cpp
12
10
 
13
11
  HEADERS += sunscraperlibrary.h \
14
- sunscraperworker.h \
15
- sunscraperwebpage.h \
16
- sunscraper.h \
17
- sunscraperproxy.h
12
+ sunscraperinterface.h \
13
+ sunscraperthread.h
18
14
 
19
- linux:LDFLAGS += -pthread
15
+ INCLUDEPATH += ../common
16
+
17
+ unix:{
18
+ LIBS += -L../common -lsunscraper_common
19
+ POST_TARGETDEPS += ../common/libsunscraper_common.a
20
+ LDFLAGS += -pthread
21
+ }
@@ -1,39 +1,40 @@
1
- #include "sunscraper.h"
2
- #include "sunscraperworker.h"
1
+ #include "sunscraperinterface.h"
2
+ #include "sunscraperthread.h"
3
3
 
4
4
  extern "C" {
5
- Sunscraper *sunscraper_create()
5
+ unsigned sunscraper_create()
6
6
  {
7
- return new Sunscraper();
7
+ return SunscraperInterface::instance()->createQuery();
8
8
  }
9
9
 
10
- void sunscraper_load_html(Sunscraper *sunscraper, const char *html)
10
+ void sunscraper_load_html(unsigned queryId, const char *html, const char *url)
11
11
  {
12
- sunscraper->loadHtml(html);
12
+ SunscraperInterface::instance()->loadHtml(queryId, html, QUrl(url));
13
13
  }
14
14
 
15
- void sunscraper_load_url(Sunscraper *sunscraper, const char *url)
15
+ void sunscraper_load_url(unsigned queryId, const char *url)
16
16
  {
17
- sunscraper->loadUrl(url);
17
+ SunscraperInterface::instance()->loadUrl(queryId, QUrl(url));
18
18
  }
19
19
 
20
- void sunscraper_wait(Sunscraper *sunscraper, unsigned timeout)
20
+ int sunscraper_wait(unsigned queryId, unsigned timeout)
21
21
  {
22
- sunscraper->wait(timeout);
22
+ return SunscraperInterface::instance()->wait(queryId, timeout);
23
23
  }
24
24
 
25
- const char *sunscraper_fetch(Sunscraper *sunscraper)
25
+ const char *sunscraper_fetch(unsigned queryId)
26
26
  {
27
- return sunscraper->fetchAsCString();
27
+ /* VERIFY that the string won't be deleted prematurely */
28
+ return SunscraperInterface::instance()->fetch(queryId).constData();
28
29
  }
29
30
 
30
- void sunscraper_discard(Sunscraper *sunscraper)
31
+ void sunscraper_finalize(unsigned queryId)
31
32
  {
32
- delete sunscraper;
33
+ SunscraperInterface::instance()->finalize(queryId);
33
34
  }
34
35
 
35
- void sunscraper_finalize()
36
+ void sunscraper_quit()
36
37
  {
37
- SunscraperWorker::commitSuicide();
38
+ SunscraperThread::commitSuicide();
38
39
  }
39
40
  }
@@ -0,0 +1,206 @@
1
+ #include <QApplication>
2
+ #include <QThread>
3
+ #include <QWebPage>
4
+ #include <QWebFrame>
5
+ #include <QMutexLocker>
6
+ #include <QtDebug>
7
+ #include "sunscraperinterface.h"
8
+ #include "sunscraperlibrary.h"
9
+ #include "sunscraperworker.h"
10
+
11
+ // #define DEBUG_SUNSCRAPERINTERFACE
12
+
13
+ QMutex SunscraperInterface::m_initializationMutex;
14
+ SunscraperInterface *SunscraperInterface::m_instance;
15
+
16
+ SunscraperInterface::SunscraperInterface() :
17
+ m_nextQueryId(0)
18
+ {
19
+ m_worker = new SunscraperWorker();
20
+ m_worker->moveToThread(QApplication::instance()->thread());
21
+
22
+ connect(this, SIGNAL(requestLoadHtml(uint,QString,QUrl)),
23
+ m_worker, SLOT(loadHtml(uint,QString,QUrl)), Qt::QueuedConnection);
24
+ connect(this, SIGNAL(requestLoadUrl(uint,QUrl)),
25
+ m_worker, SLOT(loadUrl(uint,QUrl)), Qt::QueuedConnection);
26
+ connect(this, SIGNAL(requestTimeout(uint,uint)),
27
+ m_worker, SLOT(setTimeout(uint,uint)), Qt::QueuedConnection);
28
+ connect(this, SIGNAL(requestFetch(uint)),
29
+ m_worker, SLOT(fetchHtml(uint)), Qt::QueuedConnection);
30
+ connect(this, SIGNAL(requestFinalize(uint)),
31
+ m_worker, SLOT(finalize(uint)), Qt::QueuedConnection);
32
+
33
+ connect(m_worker, SIGNAL(finished(uint)),
34
+ this, SLOT(onFinish(uint)), Qt::DirectConnection);
35
+ connect(m_worker, SIGNAL(timedOut(uint)),
36
+ this, SLOT(onTimeout(uint)), Qt::DirectConnection);
37
+ connect(m_worker, SIGNAL(htmlFetched(uint,QString)),
38
+ this, SLOT(onFetchDone(uint,QString)), Qt::DirectConnection);
39
+ }
40
+
41
+ SunscraperInterface *SunscraperInterface::instance()
42
+ {
43
+ QMutexLocker locker(&m_initializationMutex);
44
+
45
+ if(m_instance == NULL)
46
+ m_instance = new SunscraperInterface();
47
+
48
+ return m_instance;
49
+ }
50
+
51
+ void SunscraperInterface::initSemaphore(unsigned queryId)
52
+ {
53
+ QMutexLocker locker(&m_semaphoresMutex);
54
+
55
+ Q_ASSERT(m_semaphores[queryId] == NULL);
56
+
57
+ QSemaphore *semaphore = new QSemaphore(0);
58
+ m_semaphores[queryId] = semaphore;
59
+ }
60
+
61
+ void SunscraperInterface::waitOnSemaphore(unsigned queryId)
62
+ {
63
+ m_semaphoresMutex.lock();
64
+
65
+ Q_ASSERT(m_semaphores[queryId] != NULL);
66
+
67
+ QSemaphore *semaphore = m_semaphores[queryId];
68
+
69
+ m_semaphoresMutex.unlock();
70
+
71
+ semaphore->acquire(1);
72
+
73
+ m_semaphoresMutex.lock();
74
+
75
+ delete semaphore;
76
+ m_semaphores.remove(queryId);
77
+
78
+ m_semaphoresMutex.unlock();
79
+ }
80
+
81
+ void SunscraperInterface::signalSemaphore(unsigned queryId)
82
+ {
83
+ QMutexLocker locker(&m_semaphoresMutex);
84
+
85
+ Q_ASSERT(m_semaphores[queryId] != NULL);
86
+
87
+ m_semaphores[queryId]->release(1);
88
+ }
89
+
90
+ unsigned SunscraperInterface::createQuery()
91
+ {
92
+ QMutexLocker locker(&m_queryIdMutex);
93
+
94
+ m_nextQueryId += 1;
95
+
96
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
97
+ qDebug() << "createQuery" << m_nextQueryId;
98
+ #endif
99
+
100
+ return m_nextQueryId;
101
+ }
102
+
103
+ void SunscraperInterface::loadHtml(unsigned queryId, QString html, QUrl baseUrl)
104
+ {
105
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
106
+ qDebug() << "loadHtml" << queryId << html << baseUrl;
107
+ #endif
108
+
109
+ emit requestLoadHtml(queryId, html, baseUrl);
110
+ }
111
+
112
+ void SunscraperInterface::loadUrl(unsigned queryId, QUrl url)
113
+ {
114
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
115
+ qDebug() << "loadUrl" << queryId << url;
116
+ #endif
117
+
118
+ emit requestLoadUrl(queryId, url);
119
+ }
120
+
121
+ bool SunscraperInterface::wait(unsigned queryId, unsigned timeout)
122
+ {
123
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
124
+ qDebug() << "wait" << queryId << timeout;
125
+ #endif
126
+
127
+ initSemaphore(queryId);
128
+ emit requestTimeout(queryId, timeout);
129
+ waitOnSemaphore(queryId);
130
+
131
+ /* There was either a finish or timeout */
132
+
133
+ {
134
+ QMutexLocker locker(&m_resultsMutex);
135
+
136
+ bool success = m_results[queryId];
137
+ m_results.remove(queryId);
138
+
139
+ return success;
140
+ }
141
+ }
142
+
143
+ void SunscraperInterface::onFinish(unsigned queryId)
144
+ {
145
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
146
+ qDebug() << "onFinish" << queryId;
147
+ #endif
148
+
149
+ QMutexLocker locker(&m_resultsMutex);
150
+ m_results[queryId] = true;
151
+
152
+ signalSemaphore(queryId);
153
+ }
154
+
155
+ void SunscraperInterface::onTimeout(unsigned queryId)
156
+ {
157
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
158
+ qDebug() << "onTimeout" << queryId;
159
+ #endif
160
+
161
+ QMutexLocker locker(&m_resultsMutex);
162
+ m_results[queryId] = false;
163
+
164
+ signalSemaphore(queryId);
165
+ }
166
+
167
+ void SunscraperInterface::onFetchDone(unsigned queryId, QString html)
168
+ {
169
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
170
+ qDebug() << "onFetchDone" << queryId;
171
+ #endif
172
+
173
+ QMutexLocker locker(&m_resultsMutex);
174
+ m_htmlCache[queryId] = html.toLocal8Bit();
175
+
176
+ signalSemaphore(queryId);
177
+ }
178
+
179
+ QByteArray SunscraperInterface::fetch(unsigned queryId)
180
+ {
181
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
182
+ qDebug() << "fetch" << queryId;
183
+ #endif
184
+
185
+ initSemaphore(queryId);
186
+ emit requestFetch(queryId);
187
+ waitOnSemaphore(queryId);
188
+
189
+ {
190
+ QMutexLocker locker(&m_resultsMutex);
191
+ return m_htmlCache[queryId];
192
+ }
193
+ }
194
+
195
+ void SunscraperInterface::finalize(unsigned queryId)
196
+ {
197
+ #ifdef DEBUG_SUNSCRAPERINTERFACE
198
+ qDebug() << "finalize" << queryId;
199
+ #endif
200
+
201
+ emit requestFinalize(queryId);
202
+
203
+ QMutexLocker locker(&m_resultsMutex);
204
+ m_results.remove(queryId);
205
+ m_htmlCache.remove(queryId);
206
+ }