sunscraper 1.1.0.beta3 → 1.2.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/ext/common/common.pro +13 -0
  2. data/ext/common/libsunscraper_common.a +0 -0
  3. data/ext/common/sunscraperproxy.cpp +11 -0
  4. data/ext/{standalone → common}/sunscraperproxy.h +5 -5
  5. data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
  6. data/ext/{embed → common}/sunscraperwebpage.h +0 -0
  7. data/ext/common/sunscraperworker.cpp +124 -0
  8. data/ext/common/sunscraperworker.h +50 -0
  9. data/ext/embed/embed.pro +14 -12
  10. data/ext/embed/sunscraperexternal.cpp +17 -16
  11. data/ext/embed/sunscraperinterface.cpp +206 -0
  12. data/ext/embed/sunscraperinterface.h +66 -0
  13. data/ext/embed/sunscraperlibrary.cpp +2 -12
  14. data/ext/embed/sunscraperlibrary.h +0 -1
  15. data/ext/embed/sunscraperthread.cpp +49 -0
  16. data/ext/embed/sunscraperthread.h +24 -0
  17. data/ext/extconf.rb +5 -3
  18. data/ext/standalone/standalone.pro +12 -6
  19. data/ext/standalone/sunscrapermain.cpp +13 -3
  20. data/ext/standalone/sunscraperrpc.cpp +76 -88
  21. data/ext/standalone/sunscraperrpc.h +19 -22
  22. data/ext/standalone/sunscraperrpcserver.cpp +26 -0
  23. data/ext/standalone/sunscraperrpcserver.h +24 -0
  24. data/ext/sunscraper-ext.pro +1 -1
  25. data/lib/sunscraper.rb +14 -14
  26. data/lib/sunscraper/library.rb +9 -9
  27. data/lib/sunscraper/standalone.rb +53 -107
  28. data/spec/sunscraper_spec.rb +86 -44
  29. data/sunscraper.gemspec +1 -1
  30. metadata +19 -17
  31. data/ext/embed/sunscraper.cpp +0 -92
  32. data/ext/embed/sunscraper.h +0 -47
  33. data/ext/embed/sunscraperproxy.cpp +0 -14
  34. data/ext/embed/sunscraperproxy.h +0 -24
  35. data/ext/embed/sunscraperworker.cpp +0 -163
  36. data/ext/embed/sunscraperworker.h +0 -58
  37. data/ext/standalone/sunscraperproxy.cpp +0 -14
  38. data/ext/standalone/sunscraperworker.cpp +0 -60
  39. data/ext/standalone/sunscraperworker.h +0 -34
data/sunscraper.gemspec CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "sunscraper"
6
- s.version = "1.1.0.beta3"
6
+ s.version = "1.2.0.beta1"
7
7
  s.authors = ["Peter Zotov"]
8
8
  s.email = ["whitequark@whitequark.org"]
9
9
  s.homepage = "http://github.com/whitequark/sunscraper"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sunscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0.beta3
4
+ version: 1.2.0.beta1
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &81880600 !ruby/object:Gem::Requirement
16
+ requirement: &74877930 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *81880600
24
+ version_requirements: *74877930
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: ffi
27
- requirement: &81880200 !ruby/object:Gem::Requirement
27
+ requirement: &74877500 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,7 +32,7 @@ dependencies:
32
32
  version: 1.0.11
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *81880200
35
+ version_requirements: *74877500
36
36
  description: A WebKit-based, JavaScript-capable HTML scraper.
37
37
  email:
38
38
  - whitequark@whitequark.org
@@ -50,27 +50,29 @@ files:
50
50
  - README.md
51
51
  - Rakefile
52
52
  - ext/.gitignore
53
+ - ext/common/common.pro
54
+ - ext/common/libsunscraper_common.a
55
+ - ext/common/sunscraperproxy.cpp
56
+ - ext/common/sunscraperproxy.h
57
+ - ext/common/sunscraperwebpage.cpp
58
+ - ext/common/sunscraperwebpage.h
59
+ - ext/common/sunscraperworker.cpp
60
+ - ext/common/sunscraperworker.h
53
61
  - ext/embed/embed.pro
54
- - ext/embed/sunscraper.cpp
55
- - ext/embed/sunscraper.h
56
62
  - ext/embed/sunscraperexternal.cpp
63
+ - ext/embed/sunscraperinterface.cpp
64
+ - ext/embed/sunscraperinterface.h
57
65
  - ext/embed/sunscraperlibrary.cpp
58
66
  - ext/embed/sunscraperlibrary.h
59
- - ext/embed/sunscraperproxy.cpp
60
- - ext/embed/sunscraperproxy.h
61
- - ext/embed/sunscraperwebpage.cpp
62
- - ext/embed/sunscraperwebpage.h
63
- - ext/embed/sunscraperworker.cpp
64
- - ext/embed/sunscraperworker.h
67
+ - ext/embed/sunscraperthread.cpp
68
+ - ext/embed/sunscraperthread.h
65
69
  - ext/extconf.rb
66
70
  - ext/standalone/standalone.pro
67
71
  - ext/standalone/sunscrapermain.cpp
68
- - ext/standalone/sunscraperproxy.cpp
69
- - ext/standalone/sunscraperproxy.h
70
72
  - ext/standalone/sunscraperrpc.cpp
71
73
  - ext/standalone/sunscraperrpc.h
72
- - ext/standalone/sunscraperworker.cpp
73
- - ext/standalone/sunscraperworker.h
74
+ - ext/standalone/sunscraperrpcserver.cpp
75
+ - ext/standalone/sunscraperrpcserver.h
74
76
  - ext/sunscraper-ext.pro
75
77
  - lib/sunscraper.rb
76
78
  - lib/sunscraper/library.rb
@@ -1,92 +0,0 @@
1
- #include <QApplication>
2
- #include <QThread>
3
- #include <QTimer>
4
- #include <QWebPage>
5
- #include <QWebFrame>
6
- #include <QMutexLocker>
7
- #include <QEventLoop>
8
- #include <QtDebug>
9
- #include "sunscraper.h"
10
- #include "sunscraperlibrary.h"
11
- #include "sunscraperworker.h"
12
-
13
- unsigned Sunscraper::m_nextQueryId = 1;
14
- QMutex Sunscraper::m_staticMutex;
15
-
16
- Sunscraper::Sunscraper()
17
- {
18
- QMutexLocker locker(&m_staticMutex);
19
-
20
- m_queryId = m_nextQueryId++;
21
-
22
- SunscraperWorker *worker = SunscraperWorker::instance();
23
- if(worker == NULL)
24
- qFatal("Attempt to run Sunscraper before thread initialization");
25
-
26
- connect(this, SIGNAL(requestLoadHtml(uint,QString)),
27
- worker, SLOT(loadHtml(uint,QString)), Qt::QueuedConnection);
28
- connect(this, SIGNAL(requestLoadUrl(uint,QString)),
29
- worker, SLOT(loadUrl(uint,QString)), Qt::QueuedConnection);
30
- connect(this, SIGNAL(requestFinalize(uint)),
31
- worker, SLOT(finalize(uint)), Qt::QueuedConnection);
32
- connect(this, SIGNAL(requestTimeout(uint,uint)),
33
- worker, SLOT(setTimeout(uint, uint)), Qt::QueuedConnection);
34
-
35
- connect(worker, SIGNAL(finished(uint,QString)),
36
- this, SLOT(finished(uint,QString)), Qt::QueuedConnection);
37
- connect(worker, SIGNAL(timeout(uint)),
38
- this, SLOT(timeout(uint)), Qt::QueuedConnection);
39
-
40
- m_eventLoop = new QEventLoop;
41
- }
42
-
43
- void Sunscraper::loadHtml(QString html)
44
- {
45
- emit requestLoadHtml(m_queryId, html);
46
- }
47
-
48
- void Sunscraper::loadUrl(QString url)
49
- {
50
- emit requestLoadUrl(m_queryId, url);
51
- }
52
-
53
- void Sunscraper::wait(unsigned timeout)
54
- {
55
- emit requestTimeout(m_queryId, timeout);
56
-
57
- m_eventLoop->exec();
58
- }
59
-
60
- void Sunscraper::finished(unsigned eventQueryId, QString html)
61
- {
62
- if(eventQueryId != m_queryId)
63
- return;
64
-
65
- m_eventLoop->quit();
66
-
67
- m_html = html.toUtf8();
68
-
69
- emit requestFinalize(m_queryId);
70
- }
71
-
72
- void Sunscraper::timeout(unsigned eventQueryId)
73
- {
74
- if(eventQueryId != m_queryId)
75
- return;
76
-
77
- m_eventLoop->quit();
78
-
79
- m_html = "!SUNSCRAPER_TIMEOUT";
80
-
81
- emit requestFinalize(m_queryId);
82
- }
83
-
84
- QByteArray Sunscraper::fetch()
85
- {
86
- return m_html;
87
- }
88
-
89
- const char *Sunscraper::fetchAsCString()
90
- {
91
- return m_html.constData();
92
- }
@@ -1,47 +0,0 @@
1
- #ifndef SUNSCRAPER_H
2
- #define SUNSCRAPER_H
3
-
4
- #include <QObject>
5
- #include <QString>
6
- #include <QMutex>
7
- #include <QByteArray>
8
-
9
- class QWebPage;
10
- class QEventLoop;
11
-
12
- class Sunscraper : public QObject
13
- {
14
- Q_OBJECT
15
-
16
- public:
17
- Sunscraper();
18
-
19
- void loadHtml(QString html);
20
- void loadUrl(QString url);
21
-
22
- void wait(unsigned timeout);
23
-
24
- QByteArray fetch();
25
- const char *fetchAsCString();
26
-
27
- private slots:
28
- void finished(unsigned queryId, QString html);
29
- void timeout(unsigned queryId);
30
-
31
- signals:
32
- void requestLoadHtml(unsigned queryId, QString html);
33
- void requestLoadUrl(unsigned queryId, QString html);
34
- void requestTimeout(unsigned queryId, unsigned timeout);
35
- void requestFinalize(unsigned queryId);
36
-
37
- private:
38
- static unsigned m_nextQueryId;
39
- static QMutex m_staticMutex;
40
-
41
- QEventLoop *m_eventLoop;
42
-
43
- unsigned m_queryId;
44
- QByteArray m_html;
45
- };
46
-
47
- #endif // SUNSCRAPER_H
@@ -1,14 +0,0 @@
1
- #include <QWebPage>
2
- #include <QWebFrame>
3
- #include <QtDebug>
4
- #include "sunscraperproxy.h"
5
-
6
- SunscraperProxy::SunscraperProxy(QWebPage *parent, unsigned queryId) :
7
- QObject(parent), m_webPage(parent), m_queryId(queryId)
8
- {
9
- }
10
-
11
- void SunscraperProxy::finish()
12
- {
13
- emit finished(m_queryId, m_webPage->mainFrame()->toHtml());
14
- }
@@ -1,24 +0,0 @@
1
- #ifndef SUNSCRAPERPROXY_H
2
- #define SUNSCRAPERPROXY_H
3
-
4
- #include <QObject>
5
-
6
- class QWebPage;
7
-
8
- class SunscraperProxy : public QObject
9
- {
10
- Q_OBJECT
11
- public:
12
- SunscraperProxy(QWebPage *parent, unsigned queryId);
13
-
14
- Q_INVOKABLE void finish();
15
-
16
- signals:
17
- void finished(unsigned queryId, QString html);
18
-
19
- private:
20
- QWebPage *m_webPage;
21
- unsigned m_queryId;
22
- };
23
-
24
- #endif // SUNSCRAPERPROXY_H
@@ -1,163 +0,0 @@
1
- #include <QApplication>
2
- #include <QWebPage>
3
- #include <QWebFrame>
4
- #include <QTimer>
5
- #include <QWebView>
6
- #include "sunscraperworker.h"
7
- #include "sunscraperwebpage.h"
8
- #include "sunscraperproxy.h"
9
- #include <QtDebug>
10
- #include <time.h>
11
-
12
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
13
- pthread_t SunscraperWorker::m_thread;
14
- #endif
15
-
16
- SunscraperWorker *SunscraperWorker::m_instance;
17
- QSemaphore SunscraperWorker::m_initializationLock;
18
-
19
- SunscraperWorker::SunscraperWorker()
20
- {
21
- }
22
-
23
- SunscraperWorker *SunscraperWorker::instance()
24
- {
25
- m_initializationLock.acquire(1);
26
- m_initializationLock.release(1);
27
-
28
- return m_instance;
29
- }
30
-
31
- void SunscraperWorker::invoke()
32
- {
33
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
34
- pthread_create(&m_thread, NULL, &SunscraperWorker::thread_routine, NULL);
35
- #endif
36
- }
37
-
38
- void *SunscraperWorker::thread_routine(void *)
39
- {
40
- /* Better error messages. */
41
- int argc = 1;
42
- char *argv[] = { (char*) "Sunscraper", NULL};
43
-
44
- /* Why (char*)? Because argv can (theoretically) be modified. *
45
- * But Qt won't do that with argv[0]. I know, trust me. */
46
-
47
- //qDebug() << "a";
48
- //usleep(1000000);
49
- //qDebug() << "b";
50
-
51
- QApplication app(argc, argv);
52
-
53
- if(m_instance != NULL)
54
- qFatal("Attempt to invoke SunscraperWorker more than once");
55
-
56
- m_instance = new SunscraperWorker();
57
- m_initializationLock.release(1);
58
-
59
- /* The magic value 42 means we want exit from the loop. */
60
- while(app.exec() != 42);
61
-
62
- /* Our application exits. */
63
-
64
- return NULL;
65
- }
66
-
67
- void SunscraperWorker::commitSuicide()
68
- {
69
- QApplication::exit(42);
70
-
71
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
72
- pthread_join(m_thread, NULL);
73
- #endif
74
- }
75
-
76
- void SunscraperWorker::loadHtml(unsigned queryId, QString html)
77
- {
78
- QWebPage *webPage = initializeWebPage(queryId);
79
- webPage->mainFrame()->setHtml(html);
80
- }
81
-
82
- void SunscraperWorker::loadUrl(unsigned queryId, QString url)
83
- {
84
- QWebPage *webPage = initializeWebPage(queryId);
85
- webPage->mainFrame()->load(url);
86
- }
87
-
88
- void SunscraperWorker::setTimeout(unsigned queryId, unsigned timeout)
89
- {
90
- Q_ASSERT(m_timers[queryId] == NULL);
91
-
92
- QTimer *timer = new QTimer(this);
93
- timer->setInterval(timeout);
94
- timer->setSingleShot(true);
95
-
96
- connect(timer, SIGNAL(timeout()), this, SLOT(routeTimeout()));
97
-
98
- timer->start();
99
- m_timers[queryId] = timer;
100
- }
101
-
102
- void SunscraperWorker::finalize(unsigned queryId)
103
- {
104
- Q_ASSERT(m_webPages[queryId] != NULL);
105
-
106
- m_webPages[queryId]->deleteLater();
107
- m_webPages.remove(queryId);
108
-
109
- if(m_timers.contains(queryId)) {
110
- m_timers[queryId]->deleteLater();
111
- m_timers.remove(queryId);
112
- }
113
- }
114
-
115
- QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
116
- {
117
- Q_ASSERT(m_webPages[queryId] == NULL);
118
-
119
- SunscraperWebPage *webPage = new SunscraperWebPage(this);
120
- webPage->settings()->setAttribute(QWebSettings::LocalStorageEnabled, true);
121
-
122
- connect(webPage, SIGNAL(frameCreated(QWebFrame*)), this, SLOT(attachFrame(QWebFrame*)));
123
- connect(webPage, SIGNAL(consoleMessage(QString)), this, SLOT(routeMessage(QString)));
124
-
125
- m_webPages[queryId] = webPage;
126
-
127
- return webPage;
128
- }
129
-
130
- void SunscraperWorker::attachFrame(QWebFrame *frame)
131
- {
132
- connect(frame, SIGNAL(javaScriptWindowObjectCleared()),
133
- this, SLOT(attachAPI()));
134
- }
135
-
136
- void SunscraperWorker::attachAPI()
137
- {
138
- QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
139
- QWebPage *page = origin->page();
140
-
141
- unsigned queryId = m_webPages.key(page, 0);
142
- Q_ASSERT(queryId != 0);
143
-
144
- SunscraperProxy *proxy = new SunscraperProxy(page, queryId);
145
- connect(proxy, SIGNAL(finished(uint,QString)), this, SIGNAL(finished(uint,QString)));
146
-
147
- origin->addToJavaScriptWindowObject("Sunscraper", proxy, QScriptEngine::QtOwnership);
148
- }
149
-
150
- void SunscraperWorker::routeTimeout()
151
- {
152
- QTimer *origin = static_cast<QTimer *>(QObject::sender());
153
-
154
- unsigned queryId = m_timers.key(origin, 0);
155
- Q_ASSERT(queryId != 0);
156
-
157
- emit timeout(queryId);
158
- }
159
-
160
- void SunscraperWorker::routeMessage(QString message)
161
- {
162
- qDebug() << "Sunscraper Console:" << message;
163
- }
@@ -1,58 +0,0 @@
1
- #ifndef SUNSCRAPERWORKER_H
2
- #define SUNSCRAPERWORKER_H
3
-
4
- #include <QObject>
5
- #include <QSemaphore>
6
- #include <QMap>
7
- #include <QUrl>
8
-
9
- class QWebPage;
10
- class QWebFrame;
11
- class QTimer;
12
-
13
- class SunscraperWorker : public QObject
14
- {
15
- Q_OBJECT
16
- public:
17
- static void invoke();
18
- static void commitSuicide();
19
- static SunscraperWorker *instance();
20
-
21
- signals:
22
- void finished(unsigned queryId, QString result);
23
- void timeout(unsigned queryId);
24
-
25
- public slots:
26
- void loadHtml(unsigned queryId, QString html);
27
- void loadUrl(unsigned queryId, QString url);
28
- void setTimeout(unsigned queryId, unsigned timeout);
29
- void finalize(unsigned queryId);
30
-
31
- private slots:
32
- void attachFrame(QWebFrame *frame);
33
- void attachAPI();
34
- void routeTimeout();
35
- void routeMessage(QString message);
36
-
37
- private:
38
- static SunscraperWorker *m_instance;
39
- static QSemaphore m_initializationLock;
40
-
41
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
42
- static pthread_t m_thread;
43
- #else
44
- #error Your platform is unsupported. Implement SunscraperWorker::invoke() and send a pull request.
45
- #endif
46
-
47
- static void *thread_routine(void *arg);
48
-
49
- QMap<unsigned, QWebPage *> m_webPages;
50
- QMap<unsigned, QTimer *> m_timers;
51
-
52
- SunscraperWorker();
53
- SunscraperWorker(SunscraperWorker &);
54
-
55
- QWebPage *initializeWebPage(unsigned queryId);
56
- };
57
-
58
- #endif // SUNSCRAPERWORKER_H