sunscraper 1.1.0.beta3 → 1.2.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/ext/common/common.pro +13 -0
  2. data/ext/common/libsunscraper_common.a +0 -0
  3. data/ext/common/sunscraperproxy.cpp +11 -0
  4. data/ext/{standalone → common}/sunscraperproxy.h +5 -5
  5. data/ext/{embed → common}/sunscraperwebpage.cpp +0 -0
  6. data/ext/{embed → common}/sunscraperwebpage.h +0 -0
  7. data/ext/common/sunscraperworker.cpp +124 -0
  8. data/ext/common/sunscraperworker.h +50 -0
  9. data/ext/embed/embed.pro +14 -12
  10. data/ext/embed/sunscraperexternal.cpp +17 -16
  11. data/ext/embed/sunscraperinterface.cpp +206 -0
  12. data/ext/embed/sunscraperinterface.h +66 -0
  13. data/ext/embed/sunscraperlibrary.cpp +2 -12
  14. data/ext/embed/sunscraperlibrary.h +0 -1
  15. data/ext/embed/sunscraperthread.cpp +49 -0
  16. data/ext/embed/sunscraperthread.h +24 -0
  17. data/ext/extconf.rb +5 -3
  18. data/ext/standalone/standalone.pro +12 -6
  19. data/ext/standalone/sunscrapermain.cpp +13 -3
  20. data/ext/standalone/sunscraperrpc.cpp +76 -88
  21. data/ext/standalone/sunscraperrpc.h +19 -22
  22. data/ext/standalone/sunscraperrpcserver.cpp +26 -0
  23. data/ext/standalone/sunscraperrpcserver.h +24 -0
  24. data/ext/sunscraper-ext.pro +1 -1
  25. data/lib/sunscraper.rb +14 -14
  26. data/lib/sunscraper/library.rb +9 -9
  27. data/lib/sunscraper/standalone.rb +53 -107
  28. data/spec/sunscraper_spec.rb +86 -44
  29. data/sunscraper.gemspec +1 -1
  30. metadata +19 -17
  31. data/ext/embed/sunscraper.cpp +0 -92
  32. data/ext/embed/sunscraper.h +0 -47
  33. data/ext/embed/sunscraperproxy.cpp +0 -14
  34. data/ext/embed/sunscraperproxy.h +0 -24
  35. data/ext/embed/sunscraperworker.cpp +0 -163
  36. data/ext/embed/sunscraperworker.h +0 -58
  37. data/ext/standalone/sunscraperproxy.cpp +0 -14
  38. data/ext/standalone/sunscraperworker.cpp +0 -60
  39. data/ext/standalone/sunscraperworker.h +0 -34
data/sunscraper.gemspec CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "sunscraper"
6
- s.version = "1.1.0.beta3"
6
+ s.version = "1.2.0.beta1"
7
7
  s.authors = ["Peter Zotov"]
8
8
  s.email = ["whitequark@whitequark.org"]
9
9
  s.homepage = "http://github.com/whitequark/sunscraper"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sunscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0.beta3
4
+ version: 1.2.0.beta1
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &81880600 !ruby/object:Gem::Requirement
16
+ requirement: &74877930 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *81880600
24
+ version_requirements: *74877930
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: ffi
27
- requirement: &81880200 !ruby/object:Gem::Requirement
27
+ requirement: &74877500 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,7 +32,7 @@ dependencies:
32
32
  version: 1.0.11
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *81880200
35
+ version_requirements: *74877500
36
36
  description: A WebKit-based, JavaScript-capable HTML scraper.
37
37
  email:
38
38
  - whitequark@whitequark.org
@@ -50,27 +50,29 @@ files:
50
50
  - README.md
51
51
  - Rakefile
52
52
  - ext/.gitignore
53
+ - ext/common/common.pro
54
+ - ext/common/libsunscraper_common.a
55
+ - ext/common/sunscraperproxy.cpp
56
+ - ext/common/sunscraperproxy.h
57
+ - ext/common/sunscraperwebpage.cpp
58
+ - ext/common/sunscraperwebpage.h
59
+ - ext/common/sunscraperworker.cpp
60
+ - ext/common/sunscraperworker.h
53
61
  - ext/embed/embed.pro
54
- - ext/embed/sunscraper.cpp
55
- - ext/embed/sunscraper.h
56
62
  - ext/embed/sunscraperexternal.cpp
63
+ - ext/embed/sunscraperinterface.cpp
64
+ - ext/embed/sunscraperinterface.h
57
65
  - ext/embed/sunscraperlibrary.cpp
58
66
  - ext/embed/sunscraperlibrary.h
59
- - ext/embed/sunscraperproxy.cpp
60
- - ext/embed/sunscraperproxy.h
61
- - ext/embed/sunscraperwebpage.cpp
62
- - ext/embed/sunscraperwebpage.h
63
- - ext/embed/sunscraperworker.cpp
64
- - ext/embed/sunscraperworker.h
67
+ - ext/embed/sunscraperthread.cpp
68
+ - ext/embed/sunscraperthread.h
65
69
  - ext/extconf.rb
66
70
  - ext/standalone/standalone.pro
67
71
  - ext/standalone/sunscrapermain.cpp
68
- - ext/standalone/sunscraperproxy.cpp
69
- - ext/standalone/sunscraperproxy.h
70
72
  - ext/standalone/sunscraperrpc.cpp
71
73
  - ext/standalone/sunscraperrpc.h
72
- - ext/standalone/sunscraperworker.cpp
73
- - ext/standalone/sunscraperworker.h
74
+ - ext/standalone/sunscraperrpcserver.cpp
75
+ - ext/standalone/sunscraperrpcserver.h
74
76
  - ext/sunscraper-ext.pro
75
77
  - lib/sunscraper.rb
76
78
  - lib/sunscraper/library.rb
@@ -1,92 +0,0 @@
1
- #include <QApplication>
2
- #include <QThread>
3
- #include <QTimer>
4
- #include <QWebPage>
5
- #include <QWebFrame>
6
- #include <QMutexLocker>
7
- #include <QEventLoop>
8
- #include <QtDebug>
9
- #include "sunscraper.h"
10
- #include "sunscraperlibrary.h"
11
- #include "sunscraperworker.h"
12
-
13
- unsigned Sunscraper::m_nextQueryId = 1;
14
- QMutex Sunscraper::m_staticMutex;
15
-
16
- Sunscraper::Sunscraper()
17
- {
18
- QMutexLocker locker(&m_staticMutex);
19
-
20
- m_queryId = m_nextQueryId++;
21
-
22
- SunscraperWorker *worker = SunscraperWorker::instance();
23
- if(worker == NULL)
24
- qFatal("Attempt to run Sunscraper before thread initialization");
25
-
26
- connect(this, SIGNAL(requestLoadHtml(uint,QString)),
27
- worker, SLOT(loadHtml(uint,QString)), Qt::QueuedConnection);
28
- connect(this, SIGNAL(requestLoadUrl(uint,QString)),
29
- worker, SLOT(loadUrl(uint,QString)), Qt::QueuedConnection);
30
- connect(this, SIGNAL(requestFinalize(uint)),
31
- worker, SLOT(finalize(uint)), Qt::QueuedConnection);
32
- connect(this, SIGNAL(requestTimeout(uint,uint)),
33
- worker, SLOT(setTimeout(uint, uint)), Qt::QueuedConnection);
34
-
35
- connect(worker, SIGNAL(finished(uint,QString)),
36
- this, SLOT(finished(uint,QString)), Qt::QueuedConnection);
37
- connect(worker, SIGNAL(timeout(uint)),
38
- this, SLOT(timeout(uint)), Qt::QueuedConnection);
39
-
40
- m_eventLoop = new QEventLoop;
41
- }
42
-
43
- void Sunscraper::loadHtml(QString html)
44
- {
45
- emit requestLoadHtml(m_queryId, html);
46
- }
47
-
48
- void Sunscraper::loadUrl(QString url)
49
- {
50
- emit requestLoadUrl(m_queryId, url);
51
- }
52
-
53
- void Sunscraper::wait(unsigned timeout)
54
- {
55
- emit requestTimeout(m_queryId, timeout);
56
-
57
- m_eventLoop->exec();
58
- }
59
-
60
- void Sunscraper::finished(unsigned eventQueryId, QString html)
61
- {
62
- if(eventQueryId != m_queryId)
63
- return;
64
-
65
- m_eventLoop->quit();
66
-
67
- m_html = html.toUtf8();
68
-
69
- emit requestFinalize(m_queryId);
70
- }
71
-
72
- void Sunscraper::timeout(unsigned eventQueryId)
73
- {
74
- if(eventQueryId != m_queryId)
75
- return;
76
-
77
- m_eventLoop->quit();
78
-
79
- m_html = "!SUNSCRAPER_TIMEOUT";
80
-
81
- emit requestFinalize(m_queryId);
82
- }
83
-
84
- QByteArray Sunscraper::fetch()
85
- {
86
- return m_html;
87
- }
88
-
89
- const char *Sunscraper::fetchAsCString()
90
- {
91
- return m_html.constData();
92
- }
@@ -1,47 +0,0 @@
1
- #ifndef SUNSCRAPER_H
2
- #define SUNSCRAPER_H
3
-
4
- #include <QObject>
5
- #include <QString>
6
- #include <QMutex>
7
- #include <QByteArray>
8
-
9
- class QWebPage;
10
- class QEventLoop;
11
-
12
- class Sunscraper : public QObject
13
- {
14
- Q_OBJECT
15
-
16
- public:
17
- Sunscraper();
18
-
19
- void loadHtml(QString html);
20
- void loadUrl(QString url);
21
-
22
- void wait(unsigned timeout);
23
-
24
- QByteArray fetch();
25
- const char *fetchAsCString();
26
-
27
- private slots:
28
- void finished(unsigned queryId, QString html);
29
- void timeout(unsigned queryId);
30
-
31
- signals:
32
- void requestLoadHtml(unsigned queryId, QString html);
33
- void requestLoadUrl(unsigned queryId, QString html);
34
- void requestTimeout(unsigned queryId, unsigned timeout);
35
- void requestFinalize(unsigned queryId);
36
-
37
- private:
38
- static unsigned m_nextQueryId;
39
- static QMutex m_staticMutex;
40
-
41
- QEventLoop *m_eventLoop;
42
-
43
- unsigned m_queryId;
44
- QByteArray m_html;
45
- };
46
-
47
- #endif // SUNSCRAPER_H
@@ -1,14 +0,0 @@
1
- #include <QWebPage>
2
- #include <QWebFrame>
3
- #include <QtDebug>
4
- #include "sunscraperproxy.h"
5
-
6
- SunscraperProxy::SunscraperProxy(QWebPage *parent, unsigned queryId) :
7
- QObject(parent), m_webPage(parent), m_queryId(queryId)
8
- {
9
- }
10
-
11
- void SunscraperProxy::finish()
12
- {
13
- emit finished(m_queryId, m_webPage->mainFrame()->toHtml());
14
- }
@@ -1,24 +0,0 @@
1
- #ifndef SUNSCRAPERPROXY_H
2
- #define SUNSCRAPERPROXY_H
3
-
4
- #include <QObject>
5
-
6
- class QWebPage;
7
-
8
- class SunscraperProxy : public QObject
9
- {
10
- Q_OBJECT
11
- public:
12
- SunscraperProxy(QWebPage *parent, unsigned queryId);
13
-
14
- Q_INVOKABLE void finish();
15
-
16
- signals:
17
- void finished(unsigned queryId, QString html);
18
-
19
- private:
20
- QWebPage *m_webPage;
21
- unsigned m_queryId;
22
- };
23
-
24
- #endif // SUNSCRAPERPROXY_H
@@ -1,163 +0,0 @@
1
- #include <QApplication>
2
- #include <QWebPage>
3
- #include <QWebFrame>
4
- #include <QTimer>
5
- #include <QWebView>
6
- #include "sunscraperworker.h"
7
- #include "sunscraperwebpage.h"
8
- #include "sunscraperproxy.h"
9
- #include <QtDebug>
10
- #include <time.h>
11
-
12
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
13
- pthread_t SunscraperWorker::m_thread;
14
- #endif
15
-
16
- SunscraperWorker *SunscraperWorker::m_instance;
17
- QSemaphore SunscraperWorker::m_initializationLock;
18
-
19
- SunscraperWorker::SunscraperWorker()
20
- {
21
- }
22
-
23
- SunscraperWorker *SunscraperWorker::instance()
24
- {
25
- m_initializationLock.acquire(1);
26
- m_initializationLock.release(1);
27
-
28
- return m_instance;
29
- }
30
-
31
- void SunscraperWorker::invoke()
32
- {
33
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
34
- pthread_create(&m_thread, NULL, &SunscraperWorker::thread_routine, NULL);
35
- #endif
36
- }
37
-
38
- void *SunscraperWorker::thread_routine(void *)
39
- {
40
- /* Better error messages. */
41
- int argc = 1;
42
- char *argv[] = { (char*) "Sunscraper", NULL};
43
-
44
- /* Why (char*)? Because argv can (theoretically) be modified. *
45
- * But Qt won't do that with argv[0]. I know, trust me. */
46
-
47
- //qDebug() << "a";
48
- //usleep(1000000);
49
- //qDebug() << "b";
50
-
51
- QApplication app(argc, argv);
52
-
53
- if(m_instance != NULL)
54
- qFatal("Attempt to invoke SunscraperWorker more than once");
55
-
56
- m_instance = new SunscraperWorker();
57
- m_initializationLock.release(1);
58
-
59
- /* The magic value 42 means we want exit from the loop. */
60
- while(app.exec() != 42);
61
-
62
- /* Our application exits. */
63
-
64
- return NULL;
65
- }
66
-
67
- void SunscraperWorker::commitSuicide()
68
- {
69
- QApplication::exit(42);
70
-
71
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
72
- pthread_join(m_thread, NULL);
73
- #endif
74
- }
75
-
76
- void SunscraperWorker::loadHtml(unsigned queryId, QString html)
77
- {
78
- QWebPage *webPage = initializeWebPage(queryId);
79
- webPage->mainFrame()->setHtml(html);
80
- }
81
-
82
- void SunscraperWorker::loadUrl(unsigned queryId, QString url)
83
- {
84
- QWebPage *webPage = initializeWebPage(queryId);
85
- webPage->mainFrame()->load(url);
86
- }
87
-
88
- void SunscraperWorker::setTimeout(unsigned queryId, unsigned timeout)
89
- {
90
- Q_ASSERT(m_timers[queryId] == NULL);
91
-
92
- QTimer *timer = new QTimer(this);
93
- timer->setInterval(timeout);
94
- timer->setSingleShot(true);
95
-
96
- connect(timer, SIGNAL(timeout()), this, SLOT(routeTimeout()));
97
-
98
- timer->start();
99
- m_timers[queryId] = timer;
100
- }
101
-
102
- void SunscraperWorker::finalize(unsigned queryId)
103
- {
104
- Q_ASSERT(m_webPages[queryId] != NULL);
105
-
106
- m_webPages[queryId]->deleteLater();
107
- m_webPages.remove(queryId);
108
-
109
- if(m_timers.contains(queryId)) {
110
- m_timers[queryId]->deleteLater();
111
- m_timers.remove(queryId);
112
- }
113
- }
114
-
115
- QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
116
- {
117
- Q_ASSERT(m_webPages[queryId] == NULL);
118
-
119
- SunscraperWebPage *webPage = new SunscraperWebPage(this);
120
- webPage->settings()->setAttribute(QWebSettings::LocalStorageEnabled, true);
121
-
122
- connect(webPage, SIGNAL(frameCreated(QWebFrame*)), this, SLOT(attachFrame(QWebFrame*)));
123
- connect(webPage, SIGNAL(consoleMessage(QString)), this, SLOT(routeMessage(QString)));
124
-
125
- m_webPages[queryId] = webPage;
126
-
127
- return webPage;
128
- }
129
-
130
- void SunscraperWorker::attachFrame(QWebFrame *frame)
131
- {
132
- connect(frame, SIGNAL(javaScriptWindowObjectCleared()),
133
- this, SLOT(attachAPI()));
134
- }
135
-
136
- void SunscraperWorker::attachAPI()
137
- {
138
- QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
139
- QWebPage *page = origin->page();
140
-
141
- unsigned queryId = m_webPages.key(page, 0);
142
- Q_ASSERT(queryId != 0);
143
-
144
- SunscraperProxy *proxy = new SunscraperProxy(page, queryId);
145
- connect(proxy, SIGNAL(finished(uint,QString)), this, SIGNAL(finished(uint,QString)));
146
-
147
- origin->addToJavaScriptWindowObject("Sunscraper", proxy, QScriptEngine::QtOwnership);
148
- }
149
-
150
- void SunscraperWorker::routeTimeout()
151
- {
152
- QTimer *origin = static_cast<QTimer *>(QObject::sender());
153
-
154
- unsigned queryId = m_timers.key(origin, 0);
155
- Q_ASSERT(queryId != 0);
156
-
157
- emit timeout(queryId);
158
- }
159
-
160
- void SunscraperWorker::routeMessage(QString message)
161
- {
162
- qDebug() << "Sunscraper Console:" << message;
163
- }
@@ -1,58 +0,0 @@
1
- #ifndef SUNSCRAPERWORKER_H
2
- #define SUNSCRAPERWORKER_H
3
-
4
- #include <QObject>
5
- #include <QSemaphore>
6
- #include <QMap>
7
- #include <QUrl>
8
-
9
- class QWebPage;
10
- class QWebFrame;
11
- class QTimer;
12
-
13
- class SunscraperWorker : public QObject
14
- {
15
- Q_OBJECT
16
- public:
17
- static void invoke();
18
- static void commitSuicide();
19
- static SunscraperWorker *instance();
20
-
21
- signals:
22
- void finished(unsigned queryId, QString result);
23
- void timeout(unsigned queryId);
24
-
25
- public slots:
26
- void loadHtml(unsigned queryId, QString html);
27
- void loadUrl(unsigned queryId, QString url);
28
- void setTimeout(unsigned queryId, unsigned timeout);
29
- void finalize(unsigned queryId);
30
-
31
- private slots:
32
- void attachFrame(QWebFrame *frame);
33
- void attachAPI();
34
- void routeTimeout();
35
- void routeMessage(QString message);
36
-
37
- private:
38
- static SunscraperWorker *m_instance;
39
- static QSemaphore m_initializationLock;
40
-
41
- #if defined(Q_OS_LINUX) || defined(Q_OS_UNIX)
42
- static pthread_t m_thread;
43
- #else
44
- #error Your platform is unsupported. Implement SunscraperWorker::invoke() and send a pull request.
45
- #endif
46
-
47
- static void *thread_routine(void *arg);
48
-
49
- QMap<unsigned, QWebPage *> m_webPages;
50
- QMap<unsigned, QTimer *> m_timers;
51
-
52
- SunscraperWorker();
53
- SunscraperWorker(SunscraperWorker &);
54
-
55
- QWebPage *initializeWebPage(unsigned queryId);
56
- };
57
-
58
- #endif // SUNSCRAPERWORKER_H