sunscraper 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ doc/*
6
+ .rbx/*
7
+ .yardoc/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
@@ -0,0 +1 @@
1
+ --no-private --markup markdown - LICENSE
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in sunscraper.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (C) 2012 Peter Zotov <whitequark@whitequark.org>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ of the Software, and to permit persons to whom the Software is furnished to do
8
+ so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,68 @@
1
+ Sunscraper
2
+ ==========
3
+
4
+ Sunscraper is a gem for prerendering pages with hashbang URLs like `http://whatever.com/#!/page`.
5
+
6
+ It works by loading content in the embedded web browser and waiting for a JavaScript method to be
7
+ called.
8
+
9
+ HTML = %{
10
+ <html>
11
+ <head>
12
+ <script type="text/javascript">
13
+ document.addEventListener("DOMContentLoaded", function() {
14
+ document.getElementById('fuga').textContent =
15
+ ("!skrow tI").split("").reverse().join("");
16
+ Sunscraper.finish();
17
+ }, true);
18
+ </script>
19
+ </head>
20
+ <body>
21
+ <div id='fuga'></div>
22
+ </body>
23
+ </html>
24
+ }
25
+
26
+ Sunscraper.scrape_html(HTML).include?('It works!') # => true
27
+
28
+ See also [documentation][].
29
+
30
+ [documentation]: http://rdoc.info/gems/sunscraper/Sunscraper
31
+
32
+ Installation
33
+ ------------
34
+
35
+ Sunscraper requires Qt 4.x and QtWebkit packages to be installed on the target system. *Sunscraper is not a Ruby
36
+ C extension*; it works by building a Qt shared library and loading it through [FFI][].
37
+
38
+ [FFI]: http://en.wikipedia.org/wiki/Foreign_Function_Interface
39
+
40
+ gem install sunscraper
41
+
42
+ Runtime requirements
43
+ --------------------
44
+
45
+ On Linux with Qt versions <= 4.8, Sunscraper requires a running X server and a valid `DISPLAY` environment
46
+ variable. Consider using [Xvfb][] on a GUI-less production server.
47
+
48
+ [Xvfb]: http://www.x.org/releases/X11R7.6/doc/man/man1/Xvfb.1.xhtml
49
+
50
+ Compatibility
51
+ -------------
52
+
53
+ Sunscraper should be compatible across all major implementations on all major operating systems, including
54
+ Ruby MRI 1.9, JRuby, Rubinius and MacRuby running on GNU/Linux, OS X and Windows.
55
+
56
+ JRuby versions up to 1.6.5 are known not to work due to a bug in its FFI library.
57
+
58
+ Ruby MRI 1.8 is not supported because it has a braindead threading model and will never be because I don't care.
59
+
60
+ Thread safety
61
+ -------------
62
+
63
+ Sunscraper is thread-safe.
64
+
65
+ License
66
+ -------
67
+
68
+ Sunscraper is distributed under the terms of a MIT license; see LICENSE in the source distribution.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,4 @@
1
+ *.user
2
+ *.o
3
+ *.so*
4
+ moc_*
@@ -0,0 +1,270 @@
1
+ #############################################################################
2
+ # Makefile for building: libsunscraper.so.1.0.0
3
+ # Generated by qmake (2.01a) (Qt 4.7.4) on: Sat Feb 18 05:54:02 2012
4
+ # Project: sunscraper.pro
5
+ # Template: lib
6
+ # Command: /usr/bin/qmake-qt4 -spec /usr/share/qt4/mkspecs/linux-g++ CONFIG+=debug -o Makefile sunscraper.pro
7
+ #############################################################################
8
+
9
+ ####### Compiler, tools and options
10
+
11
+ CC = gcc
12
+ CXX = g++
13
+ DEFINES = -DQT_WEBKIT -DQT_WEBKIT_LIB -DQT_GUI_LIB -DQT_CORE_LIB -DQT_SHARED
14
+ CFLAGS = -pipe -g -Wall -W -D_REENTRANT -fPIC $(DEFINES)
15
+ CXXFLAGS = -pipe -g -Wall -W -D_REENTRANT -fPIC $(DEFINES)
16
+ INCPATH = -I/usr/share/qt4/mkspecs/linux-g++ -I. -I/usr/include/qt4/QtCore -I/usr/include/qt4/QtGui -I/usr/include/qt4/QtWebKit -I/usr/include/qt4 -I.
17
+ LINK = g++
18
+ LFLAGS = -shared -Wl,-soname,libsunscraper.so.1
19
+ LIBS = $(SUBLIBS) -L/usr/lib -lQtWebKit -lQtGui -lQtCore -lpthread
20
+ AR = ar cqs
21
+ RANLIB =
22
+ QMAKE = /usr/bin/qmake-qt4
23
+ TAR = tar -cf
24
+ COMPRESS = gzip -9f
25
+ COPY = cp -f
26
+ SED = sed
27
+ COPY_FILE = $(COPY)
28
+ COPY_DIR = $(COPY) -r
29
+ STRIP = strip
30
+ INSTALL_FILE = install -m 644 -p
31
+ INSTALL_DIR = $(COPY_DIR)
32
+ INSTALL_PROGRAM = install -m 755 -p
33
+ DEL_FILE = rm -f
34
+ SYMLINK = ln -f -s
35
+ DEL_DIR = rmdir
36
+ MOVE = mv -f
37
+ CHK_DIR_EXISTS= test -d
38
+ MKDIR = mkdir -p
39
+
40
+ ####### Output directory
41
+
42
+ OBJECTS_DIR = ./
43
+
44
+ ####### Files
45
+
46
+ SOURCES = sunscraperlibrary.cpp \
47
+ sunscraperthread.cpp \
48
+ sunscraperexternal.cpp \
49
+ sunscraper.cpp \
50
+ sunscraperproxy.cpp moc_sunscraperthread.cpp \
51
+ moc_sunscraper.cpp \
52
+ moc_sunscraperproxy.cpp
53
+ OBJECTS = sunscraperlibrary.o \
54
+ sunscraperthread.o \
55
+ sunscraperexternal.o \
56
+ sunscraper.o \
57
+ sunscraperproxy.o \
58
+ moc_sunscraperthread.o \
59
+ moc_sunscraper.o \
60
+ moc_sunscraperproxy.o
61
+ DIST = /usr/share/qt4/mkspecs/common/g++.conf \
62
+ /usr/share/qt4/mkspecs/common/unix.conf \
63
+ /usr/share/qt4/mkspecs/common/linux.conf \
64
+ /usr/share/qt4/mkspecs/qconfig.pri \
65
+ /usr/share/qt4/mkspecs/modules/qt_webkit_version.pri \
66
+ /usr/share/qt4/mkspecs/features/qt_functions.prf \
67
+ /usr/share/qt4/mkspecs/features/qt_config.prf \
68
+ /usr/share/qt4/mkspecs/features/exclusive_builds.prf \
69
+ /usr/share/qt4/mkspecs/features/default_pre.prf \
70
+ /usr/share/qt4/mkspecs/features/debug.prf \
71
+ /usr/share/qt4/mkspecs/features/default_post.prf \
72
+ /usr/share/qt4/mkspecs/features/warn_on.prf \
73
+ /usr/share/qt4/mkspecs/features/qt.prf \
74
+ /usr/share/qt4/mkspecs/features/unix/thread.prf \
75
+ /usr/share/qt4/mkspecs/features/moc.prf \
76
+ /usr/share/qt4/mkspecs/features/resources.prf \
77
+ /usr/share/qt4/mkspecs/features/uic.prf \
78
+ /usr/share/qt4/mkspecs/features/yacc.prf \
79
+ /usr/share/qt4/mkspecs/features/lex.prf \
80
+ /usr/share/qt4/mkspecs/features/include_source_dir.prf \
81
+ sunscraper.pro
82
+ QMAKE_TARGET = sunscraper
83
+ DESTDIR =
84
+ TARGET = libsunscraper.so.1.0.0
85
+ TARGETA = libsunscraper.a
86
+ TARGETD = libsunscraper.so.1.0.0
87
+ TARGET0 = libsunscraper.so
88
+ TARGET1 = libsunscraper.so.1
89
+ TARGET2 = libsunscraper.so.1.0
90
+
91
+ first: all
92
+ ####### Implicit rules
93
+
94
+ .SUFFIXES: .o .c .cpp .cc .cxx .C
95
+
96
+ .cpp.o:
97
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
98
+
99
+ .cc.o:
100
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
101
+
102
+ .cxx.o:
103
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
104
+
105
+ .C.o:
106
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
107
+
108
+ .c.o:
109
+ $(CC) -c $(CFLAGS) $(INCPATH) -o "$@" "$<"
110
+
111
+ ####### Build rules
112
+
113
+ all: Makefile $(TARGET)
114
+
115
+ $(TARGET): $(OBJECTS) $(SUBLIBS) $(OBJCOMP)
116
+ -$(DEL_FILE) $(TARGET) $(TARGET0) $(TARGET1) $(TARGET2)
117
+ $(LINK) $(LFLAGS) -o $(TARGET) $(OBJECTS) $(LIBS) $(OBJCOMP)
118
+ -ln -s $(TARGET) $(TARGET0)
119
+ -ln -s $(TARGET) $(TARGET1)
120
+ -ln -s $(TARGET) $(TARGET2)
121
+
122
+
123
+
124
+ staticlib: $(TARGETA)
125
+
126
+ $(TARGETA): $(OBJECTS) $(OBJCOMP)
127
+ -$(DEL_FILE) $(TARGETA)
128
+ $(AR) $(TARGETA) $(OBJECTS)
129
+
130
+ Makefile: sunscraper.pro /usr/share/qt4/mkspecs/linux-g++/qmake.conf /usr/share/qt4/mkspecs/common/g++.conf \
131
+ /usr/share/qt4/mkspecs/common/unix.conf \
132
+ /usr/share/qt4/mkspecs/common/linux.conf \
133
+ /usr/share/qt4/mkspecs/qconfig.pri \
134
+ /usr/share/qt4/mkspecs/modules/qt_webkit_version.pri \
135
+ /usr/share/qt4/mkspecs/features/qt_functions.prf \
136
+ /usr/share/qt4/mkspecs/features/qt_config.prf \
137
+ /usr/share/qt4/mkspecs/features/exclusive_builds.prf \
138
+ /usr/share/qt4/mkspecs/features/default_pre.prf \
139
+ /usr/share/qt4/mkspecs/features/debug.prf \
140
+ /usr/share/qt4/mkspecs/features/default_post.prf \
141
+ /usr/share/qt4/mkspecs/features/warn_on.prf \
142
+ /usr/share/qt4/mkspecs/features/qt.prf \
143
+ /usr/share/qt4/mkspecs/features/unix/thread.prf \
144
+ /usr/share/qt4/mkspecs/features/moc.prf \
145
+ /usr/share/qt4/mkspecs/features/resources.prf \
146
+ /usr/share/qt4/mkspecs/features/uic.prf \
147
+ /usr/share/qt4/mkspecs/features/yacc.prf \
148
+ /usr/share/qt4/mkspecs/features/lex.prf \
149
+ /usr/share/qt4/mkspecs/features/include_source_dir.prf \
150
+ /usr/lib/libQtWebKit.prl \
151
+ /usr/lib/libQtGui.prl \
152
+ /usr/lib/libQtCore.prl
153
+ $(QMAKE) -spec /usr/share/qt4/mkspecs/linux-g++ CONFIG+=debug -o Makefile sunscraper.pro
154
+ /usr/share/qt4/mkspecs/common/g++.conf:
155
+ /usr/share/qt4/mkspecs/common/unix.conf:
156
+ /usr/share/qt4/mkspecs/common/linux.conf:
157
+ /usr/share/qt4/mkspecs/qconfig.pri:
158
+ /usr/share/qt4/mkspecs/modules/qt_webkit_version.pri:
159
+ /usr/share/qt4/mkspecs/features/qt_functions.prf:
160
+ /usr/share/qt4/mkspecs/features/qt_config.prf:
161
+ /usr/share/qt4/mkspecs/features/exclusive_builds.prf:
162
+ /usr/share/qt4/mkspecs/features/default_pre.prf:
163
+ /usr/share/qt4/mkspecs/features/debug.prf:
164
+ /usr/share/qt4/mkspecs/features/default_post.prf:
165
+ /usr/share/qt4/mkspecs/features/warn_on.prf:
166
+ /usr/share/qt4/mkspecs/features/qt.prf:
167
+ /usr/share/qt4/mkspecs/features/unix/thread.prf:
168
+ /usr/share/qt4/mkspecs/features/moc.prf:
169
+ /usr/share/qt4/mkspecs/features/resources.prf:
170
+ /usr/share/qt4/mkspecs/features/uic.prf:
171
+ /usr/share/qt4/mkspecs/features/yacc.prf:
172
+ /usr/share/qt4/mkspecs/features/lex.prf:
173
+ /usr/share/qt4/mkspecs/features/include_source_dir.prf:
174
+ /usr/lib/libQtWebKit.prl:
175
+ /usr/lib/libQtGui.prl:
176
+ /usr/lib/libQtCore.prl:
177
+ qmake: FORCE
178
+ @$(QMAKE) -spec /usr/share/qt4/mkspecs/linux-g++ CONFIG+=debug -o Makefile sunscraper.pro
179
+
180
+ dist:
181
+ @$(CHK_DIR_EXISTS) .tmp/sunscraper1.0.0 || $(MKDIR) .tmp/sunscraper1.0.0
182
+ $(COPY_FILE) --parents $(SOURCES) $(DIST) .tmp/sunscraper1.0.0/ && $(COPY_FILE) --parents sunscraperlibrary.h sunscraperthread.h sunscraper.h sunscraperproxy.h .tmp/sunscraper1.0.0/ && $(COPY_FILE) --parents sunscraperlibrary.cpp sunscraperthread.cpp sunscraperexternal.cpp sunscraper.cpp sunscraperproxy.cpp .tmp/sunscraper1.0.0/ && (cd `dirname .tmp/sunscraper1.0.0` && $(TAR) sunscraper1.0.0.tar sunscraper1.0.0 && $(COMPRESS) sunscraper1.0.0.tar) && $(MOVE) `dirname .tmp/sunscraper1.0.0`/sunscraper1.0.0.tar.gz . && $(DEL_FILE) -r .tmp/sunscraper1.0.0
183
+
184
+
185
+ clean:compiler_clean
186
+ -$(DEL_FILE) $(OBJECTS)
187
+ -$(DEL_FILE) *~ core *.core
188
+
189
+
190
+ ####### Sub-libraries
191
+
192
+ distclean: clean
193
+ -$(DEL_FILE) $(TARGET)
194
+ -$(DEL_FILE) $(TARGET0) $(TARGET1) $(TARGET2) $(TARGETA)
195
+ -$(DEL_FILE) Makefile
196
+
197
+
198
+ check: first
199
+
200
+ mocclean: compiler_moc_header_clean compiler_moc_source_clean
201
+
202
+ mocables: compiler_moc_header_make_all compiler_moc_source_make_all
203
+
204
+ compiler_moc_header_make_all: moc_sunscraperthread.cpp moc_sunscraper.cpp moc_sunscraperproxy.cpp
205
+ compiler_moc_header_clean:
206
+ -$(DEL_FILE) moc_sunscraperthread.cpp moc_sunscraper.cpp moc_sunscraperproxy.cpp
207
+ moc_sunscraperthread.cpp: sunscraperthread.h
208
+ /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) sunscraperthread.h -o moc_sunscraperthread.cpp
209
+
210
+ moc_sunscraper.cpp: sunscraper.h
211
+ /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) sunscraper.h -o moc_sunscraper.cpp
212
+
213
+ moc_sunscraperproxy.cpp: sunscraperproxy.h
214
+ /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) sunscraperproxy.h -o moc_sunscraperproxy.cpp
215
+
216
+ compiler_rcc_make_all:
217
+ compiler_rcc_clean:
218
+ compiler_image_collection_make_all: qmake_image_collection.cpp
219
+ compiler_image_collection_clean:
220
+ -$(DEL_FILE) qmake_image_collection.cpp
221
+ compiler_moc_source_make_all:
222
+ compiler_moc_source_clean:
223
+ compiler_uic_make_all:
224
+ compiler_uic_clean:
225
+ compiler_yacc_decl_make_all:
226
+ compiler_yacc_decl_clean:
227
+ compiler_yacc_impl_make_all:
228
+ compiler_yacc_impl_clean:
229
+ compiler_lex_make_all:
230
+ compiler_lex_clean:
231
+ compiler_clean: compiler_moc_header_clean
232
+
233
+ ####### Compile
234
+
235
+ sunscraperlibrary.o: sunscraperlibrary.cpp sunscraperlibrary.h \
236
+ sunscraperthread.h
237
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperlibrary.o sunscraperlibrary.cpp
238
+
239
+ sunscraperthread.o: sunscraperthread.cpp sunscraperthread.h \
240
+ sunscraperproxy.h
241
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperthread.o sunscraperthread.cpp
242
+
243
+ sunscraperexternal.o: sunscraperexternal.cpp sunscraper.h
244
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperexternal.o sunscraperexternal.cpp
245
+
246
+ sunscraper.o: sunscraper.cpp sunscraper.h \
247
+ sunscraperlibrary.h \
248
+ sunscraperthread.h
249
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraper.o sunscraper.cpp
250
+
251
+ sunscraperproxy.o: sunscraperproxy.cpp sunscraperproxy.h
252
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperproxy.o sunscraperproxy.cpp
253
+
254
+ moc_sunscraperthread.o: moc_sunscraperthread.cpp
255
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o moc_sunscraperthread.o moc_sunscraperthread.cpp
256
+
257
+ moc_sunscraper.o: moc_sunscraper.cpp
258
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o moc_sunscraper.o moc_sunscraper.cpp
259
+
260
+ moc_sunscraperproxy.o: moc_sunscraperproxy.cpp
261
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o moc_sunscraperproxy.o moc_sunscraperproxy.cpp
262
+
263
+ ####### Install
264
+
265
+ install: FORCE
266
+
267
+ uninstall: FORCE
268
+
269
+ FORCE:
270
+
@@ -0,0 +1,9 @@
1
+ # This Makefile will get replaced by qmake.
2
+
3
+ File.open("Makefile", "w") do |mf|
4
+ mf.puts <<-ENDM
5
+ all:
6
+ qmake
7
+ make
8
+ ENDM
9
+ end
@@ -0,0 +1,86 @@
1
+ #include <QApplication>
2
+ #include <QThread>
3
+ #include <QTimer>
4
+ #include <QWebPage>
5
+ #include <QWebFrame>
6
+ #include <QMutexLocker>
7
+ #include <QtDebug>
8
+ #include "sunscraper.h"
9
+ #include "sunscraperlibrary.h"
10
+ #include "sunscraperthread.h"
11
+
12
+ unsigned Sunscraper::_nextQueryId = 1;
13
+ QMutex Sunscraper::_staticMutex;
14
+
15
+ Sunscraper::Sunscraper()
16
+ {
17
+ QMutexLocker locker(&_staticMutex);
18
+
19
+ _queryId = _nextQueryId++;
20
+
21
+ SunscraperThread *worker = SunscraperLibrary::instance()->thread();
22
+
23
+ connect(this, SIGNAL(requestLoadHtml(uint,QString)),
24
+ worker, SLOT(loadHtml(uint,QString)), Qt::QueuedConnection);
25
+ connect(this, SIGNAL(requestLoadUrl(uint,QString)),
26
+ worker, SLOT(loadUrl(uint,QString)), Qt::QueuedConnection);
27
+ connect(this, SIGNAL(requestFinalize(uint)),
28
+ worker, SLOT(finalize(uint)), Qt::QueuedConnection);
29
+
30
+ connect(worker, SIGNAL(finished(uint,QString)),
31
+ this, SLOT(finished(uint,QString)), Qt::QueuedConnection);
32
+ }
33
+
34
+ void Sunscraper::loadHtml(QString html)
35
+ {
36
+ emit requestLoadHtml(_queryId, html);
37
+ }
38
+
39
+ void Sunscraper::loadUrl(QString url)
40
+ {
41
+ emit requestLoadUrl(_queryId, url);
42
+ }
43
+
44
+ void Sunscraper::wait(unsigned timeout)
45
+ {
46
+ QTimer _timeoutTimer;
47
+ connect(&_timeoutTimer, SIGNAL(timeout()), this, SLOT(timeout()));
48
+
49
+ _timeoutTimer.setInterval(timeout);
50
+ _timeoutTimer.start();
51
+
52
+ _eventLoop.exec();
53
+
54
+ _timeoutTimer.stop();
55
+ }
56
+
57
+ void Sunscraper::finished(unsigned eventQueryId, QString html)
58
+ {
59
+ if(eventQueryId != _queryId)
60
+ return;
61
+
62
+ _eventLoop.quit();
63
+
64
+ _html = html.toUtf8();
65
+
66
+ emit requestFinalize(_queryId);
67
+ }
68
+
69
+ void Sunscraper::timeout()
70
+ {
71
+ _eventLoop.quit();
72
+
73
+ _html = "!SUNSCRAPER_TIMEOUT";
74
+
75
+ emit requestFinalize(_queryId);
76
+ }
77
+
78
+ QByteArray Sunscraper::fetch()
79
+ {
80
+ return _html;
81
+ }
82
+
83
+ const char *Sunscraper::fetchAsCString()
84
+ {
85
+ return _html.constData();
86
+ }
@@ -0,0 +1,45 @@
1
+ #ifndef SUNSCRAPER_H
2
+ #define SUNSCRAPER_H
3
+
4
+ #include <QObject>
5
+ #include <QString>
6
+ #include <QMutex>
7
+ #include <QByteArray>
8
+ #include <QEventLoop>
9
+
10
+ class QWebPage;
11
+
12
+ class Sunscraper : public QObject
13
+ {
14
+ Q_OBJECT
15
+
16
+ public:
17
+ Sunscraper();
18
+
19
+ void loadHtml(QString html);
20
+ void loadUrl(QString url);
21
+
22
+ void wait(unsigned timeout);
23
+
24
+ QByteArray fetch();
25
+ const char *fetchAsCString();
26
+
27
+ private slots:
28
+ void finished(unsigned queryId, QString html);
29
+ void timeout();
30
+
31
+ signals:
32
+ void requestLoadHtml(unsigned queryId, QString html);
33
+ void requestLoadUrl(unsigned queryId, QString html);
34
+ void requestFinalize(unsigned queryId);
35
+
36
+ private:
37
+ static unsigned _nextQueryId;
38
+ static QMutex _staticMutex;
39
+
40
+ unsigned _queryId;
41
+ QEventLoop _eventLoop;
42
+ QByteArray _html;
43
+ };
44
+
45
+ #endif // SUNSCRAPER_H
@@ -0,0 +1,15 @@
1
+ QT += webkit
2
+
3
+ TARGET = sunscraper
4
+ TEMPLATE = lib
5
+
6
+ SOURCES += sunscraperlibrary.cpp \
7
+ sunscraperthread.cpp \
8
+ sunscraperexternal.cpp \
9
+ sunscraper.cpp \
10
+ sunscraperproxy.cpp
11
+
12
+ HEADERS += sunscraperlibrary.h \
13
+ sunscraperthread.h \
14
+ sunscraper.h \
15
+ sunscraperproxy.h
@@ -0,0 +1,33 @@
1
+ #include "sunscraper.h"
2
+
3
+ extern "C" {
4
+ Sunscraper *sunscraper_create()
5
+ {
6
+ return new Sunscraper();
7
+ }
8
+
9
+ void sunscraper_load_html(Sunscraper *sunscraper, const char *html)
10
+ {
11
+ sunscraper->loadHtml(html);
12
+ }
13
+
14
+ void sunscraper_load_url(Sunscraper *sunscraper, const char *url)
15
+ {
16
+ sunscraper->loadUrl(url);
17
+ }
18
+
19
+ void sunscraper_wait(Sunscraper *sunscraper, unsigned timeout)
20
+ {
21
+ sunscraper->wait(timeout);
22
+ }
23
+
24
+ const char *sunscraper_fetch(Sunscraper *sunscraper)
25
+ {
26
+ return sunscraper->fetchAsCString();
27
+ }
28
+
29
+ void sunscraper_discard(Sunscraper *sunscraper)
30
+ {
31
+ delete sunscraper;
32
+ }
33
+ }
@@ -0,0 +1,25 @@
1
+ #include "sunscraperlibrary.h"
2
+ #include "sunscraperthread.h"
3
+
4
+ SunscraperLibrary SunscraperLibrary::_instance;
5
+
6
+ SunscraperLibrary::SunscraperLibrary()
7
+ {
8
+ _apartmentThread = new SunscraperThread();
9
+ _apartmentThread->start();
10
+ }
11
+
12
+ SunscraperLibrary::~SunscraperLibrary()
13
+ {
14
+ /* Do nothing. This is on purpose. */
15
+ }
16
+
17
+ SunscraperLibrary *SunscraperLibrary::instance()
18
+ {
19
+ return &_instance;
20
+ }
21
+
22
+ SunscraperThread *SunscraperLibrary::thread()
23
+ {
24
+ return _apartmentThread;
25
+ }
@@ -0,0 +1,22 @@
1
+ #ifndef SUNSCRAPERLIBRARY_H
2
+ #define SUNSCRAPERLIBRARY_H
3
+
4
+ class SunscraperThread;
5
+
6
+ class SunscraperLibrary {
7
+ public:
8
+ static SunscraperLibrary *instance();
9
+
10
+ SunscraperThread *thread();
11
+
12
+ private:
13
+ SunscraperLibrary();
14
+ SunscraperLibrary(SunscraperLibrary &);
15
+ ~SunscraperLibrary();
16
+
17
+ static SunscraperLibrary _instance;
18
+
19
+ SunscraperThread *_apartmentThread;
20
+ };
21
+
22
+ #endif // SUNSCRAPER_H
@@ -0,0 +1,13 @@
1
+ #include <QWebPage>
2
+ #include <QWebFrame>
3
+ #include "sunscraperproxy.h"
4
+
5
+ SunscraperProxy::SunscraperProxy(QWebPage *parent, unsigned queryId) :
6
+ QObject(parent), _webPage(parent), _queryId(queryId)
7
+ {
8
+ }
9
+
10
+ void SunscraperProxy::finish()
11
+ {
12
+ emit finished(_queryId, _webPage->mainFrame()->toHtml());
13
+ }
@@ -0,0 +1,24 @@
1
+ #ifndef SUNSCRAPERPROXY_H
2
+ #define SUNSCRAPERPROXY_H
3
+
4
+ #include <QObject>
5
+
6
+ class QWebPage;
7
+
8
+ class SunscraperProxy : public QObject
9
+ {
10
+ Q_OBJECT
11
+ public:
12
+ SunscraperProxy(QWebPage *parent, unsigned queryId);
13
+
14
+ Q_INVOKABLE void finish();
15
+
16
+ signals:
17
+ void finished(unsigned _queryId, QString html);
18
+
19
+ private:
20
+ QWebPage *_webPage;
21
+ unsigned _queryId;
22
+ };
23
+
24
+ #endif // SUNSCRAPERPROXY_H
@@ -0,0 +1,67 @@
1
+ #include <QApplication>
2
+ #include <QWebPage>
3
+ #include <QWebFrame>
4
+ #include "sunscraperthread.h"
5
+ #include "sunscraperproxy.h"
6
+
7
+ SunscraperThread::SunscraperThread()
8
+ {
9
+ }
10
+
11
+ void SunscraperThread::run()
12
+ {
13
+ static int argc;
14
+ static char **argv = {NULL};
15
+
16
+ QApplication app(argc, argv);
17
+ app.exec();
18
+
19
+ qFatal("Sunscraper apartment thread event loop should never end");
20
+ }
21
+
22
+ void SunscraperThread::loadHtml(unsigned queryId, QString html)
23
+ {
24
+ QWebPage *webPage = initializeWebPage(queryId);
25
+ webPage->mainFrame()->setHtml(html);
26
+ }
27
+
28
+ void SunscraperThread::loadUrl(unsigned queryId, QString url)
29
+ {
30
+ QWebPage *webPage = initializeWebPage(queryId);
31
+ webPage->mainFrame()->load(url);
32
+ }
33
+
34
+ void SunscraperThread::finalize(unsigned queryId)
35
+ {
36
+ Q_ASSERT(_webPages[queryId] != NULL);
37
+
38
+ _webPages[queryId]->deleteLater();
39
+ _webPages.remove(queryId);
40
+ }
41
+
42
+ QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
43
+ {
44
+ Q_ASSERT(_webPages[queryId] == NULL);
45
+
46
+ QWebPage *webPage = new QWebPage(this);
47
+ connect(webPage->mainFrame(), SIGNAL(javaScriptWindowObjectCleared()),
48
+ this, SLOT(attachAPI()));
49
+
50
+ _webPages[queryId] = webPage;
51
+
52
+ return webPage;
53
+ }
54
+
55
+ void SunscraperThread::attachAPI()
56
+ {
57
+ QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
58
+ QWebPage *page = origin->page();
59
+
60
+ unsigned queryId = _webPages.key(page, 0);
61
+ Q_ASSERT(queryId != 0);
62
+
63
+ SunscraperProxy *proxy = new SunscraperProxy(page, queryId);
64
+ connect(proxy, SIGNAL(finished(uint,QString)), this, SIGNAL(finished(uint,QString)));
65
+
66
+ origin->addToJavaScriptWindowObject("Sunscraper", proxy, QScriptEngine::QtOwnership);
67
+ }
@@ -0,0 +1,34 @@
1
+ #ifndef SUNSCRAPERTHREAD_H
2
+ #define SUNSCRAPERTHREAD_H
3
+
4
+ #include <QThread>
5
+ #include <QMap>
6
+
7
+ class QWebPage;
8
+
9
+ class SunscraperThread : public QThread
10
+ {
11
+ Q_OBJECT
12
+ public:
13
+ SunscraperThread();
14
+
15
+ void run();
16
+
17
+ signals:
18
+ void finished(unsigned queryId, QString result);
19
+
20
+ public slots:
21
+ void loadHtml(unsigned queryId, QString html);
22
+ void loadUrl(unsigned queryId, QString url);
23
+ void finalize(unsigned queryId);
24
+
25
+ private slots:
26
+ void attachAPI();
27
+
28
+ private:
29
+ QMap<unsigned, QWebPage *> _webPages;
30
+
31
+ QWebPage *initializeWebPage(unsigned queryId);
32
+ };
33
+
34
+ #endif // SUNSCRAPERTHREAD_H
@@ -0,0 +1,50 @@
1
+ require 'sunscraper/library'
2
+
3
+ # Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
4
+ # method to be called. It blocks the calling thread, but is threadsafe, does
5
+ # not acquire GIL and thus can be called from multiple threads simultaneously.
6
+ module Sunscraper
7
+ # ScrapeTimeout error is raised when the page could not be loaded fast enough.
8
+ class ScrapeTimeout < StandardError; end
9
+
10
+ class << self
11
+ # Scrape an inline HTML. The content is loaded without a particular base URL.
12
+ # If your application depends on base URL being available, use {scrape_url}.
13
+ #
14
+ # @param [Integer] timeout timeout in milliseconds
15
+ def scrape_html(html, timeout=5000)
16
+ scrape(timeout) do |context|
17
+ Library.load_html context, html
18
+ end
19
+ end
20
+
21
+ # Scrape an URL.
22
+ #
23
+ # @param [Integer] timeout timeout in milliseconds
24
+ def scrape_url(url, timeout=5000)
25
+ scrape(timeout) do |context|
26
+ Library.load_url context, url
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def scrape(timeout)
33
+ context = Library.create
34
+
35
+ yield context
36
+
37
+ Library.wait(context, timeout)
38
+
39
+ data = Library.fetch(context)
40
+
41
+ if data == "!SUNSCRAPER_TIMEOUT"
42
+ raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
43
+ else
44
+ data
45
+ end
46
+ ensure
47
+ Library.discard(context) if context
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,37 @@
1
+ if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
2
+ raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
3
+ end
4
+
5
+ require 'ffi'
6
+
7
+ # @private
8
+ module Sunscraper::Library
9
+ extend FFI::Library
10
+
11
+ # RbConfig sniffing does not work on JRuby.
12
+ if Gem.win_platform?
13
+ extension = 'dll'
14
+ elsif RUBY_PLATFORM =~ /darwin/i
15
+ extension = 'dylib'
16
+ else
17
+ extension = 'so'
18
+ end
19
+
20
+ ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
21
+ 'ext', "libsunscraper.#{extension}")
22
+
23
+ attach_function 'create', :sunscraper_create, [], :pointer
24
+ attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
25
+ attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
26
+ attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
27
+ attach_function 'discard', :sunscraper_discard, [:pointer], :void
28
+
29
+ if RUBY_ENGINE == 'ruby'
30
+ # MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
31
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
32
+ else
33
+ # Rubinius does not have GVL neither it has options in attach_function.
34
+ # Same for JRuby.
35
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
36
+ end
37
+ end
@@ -0,0 +1,4 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'sunscraper'
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ require 'webrick'
4
+
5
+ HTML = <<HTML
6
+ <html>
7
+ <head>
8
+ <script type="text/javascript">
9
+ document.addEventListener("DOMContentLoaded", function() {
10
+ document.getElementById('fuga').textContent =
11
+ ("!skrow tI").split("").reverse().join("");
12
+ Sunscraper.finish();
13
+ }, true);
14
+ </script>
15
+ </head>
16
+ <body>
17
+ <div id='fuga'></div>
18
+ </body>
19
+ </html>
20
+ HTML
21
+
22
+ PORT = 45555
23
+
24
+ describe Sunscraper do
25
+ it "can scrape an HTML provided as a string" do
26
+ Sunscraper.scrape_html(HTML).should include('It works!')
27
+ end
28
+
29
+ it "can scrape an URL" do
30
+ server = WEBrick::HTTPServer.new :Port => PORT, :Logger => WEBrick::Log.new('/dev/null'), :AccessLog => []
31
+ server.mount_proc '/' do |req, res|
32
+ res.body = HTML
33
+ end
34
+ Thread.new { server.start }
35
+
36
+ Sunscraper.scrape_url("http://localhost:#{PORT}/").should include('It works!')
37
+
38
+ server.stop
39
+ end
40
+
41
+ it "should time out if callback is not called" do
42
+ lambda { Sunscraper.scrape_html("<!-- nothing. at least no callbacks -->", 1000) }.
43
+ should raise_exception(Sunscraper::ScrapeTimeout)
44
+ end
45
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "sunscraper"
6
+ s.version = "1.0.0"
7
+ s.authors = ["Peter Zotov"]
8
+ s.email = ["whitequark@whitequark.org"]
9
+ s.homepage = "http://github.com/roundlake/sunscraper"
10
+ s.summary = %q{A WebKit-based, JavaScript-capable HTML scraper.}
11
+ s.description = s.summary
12
+
13
+ s.rubyforge_project = "sunscraper"
14
+
15
+ s.files = `git ls-files`.split("\n")
16
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ s.extensions = ["ext/extconf.rb"]
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rspec"
22
+ s.add_runtime_dependency "ffi", '>= 1.0.11'
23
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sunscraper
3
+ version: !ruby/object:Gem::Version
4
+ hash: 540260530
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Peter Zotov
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-02-18 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 881230260
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :development
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: ffi
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ hash: 716237343
43
+ segments:
44
+ - 1
45
+ - 0
46
+ - 11
47
+ version: 1.0.11
48
+ type: :runtime
49
+ version_requirements: *id002
50
+ description: A WebKit-based, JavaScript-capable HTML scraper.
51
+ email:
52
+ - whitequark@whitequark.org
53
+ executables: []
54
+
55
+ extensions:
56
+ - ext/extconf.rb
57
+ extra_rdoc_files: []
58
+
59
+ files:
60
+ - .gitignore
61
+ - .rspec
62
+ - .yardopts
63
+ - Gemfile
64
+ - LICENSE
65
+ - README.md
66
+ - Rakefile
67
+ - ext/.gitignore
68
+ - ext/Makefile
69
+ - ext/extconf.rb
70
+ - ext/sunscraper.cpp
71
+ - ext/sunscraper.h
72
+ - ext/sunscraper.pro
73
+ - ext/sunscraperexternal.cpp
74
+ - ext/sunscraperlibrary.cpp
75
+ - ext/sunscraperlibrary.h
76
+ - ext/sunscraperproxy.cpp
77
+ - ext/sunscraperproxy.h
78
+ - ext/sunscraperthread.cpp
79
+ - ext/sunscraperthread.h
80
+ - lib/sunscraper.rb
81
+ - lib/sunscraper/library.rb
82
+ - spec/spec_helper.rb
83
+ - spec/sunscraper_spec.rb
84
+ - sunscraper.gemspec
85
+ homepage: http://github.com/roundlake/sunscraper
86
+ licenses: []
87
+
88
+ post_install_message:
89
+ rdoc_options: []
90
+
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ hash: 881230260
99
+ segments:
100
+ - 0
101
+ version: "0"
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ hash: 881230260
108
+ segments:
109
+ - 0
110
+ version: "0"
111
+ requirements: []
112
+
113
+ rubyforge_project: sunscraper
114
+ rubygems_version: 1.8.12
115
+ signing_key:
116
+ specification_version: 3
117
+ summary: A WebKit-based, JavaScript-capable HTML scraper.
118
+ test_files: []
119
+
120
+ has_rdoc: