sunscraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ doc/*
6
+ .rbx/*
7
+ .yardoc/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
@@ -0,0 +1 @@
1
+ --no-private --markup markdown - LICENSE
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in sunscraper.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (C) 2012 Peter Zotov <whitequark@whitequark.org>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ of the Software, and to permit persons to whom the Software is furnished to do
8
+ so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,68 @@
1
+ Sunscraper
2
+ ==========
3
+
4
+ Sunscraper is a gem for prerendering pages with hashbang URLs like `http://whatever.com/#!/page`.
5
+
6
+ It works by loading content in the embedded web browser and waiting for a JavaScript method to be
7
+ called.
8
+
9
+ HTML = %{
10
+ <html>
11
+ <head>
12
+ <script type="text/javascript">
13
+ document.addEventListener("DOMContentLoaded", function() {
14
+ document.getElementById('fuga').textContent =
15
+ ("!skrow tI").split("").reverse().join("");
16
+ Sunscraper.finish();
17
+ }, true);
18
+ </script>
19
+ </head>
20
+ <body>
21
+ <div id='fuga'></div>
22
+ </body>
23
+ </html>
24
+ }
25
+
26
+ Sunscraper.scrape_html(HTML).include?('It works!') # => true
27
+
28
+ See also [documentation][].
29
+
30
+ [documentation]: http://rdoc.info/gems/sunscraper/Sunscraper
31
+
32
+ Installation
33
+ ------------
34
+
35
+ Sunscraper requires Qt 4.x and QtWebkit packages to be installed on the target system. *Sunscraper is not a Ruby
36
+ C extension*; it works by building a Qt shared library and loading it through [FFI][].
37
+
38
+ [FFI]: http://en.wikipedia.org/wiki/Foreign_Function_Interface
39
+
40
+ gem install sunscraper
41
+
42
+ Runtime requirements
43
+ --------------------
44
+
45
+ On Linux with Qt versions <= 4.8, Sunscraper requires a running X server and a valid `DISPLAY` environment
46
+ variable. Consider using [Xvfb][] on a GUI-less production server.
47
+
48
+ [Xvfb]: http://www.x.org/releases/X11R7.6/doc/man/man1/Xvfb.1.xhtml
49
+
50
+ Compatibility
51
+ -------------
52
+
53
+ Sunscraper should be compatible across all major implementations on all major operating systems, including
54
+ Ruby MRI 1.9, JRuby, Rubinius and MacRuby running on GNU/Linux, OS X and Windows.
55
+
56
+ JRuby versions up to 1.6.5 are known not to work due to a bug in its FFI library.
57
+
58
+ Ruby MRI 1.8 is not supported because it has a braindead threading model and will never be because I don't care.
59
+
60
+ Thread safety
61
+ -------------
62
+
63
+ Sunscraper is thread-safe.
64
+
65
+ License
66
+ -------
67
+
68
+ Sunscraper is distributed under the terms of a MIT license; see LICENSE in the source distribution.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,4 @@
1
+ *.user
2
+ *.o
3
+ *.so*
4
+ moc_*
@@ -0,0 +1,270 @@
1
+ #############################################################################
2
+ # Makefile for building: libsunscraper.so.1.0.0
3
+ # Generated by qmake (2.01a) (Qt 4.7.4) on: Sat Feb 18 05:54:02 2012
4
+ # Project: sunscraper.pro
5
+ # Template: lib
6
+ # Command: /usr/bin/qmake-qt4 -spec /usr/share/qt4/mkspecs/linux-g++ CONFIG+=debug -o Makefile sunscraper.pro
7
+ #############################################################################
8
+
9
+ ####### Compiler, tools and options
10
+
11
+ CC = gcc
12
+ CXX = g++
13
+ DEFINES = -DQT_WEBKIT -DQT_WEBKIT_LIB -DQT_GUI_LIB -DQT_CORE_LIB -DQT_SHARED
14
+ CFLAGS = -pipe -g -Wall -W -D_REENTRANT -fPIC $(DEFINES)
15
+ CXXFLAGS = -pipe -g -Wall -W -D_REENTRANT -fPIC $(DEFINES)
16
+ INCPATH = -I/usr/share/qt4/mkspecs/linux-g++ -I. -I/usr/include/qt4/QtCore -I/usr/include/qt4/QtGui -I/usr/include/qt4/QtWebKit -I/usr/include/qt4 -I.
17
+ LINK = g++
18
+ LFLAGS = -shared -Wl,-soname,libsunscraper.so.1
19
+ LIBS = $(SUBLIBS) -L/usr/lib -lQtWebKit -lQtGui -lQtCore -lpthread
20
+ AR = ar cqs
21
+ RANLIB =
22
+ QMAKE = /usr/bin/qmake-qt4
23
+ TAR = tar -cf
24
+ COMPRESS = gzip -9f
25
+ COPY = cp -f
26
+ SED = sed
27
+ COPY_FILE = $(COPY)
28
+ COPY_DIR = $(COPY) -r
29
+ STRIP = strip
30
+ INSTALL_FILE = install -m 644 -p
31
+ INSTALL_DIR = $(COPY_DIR)
32
+ INSTALL_PROGRAM = install -m 755 -p
33
+ DEL_FILE = rm -f
34
+ SYMLINK = ln -f -s
35
+ DEL_DIR = rmdir
36
+ MOVE = mv -f
37
+ CHK_DIR_EXISTS= test -d
38
+ MKDIR = mkdir -p
39
+
40
+ ####### Output directory
41
+
42
+ OBJECTS_DIR = ./
43
+
44
+ ####### Files
45
+
46
+ SOURCES = sunscraperlibrary.cpp \
47
+ sunscraperthread.cpp \
48
+ sunscraperexternal.cpp \
49
+ sunscraper.cpp \
50
+ sunscraperproxy.cpp moc_sunscraperthread.cpp \
51
+ moc_sunscraper.cpp \
52
+ moc_sunscraperproxy.cpp
53
+ OBJECTS = sunscraperlibrary.o \
54
+ sunscraperthread.o \
55
+ sunscraperexternal.o \
56
+ sunscraper.o \
57
+ sunscraperproxy.o \
58
+ moc_sunscraperthread.o \
59
+ moc_sunscraper.o \
60
+ moc_sunscraperproxy.o
61
+ DIST = /usr/share/qt4/mkspecs/common/g++.conf \
62
+ /usr/share/qt4/mkspecs/common/unix.conf \
63
+ /usr/share/qt4/mkspecs/common/linux.conf \
64
+ /usr/share/qt4/mkspecs/qconfig.pri \
65
+ /usr/share/qt4/mkspecs/modules/qt_webkit_version.pri \
66
+ /usr/share/qt4/mkspecs/features/qt_functions.prf \
67
+ /usr/share/qt4/mkspecs/features/qt_config.prf \
68
+ /usr/share/qt4/mkspecs/features/exclusive_builds.prf \
69
+ /usr/share/qt4/mkspecs/features/default_pre.prf \
70
+ /usr/share/qt4/mkspecs/features/debug.prf \
71
+ /usr/share/qt4/mkspecs/features/default_post.prf \
72
+ /usr/share/qt4/mkspecs/features/warn_on.prf \
73
+ /usr/share/qt4/mkspecs/features/qt.prf \
74
+ /usr/share/qt4/mkspecs/features/unix/thread.prf \
75
+ /usr/share/qt4/mkspecs/features/moc.prf \
76
+ /usr/share/qt4/mkspecs/features/resources.prf \
77
+ /usr/share/qt4/mkspecs/features/uic.prf \
78
+ /usr/share/qt4/mkspecs/features/yacc.prf \
79
+ /usr/share/qt4/mkspecs/features/lex.prf \
80
+ /usr/share/qt4/mkspecs/features/include_source_dir.prf \
81
+ sunscraper.pro
82
+ QMAKE_TARGET = sunscraper
83
+ DESTDIR =
84
+ TARGET = libsunscraper.so.1.0.0
85
+ TARGETA = libsunscraper.a
86
+ TARGETD = libsunscraper.so.1.0.0
87
+ TARGET0 = libsunscraper.so
88
+ TARGET1 = libsunscraper.so.1
89
+ TARGET2 = libsunscraper.so.1.0
90
+
91
+ first: all
92
+ ####### Implicit rules
93
+
94
+ .SUFFIXES: .o .c .cpp .cc .cxx .C
95
+
96
+ .cpp.o:
97
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
98
+
99
+ .cc.o:
100
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
101
+
102
+ .cxx.o:
103
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
104
+
105
+ .C.o:
106
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
107
+
108
+ .c.o:
109
+ $(CC) -c $(CFLAGS) $(INCPATH) -o "$@" "$<"
110
+
111
+ ####### Build rules
112
+
113
+ all: Makefile $(TARGET)
114
+
115
+ $(TARGET): $(OBJECTS) $(SUBLIBS) $(OBJCOMP)
116
+ -$(DEL_FILE) $(TARGET) $(TARGET0) $(TARGET1) $(TARGET2)
117
+ $(LINK) $(LFLAGS) -o $(TARGET) $(OBJECTS) $(LIBS) $(OBJCOMP)
118
+ -ln -s $(TARGET) $(TARGET0)
119
+ -ln -s $(TARGET) $(TARGET1)
120
+ -ln -s $(TARGET) $(TARGET2)
121
+
122
+
123
+
124
+ staticlib: $(TARGETA)
125
+
126
+ $(TARGETA): $(OBJECTS) $(OBJCOMP)
127
+ -$(DEL_FILE) $(TARGETA)
128
+ $(AR) $(TARGETA) $(OBJECTS)
129
+
130
+ Makefile: sunscraper.pro /usr/share/qt4/mkspecs/linux-g++/qmake.conf /usr/share/qt4/mkspecs/common/g++.conf \
131
+ /usr/share/qt4/mkspecs/common/unix.conf \
132
+ /usr/share/qt4/mkspecs/common/linux.conf \
133
+ /usr/share/qt4/mkspecs/qconfig.pri \
134
+ /usr/share/qt4/mkspecs/modules/qt_webkit_version.pri \
135
+ /usr/share/qt4/mkspecs/features/qt_functions.prf \
136
+ /usr/share/qt4/mkspecs/features/qt_config.prf \
137
+ /usr/share/qt4/mkspecs/features/exclusive_builds.prf \
138
+ /usr/share/qt4/mkspecs/features/default_pre.prf \
139
+ /usr/share/qt4/mkspecs/features/debug.prf \
140
+ /usr/share/qt4/mkspecs/features/default_post.prf \
141
+ /usr/share/qt4/mkspecs/features/warn_on.prf \
142
+ /usr/share/qt4/mkspecs/features/qt.prf \
143
+ /usr/share/qt4/mkspecs/features/unix/thread.prf \
144
+ /usr/share/qt4/mkspecs/features/moc.prf \
145
+ /usr/share/qt4/mkspecs/features/resources.prf \
146
+ /usr/share/qt4/mkspecs/features/uic.prf \
147
+ /usr/share/qt4/mkspecs/features/yacc.prf \
148
+ /usr/share/qt4/mkspecs/features/lex.prf \
149
+ /usr/share/qt4/mkspecs/features/include_source_dir.prf \
150
+ /usr/lib/libQtWebKit.prl \
151
+ /usr/lib/libQtGui.prl \
152
+ /usr/lib/libQtCore.prl
153
+ $(QMAKE) -spec /usr/share/qt4/mkspecs/linux-g++ CONFIG+=debug -o Makefile sunscraper.pro
154
+ /usr/share/qt4/mkspecs/common/g++.conf:
155
+ /usr/share/qt4/mkspecs/common/unix.conf:
156
+ /usr/share/qt4/mkspecs/common/linux.conf:
157
+ /usr/share/qt4/mkspecs/qconfig.pri:
158
+ /usr/share/qt4/mkspecs/modules/qt_webkit_version.pri:
159
+ /usr/share/qt4/mkspecs/features/qt_functions.prf:
160
+ /usr/share/qt4/mkspecs/features/qt_config.prf:
161
+ /usr/share/qt4/mkspecs/features/exclusive_builds.prf:
162
+ /usr/share/qt4/mkspecs/features/default_pre.prf:
163
+ /usr/share/qt4/mkspecs/features/debug.prf:
164
+ /usr/share/qt4/mkspecs/features/default_post.prf:
165
+ /usr/share/qt4/mkspecs/features/warn_on.prf:
166
+ /usr/share/qt4/mkspecs/features/qt.prf:
167
+ /usr/share/qt4/mkspecs/features/unix/thread.prf:
168
+ /usr/share/qt4/mkspecs/features/moc.prf:
169
+ /usr/share/qt4/mkspecs/features/resources.prf:
170
+ /usr/share/qt4/mkspecs/features/uic.prf:
171
+ /usr/share/qt4/mkspecs/features/yacc.prf:
172
+ /usr/share/qt4/mkspecs/features/lex.prf:
173
+ /usr/share/qt4/mkspecs/features/include_source_dir.prf:
174
+ /usr/lib/libQtWebKit.prl:
175
+ /usr/lib/libQtGui.prl:
176
+ /usr/lib/libQtCore.prl:
177
+ qmake: FORCE
178
+ @$(QMAKE) -spec /usr/share/qt4/mkspecs/linux-g++ CONFIG+=debug -o Makefile sunscraper.pro
179
+
180
+ dist:
181
+ @$(CHK_DIR_EXISTS) .tmp/sunscraper1.0.0 || $(MKDIR) .tmp/sunscraper1.0.0
182
+ $(COPY_FILE) --parents $(SOURCES) $(DIST) .tmp/sunscraper1.0.0/ && $(COPY_FILE) --parents sunscraperlibrary.h sunscraperthread.h sunscraper.h sunscraperproxy.h .tmp/sunscraper1.0.0/ && $(COPY_FILE) --parents sunscraperlibrary.cpp sunscraperthread.cpp sunscraperexternal.cpp sunscraper.cpp sunscraperproxy.cpp .tmp/sunscraper1.0.0/ && (cd `dirname .tmp/sunscraper1.0.0` && $(TAR) sunscraper1.0.0.tar sunscraper1.0.0 && $(COMPRESS) sunscraper1.0.0.tar) && $(MOVE) `dirname .tmp/sunscraper1.0.0`/sunscraper1.0.0.tar.gz . && $(DEL_FILE) -r .tmp/sunscraper1.0.0
183
+
184
+
185
+ clean:compiler_clean
186
+ -$(DEL_FILE) $(OBJECTS)
187
+ -$(DEL_FILE) *~ core *.core
188
+
189
+
190
+ ####### Sub-libraries
191
+
192
+ distclean: clean
193
+ -$(DEL_FILE) $(TARGET)
194
+ -$(DEL_FILE) $(TARGET0) $(TARGET1) $(TARGET2) $(TARGETA)
195
+ -$(DEL_FILE) Makefile
196
+
197
+
198
+ check: first
199
+
200
+ mocclean: compiler_moc_header_clean compiler_moc_source_clean
201
+
202
+ mocables: compiler_moc_header_make_all compiler_moc_source_make_all
203
+
204
+ compiler_moc_header_make_all: moc_sunscraperthread.cpp moc_sunscraper.cpp moc_sunscraperproxy.cpp
205
+ compiler_moc_header_clean:
206
+ -$(DEL_FILE) moc_sunscraperthread.cpp moc_sunscraper.cpp moc_sunscraperproxy.cpp
207
+ moc_sunscraperthread.cpp: sunscraperthread.h
208
+ /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) sunscraperthread.h -o moc_sunscraperthread.cpp
209
+
210
+ moc_sunscraper.cpp: sunscraper.h
211
+ /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) sunscraper.h -o moc_sunscraper.cpp
212
+
213
+ moc_sunscraperproxy.cpp: sunscraperproxy.h
214
+ /usr/bin/moc-qt4 $(DEFINES) $(INCPATH) sunscraperproxy.h -o moc_sunscraperproxy.cpp
215
+
216
+ compiler_rcc_make_all:
217
+ compiler_rcc_clean:
218
+ compiler_image_collection_make_all: qmake_image_collection.cpp
219
+ compiler_image_collection_clean:
220
+ -$(DEL_FILE) qmake_image_collection.cpp
221
+ compiler_moc_source_make_all:
222
+ compiler_moc_source_clean:
223
+ compiler_uic_make_all:
224
+ compiler_uic_clean:
225
+ compiler_yacc_decl_make_all:
226
+ compiler_yacc_decl_clean:
227
+ compiler_yacc_impl_make_all:
228
+ compiler_yacc_impl_clean:
229
+ compiler_lex_make_all:
230
+ compiler_lex_clean:
231
+ compiler_clean: compiler_moc_header_clean
232
+
233
+ ####### Compile
234
+
235
+ sunscraperlibrary.o: sunscraperlibrary.cpp sunscraperlibrary.h \
236
+ sunscraperthread.h
237
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperlibrary.o sunscraperlibrary.cpp
238
+
239
+ sunscraperthread.o: sunscraperthread.cpp sunscraperthread.h \
240
+ sunscraperproxy.h
241
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperthread.o sunscraperthread.cpp
242
+
243
+ sunscraperexternal.o: sunscraperexternal.cpp sunscraper.h
244
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperexternal.o sunscraperexternal.cpp
245
+
246
+ sunscraper.o: sunscraper.cpp sunscraper.h \
247
+ sunscraperlibrary.h \
248
+ sunscraperthread.h
249
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraper.o sunscraper.cpp
250
+
251
+ sunscraperproxy.o: sunscraperproxy.cpp sunscraperproxy.h
252
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o sunscraperproxy.o sunscraperproxy.cpp
253
+
254
+ moc_sunscraperthread.o: moc_sunscraperthread.cpp
255
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o moc_sunscraperthread.o moc_sunscraperthread.cpp
256
+
257
+ moc_sunscraper.o: moc_sunscraper.cpp
258
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o moc_sunscraper.o moc_sunscraper.cpp
259
+
260
+ moc_sunscraperproxy.o: moc_sunscraperproxy.cpp
261
+ $(CXX) -c $(CXXFLAGS) $(INCPATH) -o moc_sunscraperproxy.o moc_sunscraperproxy.cpp
262
+
263
+ ####### Install
264
+
265
+ install: FORCE
266
+
267
+ uninstall: FORCE
268
+
269
+ FORCE:
270
+
@@ -0,0 +1,9 @@
1
+ # This Makefile will get replaced by qmake.
2
+
3
+ File.open("Makefile", "w") do |mf|
4
+ mf.puts <<-ENDM
5
+ all:
6
+ qmake
7
+ make
8
+ ENDM
9
+ end
@@ -0,0 +1,86 @@
1
+ #include <QApplication>
2
+ #include <QThread>
3
+ #include <QTimer>
4
+ #include <QWebPage>
5
+ #include <QWebFrame>
6
+ #include <QMutexLocker>
7
+ #include <QtDebug>
8
+ #include "sunscraper.h"
9
+ #include "sunscraperlibrary.h"
10
+ #include "sunscraperthread.h"
11
+
12
+ unsigned Sunscraper::_nextQueryId = 1;
13
+ QMutex Sunscraper::_staticMutex;
14
+
15
+ Sunscraper::Sunscraper()
16
+ {
17
+ QMutexLocker locker(&_staticMutex);
18
+
19
+ _queryId = _nextQueryId++;
20
+
21
+ SunscraperThread *worker = SunscraperLibrary::instance()->thread();
22
+
23
+ connect(this, SIGNAL(requestLoadHtml(uint,QString)),
24
+ worker, SLOT(loadHtml(uint,QString)), Qt::QueuedConnection);
25
+ connect(this, SIGNAL(requestLoadUrl(uint,QString)),
26
+ worker, SLOT(loadUrl(uint,QString)), Qt::QueuedConnection);
27
+ connect(this, SIGNAL(requestFinalize(uint)),
28
+ worker, SLOT(finalize(uint)), Qt::QueuedConnection);
29
+
30
+ connect(worker, SIGNAL(finished(uint,QString)),
31
+ this, SLOT(finished(uint,QString)), Qt::QueuedConnection);
32
+ }
33
+
34
+ void Sunscraper::loadHtml(QString html)
35
+ {
36
+ emit requestLoadHtml(_queryId, html);
37
+ }
38
+
39
+ void Sunscraper::loadUrl(QString url)
40
+ {
41
+ emit requestLoadUrl(_queryId, url);
42
+ }
43
+
44
+ void Sunscraper::wait(unsigned timeout)
45
+ {
46
+ QTimer _timeoutTimer;
47
+ connect(&_timeoutTimer, SIGNAL(timeout()), this, SLOT(timeout()));
48
+
49
+ _timeoutTimer.setInterval(timeout);
50
+ _timeoutTimer.start();
51
+
52
+ _eventLoop.exec();
53
+
54
+ _timeoutTimer.stop();
55
+ }
56
+
57
+ void Sunscraper::finished(unsigned eventQueryId, QString html)
58
+ {
59
+ if(eventQueryId != _queryId)
60
+ return;
61
+
62
+ _eventLoop.quit();
63
+
64
+ _html = html.toUtf8();
65
+
66
+ emit requestFinalize(_queryId);
67
+ }
68
+
69
+ void Sunscraper::timeout()
70
+ {
71
+ _eventLoop.quit();
72
+
73
+ _html = "!SUNSCRAPER_TIMEOUT";
74
+
75
+ emit requestFinalize(_queryId);
76
+ }
77
+
78
+ QByteArray Sunscraper::fetch()
79
+ {
80
+ return _html;
81
+ }
82
+
83
+ const char *Sunscraper::fetchAsCString()
84
+ {
85
+ return _html.constData();
86
+ }
@@ -0,0 +1,45 @@
1
+ #ifndef SUNSCRAPER_H
2
+ #define SUNSCRAPER_H
3
+
4
+ #include <QObject>
5
+ #include <QString>
6
+ #include <QMutex>
7
+ #include <QByteArray>
8
+ #include <QEventLoop>
9
+
10
+ class QWebPage;
11
+
12
+ class Sunscraper : public QObject
13
+ {
14
+ Q_OBJECT
15
+
16
+ public:
17
+ Sunscraper();
18
+
19
+ void loadHtml(QString html);
20
+ void loadUrl(QString url);
21
+
22
+ void wait(unsigned timeout);
23
+
24
+ QByteArray fetch();
25
+ const char *fetchAsCString();
26
+
27
+ private slots:
28
+ void finished(unsigned queryId, QString html);
29
+ void timeout();
30
+
31
+ signals:
32
+ void requestLoadHtml(unsigned queryId, QString html);
33
+ void requestLoadUrl(unsigned queryId, QString html);
34
+ void requestFinalize(unsigned queryId);
35
+
36
+ private:
37
+ static unsigned _nextQueryId;
38
+ static QMutex _staticMutex;
39
+
40
+ unsigned _queryId;
41
+ QEventLoop _eventLoop;
42
+ QByteArray _html;
43
+ };
44
+
45
+ #endif // SUNSCRAPER_H
@@ -0,0 +1,15 @@
1
+ QT += webkit
2
+
3
+ TARGET = sunscraper
4
+ TEMPLATE = lib
5
+
6
+ SOURCES += sunscraperlibrary.cpp \
7
+ sunscraperthread.cpp \
8
+ sunscraperexternal.cpp \
9
+ sunscraper.cpp \
10
+ sunscraperproxy.cpp
11
+
12
+ HEADERS += sunscraperlibrary.h \
13
+ sunscraperthread.h \
14
+ sunscraper.h \
15
+ sunscraperproxy.h
@@ -0,0 +1,33 @@
1
+ #include "sunscraper.h"
2
+
3
+ extern "C" {
4
+ Sunscraper *sunscraper_create()
5
+ {
6
+ return new Sunscraper();
7
+ }
8
+
9
+ void sunscraper_load_html(Sunscraper *sunscraper, const char *html)
10
+ {
11
+ sunscraper->loadHtml(html);
12
+ }
13
+
14
+ void sunscraper_load_url(Sunscraper *sunscraper, const char *url)
15
+ {
16
+ sunscraper->loadUrl(url);
17
+ }
18
+
19
+ void sunscraper_wait(Sunscraper *sunscraper, unsigned timeout)
20
+ {
21
+ sunscraper->wait(timeout);
22
+ }
23
+
24
+ const char *sunscraper_fetch(Sunscraper *sunscraper)
25
+ {
26
+ return sunscraper->fetchAsCString();
27
+ }
28
+
29
+ void sunscraper_discard(Sunscraper *sunscraper)
30
+ {
31
+ delete sunscraper;
32
+ }
33
+ }
@@ -0,0 +1,25 @@
1
+ #include "sunscraperlibrary.h"
2
+ #include "sunscraperthread.h"
3
+
4
+ SunscraperLibrary SunscraperLibrary::_instance;
5
+
6
+ SunscraperLibrary::SunscraperLibrary()
7
+ {
8
+ _apartmentThread = new SunscraperThread();
9
+ _apartmentThread->start();
10
+ }
11
+
12
+ SunscraperLibrary::~SunscraperLibrary()
13
+ {
14
+ /* Do nothing. This is on purpose. */
15
+ }
16
+
17
+ SunscraperLibrary *SunscraperLibrary::instance()
18
+ {
19
+ return &_instance;
20
+ }
21
+
22
+ SunscraperThread *SunscraperLibrary::thread()
23
+ {
24
+ return _apartmentThread;
25
+ }
@@ -0,0 +1,22 @@
1
+ #ifndef SUNSCRAPERLIBRARY_H
2
+ #define SUNSCRAPERLIBRARY_H
3
+
4
+ class SunscraperThread;
5
+
6
+ class SunscraperLibrary {
7
+ public:
8
+ static SunscraperLibrary *instance();
9
+
10
+ SunscraperThread *thread();
11
+
12
+ private:
13
+ SunscraperLibrary();
14
+ SunscraperLibrary(SunscraperLibrary &);
15
+ ~SunscraperLibrary();
16
+
17
+ static SunscraperLibrary _instance;
18
+
19
+ SunscraperThread *_apartmentThread;
20
+ };
21
+
22
+ #endif // SUNSCRAPER_H
@@ -0,0 +1,13 @@
1
+ #include <QWebPage>
2
+ #include <QWebFrame>
3
+ #include "sunscraperproxy.h"
4
+
5
+ SunscraperProxy::SunscraperProxy(QWebPage *parent, unsigned queryId) :
6
+ QObject(parent), _webPage(parent), _queryId(queryId)
7
+ {
8
+ }
9
+
10
+ void SunscraperProxy::finish()
11
+ {
12
+ emit finished(_queryId, _webPage->mainFrame()->toHtml());
13
+ }
@@ -0,0 +1,24 @@
1
+ #ifndef SUNSCRAPERPROXY_H
2
+ #define SUNSCRAPERPROXY_H
3
+
4
+ #include <QObject>
5
+
6
+ class QWebPage;
7
+
8
+ class SunscraperProxy : public QObject
9
+ {
10
+ Q_OBJECT
11
+ public:
12
+ SunscraperProxy(QWebPage *parent, unsigned queryId);
13
+
14
+ Q_INVOKABLE void finish();
15
+
16
+ signals:
17
+ void finished(unsigned _queryId, QString html);
18
+
19
+ private:
20
+ QWebPage *_webPage;
21
+ unsigned _queryId;
22
+ };
23
+
24
+ #endif // SUNSCRAPERPROXY_H
@@ -0,0 +1,67 @@
1
+ #include <QApplication>
2
+ #include <QWebPage>
3
+ #include <QWebFrame>
4
+ #include "sunscraperthread.h"
5
+ #include "sunscraperproxy.h"
6
+
7
+ SunscraperThread::SunscraperThread()
8
+ {
9
+ }
10
+
11
+ void SunscraperThread::run()
12
+ {
13
+ static int argc;
14
+ static char **argv = {NULL};
15
+
16
+ QApplication app(argc, argv);
17
+ app.exec();
18
+
19
+ qFatal("Sunscraper apartment thread event loop should never end");
20
+ }
21
+
22
+ void SunscraperThread::loadHtml(unsigned queryId, QString html)
23
+ {
24
+ QWebPage *webPage = initializeWebPage(queryId);
25
+ webPage->mainFrame()->setHtml(html);
26
+ }
27
+
28
+ void SunscraperThread::loadUrl(unsigned queryId, QString url)
29
+ {
30
+ QWebPage *webPage = initializeWebPage(queryId);
31
+ webPage->mainFrame()->load(url);
32
+ }
33
+
34
+ void SunscraperThread::finalize(unsigned queryId)
35
+ {
36
+ Q_ASSERT(_webPages[queryId] != NULL);
37
+
38
+ _webPages[queryId]->deleteLater();
39
+ _webPages.remove(queryId);
40
+ }
41
+
42
+ QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
43
+ {
44
+ Q_ASSERT(_webPages[queryId] == NULL);
45
+
46
+ QWebPage *webPage = new QWebPage(this);
47
+ connect(webPage->mainFrame(), SIGNAL(javaScriptWindowObjectCleared()),
48
+ this, SLOT(attachAPI()));
49
+
50
+ _webPages[queryId] = webPage;
51
+
52
+ return webPage;
53
+ }
54
+
55
+ void SunscraperThread::attachAPI()
56
+ {
57
+ QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
58
+ QWebPage *page = origin->page();
59
+
60
+ unsigned queryId = _webPages.key(page, 0);
61
+ Q_ASSERT(queryId != 0);
62
+
63
+ SunscraperProxy *proxy = new SunscraperProxy(page, queryId);
64
+ connect(proxy, SIGNAL(finished(uint,QString)), this, SIGNAL(finished(uint,QString)));
65
+
66
+ origin->addToJavaScriptWindowObject("Sunscraper", proxy, QScriptEngine::QtOwnership);
67
+ }
@@ -0,0 +1,34 @@
1
+ #ifndef SUNSCRAPERTHREAD_H
2
+ #define SUNSCRAPERTHREAD_H
3
+
4
+ #include <QThread>
5
+ #include <QMap>
6
+
7
+ class QWebPage;
8
+
9
+ class SunscraperThread : public QThread
10
+ {
11
+ Q_OBJECT
12
+ public:
13
+ SunscraperThread();
14
+
15
+ void run();
16
+
17
+ signals:
18
+ void finished(unsigned queryId, QString result);
19
+
20
+ public slots:
21
+ void loadHtml(unsigned queryId, QString html);
22
+ void loadUrl(unsigned queryId, QString url);
23
+ void finalize(unsigned queryId);
24
+
25
+ private slots:
26
+ void attachAPI();
27
+
28
+ private:
29
+ QMap<unsigned, QWebPage *> _webPages;
30
+
31
+ QWebPage *initializeWebPage(unsigned queryId);
32
+ };
33
+
34
+ #endif // SUNSCRAPERTHREAD_H
@@ -0,0 +1,50 @@
1
+ require 'sunscraper/library'
2
+
3
+ # Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
4
+ # method to be called. It blocks the calling thread, but is threadsafe, does
5
+ # not acquire GIL and thus can be called from multiple threads simultaneously.
6
+ module Sunscraper
7
+ # ScrapeTimeout error is raised when the page could not be loaded fast enough.
8
+ class ScrapeTimeout < StandardError; end
9
+
10
+ class << self
11
+ # Scrape an inline HTML. The content is loaded without a particular base URL.
12
+ # If your application depends on base URL being available, use {scrape_url}.
13
+ #
14
+ # @param [Integer] timeout timeout in milliseconds
15
+ def scrape_html(html, timeout=5000)
16
+ scrape(timeout) do |context|
17
+ Library.load_html context, html
18
+ end
19
+ end
20
+
21
+ # Scrape an URL.
22
+ #
23
+ # @param [Integer] timeout timeout in milliseconds
24
+ def scrape_url(url, timeout=5000)
25
+ scrape(timeout) do |context|
26
+ Library.load_url context, url
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def scrape(timeout)
33
+ context = Library.create
34
+
35
+ yield context
36
+
37
+ Library.wait(context, timeout)
38
+
39
+ data = Library.fetch(context)
40
+
41
+ if data == "!SUNSCRAPER_TIMEOUT"
42
+ raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
43
+ else
44
+ data
45
+ end
46
+ ensure
47
+ Library.discard(context) if context
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,37 @@
1
+ if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
2
+ raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
3
+ end
4
+
5
+ require 'ffi'
6
+
7
+ # @private
8
+ module Sunscraper::Library
9
+ extend FFI::Library
10
+
11
+ # RbConfig sniffing does not work on JRuby.
12
+ if Gem.win_platform?
13
+ extension = 'dll'
14
+ elsif RUBY_PLATFORM =~ /darwin/i
15
+ extension = 'dylib'
16
+ else
17
+ extension = 'so'
18
+ end
19
+
20
+ ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
21
+ 'ext', "libsunscraper.#{extension}")
22
+
23
+ attach_function 'create', :sunscraper_create, [], :pointer
24
+ attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
25
+ attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
26
+ attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
27
+ attach_function 'discard', :sunscraper_discard, [:pointer], :void
28
+
29
+ if RUBY_ENGINE == 'ruby'
30
+ # MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
31
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
32
+ else
33
+ # Rubinius does not have GVL neither it has options in attach_function.
34
+ # Same for JRuby.
35
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
36
+ end
37
+ end
@@ -0,0 +1,4 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'sunscraper'
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ require 'webrick'
4
+
5
+ HTML = <<HTML
6
+ <html>
7
+ <head>
8
+ <script type="text/javascript">
9
+ document.addEventListener("DOMContentLoaded", function() {
10
+ document.getElementById('fuga').textContent =
11
+ ("!skrow tI").split("").reverse().join("");
12
+ Sunscraper.finish();
13
+ }, true);
14
+ </script>
15
+ </head>
16
+ <body>
17
+ <div id='fuga'></div>
18
+ </body>
19
+ </html>
20
+ HTML
21
+
22
+ PORT = 45555
23
+
24
+ describe Sunscraper do
25
+ it "can scrape an HTML provided as a string" do
26
+ Sunscraper.scrape_html(HTML).should include('It works!')
27
+ end
28
+
29
+ it "can scrape an URL" do
30
+ server = WEBrick::HTTPServer.new :Port => PORT, :Logger => WEBrick::Log.new('/dev/null'), :AccessLog => []
31
+ server.mount_proc '/' do |req, res|
32
+ res.body = HTML
33
+ end
34
+ Thread.new { server.start }
35
+
36
+ Sunscraper.scrape_url("http://localhost:#{PORT}/").should include('It works!')
37
+
38
+ server.stop
39
+ end
40
+
41
+ it "should time out if callback is not called" do
42
+ lambda { Sunscraper.scrape_html("<!-- nothing. at least no callbacks -->", 1000) }.
43
+ should raise_exception(Sunscraper::ScrapeTimeout)
44
+ end
45
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "sunscraper"
6
+ s.version = "1.0.0"
7
+ s.authors = ["Peter Zotov"]
8
+ s.email = ["whitequark@whitequark.org"]
9
+ s.homepage = "http://github.com/roundlake/sunscraper"
10
+ s.summary = %q{A WebKit-based, JavaScript-capable HTML scraper.}
11
+ s.description = s.summary
12
+
13
+ s.rubyforge_project = "sunscraper"
14
+
15
+ s.files = `git ls-files`.split("\n")
16
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ s.extensions = ["ext/extconf.rb"]
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rspec"
22
+ s.add_runtime_dependency "ffi", '>= 1.0.11'
23
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sunscraper
3
+ version: !ruby/object:Gem::Version
4
+ hash: 540260530
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Peter Zotov
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-02-18 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 881230260
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :development
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: ffi
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ hash: 716237343
43
+ segments:
44
+ - 1
45
+ - 0
46
+ - 11
47
+ version: 1.0.11
48
+ type: :runtime
49
+ version_requirements: *id002
50
+ description: A WebKit-based, JavaScript-capable HTML scraper.
51
+ email:
52
+ - whitequark@whitequark.org
53
+ executables: []
54
+
55
+ extensions:
56
+ - ext/extconf.rb
57
+ extra_rdoc_files: []
58
+
59
+ files:
60
+ - .gitignore
61
+ - .rspec
62
+ - .yardopts
63
+ - Gemfile
64
+ - LICENSE
65
+ - README.md
66
+ - Rakefile
67
+ - ext/.gitignore
68
+ - ext/Makefile
69
+ - ext/extconf.rb
70
+ - ext/sunscraper.cpp
71
+ - ext/sunscraper.h
72
+ - ext/sunscraper.pro
73
+ - ext/sunscraperexternal.cpp
74
+ - ext/sunscraperlibrary.cpp
75
+ - ext/sunscraperlibrary.h
76
+ - ext/sunscraperproxy.cpp
77
+ - ext/sunscraperproxy.h
78
+ - ext/sunscraperthread.cpp
79
+ - ext/sunscraperthread.h
80
+ - lib/sunscraper.rb
81
+ - lib/sunscraper/library.rb
82
+ - spec/spec_helper.rb
83
+ - spec/sunscraper_spec.rb
84
+ - sunscraper.gemspec
85
+ homepage: http://github.com/roundlake/sunscraper
86
+ licenses: []
87
+
88
+ post_install_message:
89
+ rdoc_options: []
90
+
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ hash: 881230260
99
+ segments:
100
+ - 0
101
+ version: "0"
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ hash: 881230260
108
+ segments:
109
+ - 0
110
+ version: "0"
111
+ requirements: []
112
+
113
+ rubyforge_project: sunscraper
114
+ rubygems_version: 1.8.12
115
+ signing_key:
116
+ specification_version: 3
117
+ summary: A WebKit-based, JavaScript-capable HTML scraper.
118
+ test_files: []
119
+
120
+ has_rdoc: