sunscraper 1.0.0 → 1.1.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,183 @@
1
+ #include <QLocalSocket>
2
+ #include <QTimer>
3
+ #include <QDataStream>
4
+ #include <QApplication>
5
+ #include <QtDebug>
6
+ #include <arpa/inet.h>
7
+ #include "sunscraperrpc.h"
8
+ #include "sunscraperworker.h"
9
+
10
+ SunscraperRPC::SunscraperRPC(QString socketPath) :
11
+ m_state(StateHeader)
12
+ {
13
+ m_socket = new QLocalSocket(this);
14
+ m_socket->connectToServer(socketPath);
15
+ connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
16
+ connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
17
+
18
+ m_worker = new SunscraperWorker(this);
19
+ connect(m_worker, SIGNAL(finished(uint,QString)), this, SLOT(onPageRendered(uint,QString)));
20
+ }
21
+
22
+ SunscraperRPC::~SunscraperRPC()
23
+ {
24
+ delete m_worker;
25
+ }
26
+
27
+ void SunscraperRPC::onInputReadable()
28
+ {
29
+ m_buffer += m_socket->readAll();
30
+
31
+ bool moreData = true;
32
+ while(moreData) {
33
+ switch(m_state) {
34
+ case StateHeader:
35
+ if((unsigned) m_buffer.length() >= sizeof(Header)) {
36
+ memcpy((void*) &m_pendingHeader, m_buffer.constData(), sizeof(Header));
37
+ m_buffer.remove(0, sizeof(Header));
38
+ m_state = StateData;
39
+ } else {
40
+ moreData = false;
41
+ }
42
+
43
+ break;
44
+
45
+ case StateData:
46
+ unsigned length = ntohl(m_pendingHeader.dataLength);
47
+
48
+ if((unsigned) m_buffer.length() >= length) {
49
+ QByteArray data = m_buffer.left(length);
50
+ m_buffer.remove(0, length);
51
+ processRequest(m_pendingHeader, data);
52
+ m_state = StateHeader;
53
+ } else {
54
+ moreData = false;
55
+ }
56
+
57
+ break;
58
+ }
59
+ }
60
+ }
61
+
62
+ void SunscraperRPC::onInputDisconnected()
63
+ {
64
+ /* Magic value. */
65
+ QApplication::exit(42);
66
+ }
67
+
68
+ void SunscraperRPC::processRequest(Header header, QByteArray data)
69
+ {
70
+ unsigned queryId, requestType;
71
+
72
+ queryId = ntohl(header.queryId);
73
+ requestType = ntohl(header.requestType);
74
+
75
+ switch(requestType) {
76
+ case RPC_LOAD_HTML: {
77
+ m_worker->loadHtml(queryId, data);
78
+
79
+ break;
80
+ }
81
+
82
+ case RPC_LOAD_URL: {
83
+ m_worker->loadUrl(queryId, data);
84
+
85
+ break;
86
+ }
87
+
88
+ case RPC_WAIT: {
89
+ if(m_results.contains(queryId)) {
90
+ Header reply;
91
+ reply.queryId = htonl(queryId);
92
+ reply.requestType = htonl(RPC_WAIT);
93
+
94
+ sendReply(reply, QByteArray());
95
+ } else {
96
+ Q_ASSERT(!m_waitQueue.contains(queryId));
97
+ Q_ASSERT(!m_timers.contains(queryId));
98
+
99
+ m_waitQueue.append(queryId);
100
+
101
+ unsigned timeout;
102
+
103
+ QDataStream stream(data);
104
+ stream >> timeout;
105
+
106
+ QTimer *timer = new QTimer(this);
107
+ timer->setInterval(timeout);
108
+ timer->setSingleShot(true);
109
+ timer->start();
110
+ connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
111
+
112
+ m_timers[queryId] = timer;
113
+ }
114
+
115
+ break;
116
+ }
117
+
118
+ case RPC_FETCH: {
119
+ Header reply;
120
+ reply.queryId = htonl(queryId);
121
+ reply.requestType = htonl(RPC_FETCH);
122
+
123
+ if(m_results.contains(queryId)) {
124
+ sendReply(reply, m_results[queryId].toLocal8Bit());
125
+ } else {
126
+ sendReply(reply, "!SUNSCRAPER_TIMEOUT");
127
+ }
128
+
129
+ break;
130
+ }
131
+
132
+ case RPC_DISCARD: {
133
+ m_results.remove(queryId);
134
+ m_waitQueue.removeAll(queryId);
135
+
136
+ if(m_timers.contains(queryId)) {
137
+ QTimer *timer = m_timers[queryId];
138
+ delete timer;
139
+
140
+ m_timers.remove(queryId);
141
+ }
142
+
143
+ m_worker->finalize(queryId);
144
+
145
+ break;
146
+ }
147
+ }
148
+ }
149
+
150
+ void SunscraperRPC::onPageRendered(unsigned queryId, QString data)
151
+ {
152
+ m_results[queryId] = data;
153
+
154
+ if(m_waitQueue.contains(queryId)) {
155
+ Header reply;
156
+ reply.queryId = htonl(queryId);
157
+ reply.requestType = htonl(RPC_WAIT);
158
+
159
+ sendReply(reply, QByteArray());
160
+ }
161
+ }
162
+
163
+ void SunscraperRPC::onTimeout()
164
+ {
165
+ QTimer *timer = static_cast<QTimer*>(QObject::sender());
166
+ unsigned queryId = m_timers.key(timer);
167
+
168
+ Header reply;
169
+ reply.queryId = htonl(queryId);
170
+ reply.requestType = htonl(RPC_WAIT);
171
+
172
+ sendReply(reply, QByteArray());
173
+ }
174
+
175
+ void SunscraperRPC::sendReply(Header header, QByteArray data)
176
+ {
177
+ header.dataLength = htonl(data.length());
178
+
179
+ QByteArray serialized((const char*) &header, sizeof(Header));
180
+ serialized.append(data);
181
+
182
+ m_socket->write(serialized);
183
+ }
@@ -0,0 +1,64 @@
1
+ #ifndef SUNSCRAPERRPC_H
2
+ #define SUNSCRAPERRPC_H
3
+
4
+ #include <QObject>
5
+ #include <QVector>
6
+ #include <QMap>
7
+
8
+ class SunscraperWorker;
9
+ class QLocalSocket;
10
+ class QTimer;
11
+
12
+ class SunscraperRPC : public QObject
13
+ {
14
+ Q_OBJECT
15
+
16
+ enum State {
17
+ StateHeader = 0,
18
+ StateData,
19
+ };
20
+
21
+ struct Header {
22
+ quint32 queryId;
23
+ quint32 requestType;
24
+ quint32 dataLength;
25
+ };
26
+
27
+ enum Request {
28
+ RPC_LOAD_HTML = 1,
29
+ RPC_LOAD_URL = 2,
30
+ RPC_WAIT = 3,
31
+ RPC_FETCH = 4,
32
+ RPC_DISCARD = 5,
33
+ };
34
+
35
+ public:
36
+ SunscraperRPC(QString socketPath);
37
+ ~SunscraperRPC();
38
+
39
+ private slots:
40
+ void onInputReadable();
41
+ void onInputDisconnected();
42
+ void onPageRendered(unsigned queryId, QString data);
43
+ void onTimeout();
44
+
45
+ private:
46
+ QLocalSocket *m_socket;
47
+
48
+ State m_state;
49
+ Header m_pendingHeader;
50
+ QByteArray m_buffer;
51
+
52
+ SunscraperWorker *m_worker;
53
+
54
+ QList<unsigned> m_waitQueue;
55
+ QMap<unsigned, QTimer*> m_timers;
56
+ QMap<unsigned, QString> m_results;
57
+
58
+ SunscraperRPC();
59
+
60
+ void processRequest(Header header, QByteArray data);
61
+ void sendReply(Header header, QByteArray data);
62
+ };
63
+
64
+ #endif // SUNSCRAPERRPC_H
@@ -1,37 +1,28 @@
1
1
  #include <QApplication>
2
2
  #include <QWebPage>
3
3
  #include <QWebFrame>
4
- #include "sunscraperthread.h"
4
+ #include "sunscraperworker.h"
5
5
  #include "sunscraperproxy.h"
6
+ #include <QtDebug>
6
7
 
7
- SunscraperThread::SunscraperThread()
8
+ SunscraperWorker::SunscraperWorker(QObject *parent) :
9
+ QObject(parent)
8
10
  {
9
11
  }
10
12
 
11
- void SunscraperThread::run()
12
- {
13
- static int argc;
14
- static char **argv = {NULL};
15
-
16
- QApplication app(argc, argv);
17
- app.exec();
18
-
19
- qFatal("Sunscraper apartment thread event loop should never end");
20
- }
21
-
22
- void SunscraperThread::loadHtml(unsigned queryId, QString html)
13
+ void SunscraperWorker::loadHtml(unsigned queryId, QString html)
23
14
  {
24
15
  QWebPage *webPage = initializeWebPage(queryId);
25
16
  webPage->mainFrame()->setHtml(html);
26
17
  }
27
18
 
28
- void SunscraperThread::loadUrl(unsigned queryId, QString url)
19
+ void SunscraperWorker::loadUrl(unsigned queryId, QString url)
29
20
  {
30
21
  QWebPage *webPage = initializeWebPage(queryId);
31
22
  webPage->mainFrame()->load(url);
32
23
  }
33
24
 
34
- void SunscraperThread::finalize(unsigned queryId)
25
+ void SunscraperWorker::finalize(unsigned queryId)
35
26
  {
36
27
  Q_ASSERT(_webPages[queryId] != NULL);
37
28
 
@@ -39,7 +30,7 @@ void SunscraperThread::finalize(unsigned queryId)
39
30
  _webPages.remove(queryId);
40
31
  }
41
32
 
42
- QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
33
+ QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
43
34
  {
44
35
  Q_ASSERT(_webPages[queryId] == NULL);
45
36
 
@@ -52,7 +43,7 @@ QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
52
43
  return webPage;
53
44
  }
54
45
 
55
- void SunscraperThread::attachAPI()
46
+ void SunscraperWorker::attachAPI()
56
47
  {
57
48
  QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
58
49
  QWebPage *page = origin->page();
@@ -1,18 +1,18 @@
1
- #ifndef SUNSCRAPERTHREAD_H
2
- #define SUNSCRAPERTHREAD_H
1
+ #ifndef SUNSCRAPERWORKER_H
2
+ #define SUNSCRAPERWORKER_H
3
3
 
4
- #include <QThread>
4
+ #include <QObject>
5
+ #include <QMutex>
5
6
  #include <QMap>
6
7
 
7
8
  class QWebPage;
8
9
 
9
- class SunscraperThread : public QThread
10
+ class SunscraperWorker : public QObject
10
11
  {
11
12
  Q_OBJECT
12
- public:
13
- SunscraperThread();
14
13
 
15
- void run();
14
+ public:
15
+ SunscraperWorker(QObject *parent = 0);
16
16
 
17
17
  signals:
18
18
  void finished(unsigned queryId, QString result);
@@ -31,4 +31,4 @@ private:
31
31
  QWebPage *initializeWebPage(unsigned queryId);
32
32
  };
33
33
 
34
- #endif // SUNSCRAPERTHREAD_H
34
+ #endif // SUNSCRAPERWORKER_H
@@ -1,37 +1,41 @@
1
- if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
2
- raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
1
+ if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
2
+ raise RuntimeError, "Sunscraper/embed does not work on OS X. Use Sunscraper/standalone."
3
3
  end
4
4
 
5
5
  require 'ffi'
6
+ require 'rbconfig'
6
7
 
7
8
  # @private
8
- module Sunscraper::Library
9
- extend FFI::Library
10
-
11
- # RbConfig sniffing does not work on JRuby.
12
- if Gem.win_platform?
13
- extension = 'dll'
14
- elsif RUBY_PLATFORM =~ /darwin/i
15
- extension = 'dylib'
16
- else
17
- extension = 'so'
18
- end
9
+ module Sunscraper
10
+ module Library
11
+ extend FFI::Library
12
+
13
+ if Gem.win_platform?
14
+ extension = 'dll'
15
+ else
16
+ extension = 'so'
17
+ end
18
+
19
+ ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
20
+ 'ext', 'embed', "libsunscraper.#{extension}")
21
+
22
+ attach_function 'create', :sunscraper_create, [], :pointer
23
+ attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
24
+ attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
25
+ attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
26
+ attach_function 'discard', :sunscraper_discard, [:pointer], :void
27
+
28
+ if RUBY_ENGINE == 'ruby'
29
+ # MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
30
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
31
+ else
32
+ # Rubinius does not have GVL neither it has options in attach_function.
33
+ # Same for JRuby.
34
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
35
+ end
36
+
37
+ attach_function 'finalize', :sunscraper_finalize, [], :void
19
38
 
20
- ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
21
- 'ext', "libsunscraper.#{extension}")
22
-
23
- attach_function 'create', :sunscraper_create, [], :pointer
24
- attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
25
- attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
26
- attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
27
- attach_function 'discard', :sunscraper_discard, [:pointer], :void
28
-
29
- if RUBY_ENGINE == 'ruby'
30
- # MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
31
- attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
32
- else
33
- # Rubinius does not have GVL neither it has options in attach_function.
34
- # Same for JRuby.
35
- attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
39
+ at_exit { finalize }
36
40
  end
37
41
  end
@@ -0,0 +1,168 @@
1
+ require 'socket'
2
+
3
+ # @private
4
+ module Sunscraper
5
+ module Standalone
6
+ @last_query_id = 0
7
+
8
+ @rpc_mutex = Mutex.new
9
+ @rpc_waiters = {}
10
+ @rpc_results = {}
11
+ @rpc_thread = nil
12
+
13
+ RPC_LOAD_HTML = 1
14
+ RPC_LOAD_URL = 2
15
+ RPC_WAIT = 3
16
+ RPC_FETCH = 4
17
+ RPC_DISCARD = 5
18
+
19
+ class << self
20
+ attr_reader :rpc_mutex, :rpc_waiters, :rpc_results
21
+
22
+ def create
23
+ @rpc_mutex.synchronize do
24
+ @last_query_id += 1
25
+ @last_query_id
26
+ end
27
+ end
28
+
29
+ def load_html(query_id, html)
30
+ perform_rpc query_id,
31
+ request: RPC_LOAD_HTML,
32
+ data: html
33
+ end
34
+
35
+ def load_url(query_id, url)
36
+ perform_rpc query_id,
37
+ request: RPC_LOAD_URL,
38
+ data: url
39
+ end
40
+
41
+ def wait(query_id, timeout)
42
+ perform_rpc query_id,
43
+ request: RPC_WAIT,
44
+ data: [timeout].pack("N"),
45
+ want_result: true
46
+ end
47
+
48
+ def fetch(query_id)
49
+ perform_rpc query_id,
50
+ request: RPC_FETCH,
51
+ want_result: true
52
+ end
53
+
54
+ def discard(query_id)
55
+ perform_rpc query_id,
56
+ request: RPC_DISCARD
57
+ end
58
+
59
+ private
60
+
61
+ def perform_rpc(query_id, options={})
62
+ data = options[:data] || ""
63
+ block = options[:want_result]
64
+
65
+ @rpc_mutex.synchronize do
66
+ if @rpc_thread.nil?
67
+ @rpc_thread = Standalone::Thread.new(::Thread.current)
68
+
69
+ # Some fucko decided not to put any semaphores in Ruby,
70
+ # _and_ restrict Mutexes to be unlocked only from the thread
71
+ # which has locked them.
72
+ #
73
+ # Please, kill yourself if you're reading this.
74
+ ::Thread.stop
75
+ end
76
+
77
+ @rpc_thread.perform(query_id, options[:request], data)
78
+
79
+ if block
80
+ @rpc_waiters[query_id] = Thread.current
81
+ end
82
+ end
83
+
84
+ if block
85
+ Thread.stop
86
+ @rpc_results[query_id]
87
+ end
88
+ ensure
89
+ if block
90
+ @rpc_waiters.delete query_id
91
+ @rpc_results.delete query_id
92
+ end
93
+ end
94
+ end
95
+
96
+ class Thread < ::Thread
97
+ def initialize(creator)
98
+ @creator = creator
99
+
100
+ super do
101
+ @parent = Sunscraper::Standalone
102
+ work
103
+ end
104
+ end
105
+
106
+ def perform(query_id, request, data)
107
+ @socket.write([query_id, request, data.length, data].pack("NNNa*"))
108
+ end
109
+
110
+ private
111
+
112
+ def work
113
+ if ::Sunscraper.os_x?
114
+ # Fuck you, OS X.
115
+ suffix = ".app/Contents/MacOS/sunscraper"
116
+ else
117
+ suffix = RbConfig::CONFIG["EXEEXT"]
118
+ end
119
+
120
+ executable = File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
121
+ 'ext', 'standalone', "sunscraper#{suffix}")
122
+
123
+ server_path = "/tmp/sunscraper.#{Process.pid}.sock"
124
+ server = UNIXServer.new(server_path)
125
+
126
+ if Kernel.respond_to? :spawn
127
+ pid = Kernel.spawn "#{executable} #{server_path}"
128
+ else
129
+ # rbx does not have Kernel.spawn (yet). Sigh...
130
+ pid = fork { exec executable, server_path }
131
+ end
132
+
133
+ Process.detach pid
134
+
135
+ @socket = server.accept
136
+
137
+ server.close
138
+ FileUtils.rm server_path
139
+
140
+ # See above.
141
+ @creator.wakeup
142
+
143
+ loop do
144
+ header = @socket.read(4 * 3)
145
+ query_id, request, data_length = header.unpack("NNN")
146
+ data = @socket.read(data_length) if data_length > 0
147
+
148
+ @parent.rpc_mutex.synchronize do
149
+ if !@parent.rpc_waiters.include?(query_id)
150
+ $stderr.puts "Sunscraper/standalone: no waiter for #{query_id}"
151
+ else
152
+ @parent.rpc_results[query_id] = data
153
+ @parent.rpc_waiters[query_id].wakeup
154
+ end
155
+ end
156
+ end
157
+ rescue Exception => e
158
+ $stderr.puts "Sunscraper error: #{e.class}: #{e.message}"
159
+ e.backtrace.each do |line|
160
+ $stderr.puts " #{line}"
161
+ end
162
+ ensure
163
+ @socket.close
164
+ Process.kill pid
165
+ end
166
+ end
167
+ end
168
+ end
data/lib/sunscraper.rb CHANGED
@@ -1,4 +1,6 @@
1
- require 'sunscraper/library'
1
+ if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
2
+ raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
3
+ end
2
4
 
3
5
  # Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
4
6
  # method to be called. It blocks the calling thread, but is threadsafe, does
@@ -8,13 +10,26 @@ module Sunscraper
8
10
  class ScrapeTimeout < StandardError; end
9
11
 
10
12
  class << self
13
+ def os_x?
14
+ RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
15
+ end
16
+
17
+ attr_reader :worker
18
+ def worker=(worker_type)
19
+ if [:embed, :standalone].include?(worker_type)
20
+ @worker = worker_type
21
+ else
22
+ raise RuntimeError, "Invalid Sunscraper worker type: #{worker_type.inspect}"
23
+ end
24
+ end
25
+
11
26
  # Scrape an inline HTML. The content is loaded without a particular base URL.
12
27
  # If your application depends on base URL being available, use {scrape_url}.
13
28
  #
14
29
  # @param [Integer] timeout timeout in milliseconds
15
30
  def scrape_html(html, timeout=5000)
16
- scrape(timeout) do |context|
17
- Library.load_html context, html
31
+ scrape(timeout) do |worker, context|
32
+ worker.load_html context, html
18
33
  end
19
34
  end
20
35
 
@@ -22,21 +37,21 @@ module Sunscraper
22
37
  #
23
38
  # @param [Integer] timeout timeout in milliseconds
24
39
  def scrape_url(url, timeout=5000)
25
- scrape(timeout) do |context|
26
- Library.load_url context, url
40
+ scrape(timeout) do |worker, context|
41
+ worker.load_url context, url
27
42
  end
28
43
  end
29
44
 
30
45
  private
31
46
 
32
47
  def scrape(timeout)
33
- context = Library.create
34
-
35
- yield context
48
+ worker = load_worker
36
49
 
37
- Library.wait(context, timeout)
50
+ context = worker.create
51
+ yield worker, context
52
+ worker.wait(context, timeout)
38
53
 
39
- data = Library.fetch(context)
54
+ data = worker.fetch(context)
40
55
 
41
56
  if data == "!SUNSCRAPER_TIMEOUT"
42
57
  raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
@@ -44,7 +59,29 @@ module Sunscraper
44
59
  data
45
60
  end
46
61
  ensure
47
- Library.discard(context) if context
62
+ worker.discard(context) if context
63
+ end
64
+
65
+ def load_worker
66
+ case @worker
67
+ when :standalone
68
+ require 'sunscraper/standalone'
69
+
70
+ Sunscraper::Standalone
71
+
72
+ when :embed
73
+ require 'sunscraper/library'
74
+
75
+ Sunscraper::Library
76
+ end
48
77
  end
49
78
  end
50
79
  end
80
+
81
+ if Sunscraper.os_x?
82
+ # OS X is braindead
83
+ Sunscraper.worker = :standalone
84
+ else
85
+ # ... even Win32 is better.
86
+ Sunscraper.worker = :embed
87
+ end