sunscraper 1.0.0 → 1.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,183 @@
1
+ #include <QLocalSocket>
2
+ #include <QTimer>
3
+ #include <QDataStream>
4
+ #include <QApplication>
5
+ #include <QtDebug>
6
+ #include <arpa/inet.h>
7
+ #include "sunscraperrpc.h"
8
+ #include "sunscraperworker.h"
9
+
10
+ SunscraperRPC::SunscraperRPC(QString socketPath) :
11
+ m_state(StateHeader)
12
+ {
13
+ m_socket = new QLocalSocket(this);
14
+ m_socket->connectToServer(socketPath);
15
+ connect(m_socket, SIGNAL(readyRead()), this, SLOT(onInputReadable()));
16
+ connect(m_socket, SIGNAL(disconnected()), this, SLOT(onInputDisconnected()));
17
+
18
+ m_worker = new SunscraperWorker(this);
19
+ connect(m_worker, SIGNAL(finished(uint,QString)), this, SLOT(onPageRendered(uint,QString)));
20
+ }
21
+
22
+ SunscraperRPC::~SunscraperRPC()
23
+ {
24
+ delete m_worker;
25
+ }
26
+
27
+ void SunscraperRPC::onInputReadable()
28
+ {
29
+ m_buffer += m_socket->readAll();
30
+
31
+ bool moreData = true;
32
+ while(moreData) {
33
+ switch(m_state) {
34
+ case StateHeader:
35
+ if((unsigned) m_buffer.length() >= sizeof(Header)) {
36
+ memcpy((void*) &m_pendingHeader, m_buffer.constData(), sizeof(Header));
37
+ m_buffer.remove(0, sizeof(Header));
38
+ m_state = StateData;
39
+ } else {
40
+ moreData = false;
41
+ }
42
+
43
+ break;
44
+
45
+ case StateData:
46
+ unsigned length = ntohl(m_pendingHeader.dataLength);
47
+
48
+ if((unsigned) m_buffer.length() >= length) {
49
+ QByteArray data = m_buffer.left(length);
50
+ m_buffer.remove(0, length);
51
+ processRequest(m_pendingHeader, data);
52
+ m_state = StateHeader;
53
+ } else {
54
+ moreData = false;
55
+ }
56
+
57
+ break;
58
+ }
59
+ }
60
+ }
61
+
62
+ void SunscraperRPC::onInputDisconnected()
63
+ {
64
+ /* Magic value. */
65
+ QApplication::exit(42);
66
+ }
67
+
68
+ void SunscraperRPC::processRequest(Header header, QByteArray data)
69
+ {
70
+ unsigned queryId, requestType;
71
+
72
+ queryId = ntohl(header.queryId);
73
+ requestType = ntohl(header.requestType);
74
+
75
+ switch(requestType) {
76
+ case RPC_LOAD_HTML: {
77
+ m_worker->loadHtml(queryId, data);
78
+
79
+ break;
80
+ }
81
+
82
+ case RPC_LOAD_URL: {
83
+ m_worker->loadUrl(queryId, data);
84
+
85
+ break;
86
+ }
87
+
88
+ case RPC_WAIT: {
89
+ if(m_results.contains(queryId)) {
90
+ Header reply;
91
+ reply.queryId = htonl(queryId);
92
+ reply.requestType = htonl(RPC_WAIT);
93
+
94
+ sendReply(reply, QByteArray());
95
+ } else {
96
+ Q_ASSERT(!m_waitQueue.contains(queryId));
97
+ Q_ASSERT(!m_timers.contains(queryId));
98
+
99
+ m_waitQueue.append(queryId);
100
+
101
+ unsigned timeout;
102
+
103
+ QDataStream stream(data);
104
+ stream >> timeout;
105
+
106
+ QTimer *timer = new QTimer(this);
107
+ timer->setInterval(timeout);
108
+ timer->setSingleShot(true);
109
+ timer->start();
110
+ connect(timer, SIGNAL(timeout()), this, SLOT(onTimeout()));
111
+
112
+ m_timers[queryId] = timer;
113
+ }
114
+
115
+ break;
116
+ }
117
+
118
+ case RPC_FETCH: {
119
+ Header reply;
120
+ reply.queryId = htonl(queryId);
121
+ reply.requestType = htonl(RPC_FETCH);
122
+
123
+ if(m_results.contains(queryId)) {
124
+ sendReply(reply, m_results[queryId].toLocal8Bit());
125
+ } else {
126
+ sendReply(reply, "!SUNSCRAPER_TIMEOUT");
127
+ }
128
+
129
+ break;
130
+ }
131
+
132
+ case RPC_DISCARD: {
133
+ m_results.remove(queryId);
134
+ m_waitQueue.removeAll(queryId);
135
+
136
+ if(m_timers.contains(queryId)) {
137
+ QTimer *timer = m_timers[queryId];
138
+ delete timer;
139
+
140
+ m_timers.remove(queryId);
141
+ }
142
+
143
+ m_worker->finalize(queryId);
144
+
145
+ break;
146
+ }
147
+ }
148
+ }
149
+
150
+ void SunscraperRPC::onPageRendered(unsigned queryId, QString data)
151
+ {
152
+ m_results[queryId] = data;
153
+
154
+ if(m_waitQueue.contains(queryId)) {
155
+ Header reply;
156
+ reply.queryId = htonl(queryId);
157
+ reply.requestType = htonl(RPC_WAIT);
158
+
159
+ sendReply(reply, QByteArray());
160
+ }
161
+ }
162
+
163
+ void SunscraperRPC::onTimeout()
164
+ {
165
+ QTimer *timer = static_cast<QTimer*>(QObject::sender());
166
+ unsigned queryId = m_timers.key(timer);
167
+
168
+ Header reply;
169
+ reply.queryId = htonl(queryId);
170
+ reply.requestType = htonl(RPC_WAIT);
171
+
172
+ sendReply(reply, QByteArray());
173
+ }
174
+
175
+ void SunscraperRPC::sendReply(Header header, QByteArray data)
176
+ {
177
+ header.dataLength = htonl(data.length());
178
+
179
+ QByteArray serialized((const char*) &header, sizeof(Header));
180
+ serialized.append(data);
181
+
182
+ m_socket->write(serialized);
183
+ }
@@ -0,0 +1,64 @@
1
+ #ifndef SUNSCRAPERRPC_H
2
+ #define SUNSCRAPERRPC_H
3
+
4
+ #include <QObject>
5
+ #include <QVector>
6
+ #include <QMap>
7
+
8
+ class SunscraperWorker;
9
+ class QLocalSocket;
10
+ class QTimer;
11
+
12
+ class SunscraperRPC : public QObject
13
+ {
14
+ Q_OBJECT
15
+
16
+ enum State {
17
+ StateHeader = 0,
18
+ StateData,
19
+ };
20
+
21
+ struct Header {
22
+ quint32 queryId;
23
+ quint32 requestType;
24
+ quint32 dataLength;
25
+ };
26
+
27
+ enum Request {
28
+ RPC_LOAD_HTML = 1,
29
+ RPC_LOAD_URL = 2,
30
+ RPC_WAIT = 3,
31
+ RPC_FETCH = 4,
32
+ RPC_DISCARD = 5,
33
+ };
34
+
35
+ public:
36
+ SunscraperRPC(QString socketPath);
37
+ ~SunscraperRPC();
38
+
39
+ private slots:
40
+ void onInputReadable();
41
+ void onInputDisconnected();
42
+ void onPageRendered(unsigned queryId, QString data);
43
+ void onTimeout();
44
+
45
+ private:
46
+ QLocalSocket *m_socket;
47
+
48
+ State m_state;
49
+ Header m_pendingHeader;
50
+ QByteArray m_buffer;
51
+
52
+ SunscraperWorker *m_worker;
53
+
54
+ QList<unsigned> m_waitQueue;
55
+ QMap<unsigned, QTimer*> m_timers;
56
+ QMap<unsigned, QString> m_results;
57
+
58
+ SunscraperRPC();
59
+
60
+ void processRequest(Header header, QByteArray data);
61
+ void sendReply(Header header, QByteArray data);
62
+ };
63
+
64
+ #endif // SUNSCRAPERRPC_H
@@ -1,37 +1,28 @@
1
1
  #include <QApplication>
2
2
  #include <QWebPage>
3
3
  #include <QWebFrame>
4
- #include "sunscraperthread.h"
4
+ #include "sunscraperworker.h"
5
5
  #include "sunscraperproxy.h"
6
+ #include <QtDebug>
6
7
 
7
- SunscraperThread::SunscraperThread()
8
+ SunscraperWorker::SunscraperWorker(QObject *parent) :
9
+ QObject(parent)
8
10
  {
9
11
  }
10
12
 
11
- void SunscraperThread::run()
12
- {
13
- static int argc;
14
- static char **argv = {NULL};
15
-
16
- QApplication app(argc, argv);
17
- app.exec();
18
-
19
- qFatal("Sunscraper apartment thread event loop should never end");
20
- }
21
-
22
- void SunscraperThread::loadHtml(unsigned queryId, QString html)
13
+ void SunscraperWorker::loadHtml(unsigned queryId, QString html)
23
14
  {
24
15
  QWebPage *webPage = initializeWebPage(queryId);
25
16
  webPage->mainFrame()->setHtml(html);
26
17
  }
27
18
 
28
- void SunscraperThread::loadUrl(unsigned queryId, QString url)
19
+ void SunscraperWorker::loadUrl(unsigned queryId, QString url)
29
20
  {
30
21
  QWebPage *webPage = initializeWebPage(queryId);
31
22
  webPage->mainFrame()->load(url);
32
23
  }
33
24
 
34
- void SunscraperThread::finalize(unsigned queryId)
25
+ void SunscraperWorker::finalize(unsigned queryId)
35
26
  {
36
27
  Q_ASSERT(_webPages[queryId] != NULL);
37
28
 
@@ -39,7 +30,7 @@ void SunscraperThread::finalize(unsigned queryId)
39
30
  _webPages.remove(queryId);
40
31
  }
41
32
 
42
- QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
33
+ QWebPage *SunscraperWorker::initializeWebPage(unsigned queryId)
43
34
  {
44
35
  Q_ASSERT(_webPages[queryId] == NULL);
45
36
 
@@ -52,7 +43,7 @@ QWebPage *SunscraperThread::initializeWebPage(unsigned queryId)
52
43
  return webPage;
53
44
  }
54
45
 
55
- void SunscraperThread::attachAPI()
46
+ void SunscraperWorker::attachAPI()
56
47
  {
57
48
  QWebFrame *origin = static_cast<QWebFrame *>(QObject::sender());
58
49
  QWebPage *page = origin->page();
@@ -1,18 +1,18 @@
1
- #ifndef SUNSCRAPERTHREAD_H
2
- #define SUNSCRAPERTHREAD_H
1
+ #ifndef SUNSCRAPERWORKER_H
2
+ #define SUNSCRAPERWORKER_H
3
3
 
4
- #include <QThread>
4
+ #include <QObject>
5
+ #include <QMutex>
5
6
  #include <QMap>
6
7
 
7
8
  class QWebPage;
8
9
 
9
- class SunscraperThread : public QThread
10
+ class SunscraperWorker : public QObject
10
11
  {
11
12
  Q_OBJECT
12
- public:
13
- SunscraperThread();
14
13
 
15
- void run();
14
+ public:
15
+ SunscraperWorker(QObject *parent = 0);
16
16
 
17
17
  signals:
18
18
  void finished(unsigned queryId, QString result);
@@ -31,4 +31,4 @@ private:
31
31
  QWebPage *initializeWebPage(unsigned queryId);
32
32
  };
33
33
 
34
- #endif // SUNSCRAPERTHREAD_H
34
+ #endif // SUNSCRAPERWORKER_H
@@ -1,37 +1,41 @@
1
- if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
2
- raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
1
+ if RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
2
+ raise RuntimeError, "Sunscraper/embed does not work on OS X. Use Sunscraper/standalone."
3
3
  end
4
4
 
5
5
  require 'ffi'
6
+ require 'rbconfig'
6
7
 
7
8
  # @private
8
- module Sunscraper::Library
9
- extend FFI::Library
10
-
11
- # RbConfig sniffing does not work on JRuby.
12
- if Gem.win_platform?
13
- extension = 'dll'
14
- elsif RUBY_PLATFORM =~ /darwin/i
15
- extension = 'dylib'
16
- else
17
- extension = 'so'
18
- end
9
+ module Sunscraper
10
+ module Library
11
+ extend FFI::Library
12
+
13
+ if Gem.win_platform?
14
+ extension = 'dll'
15
+ else
16
+ extension = 'so'
17
+ end
18
+
19
+ ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
20
+ 'ext', 'embed', "libsunscraper.#{extension}")
21
+
22
+ attach_function 'create', :sunscraper_create, [], :pointer
23
+ attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
24
+ attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
25
+ attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
26
+ attach_function 'discard', :sunscraper_discard, [:pointer], :void
27
+
28
+ if RUBY_ENGINE == 'ruby'
29
+ # MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
30
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
31
+ else
32
+ # Rubinius does not have GVL neither it has options in attach_function.
33
+ # Same for JRuby.
34
+ attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
35
+ end
36
+
37
+ attach_function 'finalize', :sunscraper_finalize, [], :void
19
38
 
20
- ffi_lib File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
21
- 'ext', "libsunscraper.#{extension}")
22
-
23
- attach_function 'create', :sunscraper_create, [], :pointer
24
- attach_function 'load_html', :sunscraper_load_html, [:pointer, :string], :void
25
- attach_function 'load_url', :sunscraper_load_url, [:pointer, :string], :void
26
- attach_function 'fetch', :sunscraper_fetch, [:pointer], :string
27
- attach_function 'discard', :sunscraper_discard, [:pointer], :void
28
-
29
- if RUBY_ENGINE == 'ruby'
30
- # MRI uses ffi gem and has GVL. Hence, it needs a rb_thread_blocking_region call.
31
- attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void, :blocking => true
32
- else
33
- # Rubinius does not have GVL neither it has options in attach_function.
34
- # Same for JRuby.
35
- attach_function 'wait', :sunscraper_wait, [:pointer, :uint], :void
39
+ at_exit { finalize }
36
40
  end
37
41
  end
@@ -0,0 +1,168 @@
1
+ require 'socket'
2
+
3
+ # @private
4
+ module Sunscraper
5
+ module Standalone
6
+ @last_query_id = 0
7
+
8
+ @rpc_mutex = Mutex.new
9
+ @rpc_waiters = {}
10
+ @rpc_results = {}
11
+ @rpc_thread = nil
12
+
13
+ RPC_LOAD_HTML = 1
14
+ RPC_LOAD_URL = 2
15
+ RPC_WAIT = 3
16
+ RPC_FETCH = 4
17
+ RPC_DISCARD = 5
18
+
19
+ class << self
20
+ attr_reader :rpc_mutex, :rpc_waiters, :rpc_results
21
+
22
+ def create
23
+ @rpc_mutex.synchronize do
24
+ @last_query_id += 1
25
+ @last_query_id
26
+ end
27
+ end
28
+
29
+ def load_html(query_id, html)
30
+ perform_rpc query_id,
31
+ request: RPC_LOAD_HTML,
32
+ data: html
33
+ end
34
+
35
+ def load_url(query_id, url)
36
+ perform_rpc query_id,
37
+ request: RPC_LOAD_URL,
38
+ data: url
39
+ end
40
+
41
+ def wait(query_id, timeout)
42
+ perform_rpc query_id,
43
+ request: RPC_WAIT,
44
+ data: [timeout].pack("N"),
45
+ want_result: true
46
+ end
47
+
48
+ def fetch(query_id)
49
+ perform_rpc query_id,
50
+ request: RPC_FETCH,
51
+ want_result: true
52
+ end
53
+
54
+ def discard(query_id)
55
+ perform_rpc query_id,
56
+ request: RPC_DISCARD
57
+ end
58
+
59
+ private
60
+
61
+ def perform_rpc(query_id, options={})
62
+ data = options[:data] || ""
63
+ block = options[:want_result]
64
+
65
+ @rpc_mutex.synchronize do
66
+ if @rpc_thread.nil?
67
+ @rpc_thread = Standalone::Thread.new(::Thread.current)
68
+
69
+ # Some fucko decided not to put any semaphores in Ruby,
70
+ # _and_ restrict Mutexes to be unlocked only from the thread
71
+ # which has locked them.
72
+ #
73
+ # Please, kill yourself if you're reading this.
74
+ ::Thread.stop
75
+ end
76
+
77
+ @rpc_thread.perform(query_id, options[:request], data)
78
+
79
+ if block
80
+ @rpc_waiters[query_id] = Thread.current
81
+ end
82
+ end
83
+
84
+ if block
85
+ Thread.stop
86
+ @rpc_results[query_id]
87
+ end
88
+ ensure
89
+ if block
90
+ @rpc_waiters.delete query_id
91
+ @rpc_results.delete query_id
92
+ end
93
+ end
94
+ end
95
+
96
+ class Thread < ::Thread
97
+ def initialize(creator)
98
+ @creator = creator
99
+
100
+ super do
101
+ @parent = Sunscraper::Standalone
102
+ work
103
+ end
104
+ end
105
+
106
+ def perform(query_id, request, data)
107
+ @socket.write([query_id, request, data.length, data].pack("NNNa*"))
108
+ end
109
+
110
+ private
111
+
112
+ def work
113
+ if ::Sunscraper.os_x?
114
+ # Fuck you, OS X.
115
+ suffix = ".app/Contents/MacOS/sunscraper"
116
+ else
117
+ suffix = RbConfig::CONFIG["EXEEXT"]
118
+ end
119
+
120
+ executable = File.join(Gem.loaded_specs['sunscraper'].full_gem_path,
121
+ 'ext', 'standalone', "sunscraper#{suffix}")
122
+
123
+ server_path = "/tmp/sunscraper.#{Process.pid}.sock"
124
+ server = UNIXServer.new(server_path)
125
+
126
+ if Kernel.respond_to? :spawn
127
+ pid = Kernel.spawn "#{executable} #{server_path}"
128
+ else
129
+ # rbx does not have Kernel.spawn (yet). Sigh...
130
+ pid = fork { exec executable, server_path }
131
+ end
132
+
133
+ Process.detach pid
134
+
135
+ @socket = server.accept
136
+
137
+ server.close
138
+ FileUtils.rm server_path
139
+
140
+ # See above.
141
+ @creator.wakeup
142
+
143
+ loop do
144
+ header = @socket.read(4 * 3)
145
+ query_id, request, data_length = header.unpack("NNN")
146
+ data = @socket.read(data_length) if data_length > 0
147
+
148
+ @parent.rpc_mutex.synchronize do
149
+ if !@parent.rpc_waiters.include?(query_id)
150
+ $stderr.puts "Sunscraper/standalone: no waiter for #{query_id}"
151
+ else
152
+ @parent.rpc_results[query_id] = data
153
+ @parent.rpc_waiters[query_id].wakeup
154
+ end
155
+ end
156
+ end
157
+ rescue Exception => e
158
+ $stderr.puts "Sunscraper error: #{e.class}: #{e.message}"
159
+ e.backtrace.each do |line|
160
+ $stderr.puts " #{line}"
161
+ end
162
+ ensure
163
+ @socket.close
164
+ Process.kill pid
165
+ end
166
+ end
167
+ end
168
+ end
data/lib/sunscraper.rb CHANGED
@@ -1,4 +1,6 @@
1
- require 'sunscraper/library'
1
+ if !defined?(RUBY_ENGINE) && RUBY_VERSION =~ /^1.8/
2
+ raise RuntimeError, "Sunscraper does not work on Ruby MRI 1.8.x."
3
+ end
2
4
 
3
5
  # Sunscraper loads an HTML page in a headless browser and waits for `Sunscraper.finish()`
4
6
  # method to be called. It blocks the calling thread, but is threadsafe, does
@@ -8,13 +10,26 @@ module Sunscraper
8
10
  class ScrapeTimeout < StandardError; end
9
11
 
10
12
  class << self
13
+ def os_x?
14
+ RUBY_PLATFORM =~ /darwin/i || RbConfig::CONFIG['target_os'] == 'darwin'
15
+ end
16
+
17
+ attr_reader :worker
18
+ def worker=(worker_type)
19
+ if [:embed, :standalone].include?(worker_type)
20
+ @worker = worker_type
21
+ else
22
+ raise RuntimeError, "Invalid Sunscraper worker type: #{worker_type.inspect}"
23
+ end
24
+ end
25
+
11
26
  # Scrape an inline HTML. The content is loaded without a particular base URL.
12
27
  # If your application depends on base URL being available, use {scrape_url}.
13
28
  #
14
29
  # @param [Integer] timeout timeout in milliseconds
15
30
  def scrape_html(html, timeout=5000)
16
- scrape(timeout) do |context|
17
- Library.load_html context, html
31
+ scrape(timeout) do |worker, context|
32
+ worker.load_html context, html
18
33
  end
19
34
  end
20
35
 
@@ -22,21 +37,21 @@ module Sunscraper
22
37
  #
23
38
  # @param [Integer] timeout timeout in milliseconds
24
39
  def scrape_url(url, timeout=5000)
25
- scrape(timeout) do |context|
26
- Library.load_url context, url
40
+ scrape(timeout) do |worker, context|
41
+ worker.load_url context, url
27
42
  end
28
43
  end
29
44
 
30
45
  private
31
46
 
32
47
  def scrape(timeout)
33
- context = Library.create
34
-
35
- yield context
48
+ worker = load_worker
36
49
 
37
- Library.wait(context, timeout)
50
+ context = worker.create
51
+ yield worker, context
52
+ worker.wait(context, timeout)
38
53
 
39
- data = Library.fetch(context)
54
+ data = worker.fetch(context)
40
55
 
41
56
  if data == "!SUNSCRAPER_TIMEOUT"
42
57
  raise ScrapeTimeout, "Sunscraper has timed out waiting for the callback"
@@ -44,7 +59,29 @@ module Sunscraper
44
59
  data
45
60
  end
46
61
  ensure
47
- Library.discard(context) if context
62
+ worker.discard(context) if context
63
+ end
64
+
65
+ def load_worker
66
+ case @worker
67
+ when :standalone
68
+ require 'sunscraper/standalone'
69
+
70
+ Sunscraper::Standalone
71
+
72
+ when :embed
73
+ require 'sunscraper/library'
74
+
75
+ Sunscraper::Library
76
+ end
48
77
  end
49
78
  end
50
79
  end
80
+
81
+ if Sunscraper.os_x?
82
+ # OS X is braindead
83
+ Sunscraper.worker = :standalone
84
+ else
85
+ # ... even Win32 is better.
86
+ Sunscraper.worker = :embed
87
+ end