rcrawler 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06cb51462ed7694164d649bc33f28102ae5ed66a
4
- data.tar.gz: fc4017fd01b7140517b1e13bcbda221ac672eb77
3
+ metadata.gz: ff1eee821a0d9e416d60245d49c28fb8a871b331
4
+ data.tar.gz: 74e6cd410b5c632ccc4249f61564843579654adc
5
5
  SHA512:
6
- metadata.gz: 2408c89221e3c7d4b9e6f11f1d2015d4471efe3492369ded48ba4a202fce1c30b74894201ccdd7af5cd6e94cfe8ea9090a1a11c38f3352f7929804884a7eb2a2
7
- data.tar.gz: 6c77a1d1fab7789c3920524c746e8f1f7070a78df0b27f02a6db059eb1695a4c616389eb40939b9d6dc88d95e3ff4c7dd065b2aa01cb097ec326143612275cbe
6
+ metadata.gz: d6db4bf8b44933b5dc99d0932dee1d6a19180ade6751ad5689f6aef3868d26cc0cbb0320b18b5b41c34641c376483ac4c413d7380a68b19f990f84e8e50827be
7
+ data.tar.gz: f6f7b46570e5af00e157cf2fb5ad1a81db2f09b2097e8db4ca79e681b849a24d73055c24f1b94c202be61b9e3e704cf63a068214066e70d2190250000c6d9c99
@@ -0,0 +1 @@
1
+ repo_token: OFZ4IKBbbaOGDyQwEB3LNlGDoV9dQearl
@@ -0,0 +1,13 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ gemfile:
6
+ - Gemfile
7
+ script: bundle exec rake spec
8
+ branches:
9
+ only:
10
+ - master
11
+ notifications:
12
+ mails:
13
+ - i2bskn@gmail.com
data/README.md CHANGED
@@ -1,7 +1,17 @@
1
- # Rcrawler
1
+ # RCrawler
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rcrawler.png)](http://badge.fury.io/rb/rcrawler)
4
+ [![Build Status](https://travis-ci.org/i2bskn/rcrawler.png?branch=master)](https://travis-ci.org/i2bskn/rcrawler)
5
+ [![Coverage Status](https://coveralls.io/repos/i2bskn/rcrawler/badge.png)](https://coveralls.io/r/i2bskn/rcrawler)
6
+ [![Code Climate](https://codeclimate.com/github/i2bskn/rcrawler.png)](https://codeclimate.com/github/i2bskn/rcrawler)
2
7
 
3
8
  The wrapper of capybara for crawler.
4
9
 
10
+ ## Dependencies
11
+
12
+ * nokogiri requires libxml2.
13
+ * capybara-webkit requires qt. [capybara-webkit wiki](https://github.com/thoughtbot/capybara-webkit/wiki/Installing-Qt-and-compiling-capybara-webkit)
14
+
5
15
  ## Installation
6
16
 
7
17
  Add this line to your application's Gemfile:
@@ -18,6 +28,8 @@ Or install it yourself as:
18
28
 
19
29
  ## Usage
20
30
 
31
+ #### Crawl
32
+
21
33
  ```ruby
22
34
  require "rcrawler"
23
35
 
@@ -40,6 +52,33 @@ RCrawler.crawl do
40
52
  end
41
53
  ```
42
54
 
55
+ #### Configuration
56
+
57
+ ```ruby
58
+ RCrawler.configure do |c|
59
+ c.threads = 10 # => default is 8
60
+ c.timeout = 20 # => default is 10
61
+ c.timeout_proc = :ignore # => default is :raise
62
+ end
63
+ ```
64
+
65
+ #### Async processing
66
+
67
+ ```ruby
68
+ RCrawler.async do
69
+ crawl do
70
+ # do something
71
+ end
72
+
73
+ crawl do
74
+ # do something
75
+ end
76
+
77
+ crawl do
78
+ # do something
79
+ end
80
+ end
81
+ ```
43
82
  ## Contributing
44
83
 
45
84
  1. Fork it
@@ -7,14 +7,35 @@ require "nokogiri"
7
7
  require "headless"
8
8
 
9
9
  require "rcrawler/version"
10
+ require "rcrawler/configuration"
10
11
  require "rcrawler/driver"
11
12
  require "rcrawler/crawl"
13
+ require "rcrawler/async"
12
14
 
13
15
  module RCrawler
16
+ @config = Configuration.instance
17
+
14
18
  class << self
15
19
  def crawl(&block)
16
- c = Crawl.new
17
- c.instance_eval &block
20
+ begin
21
+ Timeout::timeout(@config.timeout) {Crawl.new.instance_eval &block}
22
+ rescue Timeout::Error => e
23
+ raise if @config.timeout_proc == :raise
24
+ end
25
+ end
26
+
27
+ def configure(&block)
28
+ if block_given?
29
+ @config.configure &block
30
+ else
31
+ @config
32
+ end
33
+ end
34
+
35
+ def async(&block)
36
+ async_threads = Async.new
37
+ async_threads.instance_eval &block
38
+ async_threads.execute
18
39
  end
19
40
  end
20
41
  end
@@ -0,0 +1,44 @@
1
+ # coding: utf-8
2
+
3
+ require "thread"
4
+ require "timeout"
5
+
6
+ module RCrawler
7
+ class Async
8
+ def initialize
9
+ @queue = ::Queue.new
10
+ @config = ::RCrawler::Configuration.instance
11
+ end
12
+
13
+ def crawl(&block)
14
+ raise ArgumentError, "crawl method is required block" unless block_given?
15
+ @queue.push block
16
+ end
17
+
18
+ def execute
19
+ threads = []
20
+ @config.threads.times do
21
+ threads << create_thread
22
+ end
23
+ threads.each {|thread| thread.join}
24
+ end
25
+
26
+ private
27
+ def create_thread
28
+ ::Thread.start do
29
+ while !@queue.empty?
30
+ begin
31
+ Timeout::timeout(@config.timeout) {exec_crawl(@queue.pop)}
32
+ rescue Timeout::Error => e
33
+ raise if @config.timeout_proc == :raise
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ def exec_crawl(block)
40
+ crwl = Crawl.new
41
+ crwl.instance_eval &block
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+
3
+ require "singleton"
4
+
5
+ module RCrawler
6
+ class Configuration
7
+ VALID_OPTIONS_KEYS = [
8
+ :threads,
9
+ :timeout,
10
+ :timeout_proc
11
+ ].freeze
12
+
13
+ attr_accessor *VALID_OPTIONS_KEYS
14
+
15
+ include Singleton
16
+
17
+ def initialize
18
+ reset
19
+ end
20
+
21
+ def configure
22
+ yield self
23
+ end
24
+
25
+ def reset
26
+ self.threads = 8
27
+ self.timeout = 10
28
+ self.timeout_proc = :raise
29
+ end
30
+ end
31
+ end
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
2
 
3
3
  module RCrawler
4
- VERSION = "0.0.1"
4
+ VERSION = "0.0.2"
5
5
  end
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Async do
6
+ let(:async) {RCrawler::Async.new}
7
+
8
+ describe "#initialize" do
9
+ it "Queue should be created" do
10
+ mock = double("queue mock")
11
+ Queue.should_receive(:new).and_return(mock)
12
+ expect(async.instance_eval {@queue}).to eq(mock)
13
+ end
14
+
15
+ it "Configuration should be setted" do
16
+ mock = double("configuration mock").as_null_object
17
+ RCrawler::Configuration.should_receive(:instance).exactly(2).and_return(mock)
18
+ expect(async.instance_eval {@config}).to eq(mock)
19
+ end
20
+ end
21
+
22
+ describe "#crawl" do
23
+ it "crawl task should be registered" do
24
+ async
25
+ expect {
26
+ async.crawl {visit "http://example.com"}
27
+ }.to change(async.instance_eval {@queue}, :size).by(1)
28
+ end
29
+
30
+ it "exception should be generated if not argument" do
31
+ expect {async.crawl}.to raise_error(ArgumentError)
32
+ end
33
+ end
34
+
35
+ describe "#execute" do
36
+ it "thread should be created 8" do
37
+ mock = double("thread mock")
38
+ mock.should_receive(:join).exactly(8)
39
+ RCrawler::Async.any_instance.should_receive(:create_thread).exactly(8).and_return(mock)
40
+ expect{async.execute}.not_to raise_error
41
+ end
42
+ end
43
+
44
+ describe "#create_thread" do
45
+ before {RCrawler::Crawl.stub(:new).and_return(double.as_null_object)}
46
+
47
+ it "Thread object should be returned" do
48
+ expect(async.send(:create_thread).is_a? Thread).to be_true
49
+ end
50
+
51
+ it "exec_crawl method should be called" do
52
+ async.crawl {}
53
+ expect{async.send(:create_thread)}.not_to raise_error
54
+ end
55
+ end
56
+
57
+ describe "#exec_crawl" do
58
+ it "Crawl object should be created" do
59
+ mock = double("crawl mock")
60
+ mock.should_receive(:instance_eval)
61
+ RCrawler::Crawl.should_receive(:new).and_return(mock)
62
+ expect(async.send(:exec_crawl, Proc.new{})).not_to raise_error
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,88 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Configuration do
6
+ let(:config) {RCrawler::Configuration.instance}
7
+
8
+ describe "defined accessors" do
9
+ it "defined threads" do
10
+ expect(config.respond_to? :threads).to be_true
11
+ end
12
+
13
+ it "defined timeout" do
14
+ expect(config.respond_to? :timeout).to be_true
15
+ end
16
+
17
+ it "defined timeout_proc" do
18
+ expect(config.respond_to? :timeout_proc).to be_true
19
+ end
20
+ end
21
+
22
+ describe "#initialize" do
23
+ it "threads should be a default" do
24
+ expect(config.threads).to eq(8)
25
+ end
26
+
27
+ it "timeout should be a default" do
28
+ expect(config.timeout).to eq(10)
29
+ end
30
+
31
+ it "timeout_proc should be a default" do
32
+ expect(config.timeout_proc).to eq(:raise)
33
+ end
34
+ end
35
+
36
+ describe "#configure" do
37
+ before do
38
+ config.configure do |c|
39
+ c.threads = 20
40
+ c.timeout = 30
41
+ c.timeout_proc = :ignore
42
+ end
43
+ end
44
+
45
+ it "threads should be a specified parameter" do
46
+ expect(config.threads).to eq(20)
47
+ end
48
+
49
+ it "timeout should be a specified parameter" do
50
+ expect(config.timeout).to eq(30)
51
+ end
52
+
53
+ it "timeout_proc should be a specified parameter" do
54
+ expect(config.timeout_proc).to eq(:ignore)
55
+ end
56
+
57
+ it "raise error unknown setting" do
58
+ expect {
59
+ config.configure do |c|
60
+ c.unknown = true
61
+ end
62
+ }.to raise_error
63
+ end
64
+ end
65
+
66
+ describe "#reset" do
67
+ before do
68
+ config.configure do |c|
69
+ c.threads = 20
70
+ c.timeout = 30
71
+ c.timeout_proc = :ignore
72
+ end
73
+ config.reset
74
+ end
75
+
76
+ it "threads should be a default" do
77
+ expect(config.threads).to eq(8)
78
+ end
79
+
80
+ it "timeout should be a default" do
81
+ expect(config.timeout).to eq(10)
82
+ end
83
+
84
+ it "timeout_proc should be a default" do
85
+ expect(config.timeout_proc).to eq(:raise)
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Crawl do
6
+ describe "#initialize" do
7
+ it "Driver should be created" do
8
+ mock = double("driver mock")
9
+ RCrawler::Driver.should_receive(:new).and_return(mock)
10
+ c = RCrawler::Crawl.new
11
+ expect(c.instance_eval {@driver}).to eq(mock)
12
+ end
13
+ end
14
+
15
+ describe "#method_missing" do
16
+ it "method in Driver should be called" do
17
+ RCrawler::Driver.any_instance.should_receive(:visit).and_return(true)
18
+ expect {
19
+ c = RCrawler::Crawl.new
20
+ c.instance_eval {visit "http://example.com"}
21
+ }.not_to raise_error
22
+ end
23
+
24
+ it "exception should be thrown if method is not defined in the Driver" do
25
+ expect {
26
+ c = RCrawler::Crawl.new
27
+ c.instance_eval {unknown_method "arg"}
28
+ }.to raise_error
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,56 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Driver do
6
+ it "default_driver should be a webkit" do
7
+ d = RCrawler::Driver.new
8
+ expect(d.instance_eval {Capybara.default_driver}).to eq(:webkit)
9
+ end
10
+
11
+ it "should include Capybara::DSL" do
12
+ d = RCrawler::Driver.new
13
+ expect(d.respond_to? :visit).to be_true
14
+ end
15
+
16
+ describe "#initialize" do
17
+ it "Headless object should be created if no argument" do
18
+ mock = double("Headless mock")
19
+ mock.should_receive(:start)
20
+ Headless.should_receive(:new).and_return(mock)
21
+ RCrawler::Driver.new
22
+ end
23
+
24
+ it "Headless should not start if headless is false" do
25
+ Headless.any_instance.should_not_receive(:start)
26
+ RCrawler::Driver.new(headless: false)
27
+ end
28
+
29
+ it "@headless should be specified object" do
30
+ mock = double("Headless mock")
31
+ mock.should_receive(:start)
32
+ RCrawler::Driver.new(headless: mock)
33
+ end
34
+ end
35
+
36
+ describe "#screenshot" do
37
+ it "exception should be thrown if no argument" do
38
+ expect{RCrawler::Driver.new.screenshot}.to raise_error(ArgumentError)
39
+ end
40
+
41
+ it "visit and save_screenshot method should be called" do
42
+ d = RCrawler::Driver.new
43
+ d.should_receive(:visit)
44
+ d.should_receive(:page).and_return(double("page mock").as_null_object)
45
+ d.screenshot("http://example.com", "/tmp/example.png")
46
+ end
47
+ end
48
+
49
+ describe "#doc" do
50
+ it "should return Nokogiri::HTML::Document object" do
51
+ d = RCrawler::Driver.new
52
+ d.page.should_receive(:html).and_return("<html></html>")
53
+ expect(d.doc.is_a? Nokogiri::HTML::Document).to be_true
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,54 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler do
6
+ describe ".crawl" do
7
+ it "method should be defined" do
8
+ expect(RCrawler.respond_to? :crawl).to be_true
9
+ end
10
+
11
+ it "exception should be thrown if no argument" do
12
+ expect{RCrawler.crawl}.to raise_error(ArgumentError)
13
+ end
14
+
15
+ it "Crawl object should be created" do
16
+ mock = double("Crawl mock")
17
+ RCrawler::Crawl.should_receive(:new).and_return(mock)
18
+ mock.should_receive(:instance_eval)
19
+ RCrawler.crawl {}
20
+ end
21
+ end
22
+
23
+ describe ".async" do
24
+ it "method should be defined" do
25
+ expect(RCrawler.respond_to? :async).to be_true
26
+ end
27
+
28
+ it "exception should be thrown if no argument" do
29
+ expect{RCrawler.async}.to raise_error(ArgumentError)
30
+ end
31
+
32
+ it "Async object should be created" do
33
+ mock = double("Async mock")
34
+ RCrawler::Async.should_receive(:new).and_return(mock)
35
+ mock.should_receive(:instance_eval)
36
+ mock.should_receive(:execute)
37
+ RCrawler.async {}
38
+ end
39
+ end
40
+
41
+ describe ".configure" do
42
+ it "return configuration object" do
43
+ expect(RCrawler.configure.is_a? RCrawler::Configuration).to be_true
44
+ end
45
+
46
+ it "set config value from block" do
47
+ RCrawler.configure do |c|
48
+ c.threads = 10
49
+ end
50
+ expect(RCrawler.configure.threads).not_to eq(8)
51
+ expect(RCrawler.configure.threads).to eq(10)
52
+ end
53
+ end
54
+ end
@@ -1,15 +1,16 @@
1
- # require "simplecov"
2
- # require "coveralls"
3
- # Coveralls.wear!
1
+ require "simplecov"
2
+ require "coveralls"
3
+ Coveralls.wear!
4
4
 
5
5
  # SimpleCov.formatter = SimpleCov::Formatter::HTMLFormatter
6
- # SimpleCov.start do
7
- # add_filter "spec"
8
- # add_filter ".bundle"
9
- # end
6
+ SimpleCov.start do
7
+ add_filter "spec"
8
+ add_filter ".bundle"
9
+ end
10
10
 
11
11
  require "rcrawler"
12
12
 
13
13
  RSpec.configure do |config|
14
14
  config.order = "random"
15
+ config.after {RCrawler::Configuration.instance.reset}
15
16
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - i2bskn
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-02 00:00:00.000000000 Z
11
+ date: 2013-07-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -115,16 +115,25 @@ executables: []
115
115
  extensions: []
116
116
  extra_rdoc_files: []
117
117
  files:
118
+ - .coveralls.yml
118
119
  - .gitignore
120
+ - .travis.yml
119
121
  - Gemfile
120
122
  - LICENSE.txt
121
123
  - README.md
122
124
  - Rakefile
123
125
  - lib/rcrawler.rb
126
+ - lib/rcrawler/async.rb
127
+ - lib/rcrawler/configuration.rb
124
128
  - lib/rcrawler/crawl.rb
125
129
  - lib/rcrawler/driver.rb
126
130
  - lib/rcrawler/version.rb
127
131
  - rcrawler.gemspec
132
+ - spec/rcrawler/async_spec.rb
133
+ - spec/rcrawler/configuration_spec.rb
134
+ - spec/rcrawler/crawl_spec.rb
135
+ - spec/rcrawler/driver_spec.rb
136
+ - spec/rcrawler_spec.rb
128
137
  - spec/spec_helper.rb
129
138
  homepage: https://github.com/i2bskn/rcrawler
130
139
  licenses:
@@ -146,9 +155,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
155
  version: '0'
147
156
  requirements: []
148
157
  rubyforge_project:
149
- rubygems_version: 2.0.2
158
+ rubygems_version: 2.0.0
150
159
  signing_key:
151
160
  specification_version: 4
152
161
  summary: The wrapper of capybara for crawler
153
162
  test_files:
163
+ - spec/rcrawler/async_spec.rb
164
+ - spec/rcrawler/configuration_spec.rb
165
+ - spec/rcrawler/crawl_spec.rb
166
+ - spec/rcrawler/driver_spec.rb
167
+ - spec/rcrawler_spec.rb
154
168
  - spec/spec_helper.rb