rcrawler 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06cb51462ed7694164d649bc33f28102ae5ed66a
4
- data.tar.gz: fc4017fd01b7140517b1e13bcbda221ac672eb77
3
+ metadata.gz: ff1eee821a0d9e416d60245d49c28fb8a871b331
4
+ data.tar.gz: 74e6cd410b5c632ccc4249f61564843579654adc
5
5
  SHA512:
6
- metadata.gz: 2408c89221e3c7d4b9e6f11f1d2015d4471efe3492369ded48ba4a202fce1c30b74894201ccdd7af5cd6e94cfe8ea9090a1a11c38f3352f7929804884a7eb2a2
7
- data.tar.gz: 6c77a1d1fab7789c3920524c746e8f1f7070a78df0b27f02a6db059eb1695a4c616389eb40939b9d6dc88d95e3ff4c7dd065b2aa01cb097ec326143612275cbe
6
+ metadata.gz: d6db4bf8b44933b5dc99d0932dee1d6a19180ade6751ad5689f6aef3868d26cc0cbb0320b18b5b41c34641c376483ac4c413d7380a68b19f990f84e8e50827be
7
+ data.tar.gz: f6f7b46570e5af00e157cf2fb5ad1a81db2f09b2097e8db4ca79e681b849a24d73055c24f1b94c202be61b9e3e704cf63a068214066e70d2190250000c6d9c99
@@ -0,0 +1 @@
1
+ repo_token: OFZ4IKBbbaOGDyQwEB3LNlGDoV9dQearl
@@ -0,0 +1,13 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ gemfile:
6
+ - Gemfile
7
+ script: bundle exec rake spec
8
+ branches:
9
+ only:
10
+ - master
11
+ notifications:
12
+ mails:
13
+ - i2bskn@gmail.com
data/README.md CHANGED
@@ -1,7 +1,17 @@
1
- # Rcrawler
1
+ # RCrawler
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rcrawler.png)](http://badge.fury.io/rb/rcrawler)
4
+ [![Build Status](https://travis-ci.org/i2bskn/rcrawler.png?branch=master)](https://travis-ci.org/i2bskn/rcrawler)
5
+ [![Coverage Status](https://coveralls.io/repos/i2bskn/rcrawler/badge.png)](https://coveralls.io/r/i2bskn/rcrawler)
6
+ [![Code Climate](https://codeclimate.com/github/i2bskn/rcrawler.png)](https://codeclimate.com/github/i2bskn/rcrawler)
2
7
 
3
8
  The wrapper of capybara for crawler.
4
9
 
10
+ ## Dependencies
11
+
12
+ * nokogiri requires libxml2.
13
+ * capybara-webkit requires qt. [capybara-webkit wiki](https://github.com/thoughtbot/capybara-webkit/wiki/Installing-Qt-and-compiling-capybara-webkit)
14
+
5
15
  ## Installation
6
16
 
7
17
  Add this line to your application's Gemfile:
@@ -18,6 +28,8 @@ Or install it yourself as:
18
28
 
19
29
  ## Usage
20
30
 
31
+ #### Crawl
32
+
21
33
  ```ruby
22
34
  require "rcrawler"
23
35
 
@@ -40,6 +52,33 @@ RCrawler.crawl do
40
52
  end
41
53
  ```
42
54
 
55
+ #### Configuration
56
+
57
+ ```ruby
58
+ RCrawler.configure do |c|
59
+ c.threads = 10 # => default is 8
60
+ c.timeout = 20 # => default is 10
61
+ c.timeout_proc = :ignore # => default is :raise
62
+ end
63
+ ```
64
+
65
+ #### Async processing
66
+
67
+ ```ruby
68
+ RCrawler.async do
69
+ crawl do
70
+ # do something
71
+ end
72
+
73
+ crawl do
74
+ # do something
75
+ end
76
+
77
+ crawl do
78
+ # do something
79
+ end
80
+ end
81
+ ```
43
82
  ## Contributing
44
83
 
45
84
  1. Fork it
@@ -7,14 +7,35 @@ require "nokogiri"
7
7
  require "headless"
8
8
 
9
9
  require "rcrawler/version"
10
+ require "rcrawler/configuration"
10
11
  require "rcrawler/driver"
11
12
  require "rcrawler/crawl"
13
+ require "rcrawler/async"
12
14
 
13
15
  module RCrawler
16
+ @config = Configuration.instance
17
+
14
18
  class << self
15
19
  def crawl(&block)
16
- c = Crawl.new
17
- c.instance_eval &block
20
+ begin
21
+ Timeout::timeout(@config.timeout) {Crawl.new.instance_eval &block}
22
+ rescue Timeout::Error => e
23
+ raise if @config.timeout_proc == :raise
24
+ end
25
+ end
26
+
27
+ def configure(&block)
28
+ if block_given?
29
+ @config.configure &block
30
+ else
31
+ @config
32
+ end
33
+ end
34
+
35
+ def async(&block)
36
+ async_threads = Async.new
37
+ async_threads.instance_eval &block
38
+ async_threads.execute
18
39
  end
19
40
  end
20
41
  end
@@ -0,0 +1,44 @@
1
+ # coding: utf-8
2
+
3
+ require "thread"
4
+ require "timeout"
5
+
6
+ module RCrawler
7
+ class Async
8
+ def initialize
9
+ @queue = ::Queue.new
10
+ @config = ::RCrawler::Configuration.instance
11
+ end
12
+
13
+ def crawl(&block)
14
+ raise ArgumentError, "crawl method is required block" unless block_given?
15
+ @queue.push block
16
+ end
17
+
18
+ def execute
19
+ threads = []
20
+ @config.threads.times do
21
+ threads << create_thread
22
+ end
23
+ threads.each {|thread| thread.join}
24
+ end
25
+
26
+ private
27
+ def create_thread
28
+ ::Thread.start do
29
+ while !@queue.empty?
30
+ begin
31
+ Timeout::timeout(@config.timeout) {exec_crawl(@queue.pop)}
32
+ rescue Timeout::Error => e
33
+ raise if @config.timeout_proc == :raise
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ def exec_crawl(block)
40
+ crwl = Crawl.new
41
+ crwl.instance_eval &block
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+
3
+ require "singleton"
4
+
5
+ module RCrawler
6
+ class Configuration
7
+ VALID_OPTIONS_KEYS = [
8
+ :threads,
9
+ :timeout,
10
+ :timeout_proc
11
+ ].freeze
12
+
13
+ attr_accessor *VALID_OPTIONS_KEYS
14
+
15
+ include Singleton
16
+
17
+ def initialize
18
+ reset
19
+ end
20
+
21
+ def configure
22
+ yield self
23
+ end
24
+
25
+ def reset
26
+ self.threads = 8
27
+ self.timeout = 10
28
+ self.timeout_proc = :raise
29
+ end
30
+ end
31
+ end
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
2
 
3
3
  module RCrawler
4
- VERSION = "0.0.1"
4
+ VERSION = "0.0.2"
5
5
  end
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Async do
6
+ let(:async) {RCrawler::Async.new}
7
+
8
+ describe "#initialize" do
9
+ it "Queue should be created" do
10
+ mock = double("queue mock")
11
+ Queue.should_receive(:new).and_return(mock)
12
+ expect(async.instance_eval {@queue}).to eq(mock)
13
+ end
14
+
15
+ it "Configuration should be setted" do
16
+ mock = double("configuration mock").as_null_object
17
+ RCrawler::Configuration.should_receive(:instance).exactly(2).and_return(mock)
18
+ expect(async.instance_eval {@config}).to eq(mock)
19
+ end
20
+ end
21
+
22
+ describe "#crawl" do
23
+ it "crawl task should be registered" do
24
+ async
25
+ expect {
26
+ async.crawl {visit "http://example.com"}
27
+ }.to change(async.instance_eval {@queue}, :size).by(1)
28
+ end
29
+
30
+ it "exception should be generated if not argument" do
31
+ expect {async.crawl}.to raise_error(ArgumentError)
32
+ end
33
+ end
34
+
35
+ describe "#execute" do
36
+ it "thread should be created 8" do
37
+ mock = double("thread mock")
38
+ mock.should_receive(:join).exactly(8)
39
+ RCrawler::Async.any_instance.should_receive(:create_thread).exactly(8).and_return(mock)
40
+ expect{async.execute}.not_to raise_error
41
+ end
42
+ end
43
+
44
+ describe "#create_thread" do
45
+ before {RCrawler::Crawl.stub(:new).and_return(double.as_null_object)}
46
+
47
+ it "Thread object should be returned" do
48
+ expect(async.send(:create_thread).is_a? Thread).to be_true
49
+ end
50
+
51
+ it "exec_crawl method should be called" do
52
+ async.crawl {}
53
+ expect{async.send(:create_thread)}.not_to raise_error
54
+ end
55
+ end
56
+
57
+ describe "#exec_crawl" do
58
+ it "Crawl object should be created" do
59
+ mock = double("crawl mock")
60
+ mock.should_receive(:instance_eval)
61
+ RCrawler::Crawl.should_receive(:new).and_return(mock)
62
+ expect(async.send(:exec_crawl, Proc.new{})).not_to raise_error
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,88 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Configuration do
6
+ let(:config) {RCrawler::Configuration.instance}
7
+
8
+ describe "defined accessors" do
9
+ it "defined threads" do
10
+ expect(config.respond_to? :threads).to be_true
11
+ end
12
+
13
+ it "defined timeout" do
14
+ expect(config.respond_to? :timeout).to be_true
15
+ end
16
+
17
+ it "defined timeout_proc" do
18
+ expect(config.respond_to? :timeout_proc).to be_true
19
+ end
20
+ end
21
+
22
+ describe "#initialize" do
23
+ it "threads should be a default" do
24
+ expect(config.threads).to eq(8)
25
+ end
26
+
27
+ it "timeout should be a default" do
28
+ expect(config.timeout).to eq(10)
29
+ end
30
+
31
+ it "timeout_proc should be a default" do
32
+ expect(config.timeout_proc).to eq(:raise)
33
+ end
34
+ end
35
+
36
+ describe "#configure" do
37
+ before do
38
+ config.configure do |c|
39
+ c.threads = 20
40
+ c.timeout = 30
41
+ c.timeout_proc = :ignore
42
+ end
43
+ end
44
+
45
+ it "threads should be a specified parameter" do
46
+ expect(config.threads).to eq(20)
47
+ end
48
+
49
+ it "timeout should be a specified parameter" do
50
+ expect(config.timeout).to eq(30)
51
+ end
52
+
53
+ it "timeout_proc should be a specified parameter" do
54
+ expect(config.timeout_proc).to eq(:ignore)
55
+ end
56
+
57
+ it "raise error unknown setting" do
58
+ expect {
59
+ config.configure do |c|
60
+ c.unknown = true
61
+ end
62
+ }.to raise_error
63
+ end
64
+ end
65
+
66
+ describe "#reset" do
67
+ before do
68
+ config.configure do |c|
69
+ c.threads = 20
70
+ c.timeout = 30
71
+ c.timeout_proc = :ignore
72
+ end
73
+ config.reset
74
+ end
75
+
76
+ it "threads should be a default" do
77
+ expect(config.threads).to eq(8)
78
+ end
79
+
80
+ it "timeout should be a default" do
81
+ expect(config.timeout).to eq(10)
82
+ end
83
+
84
+ it "timeout_proc should be a default" do
85
+ expect(config.timeout_proc).to eq(:raise)
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Crawl do
6
+ describe "#initialize" do
7
+ it "Driver should be created" do
8
+ mock = double("driver mock")
9
+ RCrawler::Driver.should_receive(:new).and_return(mock)
10
+ c = RCrawler::Crawl.new
11
+ expect(c.instance_eval {@driver}).to eq(mock)
12
+ end
13
+ end
14
+
15
+ describe "#method_missing" do
16
+ it "method in Driver should be called" do
17
+ RCrawler::Driver.any_instance.should_receive(:visit).and_return(true)
18
+ expect {
19
+ c = RCrawler::Crawl.new
20
+ c.instance_eval {visit "http://example.com"}
21
+ }.not_to raise_error
22
+ end
23
+
24
+ it "exception should be thrown if method is not defined in the Driver" do
25
+ expect {
26
+ c = RCrawler::Crawl.new
27
+ c.instance_eval {unknown_method "arg"}
28
+ }.to raise_error
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,56 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler::Driver do
6
+ it "default_driver should be a webkit" do
7
+ d = RCrawler::Driver.new
8
+ expect(d.instance_eval {Capybara.default_driver}).to eq(:webkit)
9
+ end
10
+
11
+ it "should include Capybara::DSL" do
12
+ d = RCrawler::Driver.new
13
+ expect(d.respond_to? :visit).to be_true
14
+ end
15
+
16
+ describe "#initialize" do
17
+ it "Headless object should be created if no argument" do
18
+ mock = double("Headless mock")
19
+ mock.should_receive(:start)
20
+ Headless.should_receive(:new).and_return(mock)
21
+ RCrawler::Driver.new
22
+ end
23
+
24
+ it "Headless should not start if headless is false" do
25
+ Headless.any_instance.should_not_receive(:start)
26
+ RCrawler::Driver.new(headless: false)
27
+ end
28
+
29
+ it "@headless should be specified object" do
30
+ mock = double("Headless mock")
31
+ mock.should_receive(:start)
32
+ RCrawler::Driver.new(headless: mock)
33
+ end
34
+ end
35
+
36
+ describe "#screenshot" do
37
+ it "exception should be thrown if no argument" do
38
+ expect{RCrawler::Driver.new.screenshot}.to raise_error(ArgumentError)
39
+ end
40
+
41
+ it "visit and save_screenshot method should be called" do
42
+ d = RCrawler::Driver.new
43
+ d.should_receive(:visit)
44
+ d.should_receive(:page).and_return(double("page mock").as_null_object)
45
+ d.screenshot("http://example.com", "/tmp/example.png")
46
+ end
47
+ end
48
+
49
+ describe "#doc" do
50
+ it "should return Nokogiri::HTML::Document object" do
51
+ d = RCrawler::Driver.new
52
+ d.page.should_receive(:html).and_return("<html></html>")
53
+ expect(d.doc.is_a? Nokogiri::HTML::Document).to be_true
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,54 @@
1
+ # coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe RCrawler do
6
+ describe ".crawl" do
7
+ it "method should be defined" do
8
+ expect(RCrawler.respond_to? :crawl).to be_true
9
+ end
10
+
11
+ it "exception should be thrown if no argument" do
12
+ expect{RCrawler.crawl}.to raise_error(ArgumentError)
13
+ end
14
+
15
+ it "Crawl object should be created" do
16
+ mock = double("Crawl mock")
17
+ RCrawler::Crawl.should_receive(:new).and_return(mock)
18
+ mock.should_receive(:instance_eval)
19
+ RCrawler.crawl {}
20
+ end
21
+ end
22
+
23
+ describe ".async" do
24
+ it "method should be defined" do
25
+ expect(RCrawler.respond_to? :async).to be_true
26
+ end
27
+
28
+ it "exception should be thrown if no argument" do
29
+ expect{RCrawler.async}.to raise_error(ArgumentError)
30
+ end
31
+
32
+ it "Async object should be created" do
33
+ mock = double("Async mock")
34
+ RCrawler::Async.should_receive(:new).and_return(mock)
35
+ mock.should_receive(:instance_eval)
36
+ mock.should_receive(:execute)
37
+ RCrawler.async {}
38
+ end
39
+ end
40
+
41
+ describe ".configure" do
42
+ it "return configuration object" do
43
+ expect(RCrawler.configure.is_a? RCrawler::Configuration).to be_true
44
+ end
45
+
46
+ it "set config value from block" do
47
+ RCrawler.configure do |c|
48
+ c.threads = 10
49
+ end
50
+ expect(RCrawler.configure.threads).not_to eq(8)
51
+ expect(RCrawler.configure.threads).to eq(10)
52
+ end
53
+ end
54
+ end
@@ -1,15 +1,16 @@
1
- # require "simplecov"
2
- # require "coveralls"
3
- # Coveralls.wear!
1
+ require "simplecov"
2
+ require "coveralls"
3
+ Coveralls.wear!
4
4
 
5
5
  # SimpleCov.formatter = SimpleCov::Formatter::HTMLFormatter
6
- # SimpleCov.start do
7
- # add_filter "spec"
8
- # add_filter ".bundle"
9
- # end
6
+ SimpleCov.start do
7
+ add_filter "spec"
8
+ add_filter ".bundle"
9
+ end
10
10
 
11
11
  require "rcrawler"
12
12
 
13
13
  RSpec.configure do |config|
14
14
  config.order = "random"
15
+ config.after {RCrawler::Configuration.instance.reset}
15
16
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - i2bskn
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-02 00:00:00.000000000 Z
11
+ date: 2013-07-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -115,16 +115,25 @@ executables: []
115
115
  extensions: []
116
116
  extra_rdoc_files: []
117
117
  files:
118
+ - .coveralls.yml
118
119
  - .gitignore
120
+ - .travis.yml
119
121
  - Gemfile
120
122
  - LICENSE.txt
121
123
  - README.md
122
124
  - Rakefile
123
125
  - lib/rcrawler.rb
126
+ - lib/rcrawler/async.rb
127
+ - lib/rcrawler/configuration.rb
124
128
  - lib/rcrawler/crawl.rb
125
129
  - lib/rcrawler/driver.rb
126
130
  - lib/rcrawler/version.rb
127
131
  - rcrawler.gemspec
132
+ - spec/rcrawler/async_spec.rb
133
+ - spec/rcrawler/configuration_spec.rb
134
+ - spec/rcrawler/crawl_spec.rb
135
+ - spec/rcrawler/driver_spec.rb
136
+ - spec/rcrawler_spec.rb
128
137
  - spec/spec_helper.rb
129
138
  homepage: https://github.com/i2bskn/rcrawler
130
139
  licenses:
@@ -146,9 +155,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
155
  version: '0'
147
156
  requirements: []
148
157
  rubyforge_project:
149
- rubygems_version: 2.0.2
158
+ rubygems_version: 2.0.0
150
159
  signing_key:
151
160
  specification_version: 4
152
161
  summary: The wrapper of capybara for crawler
153
162
  test_files:
163
+ - spec/rcrawler/async_spec.rb
164
+ - spec/rcrawler/configuration_spec.rb
165
+ - spec/rcrawler/crawl_spec.rb
166
+ - spec/rcrawler/driver_spec.rb
167
+ - spec/rcrawler_spec.rb
154
168
  - spec/spec_helper.rb