botch 0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8ef71ba47deb27d75990a88721b652a6974b0ac4
4
+ data.tar.gz: 6e2db20439b70a7f34285b44ee79246cf88ccf05
5
+ SHA512:
6
+ metadata.gz: f7d6b4d3cd2f94ff018fb70be5d8e71237f52bc82404167aafd34f08b4f473a830b3cda4d52aefe792d65bef5659fa3df0054b2e9d9554d62da2c77b1d39ec9c
7
+ data.tar.gz: 09725780996179fdd914675a15f06bcf792819f02e0a20bcc5a9403bfcde08a859e8ff8536a5f1cba006c2d49dfce23630249a2e540da0c4237fe712a29bd2fa
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # Botch
2
+
3
+ Botch is a simple DSL for quickly creating web crawlers.
4
+
5
+ Inspired by Sinatra.
6
+
7
+ ## Usage
8
+
9
+ ```ruby
10
+ require 'lib/botch'
11
+ require 'kconv'
12
+
13
+ class SampleBotch < Botch::Base
14
+ set :user_agent, "SampleBotch"
15
+
16
+ filter(:all) { status == 200 }
17
+ rule(:all) { |response| body.toutf8 }
18
+ end
19
+
20
+ if $0 == __FILE__
21
+ SampleBotch.run("http://namusyaka.info/") do |response|
22
+ puts response
23
+ end
24
+ end
25
+ ```
26
+
27
+ ## TODO
28
+
29
+ - RSpec
30
+ - GET/POST method
31
+ - Documentation
32
+ - Classic style
33
+
34
+ ## Contributing to Botch
35
+
36
+ 1. fork the project.
37
+ 2. create your feature branch. (`git checkout -b my-feature`)
38
+ 3. commit your changes. (`git commit -am 'commit message.'`)
39
+ 4. push to the branch. (`git push origin my-feature`)
40
+ 5. send pull request.
41
+
42
+ ## License
43
+
44
+ MIT
45
+
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ desc "Run all specs."
4
+ RSpec::Core::RakeTask.new(:rspec) do |spec|
5
+ spec.pattern = 'spec/*_spec.rb'
6
+ spec.rspec_opts = %w(--format p --color)
7
+ end
data/botch.gemspec ADDED
@@ -0,0 +1,16 @@
1
+ require File.expand_path("../lib/botch", __FILE__)
2
+
3
+ Gem::Specification.new "botch", Botch::VERSION do |s|
4
+ s.description = "Botch is a DSL for quickly creating web crawlers. Inspired by Sinatra."
5
+ s.summary = "A DSL for web clawler."
6
+ s.authors = ["namusyaka"]
7
+ s.email = "namusyaka@gmail.com"
8
+ s.homepage = "https://github.com/namusyaka/botch"
9
+ s.files = `git ls-files`.split("\n") - %w(.gitignore)
10
+ s.test_files = s.files.select { |path| path =~ /^spec\/.*_spec\.rb/ }
11
+
12
+ s.add_dependency "faraday"
13
+ s.add_dependency "mechanize"
14
+ s.add_development_dependency "rspec"
15
+ s.add_development_dependency "fakeweb", ["~> 1.3"]
16
+ end
data/lib/botch.rb ADDED
@@ -0,0 +1,5 @@
1
+ require File.expand_path('../botch/base', __FILE__)
2
+
3
+ module Botch
4
+ VERSION = "0.1"
5
+ end
data/lib/botch/base.rb ADDED
@@ -0,0 +1,204 @@
1
+ require 'rubygems' unless defined?(Gem)
2
+ require 'faraday'
3
+ require 'mechanize'
4
+
5
+ %w(
6
+ clients/abstract_client
7
+ clients/faraday_client
8
+ clients/mechanize_client
9
+ ).each{ |path| require File.expand_path("../#{path}", __FILE__) }
10
+
11
+ module Botch
12
+ class Route
13
+ attr_accessor :routes
14
+
15
+ def initialize
16
+ @routes = []
17
+ self
18
+ end
19
+
20
+ def add(label, options = {}, &block)
21
+ raise ArgumentError unless block_given?
22
+ if position = index(label)
23
+ route = @routes[position]
24
+ route[:block] = block
25
+ route[:label] = label
26
+ else
27
+ options[:block] = block
28
+ options[:label] = label
29
+ @routes << options
30
+ end
31
+ end
32
+
33
+ def del(label)
34
+ @routes.delete_if{ |route| route[:label] == label }
35
+ end
36
+
37
+ def exist?(label)
38
+ !!index(label)
39
+ end
40
+
41
+ alias :exists? :exist?
42
+
43
+ def index(label)
44
+ @routes.index{ |route| route[:label] === label }
45
+ end
46
+
47
+ def inject(url)
48
+ @routes.inject([]) do |result, route|
49
+ result << route if map_validation(url, route[:map])
50
+ result
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def map_validation(url, map)
57
+ case map.class.to_s
58
+ when "Regexp" then url =~ map
59
+ when "String" then url.include?(map)
60
+ else true
61
+ end
62
+ end
63
+ end
64
+
65
+ %w( Filter Rule ).each { |klass| Object.const_set(klass, Class.new(Route)) }
66
+
67
+ class Base
68
+ DEFAULT_INSTANCE_VARIABLES = { :header => nil, :body => nil, :status => nil }
69
+ attr_reader(*DEFAULT_INSTANCE_VARIABLES.keys)
70
+
71
+ def initialize
72
+ @header, @body = nil, nil
73
+ end
74
+
75
+ def client
76
+ self.class.client
77
+ end
78
+
79
+ def options
80
+ self.class.options
81
+ end
82
+
83
+ def settings
84
+ self.class.settings
85
+ end
86
+
87
+ class << self
88
+ @@routes = { :filter => Filter.new, :rule => Rule.new }
89
+
90
+ attr_reader :client
91
+
92
+ def instance
93
+ @instance ||= self.new
94
+ end
95
+
96
+ def helpers(*extensions, &block)
97
+ class_eval(&block) if block_given?
98
+ include(*extensions) if extensions.any?
99
+ end
100
+
101
+ def set(key, value = nil)
102
+ return __send__("#{key}=", value) if respond_to?("#{key}=")
103
+
104
+ key_symbol = key.to_sym
105
+ return settings[key_symbol] = value if settings.has_key?(key_symbol)
106
+
107
+ options[key_symbol] = value
108
+ end
109
+
110
+ def route(type, label, options = {}, &block)
111
+ unbound_method = generate_method("#{type} #{label}", &block).bind(instance)
112
+ wrapper = generate_wrapper(&unbound_method)
113
+
114
+ @@routes[type.to_sym].add(label, options, &wrapper)
115
+ end
116
+
117
+ def filter(label, options = {}, &block)
118
+ route(:filter, label, options, &block)
119
+ end
120
+
121
+ def rule(label, options = {}, &block)
122
+ route(:rule, label, options, &block)
123
+ end
124
+
125
+ def generate_wrapper(&method)
126
+ method.arity != 0 ? proc {|args| method.call(*args) } :
127
+ proc {|args| method.call }
128
+ end
129
+
130
+ def generate_method(method_name, &block)
131
+ define_method(method_name, &block)
132
+ unbound_method = instance_method(method_name)
133
+ remove_method(method_name)
134
+ unbound_method
135
+ end
136
+
137
+ def reset!
138
+ settings = {}
139
+ end
140
+
141
+ def run(*urls, &block)
142
+ if block_given?
143
+ unbound_method = generate_method(:main_unbound_method, &block).bind(instance)
144
+ block = case unbound_method.arity
145
+ when 2 then proc{|r,v| unbound_method.call(r, v) }
146
+ when 1 then proc{|r,v| unbound_method.call(r) }
147
+ else proc{|r,v| unbound_method.call }
148
+ end
149
+ end
150
+ set_default_options! unless self.client
151
+
152
+ urls.map do |url|
153
+ filters, rules = @@routes.map{ |k, v| v.inject(url) }
154
+ response = self.client.get(url, options)
155
+
156
+ set_instance_variables(:header => response[:header],
157
+ :body => response[:body],
158
+ :status => response[:status])
159
+
160
+ response = response[:response]
161
+
162
+ unless filters.empty?
163
+ valid = filters.map{ |_filter| _filter[:block].call(response) }.all?
164
+ next if settings[:disabled_invalid] && !valid
165
+ end
166
+
167
+ response = rules.inject(nil) { |result, _rule|
168
+ _rule[:block].call((result || response))
169
+ } unless rules.empty?
170
+
171
+ response = block.call(response, valid) if block_given?
172
+ set_instance_variables(DEFAULT_INSTANCE_VARIABLES)
173
+ response
174
+ end
175
+ end
176
+
177
+ def client=(name)
178
+ @client = Client.const_get("#{name.to_s.capitalize}Client").new(settings) if clients.include?(name)
179
+ end
180
+
181
+ def settings
182
+ @settings ||= { :disabled_invalid => false }
183
+ end
184
+
185
+ def options
186
+ @options ||= {}
187
+ end
188
+
189
+ private
190
+
191
+ def set_instance_variables(pairs = {})
192
+ pairs.each_pair { |name, value| instance.instance_variable_set("@#{name}".to_sym, value) }
193
+ end
194
+
195
+ def clients
196
+ @_clients ||= [:faraday, :mechanize]
197
+ end
198
+
199
+ def set_default_options!
200
+ self.client = :faraday
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,15 @@
1
+ module Botch
2
+ module Client
3
+ class AbstractClient
4
+ attr_reader :client
5
+
6
+ def initialize
7
+ @client = nil
8
+ end
9
+
10
+ def get(url, options = {})
11
+ # return a response object
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,28 @@
1
+ module Botch
2
+ module Client
3
+ class FaradayClient < AbstractClient
4
+ def initialize(settings = {})
5
+ @client = :faraday
6
+ @handler = Faraday.new(settings) do |builder|
7
+ builder.use Faraday::Adapter::NetHttp
8
+ builder.use Faraday::Request::UrlEncoded
9
+ end
10
+ end
11
+
12
+ def get(url, options)
13
+ options.each_pair{ |key, value| @handler.headers[key] = value }
14
+ response = @handler.get(url)
15
+ parse_response(response)
16
+ end
17
+
18
+ def parse_response(response)
19
+ result = {}
20
+ result[:status] = response.status
21
+ result[:header] = response.headers
22
+ result[:body] = response.body
23
+ result[:response] = response
24
+ result
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ module Botch
2
+ module Client
3
+ class MechanizeResponseError
4
+ attr_accessor :code, :header, :body
5
+
6
+ def initialize(response_error)
7
+ @code = response_error.response_code
8
+ @body = ""
9
+ @header = Mechanize::Headers.new
10
+ @response = response_error
11
+ end
12
+ end
13
+
14
+ class MechanizeClient < AbstractClient
15
+ def initialize(options = {})
16
+ @client = :mechanize
17
+ @handler = Mechanize.new
18
+ end
19
+
20
+ def get(url, options = {})
21
+ @handler.user_agent = options[:user_agent] if options[:user_agent]
22
+ mechanize_page = @handler.get(url) rescue MechanizeResponseError.new($!)
23
+ parse_response(mechanize_page)
24
+ end
25
+
26
+ def parse_response(response)
27
+ result = {}
28
+ result[:header] = response.header
29
+ result[:status] = response.code.to_i
30
+ result[:body] = response.body
31
+ result[:response] = response
32
+ result
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,67 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Botch
5
+ describe do
6
+ it "Should have a version." do
7
+ expect(Botch.const_defined?("VERSION")).to be_true
8
+ end
9
+ end
10
+
11
+ describe Base do
12
+ before(:all) do
13
+ @fake = Fake.new
14
+ class SampleBotch < Botch::Base; end
15
+ SampleBotch.run(@fake.url) {}
16
+ end
17
+
18
+ it 'Default client should be faraday.' do
19
+ expect(SampleBotch.client).to be_an_instance_of(Botch::Client::FaradayClient)
20
+ end
21
+
22
+ describe "settings and options" do
23
+ before(:all) do
24
+ class SampleBotch < Botch::Base
25
+ set :user_agent, "SampleBotch User-Agent"
26
+ set :client, :mechanize
27
+ set :disabled_invalid, true
28
+ set :original_option, "foobar"
29
+ end
30
+ @options = SampleBotch.options
31
+ @settings = SampleBotch.settings
32
+ end
33
+
34
+ it "Original options should be stored in options." do
35
+ expect(@options[:original_option]).to eq("foobar")
36
+ end
37
+
38
+ it ":user_agent should be stored in options." do
39
+ expect(@options[:user_agent]).to eq("SampleBotch User-Agent")
40
+ end
41
+
42
+ it ":disabled_invalid should be stored in settings." do
43
+ expect(@settings[:disabled_invalid]).to be_true
44
+ end
45
+
46
+ it "Client setter should be valid." do
47
+ expect(SampleBotch.client).to be_an_instance_of(Botch::Client::MechanizeClient)
48
+ end
49
+ end
50
+
51
+ describe "instance variable" do
52
+ before(:all) do
53
+ class SampleBotch < Botch::Base
54
+ set :user_agent, "SampleBotch User-Agent"
55
+ set :disabled_invalid, false
56
+
57
+ filter(:all) { @test = "test" }
58
+ rule(:all) { @test }
59
+ end
60
+ end
61
+
62
+ it "should be able to use instance variable." do
63
+ expect(SampleBotch.run(@fake.url)[0]).to eq("test")
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,57 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Botch
5
+ describe Client::FaradayClient do
6
+ before(:all) do
7
+ FakeWeb.clean_registry
8
+ @fakes = []
9
+ @fakes << Fake.new("/", :status => [200, "OK"], :content_type => "text/html")
10
+ @fakes << Fake.new("/test1", :status => [404, "Not Found"], :content_type => "text/html")
11
+ @fakes << Fake.new("/test2", :status => [500, "Internal Server Error"], :content_type => "text/html")
12
+
13
+ class SampleBotch < Botch::Base
14
+ set :user_agent, "SampleBotch User-Agent"
15
+ set :client, :faraday
16
+ set :disabled_invalid, nil
17
+ filter(:all){ status == 200 }
18
+ rule(:all){ status }
19
+ end
20
+ end
21
+
22
+ it 'client should be faraday if set :faraday to :client.' do
23
+ expect(SampleBotch.run(@fakes[0].url) { client }[0]).to be_an_instance_of(Botch::Client::FaradayClient)
24
+ end
25
+
26
+ it 'helpers should return valid data.' do
27
+ result = SampleBotch.run(@fakes[0].url) do
28
+ { :status => status, :header => header, :body => body }
29
+ end
30
+ result = result[0]
31
+ expect(result[:status]).to eq(200)
32
+ expect(result[:header]).to be_an_instance_of(Faraday::Utils::Headers)
33
+ expect(result[:body]).to be_an_instance_of(String)
34
+ end
35
+
36
+ it 'block argument of #rule should replace last expression.' do
37
+ result = SampleBotch.run(*@fakes.map(&:url))
38
+ expect(result[0]).to eq(200)
39
+ expect(result[1]).to eq(404)
40
+ expect(result[2]).to eq(500)
41
+ end
42
+
43
+ it 'block argument of #run should replace last expression.' do
44
+ result = SampleBotch.run(*@fakes.map(&:url)){ "Foo" }
45
+ expect(result[0]).to eq("Foo")
46
+ expect(result[1]).to eq("Foo")
47
+ expect(result[2]).to eq("Foo")
48
+ end
49
+
50
+ it 'the second argument should be boolean.' do
51
+ result = SampleBotch.run(*@fakes.map(&:url)){ |response, valid| valid }
52
+ expect(result[0]).to be_true
53
+ expect(result[1]).to be_false
54
+ expect(result[2]).to be_false
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,57 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Botch
5
+ describe Client::MechanizeClient do
6
+ before(:all) do
7
+ FakeWeb.clean_registry
8
+ @fakes = []
9
+ @fakes << Fake.new("/", :status => [200, "OK"], :content_type => "text/html")
10
+ @fakes << Fake.new("/test1", :status => [404, "Not Found"], :content_type => "text/html")
11
+ @fakes << Fake.new("/test2", :status => [500, "Internal Server Error"], :content_type => "text/html")
12
+
13
+ class SampleBotch < Botch::Base
14
+ set :user_agent, "SampleBotch User-Agent"
15
+ set :client, :mechanize
16
+ set :disabled_invalid, nil
17
+ filter(:all){ status == 200 }
18
+ rule(:all){ status }
19
+ end
20
+ end
21
+
22
+ it 'client should be faraday if set :mechanize to :client.' do
23
+ expect(SampleBotch.run(@fakes[0].url) { client }[0]).to be_an_instance_of(Botch::Client::MechanizeClient)
24
+ end
25
+
26
+ it 'helpers should return valid data.' do
27
+ result = SampleBotch.run(@fakes[0].url) do
28
+ { :status => status, :header => header, :body => body }
29
+ end
30
+ result = result[0]
31
+ expect(result[:status]).to eq(200)
32
+ expect(result[:header]).to be_an_instance_of(Mechanize::Headers)
33
+ expect(result[:body]).to be_an_instance_of(String)
34
+ end
35
+
36
+ it 'block argument of #rule should replace last expression.' do
37
+ result = SampleBotch.run(*@fakes.map(&:url))
38
+ expect(result[0]).to eq(200)
39
+ expect(result[1]).to eq(404)
40
+ expect(result[2]).to eq(500)
41
+ end
42
+
43
+ it 'block argument of #run should replace last expression.' do
44
+ result = SampleBotch.run(*@fakes.map(&:url)){ "Foo" }
45
+ expect(result[0]).to eq("Foo")
46
+ expect(result[1]).to eq("Foo")
47
+ expect(result[2]).to eq("Foo")
48
+ end
49
+
50
+ it 'the second argument should be boolean.' do
51
+ result = SampleBotch.run(*@fakes.map(&:url)){ |response, valid| valid }
52
+ expect(result[0]).to be_true
53
+ expect(result[1]).to be_false
54
+ expect(result[2]).to be_false
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,39 @@
1
+ require 'botch'
2
+ require 'fakeweb'
3
+
4
+ FakeWeb.allow_net_connect = false
5
+
6
+ module Botch
7
+ SPEC_DOMAIN = 'example.com'
8
+
9
+ class Fake
10
+ def initialize(path = "/", options = {})
11
+ @path = path
12
+ @scheme = options[:scheme] || "http"
13
+ @content_type = options[:content_type] || "text/html"
14
+ @status = options[:status] || [200, "OK"]
15
+ add_to_fakeweb
16
+ end
17
+
18
+ def url
19
+ @scheme + "://" + SPEC_DOMAIN + @path
20
+ end
21
+
22
+ def body
23
+ @body ||= <<-HTML
24
+ <html>
25
+ <head>
26
+ <title>Fake page #{@path}</title>
27
+ </head>
28
+ <body>
29
+ </body>
30
+ </html>
31
+ HTML
32
+ end
33
+
34
+ def add_to_fakeweb
35
+ options = { :body => @body, :content_type => @content_type, :status => @status }
36
+ FakeWeb.register_uri(:get, url, options)
37
+ end
38
+ end
39
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: botch
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - namusyaka
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: mechanize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fakeweb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '1.3'
69
+ description: Botch is a DSL for quickly creating web crawlers. Inspired by Sinatra.
70
+ email: namusyaka@gmail.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - Gemfile
76
+ - README.md
77
+ - Rakefile
78
+ - botch.gemspec
79
+ - lib/botch.rb
80
+ - lib/botch/base.rb
81
+ - lib/botch/clients/abstract_client.rb
82
+ - lib/botch/clients/faraday_client.rb
83
+ - lib/botch/clients/mechanize_client.rb
84
+ - spec/botch_spec.rb
85
+ - spec/faraday_spec.rb
86
+ - spec/mechanize_spec.rb
87
+ - spec/spec_helper.rb
88
+ homepage: https://github.com/namusyaka/botch
89
+ licenses: []
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements: []
106
+ rubyforge_project:
107
+ rubygems_version: 2.0.2
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: A DSL for web clawler.
111
+ test_files:
112
+ - spec/botch_spec.rb
113
+ - spec/faraday_spec.rb
114
+ - spec/mechanize_spec.rb
115
+ has_rdoc: