botch 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8ef71ba47deb27d75990a88721b652a6974b0ac4
4
+ data.tar.gz: 6e2db20439b70a7f34285b44ee79246cf88ccf05
5
+ SHA512:
6
+ metadata.gz: f7d6b4d3cd2f94ff018fb70be5d8e71237f52bc82404167aafd34f08b4f473a830b3cda4d52aefe792d65bef5659fa3df0054b2e9d9554d62da2c77b1d39ec9c
7
+ data.tar.gz: 09725780996179fdd914675a15f06bcf792819f02e0a20bcc5a9403bfcde08a859e8ff8536a5f1cba006c2d49dfce23630249a2e540da0c4237fe712a29bd2fa
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # Botch
2
+
3
+ Botch is a simple DSL for quickly creating web crawlers.
4
+
5
+ Inspired by Sinatra.
6
+
7
+ ## Usage
8
+
9
+ ```ruby
10
+ require 'lib/botch'
11
+ require 'kconv'
12
+
13
+ class SampleBotch < Botch::Base
14
+ set :user_agent, "SampleBotch"
15
+
16
+ filter(:all) { status == 200 }
17
+ rule(:all) { |response| body.toutf8 }
18
+ end
19
+
20
+ if $0 == __FILE__
21
+ SampleBotch.run("http://namusyaka.info/") do |response|
22
+ puts response
23
+ end
24
+ end
25
+ ```
26
+
27
+ ## TODO
28
+
29
+ - RSpec
30
+ - GET/POST method
31
+ - Documentation
32
+ - Classic style
33
+
34
+ ## Contributing to Botch
35
+
36
+ 1. fork the project.
37
+ 2. create your feature branch. (`git checkout -b my-feature`)
38
+ 3. commit your changes. (`git commit -am 'commit message.'`)
39
+ 4. push to the branch. (`git push origin my-feature`)
40
+ 5. send pull request.
41
+
42
+ ## License
43
+
44
+ MIT
45
+
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ desc "Run all specs."
4
+ RSpec::Core::RakeTask.new(:rspec) do |spec|
5
+ spec.pattern = 'spec/*_spec.rb'
6
+ spec.rspec_opts = %w(--format p --color)
7
+ end
data/botch.gemspec ADDED
@@ -0,0 +1,16 @@
1
+ require File.expand_path("../lib/botch", __FILE__)
2
+
3
+ Gem::Specification.new "botch", Botch::VERSION do |s|
4
+ s.description = "Botch is a DSL for quickly creating web crawlers. Inspired by Sinatra."
5
+ s.summary = "A DSL for web clawler."
6
+ s.authors = ["namusyaka"]
7
+ s.email = "namusyaka@gmail.com"
8
+ s.homepage = "https://github.com/namusyaka/botch"
9
+ s.files = `git ls-files`.split("\n") - %w(.gitignore)
10
+ s.test_files = s.files.select { |path| path =~ /^spec\/.*_spec\.rb/ }
11
+
12
+ s.add_dependency "faraday"
13
+ s.add_dependency "mechanize"
14
+ s.add_development_dependency "rspec"
15
+ s.add_development_dependency "fakeweb", ["~> 1.3"]
16
+ end
data/lib/botch.rb ADDED
@@ -0,0 +1,5 @@
1
+ require File.expand_path('../botch/base', __FILE__)
2
+
3
+ module Botch
4
+ VERSION = "0.1"
5
+ end
data/lib/botch/base.rb ADDED
@@ -0,0 +1,204 @@
1
+ require 'rubygems' unless defined?(Gem)
2
+ require 'faraday'
3
+ require 'mechanize'
4
+
5
+ %w(
6
+ clients/abstract_client
7
+ clients/faraday_client
8
+ clients/mechanize_client
9
+ ).each{ |path| require File.expand_path("../#{path}", __FILE__) }
10
+
11
+ module Botch
12
+ class Route
13
+ attr_accessor :routes
14
+
15
+ def initialize
16
+ @routes = []
17
+ self
18
+ end
19
+
20
+ def add(label, options = {}, &block)
21
+ raise ArgumentError unless block_given?
22
+ if position = index(label)
23
+ route = @routes[position]
24
+ route[:block] = block
25
+ route[:label] = label
26
+ else
27
+ options[:block] = block
28
+ options[:label] = label
29
+ @routes << options
30
+ end
31
+ end
32
+
33
+ def del(label)
34
+ @routes.delete_if{ |route| route[:label] == label }
35
+ end
36
+
37
+ def exist?(label)
38
+ !!index(label)
39
+ end
40
+
41
+ alias :exists? :exist?
42
+
43
+ def index(label)
44
+ @routes.index{ |route| route[:label] === label }
45
+ end
46
+
47
+ def inject(url)
48
+ @routes.inject([]) do |result, route|
49
+ result << route if map_validation(url, route[:map])
50
+ result
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def map_validation(url, map)
57
+ case map.class.to_s
58
+ when "Regexp" then url =~ map
59
+ when "String" then url.include?(map)
60
+ else true
61
+ end
62
+ end
63
+ end
64
+
65
+ %w( Filter Rule ).each { |klass| Object.const_set(klass, Class.new(Route)) }
66
+
67
+ class Base
68
+ DEFAULT_INSTANCE_VARIABLES = { :header => nil, :body => nil, :status => nil }
69
+ attr_reader(*DEFAULT_INSTANCE_VARIABLES.keys)
70
+
71
+ def initialize
72
+ @header, @body = nil, nil
73
+ end
74
+
75
+ def client
76
+ self.class.client
77
+ end
78
+
79
+ def options
80
+ self.class.options
81
+ end
82
+
83
+ def settings
84
+ self.class.settings
85
+ end
86
+
87
+ class << self
88
+ @@routes = { :filter => Filter.new, :rule => Rule.new }
89
+
90
+ attr_reader :client
91
+
92
+ def instance
93
+ @instance ||= self.new
94
+ end
95
+
96
+ def helpers(*extensions, &block)
97
+ class_eval(&block) if block_given?
98
+ include(*extensions) if extensions.any?
99
+ end
100
+
101
+ def set(key, value = nil)
102
+ return __send__("#{key}=", value) if respond_to?("#{key}=")
103
+
104
+ key_symbol = key.to_sym
105
+ return settings[key_symbol] = value if settings.has_key?(key_symbol)
106
+
107
+ options[key_symbol] = value
108
+ end
109
+
110
+ def route(type, label, options = {}, &block)
111
+ unbound_method = generate_method("#{type} #{label}", &block).bind(instance)
112
+ wrapper = generate_wrapper(&unbound_method)
113
+
114
+ @@routes[type.to_sym].add(label, options, &wrapper)
115
+ end
116
+
117
+ def filter(label, options = {}, &block)
118
+ route(:filter, label, options, &block)
119
+ end
120
+
121
+ def rule(label, options = {}, &block)
122
+ route(:rule, label, options, &block)
123
+ end
124
+
125
+ def generate_wrapper(&method)
126
+ method.arity != 0 ? proc {|args| method.call(*args) } :
127
+ proc {|args| method.call }
128
+ end
129
+
130
+ def generate_method(method_name, &block)
131
+ define_method(method_name, &block)
132
+ unbound_method = instance_method(method_name)
133
+ remove_method(method_name)
134
+ unbound_method
135
+ end
136
+
137
+ def reset!
138
+ settings = {}
139
+ end
140
+
141
+ def run(*urls, &block)
142
+ if block_given?
143
+ unbound_method = generate_method(:main_unbound_method, &block).bind(instance)
144
+ block = case unbound_method.arity
145
+ when 2 then proc{|r,v| unbound_method.call(r, v) }
146
+ when 1 then proc{|r,v| unbound_method.call(r) }
147
+ else proc{|r,v| unbound_method.call }
148
+ end
149
+ end
150
+ set_default_options! unless self.client
151
+
152
+ urls.map do |url|
153
+ filters, rules = @@routes.map{ |k, v| v.inject(url) }
154
+ response = self.client.get(url, options)
155
+
156
+ set_instance_variables(:header => response[:header],
157
+ :body => response[:body],
158
+ :status => response[:status])
159
+
160
+ response = response[:response]
161
+
162
+ unless filters.empty?
163
+ valid = filters.map{ |_filter| _filter[:block].call(response) }.all?
164
+ next if settings[:disabled_invalid] && !valid
165
+ end
166
+
167
+ response = rules.inject(nil) { |result, _rule|
168
+ _rule[:block].call((result || response))
169
+ } unless rules.empty?
170
+
171
+ response = block.call(response, valid) if block_given?
172
+ set_instance_variables(DEFAULT_INSTANCE_VARIABLES)
173
+ response
174
+ end
175
+ end
176
+
177
+ def client=(name)
178
+ @client = Client.const_get("#{name.to_s.capitalize}Client").new(settings) if clients.include?(name)
179
+ end
180
+
181
+ def settings
182
+ @settings ||= { :disabled_invalid => false }
183
+ end
184
+
185
+ def options
186
+ @options ||= {}
187
+ end
188
+
189
+ private
190
+
191
+ def set_instance_variables(pairs = {})
192
+ pairs.each_pair { |name, value| instance.instance_variable_set("@#{name}".to_sym, value) }
193
+ end
194
+
195
+ def clients
196
+ @_clients ||= [:faraday, :mechanize]
197
+ end
198
+
199
+ def set_default_options!
200
+ self.client = :faraday
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,15 @@
1
+ module Botch
2
+ module Client
3
+ class AbstractClient
4
+ attr_reader :client
5
+
6
+ def initialize
7
+ @client = nil
8
+ end
9
+
10
+ def get(url, options = {})
11
+ # return a response object
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,28 @@
1
+ module Botch
2
+ module Client
3
+ class FaradayClient < AbstractClient
4
+ def initialize(settings = {})
5
+ @client = :faraday
6
+ @handler = Faraday.new(settings) do |builder|
7
+ builder.use Faraday::Adapter::NetHttp
8
+ builder.use Faraday::Request::UrlEncoded
9
+ end
10
+ end
11
+
12
+ def get(url, options)
13
+ options.each_pair{ |key, value| @handler.headers[key] = value }
14
+ response = @handler.get(url)
15
+ parse_response(response)
16
+ end
17
+
18
+ def parse_response(response)
19
+ result = {}
20
+ result[:status] = response.status
21
+ result[:header] = response.headers
22
+ result[:body] = response.body
23
+ result[:response] = response
24
+ result
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ module Botch
2
+ module Client
3
+ class MechanizeResponseError
4
+ attr_accessor :code, :header, :body
5
+
6
+ def initialize(response_error)
7
+ @code = response_error.response_code
8
+ @body = ""
9
+ @header = Mechanize::Headers.new
10
+ @response = response_error
11
+ end
12
+ end
13
+
14
+ class MechanizeClient < AbstractClient
15
+ def initialize(options = {})
16
+ @client = :mechanize
17
+ @handler = Mechanize.new
18
+ end
19
+
20
+ def get(url, options = {})
21
+ @handler.user_agent = options[:user_agent] if options[:user_agent]
22
+ mechanize_page = @handler.get(url) rescue MechanizeResponseError.new($!)
23
+ parse_response(mechanize_page)
24
+ end
25
+
26
+ def parse_response(response)
27
+ result = {}
28
+ result[:header] = response.header
29
+ result[:status] = response.code.to_i
30
+ result[:body] = response.body
31
+ result[:response] = response
32
+ result
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,67 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Botch
5
+ describe do
6
+ it "Should have a version." do
7
+ expect(Botch.const_defined?("VERSION")).to be_true
8
+ end
9
+ end
10
+
11
+ describe Base do
12
+ before(:all) do
13
+ @fake = Fake.new
14
+ class SampleBotch < Botch::Base; end
15
+ SampleBotch.run(@fake.url) {}
16
+ end
17
+
18
+ it 'Default client should be faraday.' do
19
+ expect(SampleBotch.client).to be_an_instance_of(Botch::Client::FaradayClient)
20
+ end
21
+
22
+ describe "settings and options" do
23
+ before(:all) do
24
+ class SampleBotch < Botch::Base
25
+ set :user_agent, "SampleBotch User-Agent"
26
+ set :client, :mechanize
27
+ set :disabled_invalid, true
28
+ set :original_option, "foobar"
29
+ end
30
+ @options = SampleBotch.options
31
+ @settings = SampleBotch.settings
32
+ end
33
+
34
+ it "Original options should be stored in options." do
35
+ expect(@options[:original_option]).to eq("foobar")
36
+ end
37
+
38
+ it ":user_agent should be stored in options." do
39
+ expect(@options[:user_agent]).to eq("SampleBotch User-Agent")
40
+ end
41
+
42
+ it ":disabled_invalid should be stored in settings." do
43
+ expect(@settings[:disabled_invalid]).to be_true
44
+ end
45
+
46
+ it "Client setter should be valid." do
47
+ expect(SampleBotch.client).to be_an_instance_of(Botch::Client::MechanizeClient)
48
+ end
49
+ end
50
+
51
+ describe "instance variable" do
52
+ before(:all) do
53
+ class SampleBotch < Botch::Base
54
+ set :user_agent, "SampleBotch User-Agent"
55
+ set :disabled_invalid, false
56
+
57
+ filter(:all) { @test = "test" }
58
+ rule(:all) { @test }
59
+ end
60
+ end
61
+
62
+ it "should be able to use instance variable." do
63
+ expect(SampleBotch.run(@fake.url)[0]).to eq("test")
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,57 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Botch
5
+ describe Client::FaradayClient do
6
+ before(:all) do
7
+ FakeWeb.clean_registry
8
+ @fakes = []
9
+ @fakes << Fake.new("/", :status => [200, "OK"], :content_type => "text/html")
10
+ @fakes << Fake.new("/test1", :status => [404, "Not Found"], :content_type => "text/html")
11
+ @fakes << Fake.new("/test2", :status => [500, "Internal Server Error"], :content_type => "text/html")
12
+
13
+ class SampleBotch < Botch::Base
14
+ set :user_agent, "SampleBotch User-Agent"
15
+ set :client, :faraday
16
+ set :disabled_invalid, nil
17
+ filter(:all){ status == 200 }
18
+ rule(:all){ status }
19
+ end
20
+ end
21
+
22
+ it 'client should be faraday if set :faraday to :client.' do
23
+ expect(SampleBotch.run(@fakes[0].url) { client }[0]).to be_an_instance_of(Botch::Client::FaradayClient)
24
+ end
25
+
26
+ it 'helpers should return valid data.' do
27
+ result = SampleBotch.run(@fakes[0].url) do
28
+ { :status => status, :header => header, :body => body }
29
+ end
30
+ result = result[0]
31
+ expect(result[:status]).to eq(200)
32
+ expect(result[:header]).to be_an_instance_of(Faraday::Utils::Headers)
33
+ expect(result[:body]).to be_an_instance_of(String)
34
+ end
35
+
36
+ it 'block argument of #rule should replace last expression.' do
37
+ result = SampleBotch.run(*@fakes.map(&:url))
38
+ expect(result[0]).to eq(200)
39
+ expect(result[1]).to eq(404)
40
+ expect(result[2]).to eq(500)
41
+ end
42
+
43
+ it 'block argument of #run should replace last expression.' do
44
+ result = SampleBotch.run(*@fakes.map(&:url)){ "Foo" }
45
+ expect(result[0]).to eq("Foo")
46
+ expect(result[1]).to eq("Foo")
47
+ expect(result[2]).to eq("Foo")
48
+ end
49
+
50
+ it 'the second argument should be boolean.' do
51
+ result = SampleBotch.run(*@fakes.map(&:url)){ |response, valid| valid }
52
+ expect(result[0]).to be_true
53
+ expect(result[1]).to be_false
54
+ expect(result[2]).to be_false
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,57 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Botch
5
+ describe Client::MechanizeClient do
6
+ before(:all) do
7
+ FakeWeb.clean_registry
8
+ @fakes = []
9
+ @fakes << Fake.new("/", :status => [200, "OK"], :content_type => "text/html")
10
+ @fakes << Fake.new("/test1", :status => [404, "Not Found"], :content_type => "text/html")
11
+ @fakes << Fake.new("/test2", :status => [500, "Internal Server Error"], :content_type => "text/html")
12
+
13
+ class SampleBotch < Botch::Base
14
+ set :user_agent, "SampleBotch User-Agent"
15
+ set :client, :mechanize
16
+ set :disabled_invalid, nil
17
+ filter(:all){ status == 200 }
18
+ rule(:all){ status }
19
+ end
20
+ end
21
+
22
+ it 'client should be faraday if set :mechanize to :client.' do
23
+ expect(SampleBotch.run(@fakes[0].url) { client }[0]).to be_an_instance_of(Botch::Client::MechanizeClient)
24
+ end
25
+
26
+ it 'helpers should return valid data.' do
27
+ result = SampleBotch.run(@fakes[0].url) do
28
+ { :status => status, :header => header, :body => body }
29
+ end
30
+ result = result[0]
31
+ expect(result[:status]).to eq(200)
32
+ expect(result[:header]).to be_an_instance_of(Mechanize::Headers)
33
+ expect(result[:body]).to be_an_instance_of(String)
34
+ end
35
+
36
+ it 'block argument of #rule should replace last expression.' do
37
+ result = SampleBotch.run(*@fakes.map(&:url))
38
+ expect(result[0]).to eq(200)
39
+ expect(result[1]).to eq(404)
40
+ expect(result[2]).to eq(500)
41
+ end
42
+
43
+ it 'block argument of #run should replace last expression.' do
44
+ result = SampleBotch.run(*@fakes.map(&:url)){ "Foo" }
45
+ expect(result[0]).to eq("Foo")
46
+ expect(result[1]).to eq("Foo")
47
+ expect(result[2]).to eq("Foo")
48
+ end
49
+
50
+ it 'the second argument should be boolean.' do
51
+ result = SampleBotch.run(*@fakes.map(&:url)){ |response, valid| valid }
52
+ expect(result[0]).to be_true
53
+ expect(result[1]).to be_false
54
+ expect(result[2]).to be_false
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,39 @@
1
+ require 'botch'
2
+ require 'fakeweb'
3
+
4
+ FakeWeb.allow_net_connect = false
5
+
6
+ module Botch
7
+ SPEC_DOMAIN = 'example.com'
8
+
9
+ class Fake
10
+ def initialize(path = "/", options = {})
11
+ @path = path
12
+ @scheme = options[:scheme] || "http"
13
+ @content_type = options[:content_type] || "text/html"
14
+ @status = options[:status] || [200, "OK"]
15
+ add_to_fakeweb
16
+ end
17
+
18
+ def url
19
+ @scheme + "://" + SPEC_DOMAIN + @path
20
+ end
21
+
22
+ def body
23
+ @body ||= <<-HTML
24
+ <html>
25
+ <head>
26
+ <title>Fake page #{@path}</title>
27
+ </head>
28
+ <body>
29
+ </body>
30
+ </html>
31
+ HTML
32
+ end
33
+
34
+ def add_to_fakeweb
35
+ options = { :body => @body, :content_type => @content_type, :status => @status }
36
+ FakeWeb.register_uri(:get, url, options)
37
+ end
38
+ end
39
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: botch
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - namusyaka
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: mechanize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fakeweb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '1.3'
69
+ description: Botch is a DSL for quickly creating web crawlers. Inspired by Sinatra.
70
+ email: namusyaka@gmail.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - Gemfile
76
+ - README.md
77
+ - Rakefile
78
+ - botch.gemspec
79
+ - lib/botch.rb
80
+ - lib/botch/base.rb
81
+ - lib/botch/clients/abstract_client.rb
82
+ - lib/botch/clients/faraday_client.rb
83
+ - lib/botch/clients/mechanize_client.rb
84
+ - spec/botch_spec.rb
85
+ - spec/faraday_spec.rb
86
+ - spec/mechanize_spec.rb
87
+ - spec/spec_helper.rb
88
+ homepage: https://github.com/namusyaka/botch
89
+ licenses: []
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements: []
106
+ rubyforge_project:
107
+ rubygems_version: 2.0.2
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: A DSL for web clawler.
111
+ test_files:
112
+ - spec/botch_spec.rb
113
+ - spec/faraday_spec.rb
114
+ - spec/mechanize_spec.rb
115
+ has_rdoc: