ucnv-chainsaw 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +42 -0
- data/Rakefile +149 -0
- data/examples/01_google.rb +17 -0
- data/examples/02_twitter.rb +15 -0
- data/lib/chainsaw.rb +40 -0
- data/lib/chainsaw/browser.rb +262 -0
- data/lib/chainsaw/common.rb +87 -0
- data/lib/chainsaw/element.rb +62 -0
- data/lib/chainsaw/ext/httpclient.rb +27 -0
- data/lib/chainsaw/ext/nokogiri.rb +29 -0
- data/test/helper.rb +15 -0
- data/test/htdocs/01.html +17 -0
- data/test/htdocs/02.html +9 -0
- data/test/htdocs/03.html +25 -0
- data/test/htdocs/04.html +14 -0
- data/test/htdocs/05.html +14 -0
- data/test/htdocs/06.html +14 -0
- data/test/htdocs/cgi.rb +50 -0
- data/test/htdocs/img.gif +0 -0
- data/test/server.rb +20 -0
- data/test/test_browser.rb +274 -0
- data/test/test_chainsaw.rb +20 -0
- data/test/test_element.rb +32 -0
- data/test/test_ext.rb +31 -0
- data/test/test_m17n.rb +23 -0
- metadata +116 -0
data/README.rdoc
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
= Chainsaw
|
2
|
+
|
3
|
+
* http://github.com/ucnv/chainsaw/tree/master
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
A Ruby library for spidering web resources.
|
8
|
+
|
9
|
+
== Features/Problems
|
10
|
+
|
11
|
+
|
12
|
+
== Synopsis
|
13
|
+
|
14
|
+
Chainsaw.launch('http://example.com/').open { |cs|
|
15
|
+
cs.doc.css('#navi a')[2]
|
16
|
+
}.open { |cs|
|
17
|
+
form = cs.doc.xpath('//form')[0]
|
18
|
+
input = form.xpath('.//input[@type="text"]')[0]
|
19
|
+
input.set_attribute('value', 'blurblur')
|
20
|
+
cs.set_next form
|
21
|
+
}.submit { |cs|
|
22
|
+
puts cs.res.status
|
23
|
+
cs.doc.search('.//a') do |link|
|
24
|
+
puts link.content
|
25
|
+
end
|
26
|
+
}
|
27
|
+
|
28
|
+
== Requirements
|
29
|
+
|
30
|
+
* nokogiri
|
31
|
+
* httpclient
|
32
|
+
|
33
|
+
== Installation
|
34
|
+
|
35
|
+
* gem install ucnv-chainsaw
|
36
|
+
|
37
|
+
== Copyright
|
38
|
+
|
39
|
+
Author:: ucnv <ucnvvv at gmail.com>
|
40
|
+
Copyright:: Copyright (c) 2009 ucnv
|
41
|
+
License:: MIT
|
42
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/testtask'
|
5
|
+
require 'rake/packagetask'
|
6
|
+
require 'rake/gempackagetask'
|
7
|
+
require 'rake/rdoctask'
|
8
|
+
require 'rake/contrib/rubyforgepublisher'
|
9
|
+
require 'rake/contrib/sshpublisher'
|
10
|
+
require 'fileutils'
|
11
|
+
include FileUtils
|
12
|
+
|
13
|
+
NAME = "chainsaw"
|
14
|
+
AUTHOR = "ucnv"
|
15
|
+
EMAIL = "ucnvvv at gmail.com"
|
16
|
+
DESCRIPTION = "A Ruby library for spidering web resources."
|
17
|
+
RUBYFORGE_PROJECT = "chainsaw"
|
18
|
+
HOMEPATH = "http://github.com/ucnv/chainsaw/tree/master"
|
19
|
+
BIN_FILES = %w( )
|
20
|
+
VERS = "0.0.1"
|
21
|
+
|
22
|
+
REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
|
23
|
+
CLEAN.include ['**/.*.sw?', '*.gem', '.config']
|
24
|
+
RDOC_OPTS = [
|
25
|
+
'--title', "#{NAME} documentation",
|
26
|
+
"--charset", "utf-8",
|
27
|
+
"--opname", "index.html",
|
28
|
+
"--line-numbers",
|
29
|
+
"--main", "README.rdoc",
|
30
|
+
"--inline-source",
|
31
|
+
]
|
32
|
+
|
33
|
+
task :default => [:test]
|
34
|
+
task :package => [:clean]
|
35
|
+
|
36
|
+
Rake::TestTask.new("test") do |t|
|
37
|
+
t.libs << "test"
|
38
|
+
t.pattern = "test/**/test_*.rb"
|
39
|
+
t.verbose = true
|
40
|
+
end
|
41
|
+
|
42
|
+
spec = Gem::Specification.new do |s|
|
43
|
+
s.name = NAME
|
44
|
+
s.version = VERS
|
45
|
+
s.platform = Gem::Platform::RUBY
|
46
|
+
s.has_rdoc = true
|
47
|
+
s.extra_rdoc_files = ["README.rdoc", "ChangeLog"]
|
48
|
+
s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
|
49
|
+
s.summary = DESCRIPTION
|
50
|
+
s.description = DESCRIPTION
|
51
|
+
s.author = AUTHOR
|
52
|
+
s.email = EMAIL
|
53
|
+
s.homepage = HOMEPATH
|
54
|
+
s.executables = BIN_FILES
|
55
|
+
s.rubyforge_project = RUBYFORGE_PROJECT
|
56
|
+
s.bindir = "bin"
|
57
|
+
s.require_path = "lib"
|
58
|
+
s.autorequire = ""
|
59
|
+
s.test_files = Dir["test/test_*.rb"]
|
60
|
+
|
61
|
+
s.add_dependency('nokogiri', '>=1.2.1')
|
62
|
+
s.add_dependency('httpclient', '>=2.1.4')
|
63
|
+
s.required_ruby_version = '>= 1.8.6'
|
64
|
+
|
65
|
+
s.files = %w(README.rdoc ChangeLog Rakefile) +
|
66
|
+
Dir.glob("{bin,doc,test,lib,templates,generator,extras,website,script}/**/*") +
|
67
|
+
Dir.glob("ext/**/*.{h,c,rb}") +
|
68
|
+
Dir.glob("examples/**/*.rb") +
|
69
|
+
Dir.glob("tools/*.rb")
|
70
|
+
|
71
|
+
s.extensions = FileList["ext/**/extconf.rb"].to_a
|
72
|
+
end
|
73
|
+
|
74
|
+
Rake::GemPackageTask.new(spec) do |p|
|
75
|
+
p.need_tar = true
|
76
|
+
p.gem_spec = spec
|
77
|
+
end
|
78
|
+
|
79
|
+
task :install do
|
80
|
+
name = "#{NAME}-#{VERS}.gem"
|
81
|
+
sh %{rake package}
|
82
|
+
begin
|
83
|
+
sh %{sudo gem install pkg/#{name}}
|
84
|
+
rescue
|
85
|
+
sh %{gem install pkg/#{name}}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
task :uninstall => [:clean] do
|
90
|
+
begin
|
91
|
+
sh %{sudo gem uninstall #{NAME}}
|
92
|
+
rescue
|
93
|
+
sh %{gem uninstall #{NAME}}
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
Rake::RDocTask.new do |rdoc|
|
98
|
+
rdoc.rdoc_dir = 'html'
|
99
|
+
rdoc.options += RDOC_OPTS
|
100
|
+
rdoc.template = "resh"
|
101
|
+
#rdoc.template = "#{ENV['template']}.rb" if ENV['template']
|
102
|
+
if ENV['DOC_FILES']
|
103
|
+
rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
|
104
|
+
else
|
105
|
+
rdoc.rdoc_files.include('README.rdoc', 'ChangeLog')
|
106
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
107
|
+
rdoc.rdoc_files.include('ext/**/*.c')
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
desc "Publish to RubyForge"
|
112
|
+
task :rubyforge => [:rdoc, :package] do
|
113
|
+
require 'rubyforge'
|
114
|
+
Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'comelade').upload
|
115
|
+
end
|
116
|
+
|
117
|
+
desc 'Package and upload the release to rubyforge.'
|
118
|
+
task :release => [:clean, :package] do |t|
|
119
|
+
v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
|
120
|
+
abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
|
121
|
+
pkg = "pkg/#{NAME}-#{VERS}"
|
122
|
+
|
123
|
+
rf = RubyForge.new
|
124
|
+
puts "Logging in"
|
125
|
+
rf.login
|
126
|
+
|
127
|
+
c = rf.userconfig
|
128
|
+
# c["release_notes"] = description if description
|
129
|
+
# c["release_changes"] = changes if changes
|
130
|
+
c["preformatted"] = true
|
131
|
+
|
132
|
+
files = [
|
133
|
+
"#{pkg}.tgz",
|
134
|
+
"#{pkg}.gem"
|
135
|
+
].compact
|
136
|
+
|
137
|
+
puts "Releasing #{NAME} v. #{VERS}"
|
138
|
+
rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
|
139
|
+
end
|
140
|
+
|
141
|
+
desc 'Show information about the gem.'
|
142
|
+
task :debug_gem do
|
143
|
+
puts spec.to_ruby
|
144
|
+
end
|
145
|
+
|
146
|
+
desc 'Update gem spec'
|
147
|
+
task :gemspec do
|
148
|
+
open("#{NAME}.gemspec", 'w').write spec.to_ruby
|
149
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'chainsaw'
|
3
|
+
|
4
|
+
query = ARGV.shift
|
5
|
+
|
6
|
+
Chainsaw.launch('http://google.com/').
|
7
|
+
open { |a|
|
8
|
+
f = a.doc.xpath('//form[@name="f"]').first
|
9
|
+
q = f.xpath('//input[@name="q"]').first
|
10
|
+
q['value'] = query || 'Hello world'
|
11
|
+
a.set_next f
|
12
|
+
}.
|
13
|
+
submit { |a|
|
14
|
+
a.doc.search('//a').each do |l|
|
15
|
+
puts l.content
|
16
|
+
end
|
17
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'chainsaw'
|
3
|
+
|
4
|
+
username, password, tweet = ARGV
|
5
|
+
|
6
|
+
Chainsaw.launch('https://twitter.com/home').> { |a|
|
7
|
+
f = a.doc.css('.signin').first
|
8
|
+
f.xpath('id("username_or_email")').first['value'] = username
|
9
|
+
f.xpath('id("session[password]")').first['value'] = password
|
10
|
+
a.set_next f
|
11
|
+
}.> { |a|
|
12
|
+
f = a.doc.css('#doingForm').first
|
13
|
+
f.xpath('id("status")').first.content = tweet || "(o'-')-o fwip fwip"
|
14
|
+
a.set_next f
|
15
|
+
}.>
|
data/lib/chainsaw.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'httpclient'
|
4
|
+
require 'nkf'
|
5
|
+
|
6
|
+
require 'chainsaw/ext/httpclient'
|
7
|
+
require 'chainsaw/ext/nokogiri'
|
8
|
+
|
9
|
+
require 'chainsaw/common'
|
10
|
+
require 'chainsaw/element'
|
11
|
+
|
12
|
+
require 'chainsaw/browser'
|
13
|
+
|
14
|
+
|
15
|
+
Nokogiri::XML::Element.module_eval do
|
16
|
+
include Chainsaw::Element
|
17
|
+
end
|
18
|
+
|
19
|
+
module Chainsaw
|
20
|
+
VERSION = '0.0.1'
|
21
|
+
|
22
|
+
#
|
23
|
+
# Return a instance of the Chainsaw::Browser class.
|
24
|
+
def self.launch(*args)
|
25
|
+
args.pop! if args.last.is_a? Proc
|
26
|
+
cs = Chainsaw::Browser.new *args
|
27
|
+
yield cs if block_given?
|
28
|
+
cs
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module Kernel
|
33
|
+
#
|
34
|
+
# alias for Chainsaw.launch
|
35
|
+
def Chainsaw(*args)
|
36
|
+
Chainsaw.launch *args
|
37
|
+
end
|
38
|
+
|
39
|
+
module_function :Chainsaw
|
40
|
+
end
|
@@ -0,0 +1,262 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Chainsaw
|
4
|
+
|
5
|
+
class Browser
|
6
|
+
include Chainsaw::Util
|
7
|
+
|
8
|
+
DEFAULT_USER_AGENT = '' # TODO: set value
|
9
|
+
|
10
|
+
attr_accessor :user_agent, :ignore_redirect, :hide_referer, :max_history_count, :encoding # configurables
|
11
|
+
attr_accessor :request_headers, :url_base, :results
|
12
|
+
attr_reader :engine, :response, :history, :request_count # session
|
13
|
+
|
14
|
+
def initialize(location = '', options = {})
|
15
|
+
@user_agent = options[:user_agent] || DEFAULT_USER_AGENT
|
16
|
+
@max_history_count = options[:max_history_count] || 20
|
17
|
+
@ignore_redirect = options[:ignore_redirect] || false
|
18
|
+
@hide_referer = options[:hide_referer] || false
|
19
|
+
@encoding = options[:encoding]
|
20
|
+
@url_base = options[:url_base] || ''
|
21
|
+
@request_headers = options[:request_headers] || {}
|
22
|
+
|
23
|
+
@history = []
|
24
|
+
@results = []
|
25
|
+
|
26
|
+
@request_count = 0
|
27
|
+
|
28
|
+
@engine = HTTPClient.new
|
29
|
+
|
30
|
+
set_next(location)
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def add_header(key, value, overwrite_if_exists = true)
|
35
|
+
@request_headers = {} if @request_header.nil?
|
36
|
+
if !@request_headers.has_key?(key) or overwrite_if_exists
|
37
|
+
@request_headers[key] = value
|
38
|
+
end
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
def set_next(obj)
|
43
|
+
@reserved = obj
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
def set_auth(username, password)
|
48
|
+
@engine.set_auth(@reserved.to_s, username, password)
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
def process(&block)
|
53
|
+
if @reserved.is_a?(Nokogiri::XML::Element) and @reserved.is_form?
|
54
|
+
begin
|
55
|
+
submit &block
|
56
|
+
rescue TypeError
|
57
|
+
open &block
|
58
|
+
end
|
59
|
+
else
|
60
|
+
open &block
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
alias > process
|
65
|
+
|
66
|
+
def open
|
67
|
+
uri = prepare_next
|
68
|
+
@response = request(:get, uri)
|
69
|
+
process_chain &Proc.new if block_given?
|
70
|
+
self
|
71
|
+
end
|
72
|
+
|
73
|
+
def submit
|
74
|
+
submit_with(nil)
|
75
|
+
process_chain &Proc.new if block_given?
|
76
|
+
self
|
77
|
+
end
|
78
|
+
|
79
|
+
def submit_with(button_name, image_x = -1, image_y = -1)
|
80
|
+
uri, form = prepare_next
|
81
|
+
raise TypeError, 'No form found.' unless form
|
82
|
+
unless button_name
|
83
|
+
s = form.xpath('.//input[@type="submit"]')
|
84
|
+
button_name = s.first['name'] if s.length
|
85
|
+
end
|
86
|
+
query = form.serialize_form(button_name, image_x, image_y)
|
87
|
+
method = (form['method'] || 'get').downcase
|
88
|
+
if method == 'get'
|
89
|
+
@response = request(:get, uri, query)
|
90
|
+
elsif method == 'post'
|
91
|
+
paths, q = query.partition do |name, value|
|
92
|
+
value.instance_of? Pathname
|
93
|
+
end
|
94
|
+
files = []
|
95
|
+
begin
|
96
|
+
paths.each do |name, path|
|
97
|
+
f = File.open(path, 'rb')
|
98
|
+
q.push [name, f]
|
99
|
+
files.push f
|
100
|
+
end
|
101
|
+
@response = request(:post, uri, q)
|
102
|
+
ensure
|
103
|
+
files.each { |f| f.close unless f.nil? }
|
104
|
+
end
|
105
|
+
end
|
106
|
+
process_chain &Proc.new if block_given?
|
107
|
+
self
|
108
|
+
end
|
109
|
+
|
110
|
+
def back
|
111
|
+
require 'ostruct'
|
112
|
+
cleanup
|
113
|
+
@history.shift
|
114
|
+
back = @history.first
|
115
|
+
return self if back.nil?
|
116
|
+
@response = OpenStruct.new(
|
117
|
+
'uri' => back[:location],
|
118
|
+
'contenttype' => back[:content_type],
|
119
|
+
'content' => back[:document]
|
120
|
+
)
|
121
|
+
@reserved = nil
|
122
|
+
process_chain &Proc.new if block_given?
|
123
|
+
self
|
124
|
+
end
|
125
|
+
|
126
|
+
def uri
|
127
|
+
return nil if @response.nil?
|
128
|
+
@response.uri
|
129
|
+
end
|
130
|
+
|
131
|
+
def each(&block)
|
132
|
+
iterate(false, &block)
|
133
|
+
end
|
134
|
+
|
135
|
+
def each_with_index(&block)
|
136
|
+
iterate(true, &block)
|
137
|
+
end
|
138
|
+
|
139
|
+
def document
|
140
|
+
return @document unless @document.nil?
|
141
|
+
return nil if res.content.nil?
|
142
|
+
return nil unless is_xml_parsable?(res.contenttype)
|
143
|
+
enc = @encoding || Encoding.guess(res.content)
|
144
|
+
@document = Nokogiri.parse(res.content, nil, enc)
|
145
|
+
|
146
|
+
b = @document.xpath('//base')
|
147
|
+
base = b.empty? ? '' : b[0]['href']
|
148
|
+
@history.first.update(:base => base.to_s)
|
149
|
+
|
150
|
+
@document
|
151
|
+
end
|
152
|
+
|
153
|
+
def file
|
154
|
+
return @file unless @file.nil?
|
155
|
+
return nil if @response.content.nil?
|
156
|
+
|
157
|
+
Tempfile.open(uri.to_s.split('/').last) do |f|
|
158
|
+
f.binmode.write @response.content
|
159
|
+
@file = f
|
160
|
+
end
|
161
|
+
|
162
|
+
@file
|
163
|
+
end
|
164
|
+
|
165
|
+
alias res response
|
166
|
+
alias doc document
|
167
|
+
|
168
|
+
private
|
169
|
+
|
170
|
+
def process_chain(&block)
|
171
|
+
if block.arity == 0
|
172
|
+
results.push yield
|
173
|
+
else
|
174
|
+
results.push yield(self)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def request(method, uri, query = nil)
|
179
|
+
raise TypeError, "Invalid URI: #{qq uri.to_s}" unless uri.kind_of?(URI::HTTP)
|
180
|
+
|
181
|
+
call = method.to_s || 'get'
|
182
|
+
call += '_r' unless @ignore_redirect
|
183
|
+
add_header('Referer', @history.first[:location], false) if !@hide_referer and @history.first
|
184
|
+
@request_count += 1
|
185
|
+
r = begin
|
186
|
+
@engine.send(call, uri, query, @request_headers)
|
187
|
+
rescue
|
188
|
+
raise(
|
189
|
+
Chainsaw::RequestError,
|
190
|
+
"Error occured during #{to_nth(@request_count)} request; url: #{qq uri.to_s}"
|
191
|
+
)
|
192
|
+
end
|
193
|
+
|
194
|
+
# history
|
195
|
+
set_history uri.to_s, r.contenttype, r.content
|
196
|
+
r
|
197
|
+
end
|
198
|
+
|
199
|
+
def iterate(with_index, &block)
|
200
|
+
set = prepare_next
|
201
|
+
unless set
|
202
|
+
warn 'Iteration called without Enumerable.'
|
203
|
+
return self
|
204
|
+
end
|
205
|
+
|
206
|
+
set.each_with_index do |e, index|
|
207
|
+
set_next(e)
|
208
|
+
with_index ? yield(self, index) : yield(self)
|
209
|
+
back
|
210
|
+
end
|
211
|
+
self
|
212
|
+
end
|
213
|
+
|
214
|
+
def prepare_next(reserved = nil)
|
215
|
+
reserved ||= @reserved
|
216
|
+
cleanup
|
217
|
+
case reserved
|
218
|
+
when String # expecting a url
|
219
|
+
return prepare_next(URI.parse reserved)
|
220
|
+
when URI
|
221
|
+
if reserved.is_a?(URI::HTTP)
|
222
|
+
return reserved
|
223
|
+
elsif @history.size > 0
|
224
|
+
base = @history[0][:base] || ''
|
225
|
+
base = @history[0][:location] if base.empty?
|
226
|
+
l = URI.join base, reserved
|
227
|
+
return prepare_next(l) if l.is_a?(URI::HTTP)
|
228
|
+
end
|
229
|
+
when Nokogiri::XML::Element
|
230
|
+
url = reserved['href'] || reserved['action'] || reserved['value']
|
231
|
+
if reserved.is_form?
|
232
|
+
return [prepare_next(url), reserved]
|
233
|
+
else
|
234
|
+
return prepare_next(url)
|
235
|
+
end
|
236
|
+
when Enumerable
|
237
|
+
return reserved
|
238
|
+
end
|
239
|
+
|
240
|
+
raise TypeError, "Unexpected value is set for next before #{to_nth(@request_count+1)} request: #{@reserved.class.name}"
|
241
|
+
end
|
242
|
+
|
243
|
+
def cleanup
|
244
|
+
@reserved = @document = @response = @file = nil
|
245
|
+
end
|
246
|
+
|
247
|
+
def set_history(location, content_type, document, base = nil)
|
248
|
+
document = nil unless is_xml_parsable?(content_type)
|
249
|
+
data = {
|
250
|
+
:location => location,
|
251
|
+
:content_type => content_type,
|
252
|
+
:document => document,
|
253
|
+
:base => base
|
254
|
+
}
|
255
|
+
@history.unshift(data)
|
256
|
+
@history.slice! @max_history_count, @history.size - @max_history_count
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
|
261
|
+
end
|
262
|
+
end
|