ucnv-chainsaw 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +42 -0
- data/Rakefile +149 -0
- data/examples/01_google.rb +17 -0
- data/examples/02_twitter.rb +15 -0
- data/lib/chainsaw.rb +40 -0
- data/lib/chainsaw/browser.rb +262 -0
- data/lib/chainsaw/common.rb +87 -0
- data/lib/chainsaw/element.rb +62 -0
- data/lib/chainsaw/ext/httpclient.rb +27 -0
- data/lib/chainsaw/ext/nokogiri.rb +29 -0
- data/test/helper.rb +15 -0
- data/test/htdocs/01.html +17 -0
- data/test/htdocs/02.html +9 -0
- data/test/htdocs/03.html +25 -0
- data/test/htdocs/04.html +14 -0
- data/test/htdocs/05.html +14 -0
- data/test/htdocs/06.html +14 -0
- data/test/htdocs/cgi.rb +50 -0
- data/test/htdocs/img.gif +0 -0
- data/test/server.rb +20 -0
- data/test/test_browser.rb +274 -0
- data/test/test_chainsaw.rb +20 -0
- data/test/test_element.rb +32 -0
- data/test/test_ext.rb +31 -0
- data/test/test_m17n.rb +23 -0
- metadata +116 -0
data/README.rdoc
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
= Chainsaw
|
2
|
+
|
3
|
+
* http://github.com/ucnv/chainsaw/tree/master
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
A Ruby library for spidering web resources.
|
8
|
+
|
9
|
+
== Features/Problems
|
10
|
+
|
11
|
+
|
12
|
+
== Synopsis
|
13
|
+
|
14
|
+
Chainsaw.launch('http://example.com/').open { |cs|
|
15
|
+
cs.doc.css('#navi a')[2]
|
16
|
+
}.open { |cs|
|
17
|
+
form = cs.doc.xpath('//form')[0]
|
18
|
+
input = form.xpath('.//input[@type="text"]')[0]
|
19
|
+
input.set_attribute('value', 'blurblur')
|
20
|
+
cs.set_next form
|
21
|
+
}.submit { |cs|
|
22
|
+
puts cs.res.status
|
23
|
+
cs.doc.search('.//a') do |link|
|
24
|
+
puts link.content
|
25
|
+
end
|
26
|
+
}
|
27
|
+
|
28
|
+
== Requirements
|
29
|
+
|
30
|
+
* nokogiri
|
31
|
+
* httpclient
|
32
|
+
|
33
|
+
== Installation
|
34
|
+
|
35
|
+
* gem install ucnv-chainsaw
|
36
|
+
|
37
|
+
== Copyright
|
38
|
+
|
39
|
+
Author:: ucnv <ucnvvv at gmail.com>
|
40
|
+
Copyright:: Copyright (c) 2009 ucnv
|
41
|
+
License:: MIT
|
42
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/testtask'
|
5
|
+
require 'rake/packagetask'
|
6
|
+
require 'rake/gempackagetask'
|
7
|
+
require 'rake/rdoctask'
|
8
|
+
require 'rake/contrib/rubyforgepublisher'
|
9
|
+
require 'rake/contrib/sshpublisher'
|
10
|
+
require 'fileutils'
|
11
|
+
include FileUtils
|
12
|
+
|
13
|
+
NAME = "chainsaw"
|
14
|
+
AUTHOR = "ucnv"
|
15
|
+
EMAIL = "ucnvvv at gmail.com"
|
16
|
+
DESCRIPTION = "A Ruby library for spidering web resources."
|
17
|
+
RUBYFORGE_PROJECT = "chainsaw"
|
18
|
+
HOMEPATH = "http://github.com/ucnv/chainsaw/tree/master"
|
19
|
+
BIN_FILES = %w( )
|
20
|
+
VERS = "0.0.1"
|
21
|
+
|
22
|
+
REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
|
23
|
+
CLEAN.include ['**/.*.sw?', '*.gem', '.config']
|
24
|
+
RDOC_OPTS = [
|
25
|
+
'--title', "#{NAME} documentation",
|
26
|
+
"--charset", "utf-8",
|
27
|
+
"--opname", "index.html",
|
28
|
+
"--line-numbers",
|
29
|
+
"--main", "README.rdoc",
|
30
|
+
"--inline-source",
|
31
|
+
]
|
32
|
+
|
33
|
+
task :default => [:test]
|
34
|
+
task :package => [:clean]
|
35
|
+
|
36
|
+
Rake::TestTask.new("test") do |t|
|
37
|
+
t.libs << "test"
|
38
|
+
t.pattern = "test/**/test_*.rb"
|
39
|
+
t.verbose = true
|
40
|
+
end
|
41
|
+
|
42
|
+
spec = Gem::Specification.new do |s|
|
43
|
+
s.name = NAME
|
44
|
+
s.version = VERS
|
45
|
+
s.platform = Gem::Platform::RUBY
|
46
|
+
s.has_rdoc = true
|
47
|
+
s.extra_rdoc_files = ["README.rdoc", "ChangeLog"]
|
48
|
+
s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
|
49
|
+
s.summary = DESCRIPTION
|
50
|
+
s.description = DESCRIPTION
|
51
|
+
s.author = AUTHOR
|
52
|
+
s.email = EMAIL
|
53
|
+
s.homepage = HOMEPATH
|
54
|
+
s.executables = BIN_FILES
|
55
|
+
s.rubyforge_project = RUBYFORGE_PROJECT
|
56
|
+
s.bindir = "bin"
|
57
|
+
s.require_path = "lib"
|
58
|
+
s.autorequire = ""
|
59
|
+
s.test_files = Dir["test/test_*.rb"]
|
60
|
+
|
61
|
+
s.add_dependency('nokogiri', '>=1.2.1')
|
62
|
+
s.add_dependency('httpclient', '>=2.1.4')
|
63
|
+
s.required_ruby_version = '>= 1.8.6'
|
64
|
+
|
65
|
+
s.files = %w(README.rdoc ChangeLog Rakefile) +
|
66
|
+
Dir.glob("{bin,doc,test,lib,templates,generator,extras,website,script}/**/*") +
|
67
|
+
Dir.glob("ext/**/*.{h,c,rb}") +
|
68
|
+
Dir.glob("examples/**/*.rb") +
|
69
|
+
Dir.glob("tools/*.rb")
|
70
|
+
|
71
|
+
s.extensions = FileList["ext/**/extconf.rb"].to_a
|
72
|
+
end
|
73
|
+
|
74
|
+
Rake::GemPackageTask.new(spec) do |p|
|
75
|
+
p.need_tar = true
|
76
|
+
p.gem_spec = spec
|
77
|
+
end
|
78
|
+
|
79
|
+
task :install do
|
80
|
+
name = "#{NAME}-#{VERS}.gem"
|
81
|
+
sh %{rake package}
|
82
|
+
begin
|
83
|
+
sh %{sudo gem install pkg/#{name}}
|
84
|
+
rescue
|
85
|
+
sh %{gem install pkg/#{name}}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
task :uninstall => [:clean] do
|
90
|
+
begin
|
91
|
+
sh %{sudo gem uninstall #{NAME}}
|
92
|
+
rescue
|
93
|
+
sh %{gem uninstall #{NAME}}
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
Rake::RDocTask.new do |rdoc|
|
98
|
+
rdoc.rdoc_dir = 'html'
|
99
|
+
rdoc.options += RDOC_OPTS
|
100
|
+
rdoc.template = "resh"
|
101
|
+
#rdoc.template = "#{ENV['template']}.rb" if ENV['template']
|
102
|
+
if ENV['DOC_FILES']
|
103
|
+
rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
|
104
|
+
else
|
105
|
+
rdoc.rdoc_files.include('README.rdoc', 'ChangeLog')
|
106
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
107
|
+
rdoc.rdoc_files.include('ext/**/*.c')
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
desc "Publish to RubyForge"
|
112
|
+
task :rubyforge => [:rdoc, :package] do
|
113
|
+
require 'rubyforge'
|
114
|
+
Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'comelade').upload
|
115
|
+
end
|
116
|
+
|
117
|
+
desc 'Package and upload the release to rubyforge.'
|
118
|
+
task :release => [:clean, :package] do |t|
|
119
|
+
v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
|
120
|
+
abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
|
121
|
+
pkg = "pkg/#{NAME}-#{VERS}"
|
122
|
+
|
123
|
+
rf = RubyForge.new
|
124
|
+
puts "Logging in"
|
125
|
+
rf.login
|
126
|
+
|
127
|
+
c = rf.userconfig
|
128
|
+
# c["release_notes"] = description if description
|
129
|
+
# c["release_changes"] = changes if changes
|
130
|
+
c["preformatted"] = true
|
131
|
+
|
132
|
+
files = [
|
133
|
+
"#{pkg}.tgz",
|
134
|
+
"#{pkg}.gem"
|
135
|
+
].compact
|
136
|
+
|
137
|
+
puts "Releasing #{NAME} v. #{VERS}"
|
138
|
+
rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
|
139
|
+
end
|
140
|
+
|
141
|
+
desc 'Show information about the gem.'
|
142
|
+
task :debug_gem do
|
143
|
+
puts spec.to_ruby
|
144
|
+
end
|
145
|
+
|
146
|
+
desc 'Update gem spec'
|
147
|
+
task :gemspec do
|
148
|
+
open("#{NAME}.gemspec", 'w').write spec.to_ruby
|
149
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'chainsaw'
|
3
|
+
|
4
|
+
query = ARGV.shift
|
5
|
+
|
6
|
+
Chainsaw.launch('http://google.com/').
|
7
|
+
open { |a|
|
8
|
+
f = a.doc.xpath('//form[@name="f"]').first
|
9
|
+
q = f.xpath('//input[@name="q"]').first
|
10
|
+
q['value'] = query || 'Hello world'
|
11
|
+
a.set_next f
|
12
|
+
}.
|
13
|
+
submit { |a|
|
14
|
+
a.doc.search('//a').each do |l|
|
15
|
+
puts l.content
|
16
|
+
end
|
17
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'chainsaw'
|
3
|
+
|
4
|
+
username, password, tweet = ARGV
|
5
|
+
|
6
|
+
Chainsaw.launch('https://twitter.com/home').> { |a|
|
7
|
+
f = a.doc.css('.signin').first
|
8
|
+
f.xpath('id("username_or_email")').first['value'] = username
|
9
|
+
f.xpath('id("session[password]")').first['value'] = password
|
10
|
+
a.set_next f
|
11
|
+
}.> { |a|
|
12
|
+
f = a.doc.css('#doingForm').first
|
13
|
+
f.xpath('id("status")').first.content = tweet || "(o'-')-o fwip fwip"
|
14
|
+
a.set_next f
|
15
|
+
}.>
|
data/lib/chainsaw.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'httpclient'
|
4
|
+
require 'nkf'
|
5
|
+
|
6
|
+
require 'chainsaw/ext/httpclient'
|
7
|
+
require 'chainsaw/ext/nokogiri'
|
8
|
+
|
9
|
+
require 'chainsaw/common'
|
10
|
+
require 'chainsaw/element'
|
11
|
+
|
12
|
+
require 'chainsaw/browser'
|
13
|
+
|
14
|
+
|
15
|
+
Nokogiri::XML::Element.module_eval do
|
16
|
+
include Chainsaw::Element
|
17
|
+
end
|
18
|
+
|
19
|
+
module Chainsaw
|
20
|
+
VERSION = '0.0.1'
|
21
|
+
|
22
|
+
#
|
23
|
+
# Return a instance of the Chainsaw::Browser class.
|
24
|
+
def self.launch(*args)
|
25
|
+
args.pop! if args.last.is_a? Proc
|
26
|
+
cs = Chainsaw::Browser.new *args
|
27
|
+
yield cs if block_given?
|
28
|
+
cs
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
module Kernel
|
33
|
+
#
|
34
|
+
# alias for Chainsaw.launch
|
35
|
+
def Chainsaw(*args)
|
36
|
+
Chainsaw.launch *args
|
37
|
+
end
|
38
|
+
|
39
|
+
module_function :Chainsaw
|
40
|
+
end
|
@@ -0,0 +1,262 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Chainsaw
|
4
|
+
|
5
|
+
class Browser
|
6
|
+
include Chainsaw::Util
|
7
|
+
|
8
|
+
DEFAULT_USER_AGENT = '' # TODO: set value
|
9
|
+
|
10
|
+
attr_accessor :user_agent, :ignore_redirect, :hide_referer, :max_history_count, :encoding # configurables
|
11
|
+
attr_accessor :request_headers, :url_base, :results
|
12
|
+
attr_reader :engine, :response, :history, :request_count # session
|
13
|
+
|
14
|
+
def initialize(location = '', options = {})
|
15
|
+
@user_agent = options[:user_agent] || DEFAULT_USER_AGENT
|
16
|
+
@max_history_count = options[:max_history_count] || 20
|
17
|
+
@ignore_redirect = options[:ignore_redirect] || false
|
18
|
+
@hide_referer = options[:hide_referer] || false
|
19
|
+
@encoding = options[:encoding]
|
20
|
+
@url_base = options[:url_base] || ''
|
21
|
+
@request_headers = options[:request_headers] || {}
|
22
|
+
|
23
|
+
@history = []
|
24
|
+
@results = []
|
25
|
+
|
26
|
+
@request_count = 0
|
27
|
+
|
28
|
+
@engine = HTTPClient.new
|
29
|
+
|
30
|
+
set_next(location)
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def add_header(key, value, overwrite_if_exists = true)
|
35
|
+
@request_headers = {} if @request_header.nil?
|
36
|
+
if !@request_headers.has_key?(key) or overwrite_if_exists
|
37
|
+
@request_headers[key] = value
|
38
|
+
end
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
def set_next(obj)
|
43
|
+
@reserved = obj
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
def set_auth(username, password)
|
48
|
+
@engine.set_auth(@reserved.to_s, username, password)
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
def process(&block)
|
53
|
+
if @reserved.is_a?(Nokogiri::XML::Element) and @reserved.is_form?
|
54
|
+
begin
|
55
|
+
submit &block
|
56
|
+
rescue TypeError
|
57
|
+
open &block
|
58
|
+
end
|
59
|
+
else
|
60
|
+
open &block
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
alias > process
|
65
|
+
|
66
|
+
def open
|
67
|
+
uri = prepare_next
|
68
|
+
@response = request(:get, uri)
|
69
|
+
process_chain &Proc.new if block_given?
|
70
|
+
self
|
71
|
+
end
|
72
|
+
|
73
|
+
def submit
|
74
|
+
submit_with(nil)
|
75
|
+
process_chain &Proc.new if block_given?
|
76
|
+
self
|
77
|
+
end
|
78
|
+
|
79
|
+
def submit_with(button_name, image_x = -1, image_y = -1)
|
80
|
+
uri, form = prepare_next
|
81
|
+
raise TypeError, 'No form found.' unless form
|
82
|
+
unless button_name
|
83
|
+
s = form.xpath('.//input[@type="submit"]')
|
84
|
+
button_name = s.first['name'] if s.length
|
85
|
+
end
|
86
|
+
query = form.serialize_form(button_name, image_x, image_y)
|
87
|
+
method = (form['method'] || 'get').downcase
|
88
|
+
if method == 'get'
|
89
|
+
@response = request(:get, uri, query)
|
90
|
+
elsif method == 'post'
|
91
|
+
paths, q = query.partition do |name, value|
|
92
|
+
value.instance_of? Pathname
|
93
|
+
end
|
94
|
+
files = []
|
95
|
+
begin
|
96
|
+
paths.each do |name, path|
|
97
|
+
f = File.open(path, 'rb')
|
98
|
+
q.push [name, f]
|
99
|
+
files.push f
|
100
|
+
end
|
101
|
+
@response = request(:post, uri, q)
|
102
|
+
ensure
|
103
|
+
files.each { |f| f.close unless f.nil? }
|
104
|
+
end
|
105
|
+
end
|
106
|
+
process_chain &Proc.new if block_given?
|
107
|
+
self
|
108
|
+
end
|
109
|
+
|
110
|
+
def back
|
111
|
+
require 'ostruct'
|
112
|
+
cleanup
|
113
|
+
@history.shift
|
114
|
+
back = @history.first
|
115
|
+
return self if back.nil?
|
116
|
+
@response = OpenStruct.new(
|
117
|
+
'uri' => back[:location],
|
118
|
+
'contenttype' => back[:content_type],
|
119
|
+
'content' => back[:document]
|
120
|
+
)
|
121
|
+
@reserved = nil
|
122
|
+
process_chain &Proc.new if block_given?
|
123
|
+
self
|
124
|
+
end
|
125
|
+
|
126
|
+
def uri
|
127
|
+
return nil if @response.nil?
|
128
|
+
@response.uri
|
129
|
+
end
|
130
|
+
|
131
|
+
def each(&block)
|
132
|
+
iterate(false, &block)
|
133
|
+
end
|
134
|
+
|
135
|
+
def each_with_index(&block)
|
136
|
+
iterate(true, &block)
|
137
|
+
end
|
138
|
+
|
139
|
+
def document
|
140
|
+
return @document unless @document.nil?
|
141
|
+
return nil if res.content.nil?
|
142
|
+
return nil unless is_xml_parsable?(res.contenttype)
|
143
|
+
enc = @encoding || Encoding.guess(res.content)
|
144
|
+
@document = Nokogiri.parse(res.content, nil, enc)
|
145
|
+
|
146
|
+
b = @document.xpath('//base')
|
147
|
+
base = b.empty? ? '' : b[0]['href']
|
148
|
+
@history.first.update(:base => base.to_s)
|
149
|
+
|
150
|
+
@document
|
151
|
+
end
|
152
|
+
|
153
|
+
def file
|
154
|
+
return @file unless @file.nil?
|
155
|
+
return nil if @response.content.nil?
|
156
|
+
|
157
|
+
Tempfile.open(uri.to_s.split('/').last) do |f|
|
158
|
+
f.binmode.write @response.content
|
159
|
+
@file = f
|
160
|
+
end
|
161
|
+
|
162
|
+
@file
|
163
|
+
end
|
164
|
+
|
165
|
+
alias res response
|
166
|
+
alias doc document
|
167
|
+
|
168
|
+
private
|
169
|
+
|
170
|
+
def process_chain(&block)
|
171
|
+
if block.arity == 0
|
172
|
+
results.push yield
|
173
|
+
else
|
174
|
+
results.push yield(self)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def request(method, uri, query = nil)
|
179
|
+
raise TypeError, "Invalid URI: #{qq uri.to_s}" unless uri.kind_of?(URI::HTTP)
|
180
|
+
|
181
|
+
call = method.to_s || 'get'
|
182
|
+
call += '_r' unless @ignore_redirect
|
183
|
+
add_header('Referer', @history.first[:location], false) if !@hide_referer and @history.first
|
184
|
+
@request_count += 1
|
185
|
+
r = begin
|
186
|
+
@engine.send(call, uri, query, @request_headers)
|
187
|
+
rescue
|
188
|
+
raise(
|
189
|
+
Chainsaw::RequestError,
|
190
|
+
"Error occured during #{to_nth(@request_count)} request; url: #{qq uri.to_s}"
|
191
|
+
)
|
192
|
+
end
|
193
|
+
|
194
|
+
# history
|
195
|
+
set_history uri.to_s, r.contenttype, r.content
|
196
|
+
r
|
197
|
+
end
|
198
|
+
|
199
|
+
def iterate(with_index, &block)
|
200
|
+
set = prepare_next
|
201
|
+
unless set
|
202
|
+
warn 'Iteration called without Enumerable.'
|
203
|
+
return self
|
204
|
+
end
|
205
|
+
|
206
|
+
set.each_with_index do |e, index|
|
207
|
+
set_next(e)
|
208
|
+
with_index ? yield(self, index) : yield(self)
|
209
|
+
back
|
210
|
+
end
|
211
|
+
self
|
212
|
+
end
|
213
|
+
|
214
|
+
def prepare_next(reserved = nil)
|
215
|
+
reserved ||= @reserved
|
216
|
+
cleanup
|
217
|
+
case reserved
|
218
|
+
when String # expecting a url
|
219
|
+
return prepare_next(URI.parse reserved)
|
220
|
+
when URI
|
221
|
+
if reserved.is_a?(URI::HTTP)
|
222
|
+
return reserved
|
223
|
+
elsif @history.size > 0
|
224
|
+
base = @history[0][:base] || ''
|
225
|
+
base = @history[0][:location] if base.empty?
|
226
|
+
l = URI.join base, reserved
|
227
|
+
return prepare_next(l) if l.is_a?(URI::HTTP)
|
228
|
+
end
|
229
|
+
when Nokogiri::XML::Element
|
230
|
+
url = reserved['href'] || reserved['action'] || reserved['value']
|
231
|
+
if reserved.is_form?
|
232
|
+
return [prepare_next(url), reserved]
|
233
|
+
else
|
234
|
+
return prepare_next(url)
|
235
|
+
end
|
236
|
+
when Enumerable
|
237
|
+
return reserved
|
238
|
+
end
|
239
|
+
|
240
|
+
raise TypeError, "Unexpected value is set for next before #{to_nth(@request_count+1)} request: #{@reserved.class.name}"
|
241
|
+
end
|
242
|
+
|
243
|
+
def cleanup
|
244
|
+
@reserved = @document = @response = @file = nil
|
245
|
+
end
|
246
|
+
|
247
|
+
def set_history(location, content_type, document, base = nil)
|
248
|
+
document = nil unless is_xml_parsable?(content_type)
|
249
|
+
data = {
|
250
|
+
:location => location,
|
251
|
+
:content_type => content_type,
|
252
|
+
:document => document,
|
253
|
+
:base => base
|
254
|
+
}
|
255
|
+
@history.unshift(data)
|
256
|
+
@history.slice! @max_history_count, @history.size - @max_history_count
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
|
261
|
+
end
|
262
|
+
end
|