ucnv-chainsaw 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,42 @@
1
+ = Chainsaw
2
+
3
+ * http://github.com/ucnv/chainsaw/tree/master
4
+
5
+ == Description
6
+
7
+ A Ruby library for spidering web resources.
8
+
9
+ == Features/Problems
10
+
11
+
12
+ == Synopsis
13
+
14
+ Chainsaw.launch('http://example.com/').open { |cs|
15
+ cs.doc.css('#navi a')[2]
16
+ }.open { |cs|
17
+ form = cs.doc.xpath('//form')[0]
18
+ input = form.xpath('.//input[@type="text"]')[0]
19
+ input.set_attribute('value', 'blurblur')
20
+ cs.set_next form
21
+ }.submit { |cs|
22
+ puts cs.res.status
23
+ cs.doc.search('.//a') do |link|
24
+ puts link.content
25
+ end
26
+ }
27
+
28
+ == Requirements
29
+
30
+ * nokogiri
31
+ * httpclient
32
+
33
+ == Installation
34
+
35
+ * gem install ucnv-chainsaw
36
+
37
+ == Copyright
38
+
39
+ Author:: ucnv <ucnvvv at gmail.com>
40
+ Copyright:: Copyright (c) 2009 ucnv
41
+ License:: MIT
42
+
data/Rakefile ADDED
@@ -0,0 +1,149 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/rdoctask'
8
+ require 'rake/contrib/rubyforgepublisher'
9
+ require 'rake/contrib/sshpublisher'
10
+ require 'fileutils'
11
+ include FileUtils
12
+
13
+ NAME = "chainsaw"
14
+ AUTHOR = "ucnv"
15
+ EMAIL = "ucnvvv at gmail.com"
16
+ DESCRIPTION = "A Ruby library for spidering web resources."
17
+ RUBYFORGE_PROJECT = "chainsaw"
18
+ HOMEPATH = "http://github.com/ucnv/chainsaw/tree/master"
19
+ BIN_FILES = %w( )
20
+ VERS = "0.0.1"
21
+
22
+ REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
23
+ CLEAN.include ['**/.*.sw?', '*.gem', '.config']
24
+ RDOC_OPTS = [
25
+ '--title', "#{NAME} documentation",
26
+ "--charset", "utf-8",
27
+ "--opname", "index.html",
28
+ "--line-numbers",
29
+ "--main", "README.rdoc",
30
+ "--inline-source",
31
+ ]
32
+
33
+ task :default => [:test]
34
+ task :package => [:clean]
35
+
36
+ Rake::TestTask.new("test") do |t|
37
+ t.libs << "test"
38
+ t.pattern = "test/**/test_*.rb"
39
+ t.verbose = true
40
+ end
41
+
42
+ spec = Gem::Specification.new do |s|
43
+ s.name = NAME
44
+ s.version = VERS
45
+ s.platform = Gem::Platform::RUBY
46
+ s.has_rdoc = true
47
+ s.extra_rdoc_files = ["README.rdoc", "ChangeLog"]
48
+ s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
49
+ s.summary = DESCRIPTION
50
+ s.description = DESCRIPTION
51
+ s.author = AUTHOR
52
+ s.email = EMAIL
53
+ s.homepage = HOMEPATH
54
+ s.executables = BIN_FILES
55
+ s.rubyforge_project = RUBYFORGE_PROJECT
56
+ s.bindir = "bin"
57
+ s.require_path = "lib"
58
+ s.autorequire = ""
59
+ s.test_files = Dir["test/test_*.rb"]
60
+
61
+ s.add_dependency('nokogiri', '>=1.2.1')
62
+ s.add_dependency('httpclient', '>=2.1.4')
63
+ s.required_ruby_version = '>= 1.8.6'
64
+
65
+ s.files = %w(README.rdoc ChangeLog Rakefile) +
66
+ Dir.glob("{bin,doc,test,lib,templates,generator,extras,website,script}/**/*") +
67
+ Dir.glob("ext/**/*.{h,c,rb}") +
68
+ Dir.glob("examples/**/*.rb") +
69
+ Dir.glob("tools/*.rb")
70
+
71
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
72
+ end
73
+
74
+ Rake::GemPackageTask.new(spec) do |p|
75
+ p.need_tar = true
76
+ p.gem_spec = spec
77
+ end
78
+
79
+ task :install do
80
+ name = "#{NAME}-#{VERS}.gem"
81
+ sh %{rake package}
82
+ begin
83
+ sh %{sudo gem install pkg/#{name}}
84
+ rescue
85
+ sh %{gem install pkg/#{name}}
86
+ end
87
+ end
88
+
89
+ task :uninstall => [:clean] do
90
+ begin
91
+ sh %{sudo gem uninstall #{NAME}}
92
+ rescue
93
+ sh %{gem uninstall #{NAME}}
94
+ end
95
+ end
96
+
97
+ Rake::RDocTask.new do |rdoc|
98
+ rdoc.rdoc_dir = 'html'
99
+ rdoc.options += RDOC_OPTS
100
+ rdoc.template = "resh"
101
+ #rdoc.template = "#{ENV['template']}.rb" if ENV['template']
102
+ if ENV['DOC_FILES']
103
+ rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
104
+ else
105
+ rdoc.rdoc_files.include('README.rdoc', 'ChangeLog')
106
+ rdoc.rdoc_files.include('lib/**/*.rb')
107
+ rdoc.rdoc_files.include('ext/**/*.c')
108
+ end
109
+ end
110
+
111
+ desc "Publish to RubyForge"
112
+ task :rubyforge => [:rdoc, :package] do
113
+ require 'rubyforge'
114
+ Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'comelade').upload
115
+ end
116
+
117
+ desc 'Package and upload the release to rubyforge.'
118
+ task :release => [:clean, :package] do |t|
119
+ v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
120
+ abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
121
+ pkg = "pkg/#{NAME}-#{VERS}"
122
+
123
+ rf = RubyForge.new
124
+ puts "Logging in"
125
+ rf.login
126
+
127
+ c = rf.userconfig
128
+ # c["release_notes"] = description if description
129
+ # c["release_changes"] = changes if changes
130
+ c["preformatted"] = true
131
+
132
+ files = [
133
+ "#{pkg}.tgz",
134
+ "#{pkg}.gem"
135
+ ].compact
136
+
137
+ puts "Releasing #{NAME} v. #{VERS}"
138
+ rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
139
+ end
140
+
141
+ desc 'Show information about the gem.'
142
+ task :debug_gem do
143
+ puts spec.to_ruby
144
+ end
145
+
146
+ desc 'Update gem spec'
147
+ task :gemspec do
148
+ open("#{NAME}.gemspec", 'w').write spec.to_ruby
149
+ end
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'chainsaw'
3
+
4
+ query = ARGV.shift
5
+
6
+ Chainsaw.launch('http://google.com/').
7
+ open { |a|
8
+ f = a.doc.xpath('//form[@name="f"]').first
9
+ q = f.xpath('//input[@name="q"]').first
10
+ q['value'] = query || 'Hello world'
11
+ a.set_next f
12
+ }.
13
+ submit { |a|
14
+ a.doc.search('//a').each do |l|
15
+ puts l.content
16
+ end
17
+ }
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'chainsaw'
3
+
4
+ username, password, tweet = ARGV
5
+
6
+ Chainsaw.launch('https://twitter.com/home').> { |a|
7
+ f = a.doc.css('.signin').first
8
+ f.xpath('id("username_or_email")').first['value'] = username
9
+ f.xpath('id("session[password]")').first['value'] = password
10
+ a.set_next f
11
+ }.> { |a|
12
+ f = a.doc.css('#doingForm').first
13
+ f.xpath('id("status")').first.content = tweet || "(o'-')-o fwip fwip"
14
+ a.set_next f
15
+ }.>
data/lib/chainsaw.rb ADDED
@@ -0,0 +1,40 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'httpclient'
4
+ require 'nkf'
5
+
6
+ require 'chainsaw/ext/httpclient'
7
+ require 'chainsaw/ext/nokogiri'
8
+
9
+ require 'chainsaw/common'
10
+ require 'chainsaw/element'
11
+
12
+ require 'chainsaw/browser'
13
+
14
+
15
+ Nokogiri::XML::Element.module_eval do
16
+ include Chainsaw::Element
17
+ end
18
+
19
+ module Chainsaw
20
+ VERSION = '0.0.1'
21
+
22
+ #
23
+ # Return a instance of the Chainsaw::Browser class.
24
+ def self.launch(*args)
25
+ args.pop! if args.last.is_a? Proc
26
+ cs = Chainsaw::Browser.new *args
27
+ yield cs if block_given?
28
+ cs
29
+ end
30
+ end
31
+
32
+ module Kernel
33
+ #
34
+ # alias for Chainsaw.launch
35
+ def Chainsaw(*args)
36
+ Chainsaw.launch *args
37
+ end
38
+
39
+ module_function :Chainsaw
40
+ end
@@ -0,0 +1,262 @@
1
+
2
+
3
+ module Chainsaw
4
+
5
+ class Browser
6
+ include Chainsaw::Util
7
+
8
+ DEFAULT_USER_AGENT = '' # TODO: set value
9
+
10
+ attr_accessor :user_agent, :ignore_redirect, :hide_referer, :max_history_count, :encoding # configurables
11
+ attr_accessor :request_headers, :url_base, :results
12
+ attr_reader :engine, :response, :history, :request_count # session
13
+
14
+ def initialize(location = '', options = {})
15
+ @user_agent = options[:user_agent] || DEFAULT_USER_AGENT
16
+ @max_history_count = options[:max_history_count] || 20
17
+ @ignore_redirect = options[:ignore_redirect] || false
18
+ @hide_referer = options[:hide_referer] || false
19
+ @encoding = options[:encoding]
20
+ @url_base = options[:url_base] || ''
21
+ @request_headers = options[:request_headers] || {}
22
+
23
+ @history = []
24
+ @results = []
25
+
26
+ @request_count = 0
27
+
28
+ @engine = HTTPClient.new
29
+
30
+ set_next(location)
31
+ self
32
+ end
33
+
34
+ def add_header(key, value, overwrite_if_exists = true)
35
+ @request_headers = {} if @request_header.nil?
36
+ if !@request_headers.has_key?(key) or overwrite_if_exists
37
+ @request_headers[key] = value
38
+ end
39
+ self
40
+ end
41
+
42
+ def set_next(obj)
43
+ @reserved = obj
44
+ self
45
+ end
46
+
47
+ def set_auth(username, password)
48
+ @engine.set_auth(@reserved.to_s, username, password)
49
+ self
50
+ end
51
+
52
+ def process(&block)
53
+ if @reserved.is_a?(Nokogiri::XML::Element) and @reserved.is_form?
54
+ begin
55
+ submit &block
56
+ rescue TypeError
57
+ open &block
58
+ end
59
+ else
60
+ open &block
61
+ end
62
+ end
63
+
64
+ alias > process
65
+
66
+ def open
67
+ uri = prepare_next
68
+ @response = request(:get, uri)
69
+ process_chain &Proc.new if block_given?
70
+ self
71
+ end
72
+
73
+ def submit
74
+ submit_with(nil)
75
+ process_chain &Proc.new if block_given?
76
+ self
77
+ end
78
+
79
+ def submit_with(button_name, image_x = -1, image_y = -1)
80
+ uri, form = prepare_next
81
+ raise TypeError, 'No form found.' unless form
82
+ unless button_name
83
+ s = form.xpath('.//input[@type="submit"]')
84
+ button_name = s.first['name'] if s.length
85
+ end
86
+ query = form.serialize_form(button_name, image_x, image_y)
87
+ method = (form['method'] || 'get').downcase
88
+ if method == 'get'
89
+ @response = request(:get, uri, query)
90
+ elsif method == 'post'
91
+ paths, q = query.partition do |name, value|
92
+ value.instance_of? Pathname
93
+ end
94
+ files = []
95
+ begin
96
+ paths.each do |name, path|
97
+ f = File.open(path, 'rb')
98
+ q.push [name, f]
99
+ files.push f
100
+ end
101
+ @response = request(:post, uri, q)
102
+ ensure
103
+ files.each { |f| f.close unless f.nil? }
104
+ end
105
+ end
106
+ process_chain &Proc.new if block_given?
107
+ self
108
+ end
109
+
110
+ def back
111
+ require 'ostruct'
112
+ cleanup
113
+ @history.shift
114
+ back = @history.first
115
+ return self if back.nil?
116
+ @response = OpenStruct.new(
117
+ 'uri' => back[:location],
118
+ 'contenttype' => back[:content_type],
119
+ 'content' => back[:document]
120
+ )
121
+ @reserved = nil
122
+ process_chain &Proc.new if block_given?
123
+ self
124
+ end
125
+
126
+ def uri
127
+ return nil if @response.nil?
128
+ @response.uri
129
+ end
130
+
131
+ def each(&block)
132
+ iterate(false, &block)
133
+ end
134
+
135
+ def each_with_index(&block)
136
+ iterate(true, &block)
137
+ end
138
+
139
+ def document
140
+ return @document unless @document.nil?
141
+ return nil if res.content.nil?
142
+ return nil unless is_xml_parsable?(res.contenttype)
143
+ enc = @encoding || Encoding.guess(res.content)
144
+ @document = Nokogiri.parse(res.content, nil, enc)
145
+
146
+ b = @document.xpath('//base')
147
+ base = b.empty? ? '' : b[0]['href']
148
+ @history.first.update(:base => base.to_s)
149
+
150
+ @document
151
+ end
152
+
153
+ def file
154
+ return @file unless @file.nil?
155
+ return nil if @response.content.nil?
156
+
157
+ Tempfile.open(uri.to_s.split('/').last) do |f|
158
+ f.binmode.write @response.content
159
+ @file = f
160
+ end
161
+
162
+ @file
163
+ end
164
+
165
+ alias res response
166
+ alias doc document
167
+
168
+ private
169
+
170
+ def process_chain(&block)
171
+ if block.arity == 0
172
+ results.push yield
173
+ else
174
+ results.push yield(self)
175
+ end
176
+ end
177
+
178
+ def request(method, uri, query = nil)
179
+ raise TypeError, "Invalid URI: #{qq uri.to_s}" unless uri.kind_of?(URI::HTTP)
180
+
181
+ call = method.to_s || 'get'
182
+ call += '_r' unless @ignore_redirect
183
+ add_header('Referer', @history.first[:location], false) if !@hide_referer and @history.first
184
+ @request_count += 1
185
+ r = begin
186
+ @engine.send(call, uri, query, @request_headers)
187
+ rescue
188
+ raise(
189
+ Chainsaw::RequestError,
190
+ "Error occured during #{to_nth(@request_count)} request; url: #{qq uri.to_s}"
191
+ )
192
+ end
193
+
194
+ # history
195
+ set_history uri.to_s, r.contenttype, r.content
196
+ r
197
+ end
198
+
199
+ def iterate(with_index, &block)
200
+ set = prepare_next
201
+ unless set
202
+ warn 'Iteration called without Enumerable.'
203
+ return self
204
+ end
205
+
206
+ set.each_with_index do |e, index|
207
+ set_next(e)
208
+ with_index ? yield(self, index) : yield(self)
209
+ back
210
+ end
211
+ self
212
+ end
213
+
214
+ def prepare_next(reserved = nil)
215
+ reserved ||= @reserved
216
+ cleanup
217
+ case reserved
218
+ when String # expecting a url
219
+ return prepare_next(URI.parse reserved)
220
+ when URI
221
+ if reserved.is_a?(URI::HTTP)
222
+ return reserved
223
+ elsif @history.size > 0
224
+ base = @history[0][:base] || ''
225
+ base = @history[0][:location] if base.empty?
226
+ l = URI.join base, reserved
227
+ return prepare_next(l) if l.is_a?(URI::HTTP)
228
+ end
229
+ when Nokogiri::XML::Element
230
+ url = reserved['href'] || reserved['action'] || reserved['value']
231
+ if reserved.is_form?
232
+ return [prepare_next(url), reserved]
233
+ else
234
+ return prepare_next(url)
235
+ end
236
+ when Enumerable
237
+ return reserved
238
+ end
239
+
240
+ raise TypeError, "Unexpected value is set for next before #{to_nth(@request_count+1)} request: #{@reserved.class.name}"
241
+ end
242
+
243
+ def cleanup
244
+ @reserved = @document = @response = @file = nil
245
+ end
246
+
247
+ def set_history(location, content_type, document, base = nil)
248
+ document = nil unless is_xml_parsable?(content_type)
249
+ data = {
250
+ :location => location,
251
+ :content_type => content_type,
252
+ :document => document,
253
+ :base => base
254
+ }
255
+ @history.unshift(data)
256
+ @history.slice! @max_history_count, @history.size - @max_history_count
257
+ end
258
+
259
+
260
+
261
+ end
262
+ end