ucnv-chainsaw 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,42 @@
1
+ = Chainsaw
2
+
3
+ * http://github.com/ucnv/chainsaw/tree/master
4
+
5
+ == Description
6
+
7
+ A Ruby library for spidering web resources.
8
+
9
+ == Features/Problems
10
+
11
+
12
+ == Synopsis
13
+
14
+ Chainsaw.launch('http://example.com/').open { |cs|
15
+ cs.doc.css('#navi a')[2]
16
+ }.open { |cs|
17
+ form = cs.doc.xpath('//form')[0]
18
+ input = form.xpath('.//input[@type="text"]')[0]
19
+ input.set_attribute('value', 'blurblur')
20
+ cs.set_next form
21
+ }.submit { |cs|
22
+ puts cs.res.status
23
+ cs.doc.search('.//a') do |link|
24
+ puts link.content
25
+ end
26
+ }
27
+
28
+ == Requirements
29
+
30
+ * nokogiri
31
+ * httpclient
32
+
33
+ == Installation
34
+
35
+ * gem install ucnv-chainsaw
36
+
37
+ == Copyright
38
+
39
+ Author:: ucnv <ucnvvv at gmail.com>
40
+ Copyright:: Copyright (c) 2009 ucnv
41
+ License:: MIT
42
+
data/Rakefile ADDED
@@ -0,0 +1,149 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/rdoctask'
8
+ require 'rake/contrib/rubyforgepublisher'
9
+ require 'rake/contrib/sshpublisher'
10
+ require 'fileutils'
11
+ include FileUtils
12
+
13
+ NAME = "chainsaw"
14
+ AUTHOR = "ucnv"
15
+ EMAIL = "ucnvvv at gmail.com"
16
+ DESCRIPTION = "A Ruby library for spidering web resources."
17
+ RUBYFORGE_PROJECT = "chainsaw"
18
+ HOMEPATH = "http://github.com/ucnv/chainsaw/tree/master"
19
+ BIN_FILES = %w( )
20
+ VERS = "0.0.1"
21
+
22
+ REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
23
+ CLEAN.include ['**/.*.sw?', '*.gem', '.config']
24
+ RDOC_OPTS = [
25
+ '--title', "#{NAME} documentation",
26
+ "--charset", "utf-8",
27
+ "--opname", "index.html",
28
+ "--line-numbers",
29
+ "--main", "README.rdoc",
30
+ "--inline-source",
31
+ ]
32
+
33
+ task :default => [:test]
34
+ task :package => [:clean]
35
+
36
+ Rake::TestTask.new("test") do |t|
37
+ t.libs << "test"
38
+ t.pattern = "test/**/test_*.rb"
39
+ t.verbose = true
40
+ end
41
+
42
+ spec = Gem::Specification.new do |s|
43
+ s.name = NAME
44
+ s.version = VERS
45
+ s.platform = Gem::Platform::RUBY
46
+ s.has_rdoc = true
47
+ s.extra_rdoc_files = ["README.rdoc", "ChangeLog"]
48
+ s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
49
+ s.summary = DESCRIPTION
50
+ s.description = DESCRIPTION
51
+ s.author = AUTHOR
52
+ s.email = EMAIL
53
+ s.homepage = HOMEPATH
54
+ s.executables = BIN_FILES
55
+ s.rubyforge_project = RUBYFORGE_PROJECT
56
+ s.bindir = "bin"
57
+ s.require_path = "lib"
58
+ s.autorequire = ""
59
+ s.test_files = Dir["test/test_*.rb"]
60
+
61
+ s.add_dependency('nokogiri', '>=1.2.1')
62
+ s.add_dependency('httpclient', '>=2.1.4')
63
+ s.required_ruby_version = '>= 1.8.6'
64
+
65
+ s.files = %w(README.rdoc ChangeLog Rakefile) +
66
+ Dir.glob("{bin,doc,test,lib,templates,generator,extras,website,script}/**/*") +
67
+ Dir.glob("ext/**/*.{h,c,rb}") +
68
+ Dir.glob("examples/**/*.rb") +
69
+ Dir.glob("tools/*.rb")
70
+
71
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
72
+ end
73
+
74
+ Rake::GemPackageTask.new(spec) do |p|
75
+ p.need_tar = true
76
+ p.gem_spec = spec
77
+ end
78
+
79
+ task :install do
80
+ name = "#{NAME}-#{VERS}.gem"
81
+ sh %{rake package}
82
+ begin
83
+ sh %{sudo gem install pkg/#{name}}
84
+ rescue
85
+ sh %{gem install pkg/#{name}}
86
+ end
87
+ end
88
+
89
+ task :uninstall => [:clean] do
90
+ begin
91
+ sh %{sudo gem uninstall #{NAME}}
92
+ rescue
93
+ sh %{gem uninstall #{NAME}}
94
+ end
95
+ end
96
+
97
+ Rake::RDocTask.new do |rdoc|
98
+ rdoc.rdoc_dir = 'html'
99
+ rdoc.options += RDOC_OPTS
100
+ rdoc.template = "resh"
101
+ #rdoc.template = "#{ENV['template']}.rb" if ENV['template']
102
+ if ENV['DOC_FILES']
103
+ rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
104
+ else
105
+ rdoc.rdoc_files.include('README.rdoc', 'ChangeLog')
106
+ rdoc.rdoc_files.include('lib/**/*.rb')
107
+ rdoc.rdoc_files.include('ext/**/*.c')
108
+ end
109
+ end
110
+
111
+ desc "Publish to RubyForge"
112
+ task :rubyforge => [:rdoc, :package] do
113
+ require 'rubyforge'
114
+ Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'comelade').upload
115
+ end
116
+
117
+ desc 'Package and upload the release to rubyforge.'
118
+ task :release => [:clean, :package] do |t|
119
+ v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
120
+ abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
121
+ pkg = "pkg/#{NAME}-#{VERS}"
122
+
123
+ rf = RubyForge.new
124
+ puts "Logging in"
125
+ rf.login
126
+
127
+ c = rf.userconfig
128
+ # c["release_notes"] = description if description
129
+ # c["release_changes"] = changes if changes
130
+ c["preformatted"] = true
131
+
132
+ files = [
133
+ "#{pkg}.tgz",
134
+ "#{pkg}.gem"
135
+ ].compact
136
+
137
+ puts "Releasing #{NAME} v. #{VERS}"
138
+ rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
139
+ end
140
+
141
+ desc 'Show information about the gem.'
142
+ task :debug_gem do
143
+ puts spec.to_ruby
144
+ end
145
+
146
+ desc 'Update gem spec'
147
+ task :gemspec do
148
+ open("#{NAME}.gemspec", 'w').write spec.to_ruby
149
+ end
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'chainsaw'
3
+
4
+ query = ARGV.shift
5
+
6
+ Chainsaw.launch('http://google.com/').
7
+ open { |a|
8
+ f = a.doc.xpath('//form[@name="f"]').first
9
+ q = f.xpath('//input[@name="q"]').first
10
+ q['value'] = query || 'Hello world'
11
+ a.set_next f
12
+ }.
13
+ submit { |a|
14
+ a.doc.search('//a').each do |l|
15
+ puts l.content
16
+ end
17
+ }
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'chainsaw'
3
+
4
+ username, password, tweet = ARGV
5
+
6
+ Chainsaw.launch('https://twitter.com/home').> { |a|
7
+ f = a.doc.css('.signin').first
8
+ f.xpath('id("username_or_email")').first['value'] = username
9
+ f.xpath('id("session[password]")').first['value'] = password
10
+ a.set_next f
11
+ }.> { |a|
12
+ f = a.doc.css('#doingForm').first
13
+ f.xpath('id("status")').first.content = tweet || "(o'-')-o fwip fwip"
14
+ a.set_next f
15
+ }.>
data/lib/chainsaw.rb ADDED
@@ -0,0 +1,40 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'httpclient'
4
+ require 'nkf'
5
+
6
+ require 'chainsaw/ext/httpclient'
7
+ require 'chainsaw/ext/nokogiri'
8
+
9
+ require 'chainsaw/common'
10
+ require 'chainsaw/element'
11
+
12
+ require 'chainsaw/browser'
13
+
14
+
15
+ Nokogiri::XML::Element.module_eval do
16
+ include Chainsaw::Element
17
+ end
18
+
19
+ module Chainsaw
20
+ VERSION = '0.0.1'
21
+
22
+ #
23
+ # Return a instance of the Chainsaw::Browser class.
24
+ def self.launch(*args)
25
+ args.pop! if args.last.is_a? Proc
26
+ cs = Chainsaw::Browser.new *args
27
+ yield cs if block_given?
28
+ cs
29
+ end
30
+ end
31
+
32
+ module Kernel
33
+ #
34
+ # alias for Chainsaw.launch
35
+ def Chainsaw(*args)
36
+ Chainsaw.launch *args
37
+ end
38
+
39
+ module_function :Chainsaw
40
+ end
@@ -0,0 +1,262 @@
1
+
2
+
3
+ module Chainsaw
4
+
5
+ class Browser
6
+ include Chainsaw::Util
7
+
8
+ DEFAULT_USER_AGENT = '' # TODO: set value
9
+
10
+ attr_accessor :user_agent, :ignore_redirect, :hide_referer, :max_history_count, :encoding # configurables
11
+ attr_accessor :request_headers, :url_base, :results
12
+ attr_reader :engine, :response, :history, :request_count # session
13
+
14
+ def initialize(location = '', options = {})
15
+ @user_agent = options[:user_agent] || DEFAULT_USER_AGENT
16
+ @max_history_count = options[:max_history_count] || 20
17
+ @ignore_redirect = options[:ignore_redirect] || false
18
+ @hide_referer = options[:hide_referer] || false
19
+ @encoding = options[:encoding]
20
+ @url_base = options[:url_base] || ''
21
+ @request_headers = options[:request_headers] || {}
22
+
23
+ @history = []
24
+ @results = []
25
+
26
+ @request_count = 0
27
+
28
+ @engine = HTTPClient.new
29
+
30
+ set_next(location)
31
+ self
32
+ end
33
+
34
+ def add_header(key, value, overwrite_if_exists = true)
35
+ @request_headers = {} if @request_header.nil?
36
+ if !@request_headers.has_key?(key) or overwrite_if_exists
37
+ @request_headers[key] = value
38
+ end
39
+ self
40
+ end
41
+
42
+ def set_next(obj)
43
+ @reserved = obj
44
+ self
45
+ end
46
+
47
+ def set_auth(username, password)
48
+ @engine.set_auth(@reserved.to_s, username, password)
49
+ self
50
+ end
51
+
52
+ def process(&block)
53
+ if @reserved.is_a?(Nokogiri::XML::Element) and @reserved.is_form?
54
+ begin
55
+ submit &block
56
+ rescue TypeError
57
+ open &block
58
+ end
59
+ else
60
+ open &block
61
+ end
62
+ end
63
+
64
+ alias > process
65
+
66
+ def open
67
+ uri = prepare_next
68
+ @response = request(:get, uri)
69
+ process_chain &Proc.new if block_given?
70
+ self
71
+ end
72
+
73
+ def submit
74
+ submit_with(nil)
75
+ process_chain &Proc.new if block_given?
76
+ self
77
+ end
78
+
79
+ def submit_with(button_name, image_x = -1, image_y = -1)
80
+ uri, form = prepare_next
81
+ raise TypeError, 'No form found.' unless form
82
+ unless button_name
83
+ s = form.xpath('.//input[@type="submit"]')
84
+ button_name = s.first['name'] if s.length
85
+ end
86
+ query = form.serialize_form(button_name, image_x, image_y)
87
+ method = (form['method'] || 'get').downcase
88
+ if method == 'get'
89
+ @response = request(:get, uri, query)
90
+ elsif method == 'post'
91
+ paths, q = query.partition do |name, value|
92
+ value.instance_of? Pathname
93
+ end
94
+ files = []
95
+ begin
96
+ paths.each do |name, path|
97
+ f = File.open(path, 'rb')
98
+ q.push [name, f]
99
+ files.push f
100
+ end
101
+ @response = request(:post, uri, q)
102
+ ensure
103
+ files.each { |f| f.close unless f.nil? }
104
+ end
105
+ end
106
+ process_chain &Proc.new if block_given?
107
+ self
108
+ end
109
+
110
+ def back
111
+ require 'ostruct'
112
+ cleanup
113
+ @history.shift
114
+ back = @history.first
115
+ return self if back.nil?
116
+ @response = OpenStruct.new(
117
+ 'uri' => back[:location],
118
+ 'contenttype' => back[:content_type],
119
+ 'content' => back[:document]
120
+ )
121
+ @reserved = nil
122
+ process_chain &Proc.new if block_given?
123
+ self
124
+ end
125
+
126
+ def uri
127
+ return nil if @response.nil?
128
+ @response.uri
129
+ end
130
+
131
+ def each(&block)
132
+ iterate(false, &block)
133
+ end
134
+
135
+ def each_with_index(&block)
136
+ iterate(true, &block)
137
+ end
138
+
139
+ def document
140
+ return @document unless @document.nil?
141
+ return nil if res.content.nil?
142
+ return nil unless is_xml_parsable?(res.contenttype)
143
+ enc = @encoding || Encoding.guess(res.content)
144
+ @document = Nokogiri.parse(res.content, nil, enc)
145
+
146
+ b = @document.xpath('//base')
147
+ base = b.empty? ? '' : b[0]['href']
148
+ @history.first.update(:base => base.to_s)
149
+
150
+ @document
151
+ end
152
+
153
+ def file
154
+ return @file unless @file.nil?
155
+ return nil if @response.content.nil?
156
+
157
+ Tempfile.open(uri.to_s.split('/').last) do |f|
158
+ f.binmode.write @response.content
159
+ @file = f
160
+ end
161
+
162
+ @file
163
+ end
164
+
165
+ alias res response
166
+ alias doc document
167
+
168
+ private
169
+
170
+ def process_chain(&block)
171
+ if block.arity == 0
172
+ results.push yield
173
+ else
174
+ results.push yield(self)
175
+ end
176
+ end
177
+
178
+ def request(method, uri, query = nil)
179
+ raise TypeError, "Invalid URI: #{qq uri.to_s}" unless uri.kind_of?(URI::HTTP)
180
+
181
+ call = method.to_s || 'get'
182
+ call += '_r' unless @ignore_redirect
183
+ add_header('Referer', @history.first[:location], false) if !@hide_referer and @history.first
184
+ @request_count += 1
185
+ r = begin
186
+ @engine.send(call, uri, query, @request_headers)
187
+ rescue
188
+ raise(
189
+ Chainsaw::RequestError,
190
+ "Error occured during #{to_nth(@request_count)} request; url: #{qq uri.to_s}"
191
+ )
192
+ end
193
+
194
+ # history
195
+ set_history uri.to_s, r.contenttype, r.content
196
+ r
197
+ end
198
+
199
+ def iterate(with_index, &block)
200
+ set = prepare_next
201
+ unless set
202
+ warn 'Iteration called without Enumerable.'
203
+ return self
204
+ end
205
+
206
+ set.each_with_index do |e, index|
207
+ set_next(e)
208
+ with_index ? yield(self, index) : yield(self)
209
+ back
210
+ end
211
+ self
212
+ end
213
+
214
+ def prepare_next(reserved = nil)
215
+ reserved ||= @reserved
216
+ cleanup
217
+ case reserved
218
+ when String # expecting a url
219
+ return prepare_next(URI.parse reserved)
220
+ when URI
221
+ if reserved.is_a?(URI::HTTP)
222
+ return reserved
223
+ elsif @history.size > 0
224
+ base = @history[0][:base] || ''
225
+ base = @history[0][:location] if base.empty?
226
+ l = URI.join base, reserved
227
+ return prepare_next(l) if l.is_a?(URI::HTTP)
228
+ end
229
+ when Nokogiri::XML::Element
230
+ url = reserved['href'] || reserved['action'] || reserved['value']
231
+ if reserved.is_form?
232
+ return [prepare_next(url), reserved]
233
+ else
234
+ return prepare_next(url)
235
+ end
236
+ when Enumerable
237
+ return reserved
238
+ end
239
+
240
+ raise TypeError, "Unexpected value is set for next before #{to_nth(@request_count+1)} request: #{@reserved.class.name}"
241
+ end
242
+
243
+ def cleanup
244
+ @reserved = @document = @response = @file = nil
245
+ end
246
+
247
+ def set_history(location, content_type, document, base = nil)
248
+ document = nil unless is_xml_parsable?(content_type)
249
+ data = {
250
+ :location => location,
251
+ :content_type => content_type,
252
+ :document => document,
253
+ :base => base
254
+ }
255
+ @history.unshift(data)
256
+ @history.slice! @max_history_count, @history.size - @max_history_count
257
+ end
258
+
259
+
260
+
261
+ end
262
+ end