autoweb 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,91 @@
1
+ # Welcome to Autoweb
2
+
3
+ 1. [Autoweb][homepage]能让更好地分析HTML中的数据
4
+ 你可以先使用Autoweb提供的DSL,针对HTML建模, 然后抓取数据
5
+ 2. 它能把你写的抓取程序分享出来, 他人可以通过命令行或者web界面使用
6
+
7
+ ### 集成进来的工具
8
+ 1. baidu mp3 下载器
9
+
10
+ 使用举例:
11
+
12
+ 下载 齐秦的大约在冬季, 在命令行输入:
13
+
14
+ autoweb baidump3 "大约在冬季 齐秦" ~/Download/mp3
15
+
16
+ autoweb会自动搜索歌曲, 然后下载到指定的目录
17
+
18
+
19
+ ## Install Autoweb
20
+
21
+ 安装前需要安装以下工具:
22
+
23
+ * curl
24
+ * wget
25
+ * hpricot
26
+
27
+ 然后安装autoweb
28
+
29
+ gem install autoweb
30
+
31
+
32
+ ## Contributing
33
+
34
+ ### 页面建模
35
+ 用到了css3作为页面元素定位语法, 参照: [css3语法介绍][w3c-css3-selector]
36
+
37
+ Page.define "BaiduMp3" do |page|
38
+
39
+ # 搜索url模板
40
+ page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
41
+
42
+ page.subs "result", "#Tbs tr" do |sub| # 定义名为"result"的"sub page"
43
+
44
+ sub.ele "music", "td:nth(1) a" #音乐链接
45
+ sub.ele "artist", "td:nth(2) a" #演唱者
46
+ sub.ele "album", "td:nth(3) a" #专辑
47
+ sub.ele "lyrics", "td:nth(5) a" #歌词
48
+ sub.ele "size", "td:nth(7)" #文件大小
49
+ sub.ele "format", "td:nth(8)" #文件格式
50
+ end
51
+ end
52
+
53
+ 使用页面对象:
54
+
55
+ page = Page.pages["BaiduMp3"].parse(:word=>"大约在冬季")
56
+ first_mp3 = page["result"][1]
57
+ link = first_mp3["music"]
58
+ puts link[:href]
59
+
60
+ 更多实际代码, 参考 [commands/baidump3.rb][baidump3-codeb]
61
+
62
+ ### 新建一个命令(和baidump3类似)
63
+
64
+ 将以下代码放到autoweb/commands/helloworld.rb下
65
+
66
+ module Autoweb::Command
67
+ class HelloWorld < Base
68
+ def index
69
+ display "hello world!"
70
+ end
71
+ end
72
+ end
73
+
74
+ 直接运行 autoweb helloworld 即可
75
+
76
+ 更多实际代码, 参考 [commands/help.rb][help-code] 和 [commands/baidump3.rb][baidump3-code]
77
+
78
+ ### 将代码提交到[autoweb][homepage]
79
+
80
+ 请直接fork github上的autoweb, 提交ticket以及push request即可
81
+
82
+ ## License
83
+
84
+ Autoweb released under the MIT license.
85
+
86
+ [homepage]:http://dazuiba.github.com/autoweb
87
+ [w3c-css3-selector]:http://wiki.github.com/mxcl/homebrew/installation
88
+ [baidump3-code]:http://github.com/dazuiba/autoweb/blob/master/commands/baidump3.rb
89
+ [help-code]:http://github.com/dazuiba/autoweb/blob/master/commands/help.rb
90
+
91
+
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
3
+
4
+ def display(msg)
5
+ STDOUT.print msg
6
+ STDOUT.flush
7
+ end
8
+
9
+ display "loading"
10
+ require 'rubygems'
11
+ display "..."
12
+ require 'autoweb/command'
13
+ display "...\n"
14
+
15
+
16
+ at_exit{display "\n"}
17
+
18
+ args = ARGV.dup
19
+ ARGV.clear
20
+ command = args.shift.strip rescue 'help'
21
+
22
+ Autoweb::Command.run(command, args)
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+ require 'autoweb/page'
3
+ require 'iconv'
4
+
5
+ include Autoweb
6
+
7
+ Page.define "BaiduMp3" do |page|
8
+ page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
9
+
10
+ page.subs "result", "#Tbs tr" do |sub|
11
+ sub.ele "music", "td:nth(1) a"
12
+ sub.ele "artist", "td:nth(2) a"
13
+ sub.ele "album", "td:nth(3) a"
14
+ sub.ele "lyrics", "td:nth(5) a"
15
+ sub.ele "size", "td:nth(7)"
16
+ sub.ele "format", "td:nth(8)"
17
+ end
18
+ end
19
+
20
+ module Autoweb
21
+ module Command
22
+
23
+ class Baidump3 < Autoweb::Command::Base
24
+ MEGABYTE = 1024.0 * 1024.0
25
+ attr_accessor :dest_dir, :word
26
+
27
+ def index
28
+ @word = args[0]
29
+ @dest_dir = args[1]
30
+
31
+ if @word.nil?
32
+ return usage
33
+ end
34
+
35
+ if @dest_dir
36
+ if File.directory?(dest_dir)
37
+ @dest_dir = File.expand_path(dest_dir)+"/"
38
+ else
39
+ error "#{@dest_dir} is not directory"
40
+ end
41
+ end
42
+
43
+ search
44
+ end
45
+
46
+ def search
47
+ display("searching...")
48
+ page = Page.pages["BaiduMp3"].parse(:word => word)
49
+ result = page["result"][1]
50
+ music_url = result["music"][:href]
51
+ mp3url = decode open(URI.encode music_url).read[/var encurl = "([^"]*)"/,1]
52
+ display2("ok, parsing mp3...")
53
+ #size = `curl -I #{mp3url} 2>/dev/null`[/Content-Length:\ (\d+)/,1]
54
+ #display2(", size: %.2fM. " % (Integer(size)/MEGABYTE))
55
+ confirm("sure to download?") do
56
+ download_mp3(mp3url, word, result["format"].innerText)
57
+ end
58
+ end
59
+
60
+ def usage
61
+ display "usage: autoweb mp3search MUSIC_NAME"
62
+ end
63
+
64
+ def download_mp3(url, word, format)
65
+ `wget #{url} -O #{dest_dir}#{word.gsub(/[\+|\ |_]/, "-")}.#{format}`
66
+ end
67
+
68
+ def decode(s)
69
+ s.tr(_mktab(s[0].chr), s=~ /....:\// ? _mktab('h') : _mktab('f')) #http|ftp
70
+ end
71
+
72
+ def _mktab(x)
73
+ t0 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
74
+ p = t0.partition(x)
75
+ p[1] + p[2] + p[0]
76
+ end
77
+
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,16 @@
1
+ module Autoweb::Command
2
+ class Help < Base
3
+ def index
4
+ display usage
5
+ end
6
+
7
+ def usage
8
+ <<-EOTXT
9
+ === Command List:
10
+ autoweb baidump3 $music-name $save-to-dir
11
+
12
+
13
+ EOTXT
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ require 'hpricot'
2
+ require 'open-uri'
3
+ require 'ostruct'
4
+ require 'active_support'
5
+ require "autoweb/core.rb"
@@ -0,0 +1,77 @@
1
+ require 'autoweb/ui'
2
+ module Autoweb
3
+ module Command
4
+ class Base
5
+ include Autoweb::UI
6
+ attr_accessor :args
7
+
8
+ def initialize(args)
9
+ @args = args
10
+ end
11
+
12
+ def usage
13
+ <<-EOTXT
14
+ === Command List:
15
+ automan console
16
+ automan dbconsole
17
+ automan help
18
+ automan update
19
+
20
+
21
+ EOTXT
22
+ end
23
+
24
+ end
25
+ class InvalidCommand < RuntimeError; end
26
+ class CommandFailed < RuntimeError; end
27
+
28
+ class << self
29
+
30
+ include Autoweb::UI
31
+ def run(command, args, retries=0)
32
+ begin
33
+ run_internal(command, args.dup)
34
+ rescue InvalidCommand
35
+ error "Unknown command. Run 'autoweb help' for usage information."
36
+ rescue CommandFailed => e
37
+ error e.message
38
+ rescue Interrupt => e
39
+ error "\n[canceled]"
40
+ end
41
+ end
42
+
43
+ def run_internal(command, args)
44
+ klass, method = parse(command)
45
+ runner = klass.new(args)
46
+ raise InvalidCommand unless runner.respond_to?(method)
47
+ runner.send(method)
48
+ end
49
+
50
+ def parse(command)
51
+ parts = command.split(':')
52
+ case parts.size
53
+ when 1
54
+ begin
55
+ return eval("Autoweb::Command::#{command.capitalize}"), :index
56
+ #rescue NameError, NoMethodError
57
+ # return Autoweb::Command::Help, command
58
+ end
59
+ when 2
60
+ begin
61
+ return Autoweb::Command.const_get(parts[0].capitalize), parts[1]
62
+ rescue NameError
63
+ raise InvalidCommand
64
+ end
65
+ else
66
+ raise InvalidCommand
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ Dir["#{File.dirname(__FILE__)}/../../commands/*.rb"].each { |c|
74
+ unless (/_helper\.rb$/=~c)
75
+ require c
76
+ end
77
+ }
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env ruby
2
+ require "ostruct"
3
+ require "open-uri"
4
+ require "hpricot"
5
+ module Autoweb
6
+ module Container
7
+ attr_reader :name, :sub_pages, :elements
8
+
9
+ def subs(name,sub_css,&block)
10
+ def_sub(name, sub_css, true, &block)
11
+ end
12
+
13
+ def sub(name,sub_css,&block)
14
+ def_sub(name, sub_css, false, &block)
15
+ end
16
+
17
+ def ele(name, css)
18
+ @elements[name] = Element.new(self,name,css)
19
+ end
20
+
21
+ private
22
+
23
+ def def_sub(name, sub_css, is_array, &block)
24
+ sub = SubPage.new(self,name,sub_css,is_array)
25
+ yield sub
26
+ @sub_pages[name] = sub
27
+ end
28
+ end
29
+
30
+ class SubPage
31
+ attr_reader :parent, :css, :is_array
32
+ include Container
33
+ def initialize(parent, name, css, is_array=false)
34
+ @parent = parent
35
+ @css = css
36
+ @name = name
37
+ @sub_pages = {}
38
+ @elements = {}
39
+ @is_array = is_array
40
+ end
41
+ end
42
+
43
+ class Element
44
+ attr_reader :parent, :name, :css
45
+ def initialize(parent,name, css)
46
+ @parent = parent
47
+ @name = name
48
+ @css = css
49
+ end
50
+ end
51
+
52
+ class Page
53
+ class << self
54
+ def pages
55
+ @pages||={}
56
+ end
57
+ end
58
+ attr_accessor :name, :url_tpl
59
+ include Container
60
+
61
+ def initialize(name)
62
+ @name = name
63
+ @sub_pages = {}
64
+ @elements = {}
65
+ end
66
+
67
+ def self.define(name,&block)
68
+ page = self.new(name)
69
+ yield page
70
+ self.pages[name] = page
71
+ page
72
+ end
73
+
74
+ def parse(hash)
75
+ Parser.new(self,hash).go
76
+ end
77
+
78
+ def url(locals)
79
+ OpenStruct.new(locals.merge(:url_tpl=>self.url_tpl)).instance_eval{
80
+ eval %Q{"#{url_tpl.gsub(/"/, '\"')}"}
81
+ }
82
+ end
83
+ end
84
+
85
+ class Parser
86
+ attr_accessor :page, :url
87
+ def initialize(page, hash)
88
+ @page = page
89
+ if hash.is_a?(String)
90
+ @url = hash
91
+ else
92
+ @url = page.url(hash)
93
+ end
94
+ end
95
+
96
+ def go
97
+ contaent_parser
98
+ self
99
+ end
100
+
101
+ def [](key)
102
+ contaent_parser[key]
103
+ end
104
+
105
+ def contaent_parser
106
+ @content_parser||=ContentParser.new(doc,page)
107
+ end
108
+
109
+ def doc
110
+ @doc||=Hpricot(open(URI.encode url))
111
+ end
112
+
113
+ end
114
+
115
+ class ContentParser
116
+ attr_accessor :doc, :page
117
+
118
+ def initialize(doc, page)
119
+ @doc = doc
120
+ @page = page
121
+ end
122
+
123
+ def [](key)
124
+ if r = page.sub_pages[key]
125
+ if r.is_array
126
+ doc.search(r.css).map{|e|self.class.new(e,r)}
127
+ else
128
+ self.new(doc.at(page.css),r)
129
+ end
130
+ elsif r = page.elements[key]
131
+ doc.at(r.css)
132
+ else
133
+ raise "key not found"
134
+ end
135
+ end
136
+
137
+ def method_missing(key, *args, &block)
138
+ if doc.respond_to?(key)
139
+ doc.send key, *args, &block
140
+ else
141
+ super
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,89 @@
1
+ module Autoweb
2
+ module UI
3
+ class RetryError < RuntimeError
4
+ end
5
+
6
+ class Input < String
7
+ include UI
8
+ def process_default(options={})
9
+ if is?("q")
10
+ exit(0)
11
+ elsif is?("h")||is?("?")
12
+ display options[:help]||"no help"
13
+ raise RetryError
14
+ else
15
+ raise RetryError
16
+ end
17
+ end
18
+
19
+
20
+ def yes?
21
+ self.is?("y")
22
+ end
23
+
24
+ def is?(str)
25
+ return if str.nil?
26
+ self.downcase.strip == str.downcase.strip
27
+ end
28
+ end
29
+
30
+ def display(msg, new_line = true)
31
+ msg = msg.to_s.gsub(/_/) { ' ' }
32
+ if new_line
33
+ STDOUT.puts msg
34
+ else
35
+ STDOUT.print msg
36
+ end
37
+ STDOUT.flush
38
+ end
39
+
40
+ def error(msg="error")
41
+ display msg
42
+ exit(1)
43
+ end
44
+
45
+ def display2(msg)
46
+ display(msg,false)
47
+ end
48
+
49
+ def confirm(message=nil, options={})
50
+ if message.nil?
51
+ message = "Are you sure you wish to continue?"
52
+ end
53
+
54
+ message << "(y/q/h)"
55
+
56
+ ask_loop(message) do |input|
57
+ if input.yes?
58
+ yield
59
+ else
60
+ input.process_default(:help=>options[:help])
61
+ end
62
+ end
63
+ ask.downcase == 'y'
64
+ end
65
+
66
+ def format_date(date)
67
+ date = Time.parse(date) if date.is_a?(String)
68
+ date.strftime("%Y-%m-%d %H:%M %Z")
69
+ end
70
+
71
+ def ask_loop(message,&block)
72
+ display2 message+" "
73
+ begin
74
+ yield Input.new(ask)
75
+ rescue RetryError => e
76
+ display2 message
77
+ retry
78
+ end
79
+ end
80
+
81
+ def ask
82
+ gets.strip
83
+ end
84
+
85
+ def shell(cmd)
86
+ FileUtils.cd(Dir.pwd) {|d| return `#{cmd}`}
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,3 @@
1
+ module Autoweb
2
+ VERSION = '0.0.2'
3
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: autoweb
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 2
9
+ version: 0.0.2
10
+ platform: ruby
11
+ authors:
12
+ - dazuiba
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-08-07 00:00:00 +08:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Automate the Internet. baidu music downloader
22
+ email: come2u@gmail.com
23
+ executables:
24
+ - autoweb
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README.md
31
+ - bin/autoweb
32
+ - lib/autoweb/command.rb
33
+ - lib/autoweb/page.rb
34
+ - lib/autoweb/ui.rb
35
+ - lib/autoweb/version.rb
36
+ - lib/autoweb.rb
37
+ - commands/baidump3.rb
38
+ - commands/help.rb
39
+ has_rdoc: true
40
+ homepage: http://github.com/dazuiba/autoweb.git
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ segments:
53
+ - 1
54
+ - 8
55
+ - 7
56
+ version: 1.8.7
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.3.6
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Gem for the rest
71
+ test_files: []
72
+