autoweb 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ # Welcome to Autoweb
2
+
3
+ 1. [Autoweb][homepage]能让更好地分析HTML中的数据
4
+ 你可以先使用Autoweb提供的DSL,针对HTML建模, 然后抓取数据
5
+ 2. 它能把你写的抓取程序分享出来, 他人可以通过命令行或者web界面使用
6
+
7
+ ### 集成进来的工具
8
+ 1. baidu mp3 下载器
9
+
10
+ 使用举例:
11
+
12
+ 下载 齐秦的大约在冬季, 在命令行输入:
13
+
14
+ autoweb baidump3 "大约在冬季 齐秦" ~/Download/mp3
15
+
16
+ autoweb会自动搜索歌曲, 然后下载到指定的目录
17
+
18
+
19
+ ## Install Autoweb
20
+
21
+ 安装前需要安装以下工具:
22
+
23
+ * curl
24
+ * wget
25
+ * hpricot
26
+
27
+ 然后安装autoweb
28
+
29
+ gem install autoweb
30
+
31
+
32
+ ## Contributing
33
+
34
+ ### 页面建模
35
+ 用到了css3作为页面元素定位语法, 参照: [css3语法介绍][w3c-css3-selector]
36
+
37
+ Page.define "BaiduMp3" do |page|
38
+
39
+ # 搜索url模板
40
+ page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
41
+
42
+ page.subs "result", "#Tbs tr" do |sub| # 定义名为"result"的"sub page"
43
+
44
+ sub.ele "music", "td:nth(1) a" #音乐链接
45
+ sub.ele "artist", "td:nth(2) a" #演唱者
46
+ sub.ele "album", "td:nth(3) a" #专辑
47
+ sub.ele "lyrics", "td:nth(5) a" #歌词
48
+ sub.ele "size", "td:nth(7)" #文件大小
49
+ sub.ele "format", "td:nth(8)" #文件格式
50
+ end
51
+ end
52
+
53
+ 使用页面对象:
54
+
55
+ page = Page.pages["BaiduMp3"].parse(:word=>"大约在冬季")
56
+ first_mp3 = page["result"][1]
57
+ link = first_mp3["music"]
58
+ puts link[:href]
59
+
60
+ 更多实际代码, 参考 [commands/baidump3.rb][baidump3-codeb]
61
+
62
+ ### 新建一个命令(和baidump3类似)
63
+
64
+ 将以下代码放到autoweb/commands/helloworld.rb下
65
+
66
+ module Autoweb::Command
67
+ class HelloWorld < Base
68
+ def index
69
+ display "hello world!"
70
+ end
71
+ end
72
+ end
73
+
74
+ 直接运行 autoweb helloworld 即可
75
+
76
+ 更多实际代码, 参考 [commands/help.rb][help-code] 和 [commands/baidump3.rb][baidump3-code]
77
+
78
+ ### 将代码提交到[autoweb][homepage]
79
+
80
+ 请直接fork github上的autoweb, 提交ticket以及push request即可
81
+
82
+ ## License
83
+
84
+ Autoweb released under the MIT license.
85
+
86
+ [homepage]:http://dazuiba.github.com/autoweb
87
+ [w3c-css3-selector]:http://wiki.github.com/mxcl/homebrew/installation
88
+ [baidump3-code]:http://github.com/dazuiba/autoweb/blob/master/commands/baidump3.rb
89
+ [help-code]:http://github.com/dazuiba/autoweb/blob/master/commands/help.rb
90
+
91
+
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
3
+
4
+ def display(msg)
5
+ STDOUT.print msg
6
+ STDOUT.flush
7
+ end
8
+
9
+ display "loading"
10
+ require 'rubygems'
11
+ display "..."
12
+ require 'autoweb/command'
13
+ display "...\n"
14
+
15
+
16
+ at_exit{display "\n"}
17
+
18
+ args = ARGV.dup
19
+ ARGV.clear
20
+ command = args.shift.strip rescue 'help'
21
+
22
+ Autoweb::Command.run(command, args)
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+ require 'autoweb/page'
3
+ require 'iconv'
4
+
5
+ include Autoweb
6
+
7
+ Page.define "BaiduMp3" do |page|
8
+ page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
9
+
10
+ page.subs "result", "#Tbs tr" do |sub|
11
+ sub.ele "music", "td:nth(1) a"
12
+ sub.ele "artist", "td:nth(2) a"
13
+ sub.ele "album", "td:nth(3) a"
14
+ sub.ele "lyrics", "td:nth(5) a"
15
+ sub.ele "size", "td:nth(7)"
16
+ sub.ele "format", "td:nth(8)"
17
+ end
18
+ end
19
+
20
+ module Autoweb
21
+ module Command
22
+
23
+ class Baidump3 < Autoweb::Command::Base
24
+ MEGABYTE = 1024.0 * 1024.0
25
+ attr_accessor :dest_dir, :word
26
+
27
+ def index
28
+ @word = args[0]
29
+ @dest_dir = args[1]
30
+
31
+ if @word.nil?
32
+ return usage
33
+ end
34
+
35
+ if @dest_dir
36
+ if File.directory?(dest_dir)
37
+ @dest_dir = File.expand_path(dest_dir)+"/"
38
+ else
39
+ error "#{@dest_dir} is not directory"
40
+ end
41
+ end
42
+
43
+ search
44
+ end
45
+
46
+ def search
47
+ display("searching...")
48
+ page = Page.pages["BaiduMp3"].parse(:word => word)
49
+ result = page["result"][1]
50
+ music_url = result["music"][:href]
51
+ mp3url = decode open(URI.encode music_url).read[/var encurl = "([^"]*)"/,1]
52
+ display2("ok, parsing mp3...")
53
+ #size = `curl -I #{mp3url} 2>/dev/null`[/Content-Length:\ (\d+)/,1]
54
+ #display2(", size: %.2fM. " % (Integer(size)/MEGABYTE))
55
+ confirm("sure to download?") do
56
+ download_mp3(mp3url, word, result["format"].innerText)
57
+ end
58
+ end
59
+
60
+ def usage
61
+ display "usage: autoweb mp3search MUSIC_NAME"
62
+ end
63
+
64
+ def download_mp3(url, word, format)
65
+ `wget #{url} -O #{dest_dir}#{word.gsub(/[\+|\ |_]/, "-")}.#{format}`
66
+ end
67
+
68
+ def decode(s)
69
+ s.tr(_mktab(s[0].chr), s=~ /....:\// ? _mktab('h') : _mktab('f')) #http|ftp
70
+ end
71
+
72
+ def _mktab(x)
73
+ t0 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
74
+ p = t0.partition(x)
75
+ p[1] + p[2] + p[0]
76
+ end
77
+
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,16 @@
1
+ module Autoweb::Command
2
+ class Help < Base
3
+ def index
4
+ display usage
5
+ end
6
+
7
+ def usage
8
+ <<-EOTXT
9
+ === Command List:
10
+ autoweb baidump3 $music-name $save-to-dir
11
+
12
+
13
+ EOTXT
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ require 'hpricot'
2
+ require 'open-uri'
3
+ require 'ostruct'
4
+ require 'active_support'
5
+ require "autoweb/core.rb"
@@ -0,0 +1,77 @@
1
+ require 'autoweb/ui'
2
+ module Autoweb
3
+ module Command
4
+ class Base
5
+ include Autoweb::UI
6
+ attr_accessor :args
7
+
8
+ def initialize(args)
9
+ @args = args
10
+ end
11
+
12
+ def usage
13
+ <<-EOTXT
14
+ === Command List:
15
+ automan console
16
+ automan dbconsole
17
+ automan help
18
+ automan update
19
+
20
+
21
+ EOTXT
22
+ end
23
+
24
+ end
25
+ class InvalidCommand < RuntimeError; end
26
+ class CommandFailed < RuntimeError; end
27
+
28
+ class << self
29
+
30
+ include Autoweb::UI
31
+ def run(command, args, retries=0)
32
+ begin
33
+ run_internal(command, args.dup)
34
+ rescue InvalidCommand
35
+ error "Unknown command. Run 'autoweb help' for usage information."
36
+ rescue CommandFailed => e
37
+ error e.message
38
+ rescue Interrupt => e
39
+ error "\n[canceled]"
40
+ end
41
+ end
42
+
43
+ def run_internal(command, args)
44
+ klass, method = parse(command)
45
+ runner = klass.new(args)
46
+ raise InvalidCommand unless runner.respond_to?(method)
47
+ runner.send(method)
48
+ end
49
+
50
+ def parse(command)
51
+ parts = command.split(':')
52
+ case parts.size
53
+ when 1
54
+ begin
55
+ return eval("Autoweb::Command::#{command.capitalize}"), :index
56
+ #rescue NameError, NoMethodError
57
+ # return Autoweb::Command::Help, command
58
+ end
59
+ when 2
60
+ begin
61
+ return Autoweb::Command.const_get(parts[0].capitalize), parts[1]
62
+ rescue NameError
63
+ raise InvalidCommand
64
+ end
65
+ else
66
+ raise InvalidCommand
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ Dir["#{File.dirname(__FILE__)}/../../commands/*.rb"].each { |c|
74
+ unless (/_helper\.rb$/=~c)
75
+ require c
76
+ end
77
+ }
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env ruby
2
+ require "ostruct"
3
+ require "open-uri"
4
+ require "hpricot"
5
+ module Autoweb
6
+ module Container
7
+ attr_reader :name, :sub_pages, :elements
8
+
9
+ def subs(name,sub_css,&block)
10
+ def_sub(name, sub_css, true, &block)
11
+ end
12
+
13
+ def sub(name,sub_css,&block)
14
+ def_sub(name, sub_css, false, &block)
15
+ end
16
+
17
+ def ele(name, css)
18
+ @elements[name] = Element.new(self,name,css)
19
+ end
20
+
21
+ private
22
+
23
+ def def_sub(name, sub_css, is_array, &block)
24
+ sub = SubPage.new(self,name,sub_css,is_array)
25
+ yield sub
26
+ @sub_pages[name] = sub
27
+ end
28
+ end
29
+
30
+ class SubPage
31
+ attr_reader :parent, :css, :is_array
32
+ include Container
33
+ def initialize(parent, name, css, is_array=false)
34
+ @parent = parent
35
+ @css = css
36
+ @name = name
37
+ @sub_pages = {}
38
+ @elements = {}
39
+ @is_array = is_array
40
+ end
41
+ end
42
+
43
+ class Element
44
+ attr_reader :parent, :name, :css
45
+ def initialize(parent,name, css)
46
+ @parent = parent
47
+ @name = name
48
+ @css = css
49
+ end
50
+ end
51
+
52
+ class Page
53
+ class << self
54
+ def pages
55
+ @pages||={}
56
+ end
57
+ end
58
+ attr_accessor :name, :url_tpl
59
+ include Container
60
+
61
+ def initialize(name)
62
+ @name = name
63
+ @sub_pages = {}
64
+ @elements = {}
65
+ end
66
+
67
+ def self.define(name,&block)
68
+ page = self.new(name)
69
+ yield page
70
+ self.pages[name] = page
71
+ page
72
+ end
73
+
74
+ def parse(hash)
75
+ Parser.new(self,hash).go
76
+ end
77
+
78
+ def url(locals)
79
+ OpenStruct.new(locals.merge(:url_tpl=>self.url_tpl)).instance_eval{
80
+ eval %Q{"#{url_tpl.gsub(/"/, '\"')}"}
81
+ }
82
+ end
83
+ end
84
+
85
+ class Parser
86
+ attr_accessor :page, :url
87
+ def initialize(page, hash)
88
+ @page = page
89
+ if hash.is_a?(String)
90
+ @url = hash
91
+ else
92
+ @url = page.url(hash)
93
+ end
94
+ end
95
+
96
+ def go
97
+ contaent_parser
98
+ self
99
+ end
100
+
101
+ def [](key)
102
+ contaent_parser[key]
103
+ end
104
+
105
+ def contaent_parser
106
+ @content_parser||=ContentParser.new(doc,page)
107
+ end
108
+
109
+ def doc
110
+ @doc||=Hpricot(open(URI.encode url))
111
+ end
112
+
113
+ end
114
+
115
+ class ContentParser
116
+ attr_accessor :doc, :page
117
+
118
+ def initialize(doc, page)
119
+ @doc = doc
120
+ @page = page
121
+ end
122
+
123
+ def [](key)
124
+ if r = page.sub_pages[key]
125
+ if r.is_array
126
+ doc.search(r.css).map{|e|self.class.new(e,r)}
127
+ else
128
+ self.new(doc.at(page.css),r)
129
+ end
130
+ elsif r = page.elements[key]
131
+ doc.at(r.css)
132
+ else
133
+ raise "key not found"
134
+ end
135
+ end
136
+
137
+ def method_missing(key, *args, &block)
138
+ if doc.respond_to?(key)
139
+ doc.send key, *args, &block
140
+ else
141
+ super
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,89 @@
1
+ module Autoweb
2
+ module UI
3
+ class RetryError < RuntimeError
4
+ end
5
+
6
+ class Input < String
7
+ include UI
8
+ def process_default(options={})
9
+ if is?("q")
10
+ exit(0)
11
+ elsif is?("h")||is?("?")
12
+ display options[:help]||"no help"
13
+ raise RetryError
14
+ else
15
+ raise RetryError
16
+ end
17
+ end
18
+
19
+
20
+ def yes?
21
+ self.is?("y")
22
+ end
23
+
24
+ def is?(str)
25
+ return if str.nil?
26
+ self.downcase.strip == str.downcase.strip
27
+ end
28
+ end
29
+
30
+ def display(msg, new_line = true)
31
+ msg = msg.to_s.gsub(/_/) { ' ' }
32
+ if new_line
33
+ STDOUT.puts msg
34
+ else
35
+ STDOUT.print msg
36
+ end
37
+ STDOUT.flush
38
+ end
39
+
40
+ def error(msg="error")
41
+ display msg
42
+ exit(1)
43
+ end
44
+
45
+ def display2(msg)
46
+ display(msg,false)
47
+ end
48
+
49
+ def confirm(message=nil, options={})
50
+ if message.nil?
51
+ message = "Are you sure you wish to continue?"
52
+ end
53
+
54
+ message << "(y/q/h)"
55
+
56
+ ask_loop(message) do |input|
57
+ if input.yes?
58
+ yield
59
+ else
60
+ input.process_default(:help=>options[:help])
61
+ end
62
+ end
63
+ ask.downcase == 'y'
64
+ end
65
+
66
+ def format_date(date)
67
+ date = Time.parse(date) if date.is_a?(String)
68
+ date.strftime("%Y-%m-%d %H:%M %Z")
69
+ end
70
+
71
+ def ask_loop(message,&block)
72
+ display2 message+" "
73
+ begin
74
+ yield Input.new(ask)
75
+ rescue RetryError => e
76
+ display2 message
77
+ retry
78
+ end
79
+ end
80
+
81
+ def ask
82
+ gets.strip
83
+ end
84
+
85
+ def shell(cmd)
86
+ FileUtils.cd(Dir.pwd) {|d| return `#{cmd}`}
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,3 @@
1
+ module Autoweb
2
+ VERSION = '0.0.2'
3
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: autoweb
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 2
9
+ version: 0.0.2
10
+ platform: ruby
11
+ authors:
12
+ - dazuiba
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-08-07 00:00:00 +08:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Automate the Internet. baidu music downloader
22
+ email: come2u@gmail.com
23
+ executables:
24
+ - autoweb
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README.md
31
+ - bin/autoweb
32
+ - lib/autoweb/command.rb
33
+ - lib/autoweb/page.rb
34
+ - lib/autoweb/ui.rb
35
+ - lib/autoweb/version.rb
36
+ - lib/autoweb.rb
37
+ - commands/baidump3.rb
38
+ - commands/help.rb
39
+ has_rdoc: true
40
+ homepage: http://github.com/dazuiba/autoweb.git
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ segments:
53
+ - 1
54
+ - 8
55
+ - 7
56
+ version: 1.8.7
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.3.6
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Gem for the rest
71
+ test_files: []
72
+