autoweb 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +91 -0
- data/bin/autoweb +22 -0
- data/commands/baidump3.rb +80 -0
- data/commands/help.rb +16 -0
- data/lib/autoweb.rb +5 -0
- data/lib/autoweb/command.rb +77 -0
- data/lib/autoweb/page.rb +145 -0
- data/lib/autoweb/ui.rb +89 -0
- data/lib/autoweb/version.rb +3 -0
- metadata +72 -0
data/README.md
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# Welcome to Autoweb
|
2
|
+
|
3
|
+
1. [Autoweb][homepage]能让更好地分析HTML中的数据
|
4
|
+
你可以先使用Autoweb提供的DSL,针对HTML建模, 然后抓取数据
|
5
|
+
2. 它能把你写的抓取程序分享出来, 他人可以通过命令行或者web界面使用
|
6
|
+
|
7
|
+
### 集成进来的工具
|
8
|
+
1. baidu mp3 下载器
|
9
|
+
|
10
|
+
使用举例:
|
11
|
+
|
12
|
+
下载 齐秦的大约在冬季, 在命令行输入:
|
13
|
+
|
14
|
+
autoweb baidump3 "大约在冬季 齐秦" ~/Download/mp3
|
15
|
+
|
16
|
+
autoweb会自动搜索歌曲, 然后下载到指定的目录
|
17
|
+
|
18
|
+
|
19
|
+
## Install Autoweb
|
20
|
+
|
21
|
+
安装前需要安装以下工具:
|
22
|
+
|
23
|
+
* curl
|
24
|
+
* wget
|
25
|
+
* hpricot
|
26
|
+
|
27
|
+
然后安装autoweb
|
28
|
+
|
29
|
+
gem install autoweb
|
30
|
+
|
31
|
+
|
32
|
+
## Contributing
|
33
|
+
|
34
|
+
### 页面建模
|
35
|
+
用到了css3作为页面元素定位语法, 参照: [css3语法介绍][w3c-css3-selector]
|
36
|
+
|
37
|
+
Page.define "BaiduMp3" do |page|
|
38
|
+
|
39
|
+
# 搜索url模板
|
40
|
+
page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
|
41
|
+
|
42
|
+
page.subs "result", "#Tbs tr" do |sub| # 定义名为"result"的"sub page"
|
43
|
+
|
44
|
+
sub.ele "music", "td:nth(1) a" #音乐链接
|
45
|
+
sub.ele "artist", "td:nth(2) a" #演唱者
|
46
|
+
sub.ele "album", "td:nth(3) a" #专辑
|
47
|
+
sub.ele "lyrics", "td:nth(5) a" #歌词
|
48
|
+
sub.ele "size", "td:nth(7)" #文件大小
|
49
|
+
sub.ele "format", "td:nth(8)" #文件格式
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
使用页面对象:
|
54
|
+
|
55
|
+
page = Page.pages["BaiduMp3"].parse(:word=>"大约在冬季")
|
56
|
+
first_mp3 = page["result"][1]
|
57
|
+
link = first_mp3["music"]
|
58
|
+
puts link[:href]
|
59
|
+
|
60
|
+
更多实际代码, 参考 [commands/baidump3.rb][baidump3-codeb]
|
61
|
+
|
62
|
+
### 新建一个命令(和baidump3类似)
|
63
|
+
|
64
|
+
将以下代码放到autoweb/commands/helloworld.rb下
|
65
|
+
|
66
|
+
module Autoweb::Command
|
67
|
+
class HelloWorld < Base
|
68
|
+
def index
|
69
|
+
display "hello world!"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
直接运行 autoweb helloworld 即可
|
75
|
+
|
76
|
+
更多实际代码, 参考 [commands/help.rb][help-code] 和 [commands/baidump3.rb][baidump3-code]
|
77
|
+
|
78
|
+
### 将代码提交到[autoweb][homepage]
|
79
|
+
|
80
|
+
请直接fork github上的autoweb, 提交ticket以及push request即可
|
81
|
+
|
82
|
+
## License
|
83
|
+
|
84
|
+
Autoweb released under the MIT license.
|
85
|
+
|
86
|
+
[homepage]:http://dazuiba.github.com/autoweb
|
87
|
+
[w3c-css3-selector]:http://wiki.github.com/mxcl/homebrew/installation
|
88
|
+
[baidump3-code]:http://github.com/dazuiba/autoweb/blob/master/commands/baidump3.rb
|
89
|
+
[help-code]:http://github.com/dazuiba/autoweb/blob/master/commands/help.rb
|
90
|
+
|
91
|
+
|
data/bin/autoweb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
|
3
|
+
|
4
|
+
def display(msg)
|
5
|
+
STDOUT.print msg
|
6
|
+
STDOUT.flush
|
7
|
+
end
|
8
|
+
|
9
|
+
display "loading"
|
10
|
+
require 'rubygems'
|
11
|
+
display "..."
|
12
|
+
require 'autoweb/command'
|
13
|
+
display "...\n"
|
14
|
+
|
15
|
+
|
16
|
+
at_exit{display "\n"}
|
17
|
+
|
18
|
+
args = ARGV.dup
|
19
|
+
ARGV.clear
|
20
|
+
command = args.shift.strip rescue 'help'
|
21
|
+
|
22
|
+
Autoweb::Command.run(command, args)
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'autoweb/page'
|
3
|
+
require 'iconv'
|
4
|
+
|
5
|
+
include Autoweb
|
6
|
+
|
7
|
+
Page.define "BaiduMp3" do |page|
|
8
|
+
page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
|
9
|
+
|
10
|
+
page.subs "result", "#Tbs tr" do |sub|
|
11
|
+
sub.ele "music", "td:nth(1) a"
|
12
|
+
sub.ele "artist", "td:nth(2) a"
|
13
|
+
sub.ele "album", "td:nth(3) a"
|
14
|
+
sub.ele "lyrics", "td:nth(5) a"
|
15
|
+
sub.ele "size", "td:nth(7)"
|
16
|
+
sub.ele "format", "td:nth(8)"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module Autoweb
|
21
|
+
module Command
|
22
|
+
|
23
|
+
class Baidump3 < Autoweb::Command::Base
|
24
|
+
MEGABYTE = 1024.0 * 1024.0
|
25
|
+
attr_accessor :dest_dir, :word
|
26
|
+
|
27
|
+
def index
|
28
|
+
@word = args[0]
|
29
|
+
@dest_dir = args[1]
|
30
|
+
|
31
|
+
if @word.nil?
|
32
|
+
return usage
|
33
|
+
end
|
34
|
+
|
35
|
+
if @dest_dir
|
36
|
+
if File.directory?(dest_dir)
|
37
|
+
@dest_dir = File.expand_path(dest_dir)+"/"
|
38
|
+
else
|
39
|
+
error "#{@dest_dir} is not directory"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
search
|
44
|
+
end
|
45
|
+
|
46
|
+
def search
|
47
|
+
display("searching...")
|
48
|
+
page = Page.pages["BaiduMp3"].parse(:word => word)
|
49
|
+
result = page["result"][1]
|
50
|
+
music_url = result["music"][:href]
|
51
|
+
mp3url = decode open(URI.encode music_url).read[/var encurl = "([^"]*)"/,1]
|
52
|
+
display2("ok, parsing mp3...")
|
53
|
+
#size = `curl -I #{mp3url} 2>/dev/null`[/Content-Length:\ (\d+)/,1]
|
54
|
+
#display2(", size: %.2fM. " % (Integer(size)/MEGABYTE))
|
55
|
+
confirm("sure to download?") do
|
56
|
+
download_mp3(mp3url, word, result["format"].innerText)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def usage
|
61
|
+
display "usage: autoweb mp3search MUSIC_NAME"
|
62
|
+
end
|
63
|
+
|
64
|
+
def download_mp3(url, word, format)
|
65
|
+
`wget #{url} -O #{dest_dir}#{word.gsub(/[\+|\ |_]/, "-")}.#{format}`
|
66
|
+
end
|
67
|
+
|
68
|
+
def decode(s)
|
69
|
+
s.tr(_mktab(s[0].chr), s=~ /....:\// ? _mktab('h') : _mktab('f')) #http|ftp
|
70
|
+
end
|
71
|
+
|
72
|
+
def _mktab(x)
|
73
|
+
t0 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
74
|
+
p = t0.partition(x)
|
75
|
+
p[1] + p[2] + p[0]
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/commands/help.rb
ADDED
data/lib/autoweb.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'autoweb/ui'
|
2
|
+
module Autoweb
|
3
|
+
module Command
|
4
|
+
class Base
|
5
|
+
include Autoweb::UI
|
6
|
+
attr_accessor :args
|
7
|
+
|
8
|
+
def initialize(args)
|
9
|
+
@args = args
|
10
|
+
end
|
11
|
+
|
12
|
+
def usage
|
13
|
+
<<-EOTXT
|
14
|
+
=== Command List:
|
15
|
+
automan console
|
16
|
+
automan dbconsole
|
17
|
+
automan help
|
18
|
+
automan update
|
19
|
+
|
20
|
+
|
21
|
+
EOTXT
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
class InvalidCommand < RuntimeError; end
|
26
|
+
class CommandFailed < RuntimeError; end
|
27
|
+
|
28
|
+
class << self
|
29
|
+
|
30
|
+
include Autoweb::UI
|
31
|
+
def run(command, args, retries=0)
|
32
|
+
begin
|
33
|
+
run_internal(command, args.dup)
|
34
|
+
rescue InvalidCommand
|
35
|
+
error "Unknown command. Run 'autoweb help' for usage information."
|
36
|
+
rescue CommandFailed => e
|
37
|
+
error e.message
|
38
|
+
rescue Interrupt => e
|
39
|
+
error "\n[canceled]"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def run_internal(command, args)
|
44
|
+
klass, method = parse(command)
|
45
|
+
runner = klass.new(args)
|
46
|
+
raise InvalidCommand unless runner.respond_to?(method)
|
47
|
+
runner.send(method)
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse(command)
|
51
|
+
parts = command.split(':')
|
52
|
+
case parts.size
|
53
|
+
when 1
|
54
|
+
begin
|
55
|
+
return eval("Autoweb::Command::#{command.capitalize}"), :index
|
56
|
+
#rescue NameError, NoMethodError
|
57
|
+
# return Autoweb::Command::Help, command
|
58
|
+
end
|
59
|
+
when 2
|
60
|
+
begin
|
61
|
+
return Autoweb::Command.const_get(parts[0].capitalize), parts[1]
|
62
|
+
rescue NameError
|
63
|
+
raise InvalidCommand
|
64
|
+
end
|
65
|
+
else
|
66
|
+
raise InvalidCommand
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
Dir["#{File.dirname(__FILE__)}/../../commands/*.rb"].each { |c|
|
74
|
+
unless (/_helper\.rb$/=~c)
|
75
|
+
require c
|
76
|
+
end
|
77
|
+
}
|
data/lib/autoweb/page.rb
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "ostruct"
|
3
|
+
require "open-uri"
|
4
|
+
require "hpricot"
|
5
|
+
module Autoweb
|
6
|
+
module Container
|
7
|
+
attr_reader :name, :sub_pages, :elements
|
8
|
+
|
9
|
+
def subs(name,sub_css,&block)
|
10
|
+
def_sub(name, sub_css, true, &block)
|
11
|
+
end
|
12
|
+
|
13
|
+
def sub(name,sub_css,&block)
|
14
|
+
def_sub(name, sub_css, false, &block)
|
15
|
+
end
|
16
|
+
|
17
|
+
def ele(name, css)
|
18
|
+
@elements[name] = Element.new(self,name,css)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def def_sub(name, sub_css, is_array, &block)
|
24
|
+
sub = SubPage.new(self,name,sub_css,is_array)
|
25
|
+
yield sub
|
26
|
+
@sub_pages[name] = sub
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class SubPage
|
31
|
+
attr_reader :parent, :css, :is_array
|
32
|
+
include Container
|
33
|
+
def initialize(parent, name, css, is_array=false)
|
34
|
+
@parent = parent
|
35
|
+
@css = css
|
36
|
+
@name = name
|
37
|
+
@sub_pages = {}
|
38
|
+
@elements = {}
|
39
|
+
@is_array = is_array
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Element
|
44
|
+
attr_reader :parent, :name, :css
|
45
|
+
def initialize(parent,name, css)
|
46
|
+
@parent = parent
|
47
|
+
@name = name
|
48
|
+
@css = css
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Page
|
53
|
+
class << self
|
54
|
+
def pages
|
55
|
+
@pages||={}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
attr_accessor :name, :url_tpl
|
59
|
+
include Container
|
60
|
+
|
61
|
+
def initialize(name)
|
62
|
+
@name = name
|
63
|
+
@sub_pages = {}
|
64
|
+
@elements = {}
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.define(name,&block)
|
68
|
+
page = self.new(name)
|
69
|
+
yield page
|
70
|
+
self.pages[name] = page
|
71
|
+
page
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse(hash)
|
75
|
+
Parser.new(self,hash).go
|
76
|
+
end
|
77
|
+
|
78
|
+
def url(locals)
|
79
|
+
OpenStruct.new(locals.merge(:url_tpl=>self.url_tpl)).instance_eval{
|
80
|
+
eval %Q{"#{url_tpl.gsub(/"/, '\"')}"}
|
81
|
+
}
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class Parser
|
86
|
+
attr_accessor :page, :url
|
87
|
+
def initialize(page, hash)
|
88
|
+
@page = page
|
89
|
+
if hash.is_a?(String)
|
90
|
+
@url = hash
|
91
|
+
else
|
92
|
+
@url = page.url(hash)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def go
|
97
|
+
contaent_parser
|
98
|
+
self
|
99
|
+
end
|
100
|
+
|
101
|
+
def [](key)
|
102
|
+
contaent_parser[key]
|
103
|
+
end
|
104
|
+
|
105
|
+
def contaent_parser
|
106
|
+
@content_parser||=ContentParser.new(doc,page)
|
107
|
+
end
|
108
|
+
|
109
|
+
def doc
|
110
|
+
@doc||=Hpricot(open(URI.encode url))
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
class ContentParser
|
116
|
+
attr_accessor :doc, :page
|
117
|
+
|
118
|
+
def initialize(doc, page)
|
119
|
+
@doc = doc
|
120
|
+
@page = page
|
121
|
+
end
|
122
|
+
|
123
|
+
def [](key)
|
124
|
+
if r = page.sub_pages[key]
|
125
|
+
if r.is_array
|
126
|
+
doc.search(r.css).map{|e|self.class.new(e,r)}
|
127
|
+
else
|
128
|
+
self.new(doc.at(page.css),r)
|
129
|
+
end
|
130
|
+
elsif r = page.elements[key]
|
131
|
+
doc.at(r.css)
|
132
|
+
else
|
133
|
+
raise "key not found"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def method_missing(key, *args, &block)
|
138
|
+
if doc.respond_to?(key)
|
139
|
+
doc.send key, *args, &block
|
140
|
+
else
|
141
|
+
super
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
data/lib/autoweb/ui.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
module Autoweb
|
2
|
+
module UI
|
3
|
+
class RetryError < RuntimeError
|
4
|
+
end
|
5
|
+
|
6
|
+
class Input < String
|
7
|
+
include UI
|
8
|
+
def process_default(options={})
|
9
|
+
if is?("q")
|
10
|
+
exit(0)
|
11
|
+
elsif is?("h")||is?("?")
|
12
|
+
display options[:help]||"no help"
|
13
|
+
raise RetryError
|
14
|
+
else
|
15
|
+
raise RetryError
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def yes?
|
21
|
+
self.is?("y")
|
22
|
+
end
|
23
|
+
|
24
|
+
def is?(str)
|
25
|
+
return if str.nil?
|
26
|
+
self.downcase.strip == str.downcase.strip
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def display(msg, new_line = true)
|
31
|
+
msg = msg.to_s.gsub(/_/) { ' ' }
|
32
|
+
if new_line
|
33
|
+
STDOUT.puts msg
|
34
|
+
else
|
35
|
+
STDOUT.print msg
|
36
|
+
end
|
37
|
+
STDOUT.flush
|
38
|
+
end
|
39
|
+
|
40
|
+
def error(msg="error")
|
41
|
+
display msg
|
42
|
+
exit(1)
|
43
|
+
end
|
44
|
+
|
45
|
+
def display2(msg)
|
46
|
+
display(msg,false)
|
47
|
+
end
|
48
|
+
|
49
|
+
def confirm(message=nil, options={})
|
50
|
+
if message.nil?
|
51
|
+
message = "Are you sure you wish to continue?"
|
52
|
+
end
|
53
|
+
|
54
|
+
message << "(y/q/h)"
|
55
|
+
|
56
|
+
ask_loop(message) do |input|
|
57
|
+
if input.yes?
|
58
|
+
yield
|
59
|
+
else
|
60
|
+
input.process_default(:help=>options[:help])
|
61
|
+
end
|
62
|
+
end
|
63
|
+
ask.downcase == 'y'
|
64
|
+
end
|
65
|
+
|
66
|
+
def format_date(date)
|
67
|
+
date = Time.parse(date) if date.is_a?(String)
|
68
|
+
date.strftime("%Y-%m-%d %H:%M %Z")
|
69
|
+
end
|
70
|
+
|
71
|
+
def ask_loop(message,&block)
|
72
|
+
display2 message+" "
|
73
|
+
begin
|
74
|
+
yield Input.new(ask)
|
75
|
+
rescue RetryError => e
|
76
|
+
display2 message
|
77
|
+
retry
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def ask
|
82
|
+
gets.strip
|
83
|
+
end
|
84
|
+
|
85
|
+
def shell(cmd)
|
86
|
+
FileUtils.cd(Dir.pwd) {|d| return `#{cmd}`}
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: autoweb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: 0.0.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- dazuiba
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-08-07 00:00:00 +08:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Automate the Internet. baidu music downloader
|
22
|
+
email: come2u@gmail.com
|
23
|
+
executables:
|
24
|
+
- autoweb
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- README.md
|
31
|
+
- bin/autoweb
|
32
|
+
- lib/autoweb/command.rb
|
33
|
+
- lib/autoweb/page.rb
|
34
|
+
- lib/autoweb/ui.rb
|
35
|
+
- lib/autoweb/version.rb
|
36
|
+
- lib/autoweb.rb
|
37
|
+
- commands/baidump3.rb
|
38
|
+
- commands/help.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: http://github.com/dazuiba/autoweb.git
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
segments:
|
53
|
+
- 1
|
54
|
+
- 8
|
55
|
+
- 7
|
56
|
+
version: 1.8.7
|
57
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.3.6
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Gem for the rest
|
71
|
+
test_files: []
|
72
|
+
|