autoweb 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +91 -0
- data/bin/autoweb +22 -0
- data/commands/baidump3.rb +80 -0
- data/commands/help.rb +16 -0
- data/lib/autoweb.rb +5 -0
- data/lib/autoweb/command.rb +77 -0
- data/lib/autoweb/page.rb +145 -0
- data/lib/autoweb/ui.rb +89 -0
- data/lib/autoweb/version.rb +3 -0
- metadata +72 -0
data/README.md
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# Welcome to Autoweb
|
2
|
+
|
3
|
+
1. [Autoweb][homepage]能让更好地分析HTML中的数据
|
4
|
+
你可以先使用Autoweb提供的DSL,针对HTML建模, 然后抓取数据
|
5
|
+
2. 它能把你写的抓取程序分享出来, 他人可以通过命令行或者web界面使用
|
6
|
+
|
7
|
+
### 集成进来的工具
|
8
|
+
1. baidu mp3 下载器
|
9
|
+
|
10
|
+
使用举例:
|
11
|
+
|
12
|
+
下载 齐秦的大约在冬季, 在命令行输入:
|
13
|
+
|
14
|
+
autoweb baidump3 "大约在冬季 齐秦" ~/Download/mp3
|
15
|
+
|
16
|
+
autoweb会自动搜索歌曲, 然后下载到指定的目录
|
17
|
+
|
18
|
+
|
19
|
+
## Install Autoweb
|
20
|
+
|
21
|
+
安装前需要安装以下工具:
|
22
|
+
|
23
|
+
* curl
|
24
|
+
* wget
|
25
|
+
* hpricot
|
26
|
+
|
27
|
+
然后安装autoweb
|
28
|
+
|
29
|
+
gem install autoweb
|
30
|
+
|
31
|
+
|
32
|
+
## Contributing
|
33
|
+
|
34
|
+
### 页面建模
|
35
|
+
用到了css3作为页面元素定位语法, 参照: [css3语法介绍][w3c-css3-selector]
|
36
|
+
|
37
|
+
Page.define "BaiduMp3" do |page|
|
38
|
+
|
39
|
+
# 搜索url模板
|
40
|
+
page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
|
41
|
+
|
42
|
+
page.subs "result", "#Tbs tr" do |sub| # 定义名为"result"的"sub page"
|
43
|
+
|
44
|
+
sub.ele "music", "td:nth(1) a" #音乐链接
|
45
|
+
sub.ele "artist", "td:nth(2) a" #演唱者
|
46
|
+
sub.ele "album", "td:nth(3) a" #专辑
|
47
|
+
sub.ele "lyrics", "td:nth(5) a" #歌词
|
48
|
+
sub.ele "size", "td:nth(7)" #文件大小
|
49
|
+
sub.ele "format", "td:nth(8)" #文件格式
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
使用页面对象:
|
54
|
+
|
55
|
+
page = Page.pages["BaiduMp3"].parse(:word=>"大约在冬季")
|
56
|
+
first_mp3 = page["result"][1]
|
57
|
+
link = first_mp3["music"]
|
58
|
+
puts link[:href]
|
59
|
+
|
60
|
+
更多实际代码, 参考 [commands/baidump3.rb][baidump3-codeb]
|
61
|
+
|
62
|
+
### 新建一个命令(和baidump3类似)
|
63
|
+
|
64
|
+
将以下代码放到autoweb/commands/helloworld.rb下
|
65
|
+
|
66
|
+
module Autoweb::Command
|
67
|
+
class HelloWorld < Base
|
68
|
+
def index
|
69
|
+
display "hello world!"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
直接运行 autoweb helloworld 即可
|
75
|
+
|
76
|
+
更多实际代码, 参考 [commands/help.rb][help-code] 和 [commands/baidump3.rb][baidump3-code]
|
77
|
+
|
78
|
+
### 将代码提交到[autoweb][homepage]
|
79
|
+
|
80
|
+
请直接fork github上的autoweb, 提交ticket以及push request即可
|
81
|
+
|
82
|
+
## License
|
83
|
+
|
84
|
+
Autoweb released under the MIT license.
|
85
|
+
|
86
|
+
[homepage]:http://dazuiba.github.com/autoweb
|
87
|
+
[w3c-css3-selector]:http://wiki.github.com/mxcl/homebrew/installation
|
88
|
+
[baidump3-code]:http://github.com/dazuiba/autoweb/blob/master/commands/baidump3.rb
|
89
|
+
[help-code]:http://github.com/dazuiba/autoweb/blob/master/commands/help.rb
|
90
|
+
|
91
|
+
|
data/bin/autoweb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
|
3
|
+
|
4
|
+
def display(msg)
|
5
|
+
STDOUT.print msg
|
6
|
+
STDOUT.flush
|
7
|
+
end
|
8
|
+
|
9
|
+
display "loading"
|
10
|
+
require 'rubygems'
|
11
|
+
display "..."
|
12
|
+
require 'autoweb/command'
|
13
|
+
display "...\n"
|
14
|
+
|
15
|
+
|
16
|
+
at_exit{display "\n"}
|
17
|
+
|
18
|
+
args = ARGV.dup
|
19
|
+
ARGV.clear
|
20
|
+
command = args.shift.strip rescue 'help'
|
21
|
+
|
22
|
+
Autoweb::Command.run(command, args)
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'autoweb/page'
|
3
|
+
require 'iconv'
|
4
|
+
|
5
|
+
include Autoweb
|
6
|
+
|
7
|
+
Page.define "BaiduMp3" do |page|
|
8
|
+
page.url_tpl = 'http://mp3.baidu.com/m?f=3&rf=idx&tn=baidump3&ct=134217728&lf=&rn=&word=#{word}&lm=-1&oq=go&rsp=1'
|
9
|
+
|
10
|
+
page.subs "result", "#Tbs tr" do |sub|
|
11
|
+
sub.ele "music", "td:nth(1) a"
|
12
|
+
sub.ele "artist", "td:nth(2) a"
|
13
|
+
sub.ele "album", "td:nth(3) a"
|
14
|
+
sub.ele "lyrics", "td:nth(5) a"
|
15
|
+
sub.ele "size", "td:nth(7)"
|
16
|
+
sub.ele "format", "td:nth(8)"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module Autoweb
|
21
|
+
module Command
|
22
|
+
|
23
|
+
class Baidump3 < Autoweb::Command::Base
|
24
|
+
MEGABYTE = 1024.0 * 1024.0
|
25
|
+
attr_accessor :dest_dir, :word
|
26
|
+
|
27
|
+
def index
|
28
|
+
@word = args[0]
|
29
|
+
@dest_dir = args[1]
|
30
|
+
|
31
|
+
if @word.nil?
|
32
|
+
return usage
|
33
|
+
end
|
34
|
+
|
35
|
+
if @dest_dir
|
36
|
+
if File.directory?(dest_dir)
|
37
|
+
@dest_dir = File.expand_path(dest_dir)+"/"
|
38
|
+
else
|
39
|
+
error "#{@dest_dir} is not directory"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
search
|
44
|
+
end
|
45
|
+
|
46
|
+
def search
|
47
|
+
display("searching...")
|
48
|
+
page = Page.pages["BaiduMp3"].parse(:word => word)
|
49
|
+
result = page["result"][1]
|
50
|
+
music_url = result["music"][:href]
|
51
|
+
mp3url = decode open(URI.encode music_url).read[/var encurl = "([^"]*)"/,1]
|
52
|
+
display2("ok, parsing mp3...")
|
53
|
+
#size = `curl -I #{mp3url} 2>/dev/null`[/Content-Length:\ (\d+)/,1]
|
54
|
+
#display2(", size: %.2fM. " % (Integer(size)/MEGABYTE))
|
55
|
+
confirm("sure to download?") do
|
56
|
+
download_mp3(mp3url, word, result["format"].innerText)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def usage
|
61
|
+
display "usage: autoweb mp3search MUSIC_NAME"
|
62
|
+
end
|
63
|
+
|
64
|
+
def download_mp3(url, word, format)
|
65
|
+
`wget #{url} -O #{dest_dir}#{word.gsub(/[\+|\ |_]/, "-")}.#{format}`
|
66
|
+
end
|
67
|
+
|
68
|
+
def decode(s)
|
69
|
+
s.tr(_mktab(s[0].chr), s=~ /....:\// ? _mktab('h') : _mktab('f')) #http|ftp
|
70
|
+
end
|
71
|
+
|
72
|
+
def _mktab(x)
|
73
|
+
t0 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
74
|
+
p = t0.partition(x)
|
75
|
+
p[1] + p[2] + p[0]
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/commands/help.rb
ADDED
data/lib/autoweb.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'autoweb/ui'
|
2
|
+
module Autoweb
|
3
|
+
module Command
|
4
|
+
class Base
|
5
|
+
include Autoweb::UI
|
6
|
+
attr_accessor :args
|
7
|
+
|
8
|
+
def initialize(args)
|
9
|
+
@args = args
|
10
|
+
end
|
11
|
+
|
12
|
+
def usage
|
13
|
+
<<-EOTXT
|
14
|
+
=== Command List:
|
15
|
+
automan console
|
16
|
+
automan dbconsole
|
17
|
+
automan help
|
18
|
+
automan update
|
19
|
+
|
20
|
+
|
21
|
+
EOTXT
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
class InvalidCommand < RuntimeError; end
|
26
|
+
class CommandFailed < RuntimeError; end
|
27
|
+
|
28
|
+
class << self
|
29
|
+
|
30
|
+
include Autoweb::UI
|
31
|
+
def run(command, args, retries=0)
|
32
|
+
begin
|
33
|
+
run_internal(command, args.dup)
|
34
|
+
rescue InvalidCommand
|
35
|
+
error "Unknown command. Run 'autoweb help' for usage information."
|
36
|
+
rescue CommandFailed => e
|
37
|
+
error e.message
|
38
|
+
rescue Interrupt => e
|
39
|
+
error "\n[canceled]"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def run_internal(command, args)
|
44
|
+
klass, method = parse(command)
|
45
|
+
runner = klass.new(args)
|
46
|
+
raise InvalidCommand unless runner.respond_to?(method)
|
47
|
+
runner.send(method)
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse(command)
|
51
|
+
parts = command.split(':')
|
52
|
+
case parts.size
|
53
|
+
when 1
|
54
|
+
begin
|
55
|
+
return eval("Autoweb::Command::#{command.capitalize}"), :index
|
56
|
+
#rescue NameError, NoMethodError
|
57
|
+
# return Autoweb::Command::Help, command
|
58
|
+
end
|
59
|
+
when 2
|
60
|
+
begin
|
61
|
+
return Autoweb::Command.const_get(parts[0].capitalize), parts[1]
|
62
|
+
rescue NameError
|
63
|
+
raise InvalidCommand
|
64
|
+
end
|
65
|
+
else
|
66
|
+
raise InvalidCommand
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
Dir["#{File.dirname(__FILE__)}/../../commands/*.rb"].each { |c|
|
74
|
+
unless (/_helper\.rb$/=~c)
|
75
|
+
require c
|
76
|
+
end
|
77
|
+
}
|
data/lib/autoweb/page.rb
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "ostruct"
|
3
|
+
require "open-uri"
|
4
|
+
require "hpricot"
|
5
|
+
module Autoweb
|
6
|
+
module Container
|
7
|
+
attr_reader :name, :sub_pages, :elements
|
8
|
+
|
9
|
+
def subs(name,sub_css,&block)
|
10
|
+
def_sub(name, sub_css, true, &block)
|
11
|
+
end
|
12
|
+
|
13
|
+
def sub(name,sub_css,&block)
|
14
|
+
def_sub(name, sub_css, false, &block)
|
15
|
+
end
|
16
|
+
|
17
|
+
def ele(name, css)
|
18
|
+
@elements[name] = Element.new(self,name,css)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def def_sub(name, sub_css, is_array, &block)
|
24
|
+
sub = SubPage.new(self,name,sub_css,is_array)
|
25
|
+
yield sub
|
26
|
+
@sub_pages[name] = sub
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class SubPage
|
31
|
+
attr_reader :parent, :css, :is_array
|
32
|
+
include Container
|
33
|
+
def initialize(parent, name, css, is_array=false)
|
34
|
+
@parent = parent
|
35
|
+
@css = css
|
36
|
+
@name = name
|
37
|
+
@sub_pages = {}
|
38
|
+
@elements = {}
|
39
|
+
@is_array = is_array
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Element
|
44
|
+
attr_reader :parent, :name, :css
|
45
|
+
def initialize(parent,name, css)
|
46
|
+
@parent = parent
|
47
|
+
@name = name
|
48
|
+
@css = css
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Page
|
53
|
+
class << self
|
54
|
+
def pages
|
55
|
+
@pages||={}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
attr_accessor :name, :url_tpl
|
59
|
+
include Container
|
60
|
+
|
61
|
+
def initialize(name)
|
62
|
+
@name = name
|
63
|
+
@sub_pages = {}
|
64
|
+
@elements = {}
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.define(name,&block)
|
68
|
+
page = self.new(name)
|
69
|
+
yield page
|
70
|
+
self.pages[name] = page
|
71
|
+
page
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse(hash)
|
75
|
+
Parser.new(self,hash).go
|
76
|
+
end
|
77
|
+
|
78
|
+
def url(locals)
|
79
|
+
OpenStruct.new(locals.merge(:url_tpl=>self.url_tpl)).instance_eval{
|
80
|
+
eval %Q{"#{url_tpl.gsub(/"/, '\"')}"}
|
81
|
+
}
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class Parser
|
86
|
+
attr_accessor :page, :url
|
87
|
+
def initialize(page, hash)
|
88
|
+
@page = page
|
89
|
+
if hash.is_a?(String)
|
90
|
+
@url = hash
|
91
|
+
else
|
92
|
+
@url = page.url(hash)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def go
|
97
|
+
contaent_parser
|
98
|
+
self
|
99
|
+
end
|
100
|
+
|
101
|
+
def [](key)
|
102
|
+
contaent_parser[key]
|
103
|
+
end
|
104
|
+
|
105
|
+
def contaent_parser
|
106
|
+
@content_parser||=ContentParser.new(doc,page)
|
107
|
+
end
|
108
|
+
|
109
|
+
def doc
|
110
|
+
@doc||=Hpricot(open(URI.encode url))
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
class ContentParser
|
116
|
+
attr_accessor :doc, :page
|
117
|
+
|
118
|
+
def initialize(doc, page)
|
119
|
+
@doc = doc
|
120
|
+
@page = page
|
121
|
+
end
|
122
|
+
|
123
|
+
def [](key)
|
124
|
+
if r = page.sub_pages[key]
|
125
|
+
if r.is_array
|
126
|
+
doc.search(r.css).map{|e|self.class.new(e,r)}
|
127
|
+
else
|
128
|
+
self.new(doc.at(page.css),r)
|
129
|
+
end
|
130
|
+
elsif r = page.elements[key]
|
131
|
+
doc.at(r.css)
|
132
|
+
else
|
133
|
+
raise "key not found"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def method_missing(key, *args, &block)
|
138
|
+
if doc.respond_to?(key)
|
139
|
+
doc.send key, *args, &block
|
140
|
+
else
|
141
|
+
super
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
data/lib/autoweb/ui.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
module Autoweb
|
2
|
+
module UI
|
3
|
+
class RetryError < RuntimeError
|
4
|
+
end
|
5
|
+
|
6
|
+
class Input < String
|
7
|
+
include UI
|
8
|
+
def process_default(options={})
|
9
|
+
if is?("q")
|
10
|
+
exit(0)
|
11
|
+
elsif is?("h")||is?("?")
|
12
|
+
display options[:help]||"no help"
|
13
|
+
raise RetryError
|
14
|
+
else
|
15
|
+
raise RetryError
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def yes?
|
21
|
+
self.is?("y")
|
22
|
+
end
|
23
|
+
|
24
|
+
def is?(str)
|
25
|
+
return if str.nil?
|
26
|
+
self.downcase.strip == str.downcase.strip
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def display(msg, new_line = true)
|
31
|
+
msg = msg.to_s.gsub(/_/) { ' ' }
|
32
|
+
if new_line
|
33
|
+
STDOUT.puts msg
|
34
|
+
else
|
35
|
+
STDOUT.print msg
|
36
|
+
end
|
37
|
+
STDOUT.flush
|
38
|
+
end
|
39
|
+
|
40
|
+
def error(msg="error")
|
41
|
+
display msg
|
42
|
+
exit(1)
|
43
|
+
end
|
44
|
+
|
45
|
+
def display2(msg)
|
46
|
+
display(msg,false)
|
47
|
+
end
|
48
|
+
|
49
|
+
def confirm(message=nil, options={})
|
50
|
+
if message.nil?
|
51
|
+
message = "Are you sure you wish to continue?"
|
52
|
+
end
|
53
|
+
|
54
|
+
message << "(y/q/h)"
|
55
|
+
|
56
|
+
ask_loop(message) do |input|
|
57
|
+
if input.yes?
|
58
|
+
yield
|
59
|
+
else
|
60
|
+
input.process_default(:help=>options[:help])
|
61
|
+
end
|
62
|
+
end
|
63
|
+
ask.downcase == 'y'
|
64
|
+
end
|
65
|
+
|
66
|
+
def format_date(date)
|
67
|
+
date = Time.parse(date) if date.is_a?(String)
|
68
|
+
date.strftime("%Y-%m-%d %H:%M %Z")
|
69
|
+
end
|
70
|
+
|
71
|
+
def ask_loop(message,&block)
|
72
|
+
display2 message+" "
|
73
|
+
begin
|
74
|
+
yield Input.new(ask)
|
75
|
+
rescue RetryError => e
|
76
|
+
display2 message
|
77
|
+
retry
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def ask
|
82
|
+
gets.strip
|
83
|
+
end
|
84
|
+
|
85
|
+
def shell(cmd)
|
86
|
+
FileUtils.cd(Dir.pwd) {|d| return `#{cmd}`}
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: autoweb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: 0.0.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- dazuiba
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-08-07 00:00:00 +08:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Automate the Internet. baidu music downloader
|
22
|
+
email: come2u@gmail.com
|
23
|
+
executables:
|
24
|
+
- autoweb
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- README.md
|
31
|
+
- bin/autoweb
|
32
|
+
- lib/autoweb/command.rb
|
33
|
+
- lib/autoweb/page.rb
|
34
|
+
- lib/autoweb/ui.rb
|
35
|
+
- lib/autoweb/version.rb
|
36
|
+
- lib/autoweb.rb
|
37
|
+
- commands/baidump3.rb
|
38
|
+
- commands/help.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: http://github.com/dazuiba/autoweb.git
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
segments:
|
53
|
+
- 1
|
54
|
+
- 8
|
55
|
+
- 7
|
56
|
+
version: 1.8.7
|
57
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.3.6
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Gem for the rest
|
71
|
+
test_files: []
|
72
|
+
|