web-parser 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +23 -0
- data/TEMPLATE_SPEC +82 -0
- data/TODO +6 -0
- data/init.rb +4 -0
- data/lib/web_parser.rb +6 -0
- data/lib/web_parser/template.rb +37 -0
- data/lib/web_parser/web_agent.rb +20 -0
- data/lib/web_parser/web_parser.rb +170 -0
- data/spec/amazon.html +6177 -0
- data/spec/amazon.template +31 -0
- data/spec/dangdang.html +506 -0
- data/spec/product.template +112 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/template_spec.rb +13 -0
- data/spec/web_parser_spec.rb +23 -0
- data/web_parser.gemspec +39 -0
- metadata +109 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
---
|
2
|
+
vistor_purchased:
|
3
|
+
xpath: div[name=__BAV_bk] ul li
|
4
|
+
attributes:
|
5
|
+
purchased_product_id:
|
6
|
+
transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
|
7
|
+
= cate.match(value);matches[1] if matches}'
|
8
|
+
xpath: a
|
9
|
+
attr: href
|
10
|
+
title:
|
11
|
+
xpath: p a
|
12
|
+
attr: title
|
13
|
+
percent:
|
14
|
+
xpath: p span
|
15
|
+
attr: content
|
16
|
+
type: list
|
17
|
+
vistor_readed:
|
18
|
+
xpath: div[name=__alsoview_pub] ul li.detailed
|
19
|
+
attributes:
|
20
|
+
title:
|
21
|
+
xpath: a
|
22
|
+
attr: title
|
23
|
+
readed_product_id:
|
24
|
+
transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
|
25
|
+
= cate.match(value);matches[1] if matches}'
|
26
|
+
xpath: a
|
27
|
+
attr: href
|
28
|
+
type: list
|
29
|
+
customer_purchased:
|
30
|
+
xpath: div[name=__alsobuy_pub] ul li.detailed
|
31
|
+
attributes:
|
32
|
+
purchased_product_id:
|
33
|
+
transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
|
34
|
+
= cate.match(value);matches[1] if matches}'
|
35
|
+
xpath: a
|
36
|
+
attr: href
|
37
|
+
title:
|
38
|
+
xpath: a
|
39
|
+
attr: title
|
40
|
+
type: list
|
41
|
+
book:
|
42
|
+
xpath: div.dp_main
|
43
|
+
attributes:
|
44
|
+
isbn:
|
45
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
46
|
+
xpath: div.info div.book_detailed ul.clearfix li[3] span[2]
|
47
|
+
title:
|
48
|
+
xpath: div.h1_title h1
|
49
|
+
attr: content
|
50
|
+
price:
|
51
|
+
transform_rules: ! "{|object| object.children.first.content.split(':').last.gsub(\"\t\",'')}"
|
52
|
+
xpath: p.price_m
|
53
|
+
dimensions:
|
54
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
55
|
+
xpath: div.info div.book_detailed ul.clearfix li[2] span[2]
|
56
|
+
editions:
|
57
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
58
|
+
xpath: div.info div.book_detailed ul.clearfix li[1] span[1]
|
59
|
+
publish_date:
|
60
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
61
|
+
xpath: div.info div.book_detailed p[3]
|
62
|
+
translator:
|
63
|
+
xpath: div.info div.book_detailed p[1]
|
64
|
+
author:
|
65
|
+
transform_rules: ! '{|object| object.search(''a'').map{|a| a.content}.join('','')}'
|
66
|
+
xpath: div.info div.book_detailed p[1]
|
67
|
+
packaging:
|
68
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
69
|
+
xpath: div.info div.book_detailed ul.clearfix li[3] span[3]
|
70
|
+
print_time:
|
71
|
+
xpath: div.info div.book_detailed ul.clearfix li[2] span[1]
|
72
|
+
attr: content
|
73
|
+
word_count:
|
74
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
75
|
+
xpath: div.info div.book_detailed ul.clearfix li[1] span[3]
|
76
|
+
publisher:
|
77
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
78
|
+
xpath: div.info div.book_detailed p[2]
|
79
|
+
print_times:
|
80
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
81
|
+
xpath: div.info div.book_detailed ul.clearfix li[3] span[1]
|
82
|
+
pages:
|
83
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
84
|
+
xpath: div.info div.book_detailed ul.clearfix li[1] span[2]
|
85
|
+
paper:
|
86
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
87
|
+
xpath: div.info div.book_detailed ul.clearfix li[2] span[3]
|
88
|
+
cover:
|
89
|
+
transform_rules: ! '{|value| value.to_s.send(:gsub,''_b.'',''_e.''); }'
|
90
|
+
xpath: div.show div.pic a img
|
91
|
+
attr: src
|
92
|
+
type: single
|
93
|
+
product:
|
94
|
+
type: single
|
95
|
+
xpath: div.dp_wrap
|
96
|
+
attributes:
|
97
|
+
price:
|
98
|
+
xpath: p.price_d span
|
99
|
+
attr: content
|
100
|
+
product_id:
|
101
|
+
xpath: a#bookshelf
|
102
|
+
attr: href
|
103
|
+
category_id:
|
104
|
+
transform_rules: ! '{|value| cate = Regexp.new(''\/([\d.]+)\.''); matches =
|
105
|
+
cate.match(value); matches[1] if matches}'
|
106
|
+
xpath: div.dp_break a[last()]
|
107
|
+
attr: href
|
108
|
+
grade:
|
109
|
+
transform_rules: ! '{|object| object.search(''img'').inject(0){|f,i| {''images/star_red.gif''
|
110
|
+
=> 1,''images/star_red2.gif'' => 0.5}[i.attributes[''src''].value].to_i+f}}'
|
111
|
+
xpath: p.fraction span
|
112
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Template do
|
5
|
+
it '有效的模板文件,应该加载成功' do
|
6
|
+
Template.load_template('product.template').should be_a_instance_of(Hash)
|
7
|
+
end
|
8
|
+
|
9
|
+
it '模板文件应该能被生成' do
|
10
|
+
template = Template.load_template('product.template')
|
11
|
+
Template.dump_template(template,'dangdang.template').should be_true
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe WebParser do
|
5
|
+
it '指定有效的url和模板应该返回提取信息' do
|
6
|
+
url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
|
7
|
+
WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
|
8
|
+
end
|
9
|
+
|
10
|
+
it '可以从xhtml文件中提取信息' do
|
11
|
+
WebParser.extract_from_file('amazon.html','amazon.template').should be_a_instance_of(Hash)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'gb2312格式的文件能正常提取信息' do
|
15
|
+
WebParser.extract_from_file('dangdang.html','product.template').should be_a_instance_of(Hash)
|
16
|
+
end
|
17
|
+
|
18
|
+
it '指定有效的头部参数,应该返回提取信息' do
|
19
|
+
url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
|
20
|
+
options = {'Cookie' => "session-id=478-2475059-0859041; ubid-acbcn=480-2057666-9756625; session-token=9Ny9/S+ZSraEq4tmdtT8hvyVmHxCc9+st0wiyIUinLhDNBfG6vTanMqh7TPbdxQrgFmaluE8DlrbP0ahMnbuzmr3PoakqF5HBd9k4sYYYqLBnG2xA1GN6sG1QzP5R3kxIxgV88kqoB1tDXx7cukohzkpTEVrm+jcrYkd27HLbe2C3UrMJ/Y7bJVucqjjB0oHiEMHhkKGMTSd2+u5iS24PgpAfwD/1VSukugVn6h/gQM=;"}
|
21
|
+
WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
|
22
|
+
end
|
23
|
+
end
|
data/web_parser.gemspec
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path('../lib', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'web-parser'
|
6
|
+
s.version = '0.2.1'
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ['Aaron']
|
9
|
+
s.email = ['Aaron@nonobo.com']
|
10
|
+
s.homepage = ''
|
11
|
+
s.summary = %q{a tool for extract web information }
|
12
|
+
s.description = %q{a tool for extract web information.}
|
13
|
+
|
14
|
+
s.rubyforge_project = ''
|
15
|
+
|
16
|
+
s.add_dependency('nokogiri', '~> 1')
|
17
|
+
s.add_dependency('rspec','~> 2')
|
18
|
+
s.add_dependency('bundler','>=1.0.5')
|
19
|
+
|
20
|
+
s.files = [
|
21
|
+
'init.rb',
|
22
|
+
'README',
|
23
|
+
'TEMPLATE_SPEC',
|
24
|
+
'TODO',
|
25
|
+
'web_parser.gemspec',
|
26
|
+
'lib/web_parser.rb',
|
27
|
+
'lib/web_parser/template.rb',
|
28
|
+
'lib/web_parser/web_agent.rb',
|
29
|
+
'lib/web_parser/web_parser.rb',
|
30
|
+
'spec/amazon.html',
|
31
|
+
'spec/amazon.template',
|
32
|
+
'spec/dangdang.html',
|
33
|
+
'spec/product.template',
|
34
|
+
'spec/spec_helper.rb',
|
35
|
+
'spec/template_spec.rb',
|
36
|
+
'spec/web_parser_spec.rb'
|
37
|
+
]
|
38
|
+
s.require_paths = ['lib']
|
39
|
+
end
|
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: web-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aaron
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-08 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '2'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: bundler
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.0.5
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.0.5
|
62
|
+
description: a tool for extract web information.
|
63
|
+
email:
|
64
|
+
- Aaron@nonobo.com
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- init.rb
|
70
|
+
- README
|
71
|
+
- TEMPLATE_SPEC
|
72
|
+
- TODO
|
73
|
+
- web_parser.gemspec
|
74
|
+
- lib/web_parser.rb
|
75
|
+
- lib/web_parser/template.rb
|
76
|
+
- lib/web_parser/web_agent.rb
|
77
|
+
- lib/web_parser/web_parser.rb
|
78
|
+
- spec/amazon.html
|
79
|
+
- spec/amazon.template
|
80
|
+
- spec/dangdang.html
|
81
|
+
- spec/product.template
|
82
|
+
- spec/spec_helper.rb
|
83
|
+
- spec/template_spec.rb
|
84
|
+
- spec/web_parser_spec.rb
|
85
|
+
homepage: ''
|
86
|
+
licenses: []
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
requirements: []
|
104
|
+
rubyforge_project: ''
|
105
|
+
rubygems_version: 1.8.25
|
106
|
+
signing_key:
|
107
|
+
specification_version: 3
|
108
|
+
summary: a tool for extract web information
|
109
|
+
test_files: []
|