web-parser 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +23 -0
- data/TEMPLATE_SPEC +82 -0
- data/TODO +6 -0
- data/init.rb +4 -0
- data/lib/web_parser.rb +6 -0
- data/lib/web_parser/template.rb +37 -0
- data/lib/web_parser/web_agent.rb +20 -0
- data/lib/web_parser/web_parser.rb +170 -0
- data/spec/amazon.html +6177 -0
- data/spec/amazon.template +31 -0
- data/spec/dangdang.html +506 -0
- data/spec/product.template +112 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/template_spec.rb +13 -0
- data/spec/web_parser_spec.rb +23 -0
- data/web_parser.gemspec +39 -0
- metadata +109 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
---
|
2
|
+
vistor_purchased:
|
3
|
+
xpath: div[name=__BAV_bk] ul li
|
4
|
+
attributes:
|
5
|
+
purchased_product_id:
|
6
|
+
transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
|
7
|
+
= cate.match(value);matches[1] if matches}'
|
8
|
+
xpath: a
|
9
|
+
attr: href
|
10
|
+
title:
|
11
|
+
xpath: p a
|
12
|
+
attr: title
|
13
|
+
percent:
|
14
|
+
xpath: p span
|
15
|
+
attr: content
|
16
|
+
type: list
|
17
|
+
vistor_readed:
|
18
|
+
xpath: div[name=__alsoview_pub] ul li.detailed
|
19
|
+
attributes:
|
20
|
+
title:
|
21
|
+
xpath: a
|
22
|
+
attr: title
|
23
|
+
readed_product_id:
|
24
|
+
transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
|
25
|
+
= cate.match(value);matches[1] if matches}'
|
26
|
+
xpath: a
|
27
|
+
attr: href
|
28
|
+
type: list
|
29
|
+
customer_purchased:
|
30
|
+
xpath: div[name=__alsobuy_pub] ul li.detailed
|
31
|
+
attributes:
|
32
|
+
purchased_product_id:
|
33
|
+
transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
|
34
|
+
= cate.match(value);matches[1] if matches}'
|
35
|
+
xpath: a
|
36
|
+
attr: href
|
37
|
+
title:
|
38
|
+
xpath: a
|
39
|
+
attr: title
|
40
|
+
type: list
|
41
|
+
book:
|
42
|
+
xpath: div.dp_main
|
43
|
+
attributes:
|
44
|
+
isbn:
|
45
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
46
|
+
xpath: div.info div.book_detailed ul.clearfix li[3] span[2]
|
47
|
+
title:
|
48
|
+
xpath: div.h1_title h1
|
49
|
+
attr: content
|
50
|
+
price:
|
51
|
+
transform_rules: ! "{|object| object.children.first.content.split(':').last.gsub(\"\t\",'')}"
|
52
|
+
xpath: p.price_m
|
53
|
+
dimensions:
|
54
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
55
|
+
xpath: div.info div.book_detailed ul.clearfix li[2] span[2]
|
56
|
+
editions:
|
57
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
58
|
+
xpath: div.info div.book_detailed ul.clearfix li[1] span[1]
|
59
|
+
publish_date:
|
60
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
61
|
+
xpath: div.info div.book_detailed p[3]
|
62
|
+
translator:
|
63
|
+
xpath: div.info div.book_detailed p[1]
|
64
|
+
author:
|
65
|
+
transform_rules: ! '{|object| object.search(''a'').map{|a| a.content}.join('','')}'
|
66
|
+
xpath: div.info div.book_detailed p[1]
|
67
|
+
packaging:
|
68
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
69
|
+
xpath: div.info div.book_detailed ul.clearfix li[3] span[3]
|
70
|
+
print_time:
|
71
|
+
xpath: div.info div.book_detailed ul.clearfix li[2] span[1]
|
72
|
+
attr: content
|
73
|
+
word_count:
|
74
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
75
|
+
xpath: div.info div.book_detailed ul.clearfix li[1] span[3]
|
76
|
+
publisher:
|
77
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
78
|
+
xpath: div.info div.book_detailed p[2]
|
79
|
+
print_times:
|
80
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
81
|
+
xpath: div.info div.book_detailed ul.clearfix li[3] span[1]
|
82
|
+
pages:
|
83
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
84
|
+
xpath: div.info div.book_detailed ul.clearfix li[1] span[2]
|
85
|
+
paper:
|
86
|
+
transform_rules: ! '{|object| object.content.split('':'').last}'
|
87
|
+
xpath: div.info div.book_detailed ul.clearfix li[2] span[3]
|
88
|
+
cover:
|
89
|
+
transform_rules: ! '{|value| value.to_s.send(:gsub,''_b.'',''_e.''); }'
|
90
|
+
xpath: div.show div.pic a img
|
91
|
+
attr: src
|
92
|
+
type: single
|
93
|
+
product:
|
94
|
+
type: single
|
95
|
+
xpath: div.dp_wrap
|
96
|
+
attributes:
|
97
|
+
price:
|
98
|
+
xpath: p.price_d span
|
99
|
+
attr: content
|
100
|
+
product_id:
|
101
|
+
xpath: a#bookshelf
|
102
|
+
attr: href
|
103
|
+
category_id:
|
104
|
+
transform_rules: ! '{|value| cate = Regexp.new(''\/([\d.]+)\.''); matches =
|
105
|
+
cate.match(value); matches[1] if matches}'
|
106
|
+
xpath: div.dp_break a[last()]
|
107
|
+
attr: href
|
108
|
+
grade:
|
109
|
+
transform_rules: ! '{|object| object.search(''img'').inject(0){|f,i| {''images/star_red.gif''
|
110
|
+
=> 1,''images/star_red2.gif'' => 0.5}[i.attributes[''src''].value].to_i+f}}'
|
111
|
+
xpath: p.fraction span
|
112
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Template do
|
5
|
+
it '有效的模板文件,应该加载成功' do
|
6
|
+
Template.load_template('product.template').should be_a_instance_of(Hash)
|
7
|
+
end
|
8
|
+
|
9
|
+
it '模板文件应该能被生成' do
|
10
|
+
template = Template.load_template('product.template')
|
11
|
+
Template.dump_template(template,'dangdang.template').should be_true
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe WebParser do
|
5
|
+
it '指定有效的url和模板应该返回提取信息' do
|
6
|
+
url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
|
7
|
+
WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
|
8
|
+
end
|
9
|
+
|
10
|
+
it '可以从xhtml文件中提取信息' do
|
11
|
+
WebParser.extract_from_file('amazon.html','amazon.template').should be_a_instance_of(Hash)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'gb2312格式的文件能正常提取信息' do
|
15
|
+
WebParser.extract_from_file('dangdang.html','product.template').should be_a_instance_of(Hash)
|
16
|
+
end
|
17
|
+
|
18
|
+
it '指定有效的头部参数,应该返回提取信息' do
|
19
|
+
url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
|
20
|
+
options = {'Cookie' => "session-id=478-2475059-0859041; ubid-acbcn=480-2057666-9756625; session-token=9Ny9/S+ZSraEq4tmdtT8hvyVmHxCc9+st0wiyIUinLhDNBfG6vTanMqh7TPbdxQrgFmaluE8DlrbP0ahMnbuzmr3PoakqF5HBd9k4sYYYqLBnG2xA1GN6sG1QzP5R3kxIxgV88kqoB1tDXx7cukohzkpTEVrm+jcrYkd27HLbe2C3UrMJ/Y7bJVucqjjB0oHiEMHhkKGMTSd2+u5iS24PgpAfwD/1VSukugVn6h/gQM=;"}
|
21
|
+
WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
|
22
|
+
end
|
23
|
+
end
|
data/web_parser.gemspec
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path('../lib', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'web-parser'
|
6
|
+
s.version = '0.2.1'
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ['Aaron']
|
9
|
+
s.email = ['Aaron@nonobo.com']
|
10
|
+
s.homepage = ''
|
11
|
+
s.summary = %q{a tool for extract web information }
|
12
|
+
s.description = %q{a tool for extract web information.}
|
13
|
+
|
14
|
+
s.rubyforge_project = ''
|
15
|
+
|
16
|
+
s.add_dependency('nokogiri', '~> 1')
|
17
|
+
s.add_dependency('rspec','~> 2')
|
18
|
+
s.add_dependency('bundler','>=1.0.5')
|
19
|
+
|
20
|
+
s.files = [
|
21
|
+
'init.rb',
|
22
|
+
'README',
|
23
|
+
'TEMPLATE_SPEC',
|
24
|
+
'TODO',
|
25
|
+
'web_parser.gemspec',
|
26
|
+
'lib/web_parser.rb',
|
27
|
+
'lib/web_parser/template.rb',
|
28
|
+
'lib/web_parser/web_agent.rb',
|
29
|
+
'lib/web_parser/web_parser.rb',
|
30
|
+
'spec/amazon.html',
|
31
|
+
'spec/amazon.template',
|
32
|
+
'spec/dangdang.html',
|
33
|
+
'spec/product.template',
|
34
|
+
'spec/spec_helper.rb',
|
35
|
+
'spec/template_spec.rb',
|
36
|
+
'spec/web_parser_spec.rb'
|
37
|
+
]
|
38
|
+
s.require_paths = ['lib']
|
39
|
+
end
|
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: web-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aaron
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-04-08 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '2'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: bundler
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.0.5
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.0.5
|
62
|
+
description: a tool for extract web information.
|
63
|
+
email:
|
64
|
+
- Aaron@nonobo.com
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- init.rb
|
70
|
+
- README
|
71
|
+
- TEMPLATE_SPEC
|
72
|
+
- TODO
|
73
|
+
- web_parser.gemspec
|
74
|
+
- lib/web_parser.rb
|
75
|
+
- lib/web_parser/template.rb
|
76
|
+
- lib/web_parser/web_agent.rb
|
77
|
+
- lib/web_parser/web_parser.rb
|
78
|
+
- spec/amazon.html
|
79
|
+
- spec/amazon.template
|
80
|
+
- spec/dangdang.html
|
81
|
+
- spec/product.template
|
82
|
+
- spec/spec_helper.rb
|
83
|
+
- spec/template_spec.rb
|
84
|
+
- spec/web_parser_spec.rb
|
85
|
+
homepage: ''
|
86
|
+
licenses: []
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
requirements: []
|
104
|
+
rubyforge_project: ''
|
105
|
+
rubygems_version: 1.8.25
|
106
|
+
signing_key:
|
107
|
+
specification_version: 3
|
108
|
+
summary: a tool for extract web information
|
109
|
+
test_files: []
|