web-parser 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,112 @@
1
+ ---
2
+ vistor_purchased:
3
+ xpath: div[name=__BAV_bk] ul li
4
+ attributes:
5
+ purchased_product_id:
6
+ transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
7
+ = cate.match(value);matches[1] if matches}'
8
+ xpath: a
9
+ attr: href
10
+ title:
11
+ xpath: p a
12
+ attr: title
13
+ percent:
14
+ xpath: p span
15
+ attr: content
16
+ type: list
17
+ vistor_readed:
18
+ xpath: div[name=__alsoview_pub] ul li.detailed
19
+ attributes:
20
+ title:
21
+ xpath: a
22
+ attr: title
23
+ readed_product_id:
24
+ transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
25
+ = cate.match(value);matches[1] if matches}'
26
+ xpath: a
27
+ attr: href
28
+ type: list
29
+ customer_purchased:
30
+ xpath: div[name=__alsobuy_pub] ul li.detailed
31
+ attributes:
32
+ purchased_product_id:
33
+ transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
34
+ = cate.match(value);matches[1] if matches}'
35
+ xpath: a
36
+ attr: href
37
+ title:
38
+ xpath: a
39
+ attr: title
40
+ type: list
41
+ book:
42
+ xpath: div.dp_main
43
+ attributes:
44
+ isbn:
45
+ transform_rules: ! '{|object| object.content.split('':'').last}'
46
+ xpath: div.info div.book_detailed ul.clearfix li[3] span[2]
47
+ title:
48
+ xpath: div.h1_title h1
49
+ attr: content
50
+ price:
51
+ transform_rules: ! "{|object| object.children.first.content.split(':').last.gsub(\"\t\",'')}"
52
+ xpath: p.price_m
53
+ dimensions:
54
+ transform_rules: ! '{|object| object.content.split('':'').last}'
55
+ xpath: div.info div.book_detailed ul.clearfix li[2] span[2]
56
+ editions:
57
+ transform_rules: ! '{|object| object.content.split('':'').last}'
58
+ xpath: div.info div.book_detailed ul.clearfix li[1] span[1]
59
+ publish_date:
60
+ transform_rules: ! '{|object| object.content.split('':'').last}'
61
+ xpath: div.info div.book_detailed p[3]
62
+ translator:
63
+ xpath: div.info div.book_detailed p[1]
64
+ author:
65
+ transform_rules: ! '{|object| object.search(''a'').map{|a| a.content}.join('','')}'
66
+ xpath: div.info div.book_detailed p[1]
67
+ packaging:
68
+ transform_rules: ! '{|object| object.content.split('':'').last}'
69
+ xpath: div.info div.book_detailed ul.clearfix li[3] span[3]
70
+ print_time:
71
+ xpath: div.info div.book_detailed ul.clearfix li[2] span[1]
72
+ attr: content
73
+ word_count:
74
+ transform_rules: ! '{|object| object.content.split('':'').last}'
75
+ xpath: div.info div.book_detailed ul.clearfix li[1] span[3]
76
+ publisher:
77
+ transform_rules: ! '{|object| object.content.split('':'').last}'
78
+ xpath: div.info div.book_detailed p[2]
79
+ print_times:
80
+ transform_rules: ! '{|object| object.content.split('':'').last}'
81
+ xpath: div.info div.book_detailed ul.clearfix li[3] span[1]
82
+ pages:
83
+ transform_rules: ! '{|object| object.content.split('':'').last}'
84
+ xpath: div.info div.book_detailed ul.clearfix li[1] span[2]
85
+ paper:
86
+ transform_rules: ! '{|object| object.content.split('':'').last}'
87
+ xpath: div.info div.book_detailed ul.clearfix li[2] span[3]
88
+ cover:
89
+ transform_rules: ! '{|value| value.to_s.send(:gsub,''_b.'',''_e.''); }'
90
+ xpath: div.show div.pic a img
91
+ attr: src
92
+ type: single
93
+ product:
94
+ type: single
95
+ xpath: div.dp_wrap
96
+ attributes:
97
+ price:
98
+ xpath: p.price_d span
99
+ attr: content
100
+ product_id:
101
+ xpath: a#bookshelf
102
+ attr: href
103
+ category_id:
104
+ transform_rules: ! '{|value| cate = Regexp.new(''\/([\d.]+)\.''); matches =
105
+ cate.match(value); matches[1] if matches}'
106
+ xpath: div.dp_break a[last()]
107
+ attr: href
108
+ grade:
109
+ transform_rules: ! '{|object| object.search(''img'').inject(0){|f,i| {''images/star_red.gif''
110
+ => 1,''images/star_red2.gif'' => 0.5}[i.attributes[''src''].value].to_i+f}}'
111
+ xpath: p.fraction span
112
+
@@ -0,0 +1,4 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require File.expand_path('../../lib/web_parser', __FILE__)
@@ -0,0 +1,13 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Template do
5
+ it '有效的模板文件,应该加载成功' do
6
+ Template.load_template('product.template').should be_a_instance_of(Hash)
7
+ end
8
+
9
+ it '模板文件应该能被生成' do
10
+ template = Template.load_template('product.template')
11
+ Template.dump_template(template,'dangdang.template').should be_true
12
+ end
13
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe WebParser do
5
+ it '指定有效的url和模板应该返回提取信息' do
6
+ url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
7
+ WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
8
+ end
9
+
10
+ it '可以从xhtml文件中提取信息' do
11
+ WebParser.extract_from_file('amazon.html','amazon.template').should be_a_instance_of(Hash)
12
+ end
13
+
14
+ it 'gb2312格式的文件能正常提取信息' do
15
+ WebParser.extract_from_file('dangdang.html','product.template').should be_a_instance_of(Hash)
16
+ end
17
+
18
+ it '指定有效的头部参数,应该返回提取信息' do
19
+ url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
20
+ options = {'Cookie' => "session-id=478-2475059-0859041; ubid-acbcn=480-2057666-9756625; session-token=9Ny9/S+ZSraEq4tmdtT8hvyVmHxCc9+st0wiyIUinLhDNBfG6vTanMqh7TPbdxQrgFmaluE8DlrbP0ahMnbuzmr3PoakqF5HBd9k4sYYYqLBnG2xA1GN6sG1QzP5R3kxIxgV88kqoB1tDXx7cukohzkpTEVrm+jcrYkd27HLbe2C3UrMJ/Y7bJVucqjjB0oHiEMHhkKGMTSd2+u5iS24PgpAfwD/1VSukugVn6h/gQM=;"}
21
+ WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
22
+ end
23
+ end
@@ -0,0 +1,39 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path('../lib', __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'web-parser'
6
+ s.version = '0.2.1'
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ['Aaron']
9
+ s.email = ['Aaron@nonobo.com']
10
+ s.homepage = ''
11
+ s.summary = %q{a tool for extract web information }
12
+ s.description = %q{a tool for extract web information.}
13
+
14
+ s.rubyforge_project = ''
15
+
16
+ s.add_dependency('nokogiri', '~> 1')
17
+ s.add_dependency('rspec','~> 2')
18
+ s.add_dependency('bundler','>=1.0.5')
19
+
20
+ s.files = [
21
+ 'init.rb',
22
+ 'README',
23
+ 'TEMPLATE_SPEC',
24
+ 'TODO',
25
+ 'web_parser.gemspec',
26
+ 'lib/web_parser.rb',
27
+ 'lib/web_parser/template.rb',
28
+ 'lib/web_parser/web_agent.rb',
29
+ 'lib/web_parser/web_parser.rb',
30
+ 'spec/amazon.html',
31
+ 'spec/amazon.template',
32
+ 'spec/dangdang.html',
33
+ 'spec/product.template',
34
+ 'spec/spec_helper.rb',
35
+ 'spec/template_spec.rb',
36
+ 'spec/web_parser_spec.rb'
37
+ ]
38
+ s.require_paths = ['lib']
39
+ end
metadata ADDED
@@ -0,0 +1,109 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Aaron
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '2'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '2'
46
+ - !ruby/object:Gem::Dependency
47
+ name: bundler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 1.0.5
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.0.5
62
+ description: a tool for extract web information.
63
+ email:
64
+ - Aaron@nonobo.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - init.rb
70
+ - README
71
+ - TEMPLATE_SPEC
72
+ - TODO
73
+ - web_parser.gemspec
74
+ - lib/web_parser.rb
75
+ - lib/web_parser/template.rb
76
+ - lib/web_parser/web_agent.rb
77
+ - lib/web_parser/web_parser.rb
78
+ - spec/amazon.html
79
+ - spec/amazon.template
80
+ - spec/dangdang.html
81
+ - spec/product.template
82
+ - spec/spec_helper.rb
83
+ - spec/template_spec.rb
84
+ - spec/web_parser_spec.rb
85
+ homepage: ''
86
+ licenses: []
87
+ post_install_message:
88
+ rdoc_options: []
89
+ require_paths:
90
+ - lib
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ requirements: []
104
+ rubyforge_project: ''
105
+ rubygems_version: 1.8.25
106
+ signing_key:
107
+ specification_version: 3
108
+ summary: a tool for extract web information
109
+ test_files: []