web-parser 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ ---
2
+ vistor_purchased:
3
+ xpath: div[name=__BAV_bk] ul li
4
+ attributes:
5
+ purchased_product_id:
6
+ transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
7
+ = cate.match(value);matches[1] if matches}'
8
+ xpath: a
9
+ attr: href
10
+ title:
11
+ xpath: p a
12
+ attr: title
13
+ percent:
14
+ xpath: p span
15
+ attr: content
16
+ type: list
17
+ vistor_readed:
18
+ xpath: div[name=__alsoview_pub] ul li.detailed
19
+ attributes:
20
+ title:
21
+ xpath: a
22
+ attr: title
23
+ readed_product_id:
24
+ transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
25
+ = cate.match(value);matches[1] if matches}'
26
+ xpath: a
27
+ attr: href
28
+ type: list
29
+ customer_purchased:
30
+ xpath: div[name=__alsobuy_pub] ul li.detailed
31
+ attributes:
32
+ purchased_product_id:
33
+ transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
34
+ = cate.match(value);matches[1] if matches}'
35
+ xpath: a
36
+ attr: href
37
+ title:
38
+ xpath: a
39
+ attr: title
40
+ type: list
41
+ book:
42
+ xpath: div.dp_main
43
+ attributes:
44
+ isbn:
45
+ transform_rules: ! '{|object| object.content.split('':'').last}'
46
+ xpath: div.info div.book_detailed ul.clearfix li[3] span[2]
47
+ title:
48
+ xpath: div.h1_title h1
49
+ attr: content
50
+ price:
51
+ transform_rules: ! "{|object| object.children.first.content.split(':').last.gsub(\"\t\",'')}"
52
+ xpath: p.price_m
53
+ dimensions:
54
+ transform_rules: ! '{|object| object.content.split('':'').last}'
55
+ xpath: div.info div.book_detailed ul.clearfix li[2] span[2]
56
+ editions:
57
+ transform_rules: ! '{|object| object.content.split('':'').last}'
58
+ xpath: div.info div.book_detailed ul.clearfix li[1] span[1]
59
+ publish_date:
60
+ transform_rules: ! '{|object| object.content.split('':'').last}'
61
+ xpath: div.info div.book_detailed p[3]
62
+ translator:
63
+ xpath: div.info div.book_detailed p[1]
64
+ author:
65
+ transform_rules: ! '{|object| object.search(''a'').map{|a| a.content}.join('','')}'
66
+ xpath: div.info div.book_detailed p[1]
67
+ packaging:
68
+ transform_rules: ! '{|object| object.content.split('':'').last}'
69
+ xpath: div.info div.book_detailed ul.clearfix li[3] span[3]
70
+ print_time:
71
+ xpath: div.info div.book_detailed ul.clearfix li[2] span[1]
72
+ attr: content
73
+ word_count:
74
+ transform_rules: ! '{|object| object.content.split('':'').last}'
75
+ xpath: div.info div.book_detailed ul.clearfix li[1] span[3]
76
+ publisher:
77
+ transform_rules: ! '{|object| object.content.split('':'').last}'
78
+ xpath: div.info div.book_detailed p[2]
79
+ print_times:
80
+ transform_rules: ! '{|object| object.content.split('':'').last}'
81
+ xpath: div.info div.book_detailed ul.clearfix li[3] span[1]
82
+ pages:
83
+ transform_rules: ! '{|object| object.content.split('':'').last}'
84
+ xpath: div.info div.book_detailed ul.clearfix li[1] span[2]
85
+ paper:
86
+ transform_rules: ! '{|object| object.content.split('':'').last}'
87
+ xpath: div.info div.book_detailed ul.clearfix li[2] span[3]
88
+ cover:
89
+ transform_rules: ! '{|value| value.to_s.send(:gsub,''_b.'',''_e.''); }'
90
+ xpath: div.show div.pic a img
91
+ attr: src
92
+ type: single
93
+ product:
94
+ type: single
95
+ xpath: div.dp_wrap
96
+ attributes:
97
+ price:
98
+ xpath: p.price_d span
99
+ attr: content
100
+ product_id:
101
+ xpath: a#bookshelf
102
+ attr: href
103
+ category_id:
104
+ transform_rules: ! '{|value| cate = Regexp.new(''\/([\d.]+)\.''); matches =
105
+ cate.match(value); matches[1] if matches}'
106
+ xpath: div.dp_break a[last()]
107
+ attr: href
108
+ grade:
109
+ transform_rules: ! '{|object| object.search(''img'').inject(0){|f,i| {''images/star_red.gif''
110
+ => 1,''images/star_red2.gif'' => 0.5}[i.attributes[''src''].value].to_i+f}}'
111
+ xpath: p.fraction span
112
+
@@ -0,0 +1,4 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require File.expand_path('../../lib/web_parser', __FILE__)
@@ -0,0 +1,13 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Template do
5
+ it '有效的模板文件,应该加载成功' do
6
+ Template.load_template('product.template').should be_a_instance_of(Hash)
7
+ end
8
+
9
+ it '模板文件应该能被生成' do
10
+ template = Template.load_template('product.template')
11
+ Template.dump_template(template,'dangdang.template').should be_true
12
+ end
13
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe WebParser do
5
+ it '指定有效的url和模板应该返回提取信息' do
6
+ url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
7
+ WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
8
+ end
9
+
10
+ it '可以从xhtml文件中提取信息' do
11
+ WebParser.extract_from_file('amazon.html','amazon.template').should be_a_instance_of(Hash)
12
+ end
13
+
14
+ it 'gb2312格式的文件能正常提取信息' do
15
+ WebParser.extract_from_file('dangdang.html','product.template').should be_a_instance_of(Hash)
16
+ end
17
+
18
+ it '指定有效的头部参数,应该返回提取信息' do
19
+ url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
20
+ options = {'Cookie' => "session-id=478-2475059-0859041; ubid-acbcn=480-2057666-9756625; session-token=9Ny9/S+ZSraEq4tmdtT8hvyVmHxCc9+st0wiyIUinLhDNBfG6vTanMqh7TPbdxQrgFmaluE8DlrbP0ahMnbuzmr3PoakqF5HBd9k4sYYYqLBnG2xA1GN6sG1QzP5R3kxIxgV88kqoB1tDXx7cukohzkpTEVrm+jcrYkd27HLbe2C3UrMJ/Y7bJVucqjjB0oHiEMHhkKGMTSd2+u5iS24PgpAfwD/1VSukugVn6h/gQM=;"}
21
+ WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
22
+ end
23
+ end
@@ -0,0 +1,39 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path('../lib', __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'web-parser'
6
+ s.version = '0.2.1'
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ['Aaron']
9
+ s.email = ['Aaron@nonobo.com']
10
+ s.homepage = ''
11
+ s.summary = %q{a tool for extract web information }
12
+ s.description = %q{a tool for extract web information.}
13
+
14
+ s.rubyforge_project = ''
15
+
16
+ s.add_dependency('nokogiri', '~> 1')
17
+ s.add_dependency('rspec','~> 2')
18
+ s.add_dependency('bundler','>=1.0.5')
19
+
20
+ s.files = [
21
+ 'init.rb',
22
+ 'README',
23
+ 'TEMPLATE_SPEC',
24
+ 'TODO',
25
+ 'web_parser.gemspec',
26
+ 'lib/web_parser.rb',
27
+ 'lib/web_parser/template.rb',
28
+ 'lib/web_parser/web_agent.rb',
29
+ 'lib/web_parser/web_parser.rb',
30
+ 'spec/amazon.html',
31
+ 'spec/amazon.template',
32
+ 'spec/dangdang.html',
33
+ 'spec/product.template',
34
+ 'spec/spec_helper.rb',
35
+ 'spec/template_spec.rb',
36
+ 'spec/web_parser_spec.rb'
37
+ ]
38
+ s.require_paths = ['lib']
39
+ end
metadata ADDED
@@ -0,0 +1,109 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Aaron
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '2'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '2'
46
+ - !ruby/object:Gem::Dependency
47
+ name: bundler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 1.0.5
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.0.5
62
+ description: a tool for extract web information.
63
+ email:
64
+ - Aaron@nonobo.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - init.rb
70
+ - README
71
+ - TEMPLATE_SPEC
72
+ - TODO
73
+ - web_parser.gemspec
74
+ - lib/web_parser.rb
75
+ - lib/web_parser/template.rb
76
+ - lib/web_parser/web_agent.rb
77
+ - lib/web_parser/web_parser.rb
78
+ - spec/amazon.html
79
+ - spec/amazon.template
80
+ - spec/dangdang.html
81
+ - spec/product.template
82
+ - spec/spec_helper.rb
83
+ - spec/template_spec.rb
84
+ - spec/web_parser_spec.rb
85
+ homepage: ''
86
+ licenses: []
87
+ post_install_message:
88
+ rdoc_options: []
89
+ require_paths:
90
+ - lib
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ requirements: []
104
+ rubyforge_project: ''
105
+ rubygems_version: 1.8.25
106
+ signing_key:
107
+ specification_version: 3
108
+ summary: a tool for extract web information
109
+ test_files: []