RubyGems - web-parser - Versions diffs - 0.2.1 - Mend

web-parser 0.2.1

Files changed (17) hide show

data/README +23 -0
data/TEMPLATE_SPEC +82 -0
data/TODO +6 -0
data/init.rb +4 -0
data/lib/web_parser.rb +6 -0
data/lib/web_parser/template.rb +37 -0
data/lib/web_parser/web_agent.rb +20 -0
data/lib/web_parser/web_parser.rb +170 -0
data/spec/amazon.html +6177 -0
data/spec/amazon.template +31 -0
data/spec/dangdang.html +506 -0
data/spec/product.template +112 -0
data/spec/spec_helper.rb +4 -0
data/spec/template_spec.rb +13 -0
data/spec/web_parser_spec.rb +23 -0
data/web_parser.gemspec +39 -0
metadata +109 -0

@@ -0,0 +1,112 @@
+---
+vistor_purchased:
+  xpath: div[name=__BAV_bk] ul li
+  attributes:
+    purchased_product_id:
+      transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)'');  matches
+        = cate.match(value);matches[1] if matches}'
+      xpath: a
+      attr: href
+    title:
+      xpath: p a
+      attr: title
+    percent:
+      xpath: p span
+      attr: content
+  type: list
+vistor_readed:
+  xpath: div[name=__alsoview_pub] ul li.detailed
+  attributes:
+    title:
+      xpath: a
+      attr: title
+    readed_product_id:
+      transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
+        = cate.match(value);matches[1] if matches}'
+      xpath: a
+      attr: href
+  type: list
+customer_purchased:
+  xpath: div[name=__alsobuy_pub] ul li.detailed
+  attributes:
+    purchased_product_id:
+      transform_rules: ! '{|value| cate = Regexp.new(''product_id=(\d*)''); matches
+        = cate.match(value);matches[1] if matches}'
+      xpath: a
+      attr: href
+    title:
+      xpath: a
+      attr: title
+  type: list
+book:
+  xpath: div.dp_main
+  attributes:
+    isbn:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[3] span[2]
+    title:
+      xpath: div.h1_title h1
+      attr: content
+    price:
+      transform_rules: ! "{|object| object.children.first.content.split('：').last.gsub(\"\t\",'')}"
+      xpath: p.price_m
+    dimensions:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[2] span[2]
+    editions:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[1] span[1]
+    publish_date:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed p[3]
+    translator:
+      xpath: div.info div.book_detailed p[1]
+    author:
+      transform_rules: ! '{|object| object.search(''a'').map{|a| a.content}.join('','')}'
+      xpath: div.info div.book_detailed p[1]
+    packaging:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[3] span[3]
+    print_time:
+      xpath: div.info div.book_detailed ul.clearfix li[2] span[1]
+      attr: content
+    word_count:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[1] span[3]
+    publisher:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed p[2]
+    print_times:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[3] span[1]
+    pages:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[1] span[2]
+    paper:
+      transform_rules: ! '{|object| object.content.split(''：'').last}'
+      xpath: div.info div.book_detailed ul.clearfix li[2] span[3]
+    cover:
+      transform_rules: ! '{|value| value.to_s.send(:gsub,''_b.'',''_e.''); }'
+      xpath: div.show div.pic a img
+      attr: src
+  type: single
+product:
+  type: single
+  xpath: div.dp_wrap
+  attributes:
+    price:
+      xpath: p.price_d span
+      attr: content
+    product_id:
+      xpath: a#bookshelf
+      attr: href
+    category_id:
+      transform_rules: ! '{|value| cate = Regexp.new(''\/([\d.]+)\.''); matches =
+        cate.match(value); matches[1] if matches}'
+      xpath: div.dp_break a[last()]
+      attr: href
+    grade:
+      transform_rules: ! '{|object| object.search(''img'').inject(0){|f,i| {''images/star_red.gif''
+        => 1,''images/star_red2.gif'' => 0.5}[i.attributes[''src''].value].to_i+f}}'
+      xpath: p.fraction span

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,4 @@
+require 'rubygems'
+require 'bundler/setup'
+require File.expand_path('../../lib/web_parser', __FILE__)

data/spec/template_spec.rb ADDED

@@ -0,0 +1,13 @@
+# -*- encoding: utf-8 -*-
+require 'spec_helper'
+describe Template do
+  it '有效的模板文件，应该加载成功' do
+    Template.load_template('product.template').should be_a_instance_of(Hash)
+  end
+  it '模板文件应该能被生成' do
+    template = Template.load_template('product.template')
+    Template.dump_template(template,'dangdang.template').should be_true
+  end
+end

data/spec/web_parser_spec.rb ADDED

@@ -0,0 +1,23 @@
+# -*- encoding: utf-8 -*-
+require 'spec_helper'
+describe WebParser do
+  it '指定有效的url和模板应该返回提取信息' do
+    url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
+    WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
+  end
+  it '可以从xhtml文件中提取信息' do
+    WebParser.extract_from_file('amazon.html','amazon.template').should be_a_instance_of(Hash)
+  end
+  it 'gb2312格式的文件能正常提取信息' do
+    WebParser.extract_from_file('dangdang.html','product.template').should be_a_instance_of(Hash)
+  end
+  it '指定有效的头部参数，应该返回提取信息' do
+    url = URI.escape('http://www.amazon.cn/Ruby-语言入门-园田裕贵/dp/B003L21YJA')
+    options = {'Cookie' => "session-id=478-2475059-0859041; ubid-acbcn=480-2057666-9756625; session-token=9Ny9/S+ZSraEq4tmdtT8hvyVmHxCc9+st0wiyIUinLhDNBfG6vTanMqh7TPbdxQrgFmaluE8DlrbP0ahMnbuzmr3PoakqF5HBd9k4sYYYqLBnG2xA1GN6sG1QzP5R3kxIxgV88kqoB1tDXx7cukohzkpTEVrm+jcrYkd27HLbe2C3UrMJ/Y7bJVucqjjB0oHiEMHhkKGMTSd2+u5iS24PgpAfwD/1VSukugVn6h/gQM=;"}
+    WebParser.extract_from_url(url,'amazon.template').should be_a_instance_of(Hash)
+  end
+end

data/web_parser.gemspec ADDED

@@ -0,0 +1,39 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path('../lib', __FILE__)
+Gem::Specification.new do |s|
+  s.name        = 'web-parser'
+  s.version     = '0.2.1'
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ['Aaron']
+  s.email       = ['Aaron@nonobo.com']
+  s.homepage    = ''
+  s.summary     = %q{a tool for extract web information }
+  s.description = %q{a tool for extract web information.}
+  s.rubyforge_project = ''
+  s.add_dependency('nokogiri', '~> 1')
+  s.add_dependency('rspec','~> 2')
+  s.add_dependency('bundler','>=1.0.5')
+  s.files         = [
+    'init.rb',
+    'README',
+    'TEMPLATE_SPEC',
+    'TODO',
+    'web_parser.gemspec',
+    'lib/web_parser.rb',
+    'lib/web_parser/template.rb',
+    'lib/web_parser/web_agent.rb',
+    'lib/web_parser/web_parser.rb',
+    'spec/amazon.html',
+    'spec/amazon.template',
+    'spec/dangdang.html',
+    'spec/product.template',
+    'spec/spec_helper.rb',
+    'spec/template_spec.rb',
+    'spec/web_parser_spec.rb'
+  ]
+  s.require_paths = ['lib']
+end

metadata ADDED

@@ -0,0 +1,109 @@
+--- !ruby/object:Gem::Specification
+name: web-parser
+version: !ruby/object:Gem::Version
+  version: 0.2.1
+  prerelease:
+platform: ruby
+authors:
+- Aaron
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-04-08 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.0.5
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.0.5
+description: a tool for extract web information.
+email:
+- Aaron@nonobo.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- init.rb
+- README
+- TEMPLATE_SPEC
+- TODO
+- web_parser.gemspec
+- lib/web_parser.rb
+- lib/web_parser/template.rb
+- lib/web_parser/web_agent.rb
+- lib/web_parser/web_parser.rb
+- spec/amazon.html
+- spec/amazon.template
+- spec/dangdang.html
+- spec/product.template
+- spec/spec_helper.rb
+- spec/template_spec.rb
+- spec/web_parser_spec.rb
+homepage: ''
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: ''
+rubygems_version: 1.8.25
+signing_key:
+specification_version: 3
+summary: a tool for extract web information
+test_files: []